In [None]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import accuracy_score
import pickle
from sklearn.linear_model import ElasticNet
def train_models_and_save(df_drug):
    # Initialize LabelEncoder
    label_encoder = LabelEncoder()

    # Encode categorical features
    categorical_features = [feature for feature in df_drug.columns if df_drug[feature].dtypes == 'O']
    for feature in categorical_features:
        df_drug[feature] = label_encoder.fit_transform(df_drug[feature])

    # Define X and y
    X = df_drug.drop("Drug", axis=1)
    y = df_drug["Drug"]

    # Train Decision Tree model
    dt_model = DecisionTreeClassifier(criterion="entropy")
    dt_model.fit(X, y)

    # Train SVM model
    svm_model = SVC(kernel='rbf')
    svm_model.fit(X, y)

    # Train Logistic Regression model
    lr_model = LogisticRegression()
    lr_model.fit(X, y)

    # Save models to disk
    with open('dt_model.pkl', 'wb') as dt_file:
        pickle.dump(dt_model, dt_file)

    with open('svm_model.pkl', 'wb') as svm_file:
        pickle.dump(svm_model, svm_file)

    with open('lr_model.pkl', 'wb') as lr_file:
        pickle.dump(lr_model, lr_file)

def predict_drug_with_models(Age, Sex, BP, Cholesterol, Na_to_K):
    # Load models from disk
    with open('dt_model.pkl', 'rb') as dt_file:
        dt_model = pickle.load(dt_file)

    with open('svm_model.pkl', 'rb') as svm_file:
        svm_model = pickle.load(svm_file)

    with open('lr_model.pkl', 'rb') as lr_file:
        lr_model = pickle.load(lr_file)

    # Transform categorical variables to numerical values
    Sex = gender_map[Sex]
    BP = bp_map[BP]
    Cholesterol = cholestol_map[Cholesterol]

    # Make predictions using all models
    dt_prediction = dt_model.predict([[Age, Sex, BP, Cholesterol, Na_to_K]])[0]
    svm_prediction = svm_model.predict([[Age, Sex, BP, Cholesterol, Na_to_K]])[0]
    lr_prediction = lr_model.predict([[Age, Sex, BP, Cholesterol, Na_to_K]])[0]

    # Map numerical predictions to drug names
    dt_drug = drug_map[dt_prediction]
    svm_drug = drug_map[svm_prediction]
    lr_drug = drug_map[lr_prediction]

    return dt_drug, svm_drug, lr_drug

def calculate_accuracy(model, X, y):
    kfold = KFold(n_splits=5, random_state=42, shuffle=True)
    cv_results = cross_val_score(model, X, y, cv=kfold, scoring="accuracy")
    return cv_results.mean(), cv_results.std()

# Load the dataset
df_drug = pd.read_csv("/content/drug200.csv")

# Train models and save to disk
train_models_and_save(df_drug)

# Example usage
predicted_drug_dt, predicted_drug_svm, predicted_drug_lr = predict_drug_with_models(54, "M", "LOW", "NORMAL", 14)
print("Predicted drug using Decision Tree:", predicted_drug_dt)
print("Predicted drug using SVM:", predicted_drug_svm)
print("Predicted drug using Logistic Regression:", predicted_drug_lr)

# Calculate accuracy for Decision Tree model
X = df_drug.drop("Drug", axis=1)
y = df_drug["Drug"]
dt_model = DecisionTreeClassifier(criterion="entropy")
dt_accuracy_mean, dt_accuracy_std = calculate_accuracy(dt_model, X, y)
print("Accuracy using Decision Tree:", dt_accuracy_mean, dt_accuracy_std)

# Calculate accuracy for SVM model
svm_model = SVC(kernel='rbf')
svm_accuracy_mean, svm_accuracy_std = calculate_accuracy(svm_model, X, y)
print("Accuracy using SVM:", svm_accuracy_mean, svm_accuracy_std)

# Calculate accuracy for Logistic Regression model
lr_model = LogisticRegression()
lr_accuracy_mean, lr_accuracy_std = calculate_accuracy(lr_model, X, y)
print("Accuracy using Logistic Regression:", lr_accuracy_mean, lr_accuracy_std)


#Naive bayes classifer implemented
# Initialize the Naive Bayes classifier (Gaussian Naive Bayes for continuous features)
nb_classifier = GaussianNB()

# Train the classifier
nb_classifier.fit(X,y)

# Predictions
y_pred = nb_classifier.predict(X)

# Model evaluation
accuracy = accuracy_score(y,y_pred)
print("Accuracy using Naive Bayes Classfier:", accuracy)






STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Predicted drug using Decision Tree: drugX
Predicted drug using SVM: drugX
Predicted drug using Logistic Regression: drugX
Accuracy using Decision Tree: 0.99 0.012247448713915901
Accuracy using SVM: 0.705 0.07314369419163898
Accuracy using Logistic Regression: 0.86 0.046368092477478536
Accuracy using Naive Bayes Classfier: 0.865


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [None]:
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
dataset = pd.read_csv("/content/drug200.csv")
datah = dataset.replace('?')
from sklearn.impute import SimpleImputer
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
datahNew = datah.dropna()
datahNew

x = datahNew[['Age', 'Sex', 'BP', 'Cholesterol', 'Na_to_K']]
y = datahNew['Drug']

print (pd.unique(x['Sex']))
print (pd.unique(x['BP']))
print (pd.unique(x['Cholesterol']))

from sklearn import preprocessing
le = preprocessing.LabelEncoder()
x['Sex'] = le.fit_transform(x['Sex'])
x['BP'] = le.fit_transform(x['BP'])
x['Cholesterol'] = le.fit_transform(x['Cholesterol'])

x.shape

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = 0.25, random_state = 0)

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.fit_transform(x_test)

from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors = 2,metric = 'euclidean',p=2)
classifier.fit(x_train, y_train)


import matplotlib.pyplot as plt
neighbors = np.arange(1, 10)
train_accuracy = np.empty(len(neighbors))
test_accuracy =np.empty(len(neighbors))
#Loop over x values

for i, k in enumerate (neighbors):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(x_train, y_train)

#Compute traning and test data accuracy

train_accuracy[1]=  knn.score(x_train, y_train)
test_accuracy[1] = knn.score(x_test, y_test)

error_rate = []
for i in range(1,40):
    knn = KNeighborsClassifier(n_neighbors=1)
    knn.fit(x_train,y_train)
    pred_i= knn.predict(x_test)
    error_rate.append(np.mean(pred_i != y_test))
print("Minimum error:-",min(error_rate),"at k=",error_rate.index(min(error_rate)))

acc = []
from sklearn import metrics
for i in range(1,40):
    neigh = KNeighborsClassifier(n_neighbors=i).fit(x_train,y_train)
    yhat = neigh.predict(x_test)
    acc.append(metrics.accuracy_score(y_test, yhat))
print("Maximum accuracy:",max(acc), "at K", acc.index(max(acc)))

from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors=2, metric="euclidean", p=2)
classifier.fit(x_train,y_train)
KNeighborsClassifier(metric="euclidean", n_neighbors=2)

y_pred = classifier.predict(x_test)
df_pred = pd.DataFrame(y_pred)
df_pred
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
df_cm=pd.DataFrame(cm)
df_cm
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))
from sklearn.metrics import accuracy_score
print("Accuracy using KNN:-",accuracy_score(y_test,y_pred))

['F' 'M']
['HIGH' 'LOW' 'NORMAL']
['HIGH' 'NORMAL']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x['Sex'] = le.fit_transform(x['Sex'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x['BP'] = le.fit_transform(x['BP'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x['Cholesterol'] = le.fit_transform(x['Cholesterol'])


Minimum error:- 0.08 at k= 0
Maximum accuracy: 0.92 at K 0
              precision    recall  f1-score   support

       DrugY       0.96      0.96      0.96        25
       drugA       0.83      1.00      0.91         5
       drugB       0.33      1.00      0.50         1
       drugC       1.00      1.00      1.00         3
       drugX       1.00      0.81      0.90        16

    accuracy                           0.92        50
   macro avg       0.83      0.95      0.85        50
weighted avg       0.95      0.92      0.93        50

Accuracy using KNN:- 0.92


In [None]:
from sklearn.ensemble import RandomForestClassifier

RFclassifier = RandomForestClassifier(max_leaf_nodes=30)
RFclassifier.fit(x_train, y_train)

y_pred = RFclassifier.predict(x_test)

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

from sklearn.metrics import accuracy_score
RFAcc = accuracy_score(y_pred,y_test)
print('Random Forest accuracy is: {:.2f}%'.format(RFAcc*100))


#ridge regression
from sklearn.linear_model import RidgeClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score


# Split the dataset into training and test sets


# Define and train the Ridge Classifier model
ridge_clf = make_pipeline(StandardScaler(), RidgeClassifier(alpha=1.0))
ridge_clf.fit(x_train, y_train)

# Predict drug activity/toxicity on the test set
y_pred = ridge_clf.predict(x_test)

# Evaluate the model using accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy using RidgeClassifier:",accuracy)



              precision    recall  f1-score   support

       DrugY       1.00      1.00      1.00        25
       drugA       1.00      0.80      0.89         5
       drugB       0.50      1.00      0.67         1
       drugC       1.00      1.00      1.00         3
       drugX       1.00      1.00      1.00        16

    accuracy                           0.98        50
   macro avg       0.90      0.96      0.91        50
weighted avg       0.99      0.98      0.98        50

[[25  0  0  0  0]
 [ 0  4  1  0  0]
 [ 0  0  1  0  0]
 [ 0  0  0  3  0]
 [ 0  0  0  0 16]]
Random Forest accuracy is: 98.00%
Accuracy using RidgeClassifier: 0.84
