In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
data = pd.read_csv('/kaggle/input/drug-classification/drug200.csv')

# Exploratory Data Analysis

In [None]:
data.shape

In [None]:
data.head()

In [None]:
data.isnull().sum()

In [None]:
data.isna().sum()

No missing or null values in data

200 data variables and 6 features(including the label)

## Feature analysis

In [None]:
data.info()

## Age:
### Age of the patient

In [None]:
print('Maximum age:',max(data['Age']))
print('Minimum age:',min(data['Age']))

In [None]:
sb.distplot(data['Age'])

Age ranges from 15 to 74

## Sex:
### Sex of the patient

In [None]:
data.Sex.value_counts()

In [None]:
sb.countplot(x = data.Sex)

Sex ratio seems to be balanced.

Since it is a categorical variable, we convert it to numeric using label encoder in python

## BP:

### Blood Pressure of patient

In [None]:
data.BP.value_counts()

In [None]:
sb.countplot(x = data.BP)

BP ratio seems to be balanced.

Since it is a categorical variable, we convert it to numeric using label encoder in python

## Cholesterol:
### Cholesterol of the patient

In [None]:
data.Cholesterol.value_counts()

In [None]:
sb.countplot(x = data.Cholesterol)

Cholesterol ratio seems to be balanced.

Since it is a categorical variable, we convert it to numeric using label encoder in python

## Na_to_K:
### Sodium-Potassium ratio in patient's blood

In [None]:
print("Maximum Sodium-Potassium ratio:",data.Na_to_K.max())
print("Minimum Sodium-Potassium ratio:",data.Na_to_K.min())

In [None]:
sb.distplot(data.Na_to_K)

Sodium-Potassium ratio ranges from 6.269 to 38.247

## Drug:
### Drug administered to the patient

In [None]:
data.Drug.value_counts()

In [None]:
sb.countplot(data.Drug)

Drug is the target column(value) or label.

# Relationship between features and target value:

## Age - Drug

In [None]:
sb.swarmplot(x = "Drug", y = "Age",data = data)
plt.legend(data.Drug.value_counts().index)
plt.title("Age to Drug")

In [None]:
print("Maximum Age for administering Drug A:",data.Age[data.Drug == "drugA"].max())
print("Minimum Age for administering Drug B:",data.Age[data.Drug == "drugB"].min())

Drug A is administered to patients below 50 years.

Drug B is administered to patients above 51 years.

## Sex - Drug

In [None]:
sex_drug = data.groupby(['Drug','Sex']).size().reset_index(name = 'Count')
sb.barplot(x = 'Drug',y = 'Count', hue = 'Sex', data = sex_drug)
plt.title('Sex to Drug')

From this graph, we find Sex is not an important feature for classification.

## BP - Drug

In [None]:
BP_drug = data.groupby(['Drug','BP']).size().reset_index(name = 'Count')
sb.barplot(x = 'Drug',y = 'Count', hue = 'BP', data = BP_drug)
plt.title('BP to Drug')

Drug A and Drug B is administered only for people who have HIGH blood pressure.

Drug C is administered only for people who have LOW blood pressure.

BP is an important feature for classification.

## Cholesterol - Drug

In [None]:
BP_drug = data.groupby(['Drug','Cholesterol']).size().reset_index(name = 'Count')
sb.barplot(x = 'Drug',y = 'Count', hue = 'Cholesterol', data = BP_drug)
plt.title('Cholesterol to Drug')

Drug C is only administered for patients with HIGH Cholesterol.

Cholesterol is important feature to classify Drug C.

## Na_to_K - Drug

In [None]:
sb.swarmplot(x = "Drug", y = "Na_to_K",data = data)
plt.title("Na_to_K - Drug")

In [None]:
print("Minimum value of Na_to_K for Drug Y:",data.Na_to_K[data.Drug == "DrugY"].min())

People who have Na_to_K ratio is greater than 15, Drug Y is administered.

We can create a new feature from this feature for better classification of Drug Y.

# Data preprocessing:

## Feature Engineering

In [None]:
data['Na_to_K>15'] = np.where(data['Na_to_K'] > 15, 1, 0)

## Label Encoding

In [None]:
from sklearn import preprocessing 
label_encode = preprocessing.LabelEncoder() 

In [None]:
label_encode_list = ['Sex','BP','Cholesterol','Na_to_K>15','Drug']

for i in label_encode_list:
    data[i] = label_encode.fit_transform(data[i])

In [None]:
data.head()

# Train-Test split for the dataset

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
x = data.drop(['Drug'], axis = 1)
y = data.Drug

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 42, shuffle = True)


In [None]:

y_train = y_train.values.reshape(-1,1)
y_test = y_test.values.reshape(-1,1)

In [None]:
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape)

Dataset is split into training and test data in 4:1 ratio

# Model for the data

## KNN Classifier

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()

knn.fit(x_train,y_train)
knnPred = knn.predict(x_test)

In [None]:
knn.score(x_test,y_test)

In [None]:
grid = {'n_neighbors':np.arange(1,120),
        'p':np.arange(1,3),
        'weights':['uniform','distance']
       }
knn_cv = GridSearchCV(knn,grid,cv=5)
knn_cv.fit(x_train,y_train)
knnCvPred = knn_cv.predict(x_test)
knn_cv.score(x_test,y_test)


In [None]:
knn_cv.best_params_

## Decision Tree Classifier

In [None]:
from sklearn import tree
dt = tree.DecisionTreeClassifier(criterion = "entropy")
dt.fit(x_train, y_train)
dtPred = dt.predict(x_test)

In [None]:
dt.score(x_test,y_test)

In [None]:
grid = {'criterion':['gini','entropy'],'max_depth':np.arange(1,5)}
dt_cv = GridSearchCV(dt, grid, cv=5)
dt_cv.fit(x_train, y_train)
dtCvPred = dt_cv.predict(x_test)

In [None]:
print(dt_cv.best_params_)
dt_cv.score(x_test,y_test)

In [None]:
from sklearn.tree import export_graphviz
import graphviz

class_names = ['DrugY','drugC','drugX','drugA','drugB']
feature_names = ['Age','Sex','BP','Cholesterol','Na_to_K','Na_to_K>15']

dot_data = export_graphviz(dt, out_file=None, filled=True, rounded=True,
                                feature_names=feature_names,  
                                class_names=class_names)
graph = graphviz.Source(dot_data)  
graph     

In [None]:
from sklearn.tree import export_graphviz
import graphviz

class_names = ['DrugY','drugC','drugX','drugA','drugB']
feature_names = ['Age','Sex','BP','Cholesterol','Na_to_K','Na_to_K>15']

dot_data = export_graphviz(dt_cv.best_estimator_, out_file=None, filled=True, rounded=True,
                                feature_names=feature_names,  
                                class_names=class_names)
graph = graphviz.Source(dot_data)  
graph     

## Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(random_state = 42)
rfc.fit(x_train, y_train)
rfcPred = rfc.predict(x_test)

In [None]:
print(np.mean(cross_val_score(rfc, x_train, y_train, cv=5)))
rfc.score(x_test,y_test)

In [None]:
grid = {'n_estimators':np.arange(100,1000,100),
        'criterion':['gini','entropy'],
       }


rfc_cv = GridSearchCV(rfc, param_grid=grid, cv= 5)
rfc_cv.fit(x_train, y_train)
rfcCvPred = rfc_cv.predict(x_test)

In [None]:
print(rfc_cv.best_score_)
print(rfc_cv.best_params_)
print(rfc_cv.score(x_test,y_test))

# Performance metrics of Models


## Accuracy


### Without GSCV

In [None]:
acc_knn = knn.score(x_test,y_test)
acc_dt = dt.score(x_test,y_test)
acc_rfc = rfc.score(x_test,y_test)

In [None]:
print(acc_knn,acc_dt,acc_rfc)

### With GSCV

In [None]:
acc_cv_knn = knn_cv.score(x_test,y_test)
acc_cv_dt = dt_cv.score(x_test,y_test)
acc_cv_rfc = rfc_cv.score(x_test,y_test)

In [None]:
print(acc_cv_knn,acc_cv_dt,acc_cv_rfc)

### Confusion Matrix Plot

### Without GSCV


In [None]:
from sklearn.metrics import plot_confusion_matrix
from sklearn import metrics
class_names = ['DrugY','drugC','drugX','drugA','drugB']

dispKnnConfMat = plot_confusion_matrix(knn, x_test, y_test,cmap=plt.cm.Blues,display_labels = class_names)
dispKnnConfMat.ax_.set_title('Confusion Matrix for Knn')

dispdtConfMat = plot_confusion_matrix(dt, x_test, y_test,cmap=plt.cm.Blues,display_labels = class_names)
dispdtConfMat.ax_.set_title('Confusion Matrix for Decision Tree')

disprfcConfMat = plot_confusion_matrix(rfc, x_test, y_test,cmap=plt.cm.Blues,display_labels = class_names)
disprfcConfMat.ax_.set_title('Confusion Matrix for Random Forest Classifier')
plt.show() 

### With GSCV

In [None]:
from sklearn.metrics import plot_confusion_matrix
from sklearn import metrics
class_names = ['DrugY','drugC','drugX','drugA','drugB']

dispKnnConfMat = plot_confusion_matrix(knn_cv, x_test, y_test,cmap=plt.cm.Blues,display_labels = class_names)
dispKnnConfMat.ax_.set_title('Confusion Matrix for Knn')

dispdtConfMat = plot_confusion_matrix(dt_cv, x_test, y_test,cmap=plt.cm.Blues,display_labels = class_names)
dispdtConfMat.ax_.set_title('Confusion Matrix for Decision Tree')

disprfcConfMat = plot_confusion_matrix(rfc_cv, x_test, y_test,cmap=plt.cm.Blues,display_labels = class_names)
disprfcConfMat.ax_.set_title('Confusion Matrix for Random Forest Classifier')
plt.show() 

## Report

### Without GSCV

In [None]:
report_knn = metrics.classification_report(y_test, knnPred,target_names=class_names)
report_dt = metrics.classification_report(y_test, dtPred,target_names=class_names)
report_rfc= metrics.classification_report(y_test, rfcPred,target_names=class_names)
print(report_knn,report_dt,report_rfc,sep = '\n\n')

### with GSCV

In [None]:
report_knn = metrics.classification_report(y_test, knnCvPred,target_names=class_names)
report_dt = metrics.classification_report(y_test, dtCvPred,target_names=class_names)
report_rfc= metrics.classification_report(y_test, rfcCvPred,target_names=class_names)
print(report_knn,report_dt,report_rfc,sep = '\n\n')