## Importing libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix, f1_score, classification_report 
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import plot_tree

## Loading the dataset

In [None]:
df = pd.read_csv('../input/drug-classification/drug200.csv')

## Exploratory Data Analysis

In [None]:
df.head()

In [None]:
df.shape

The dataset has 200 records and 6 attributes.

In [None]:
df.info()

The dataset has 1 float, 1 integer and 4 object columns.

### Attribute Description:-

<ol>
    <li>Age: stores the age of patient</li>
    <li>Sex: stores the gender of patients</li>
    <li>BP: contains blood pressure of patient</li>
    <li>Cholesterol: stores cholesterol of patient</li>
    <li>Na_to_K: contains Sodium to Potassium ratio in blood</li>
    <li>Drug: contains he drug type given to patients</li>
</ol>

In [None]:
# Check for missing values

df.isnull().sum()

The dataset doesn't have any null values.

In [None]:
# Check for duplicates

duplicate = df[df.duplicated()]
duplicate

No duplicate records are present.

In [None]:
# Find minimum and maximum age

print("Minimum Age is {} years".format(df['Age'].min()))
print("Maximum Age is {} years".format(df['Age'].max()))

In [None]:
# Find unique values in categorical attributes

print("Sex:", df['Sex'].unique())
print("BP:", df['BP'].unique())
print("Cholesterol:", df['Cholesterol'].unique())
print("Drug:", df['Drug'].unique())

In [None]:
# Analyze Cholesterol based on Sex

df_Sex_Cholesterol = df.groupby(["Sex", "Cholesterol"]).size().reset_index(name = "Count")

plt.figure(figsize = (10,10))
sns.barplot(x = "Sex", y = "Count", hue = "Cholesterol", data = df_Sex_Cholesterol)
plt.title("Sex - Cholesterol", fontsize=20, fontweight='bold')
plt.show()

<b>Observation:-</b> It is observed that both Female and Male suffer from high cholesterol levels.

In [None]:
# Analyze BP based on Sex

df_Sex_BP = df.groupby(["Sex", "BP"]).size().reset_index(name = "Count")

plt.figure(figsize = (10,7))
sns.barplot(x = "Sex", y = "Count", hue = "BP", data = df_Sex_BP)
plt.title("Sex - BP", fontsize=20, fontweight='bold')
plt.show()

<b>Observation:-</b>

<ul>
    <li>Both Female and Male suffer from high blood pressure levels.</li>
    <li>More males suffer from high blood pressure than females.</li>
    <li>Almost same number of male and female have normal blood pressure.</li>    
</ul>    

In [None]:
# Analyze Drug based on Age

plt.figure(figsize = (10,6))
sns.swarmplot(x = "Drug", y = "Age", data = df)
plt.legend(df.Drug.value_counts().index)
plt.title("Age vs Drug", fontsize=20, fontweight='bold')
plt.show()

<b>Observation:-</b>

<ul>
    <li>drugA is only given to patients whose age is between 20 to 50 years.</li>
    <li>drugB is only given to patients above 50 years.</li>
    <li>DrugY, drugC and drugX can be given to all age groups.</li>    
</ul>    

In [None]:
# Analyze Drug based on Sex

df_Sex_Drug = df.groupby(["Drug", "Sex"]).size().reset_index(name = "Count")

plt.figure(figsize = (9,5))
sns.barplot(x = "Drug", y = "Count", hue = "Sex", data = df_Sex_Drug)
plt.title("Sex - Drug", fontsize=20, fontweight='bold')
plt.show()

<b>Observation:-</b>

<ul>
    <li>DrugY is given to more female patients.</li>
    <li>drugA, drugB and drugC is only given to more male patients.</li>
    <li>drugX ig given equally to male and female patients.</li>    
</ul>    

In [None]:
# Analyze Drug based on Na_to_K

plt.figure(figsize = (10,6))
sns.swarmplot(x = "Drug", y = "Na_to_K", data = df)
plt.legend(df.Drug.value_counts().index)
plt.title("Na_to_K vs Drug", fontsize=20, fontweight='bold')
plt.show()

<b>Observation:-</b>

<ul>
    <li>DrugY is only given to patients whose Na_to_K ratio is greater than 15.</li>
    <li>drugC, drugX, drugA and drugB are given to patients whose Na_to_K ratio is between 5 and 15.</li>    
</ul>    

In [None]:
# Analyze Drug based on Cholesterol

df_Sex_Drug = df.groupby(["Drug", "Cholesterol"]).size().reset_index(name = "Count")

plt.figure(figsize = (9,5))
sns.barplot(x = "Drug", y = "Count", hue = "Cholesterol", data = df_Sex_Drug)
plt.title("Cholesterol - Drug", fontsize=20, fontweight='bold')
plt.show()

<b>Observation:-</b>

<ul>
    <li>DrugY and drugA is given to more patients whose Cholesterol level is High.</li>
    <li>drugB can be given to patients whose Cholesterol level is High or Normal.</li> 
    <li>drugC is only given to those patients who have High Cholesterol level.</li>
    <li>drugX is given to more patients whose Cholesterol level is Normal.</li>
</ul>    

In [None]:
# Analyze Drug based on BP

df_Sex_Drug = df.groupby(["Drug", "BP"]).size().reset_index(name = "Count")

plt.figure(figsize = (9,5))
sns.barplot(x = "Drug", y = "Count", hue = "BP", data = df_Sex_Drug)
plt.title("BP - Drug", fontsize=20, fontweight='bold')
plt.legend(loc='upper center')
plt.show()

<b>Observation:-</b>

<ul>
    <li>DrugY is given to more patients whose BP is High.</li>
    <li>drugA and drugB can be given to only those patients whose BP is High.</li> 
    <li>drugC is only given to those patients who have Low BP.</li>
    <li>drugX is given to more patients whose BP is Normal.</li>
</ul>    

In [None]:
# Distribution of Drug

sns.countplot(x='Drug', data=df)
plt.xlabel('Drug')

<b>Observation:-</b>

<ul>
    <li>DrugY is given to most patients followed by drugX.</li>
    <li>drugA, drugB and drugC are given to less number of patients.</li> 
</ul>    

In [None]:
# Encoding categorical variables

from sklearn.preprocessing import LabelEncoder

def label_encoder(x):
    le = LabelEncoder()
    df[x] = le.fit_transform(df[x])

lables = ['Sex', 'BP', 'Cholesterol', 'Drug']
for i in lables:
    label_encoder(i)

In [None]:
df

Hence, all the categorical attributes are encoded.

## Model

### Splitting the data into training and testing set

In [None]:
X = df.iloc[:, :-1]        # independent variable
y = df.iloc[:, -1]         # dependent variable

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

### 1. Random Forest Classifier

In [None]:
# without tuning hyperparameters

clf1 = RandomForestClassifier(random_state=42)
accuracies = cross_val_score(clf1, X_train, y_train, cv=5)
clf1.fit(X_train, y_train)

print("Train Score:", np.mean(accuracies))
print("Test Score:", clf1.score(X_test, y_test))

In [None]:
# Predicting values

y_pred1 = clf1.predict(X_test)
y_pred1

In [None]:
# F1 score

clf1_f1_score = f1_score(y_test, y_pred1, average='weighted')
clf1_f1_score

In [None]:
print(classification_report(y_test, y_pred1))

In [None]:
cm = confusion_matrix(y_test, y_pred1)
cm

In [None]:
# using GridSearchCV

grid = {'n_estimators': np.arange(1, 100), 'criterion':['gini','entropy']}

clf2 = RandomForestClassifier(random_state=42)
clf2_cv = GridSearchCV(clf2, grid, cv=5)
clf2_cv.fit(X_train, y_train)

print("Hyperparameters Used:", clf2_cv.best_params_)
print("Train Score:", np.mean(accuracies))
print("Test Score:", clf2_cv.score(X_test, y_test))

In [None]:
y_pred2 = clf2_cv.predict(X_test)
y_pred2

In [None]:
# F1 score

clf2_f1_score = f1_score(y_test, y_pred2, average='weighted')
clf2_f1_score

In [None]:
print(classification_report(y_test, y_pred2))

In [None]:
cm = confusion_matrix(y_test, y_pred2)
cm

**Hence, Random Forest Classifier shows a 100% accuracy with and without tuning hyperparameters.**

### 2. Decision Tree Classifier

In [None]:
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)

In [None]:
y_pred_dt = dt.predict(X_test)
y_pred_dt

In [None]:
# F1 score

dt_f1_score = f1_score(y_test, y_pred_dt, average='weighted')
dt_f1_score

In [None]:
print(classification_report(y_test, y_pred_dt))

In [None]:
cm = confusion_matrix(y_test, y_pred_dt)
cm

**Hence, Decision Tree Classifier shows a 100% accuracy.**

In [None]:
# Visualising the graph

plt.figure(figsize = (20,10))
dec_tree = plot_tree(decision_tree=dt, feature_names = df.columns, filled = True , precision = 4, rounded = True)

### 3. KNN Classifier

In [None]:
grid = {'n_neighbors': np.arange(1,100)}

knn = KNeighborsClassifier()
knn_cv = GridSearchCV(knn, grid)
knn_cv.fit(X_train, y_train)

print("Hyperparameters Used:", knn_cv.best_params_)
print("Train Score:", np.mean(accuracies))
print("Test Score:", knn_cv.score(X_test, y_test))

In [None]:
y_pred_knn = knn_cv.predict(X_test)
y_pred_knn

In [None]:
# F1 score

knn_f1_score = f1_score(y_test, y_pred_knn, average='weighted')
knn_f1_score

In [None]:
print(classification_report(y_test, y_pred_knn))

In [None]:
cm = confusion_matrix(y_test, y_pred_knn)

plt.figure(figsize=(5,5))

sns.heatmap(data=cm,linewidths=.5, annot=True,square = True,  cmap = 'Blues')

plt.ylabel('Actual label')
plt.xlabel('Predicted label')

all_sample_title = 'Accuracy Score: {0}'.format(knn_cv.score(X_test, y_test))
plt.title(all_sample_title, size = 15)

<b>Observation:-</b>

<ul>
    <li>It can be observed that maximum errors have occured in predicting class label 4.</li>
    <li>Label 3 is also wrongly predicted as class 1 and 2.</li>
</ul>

**K Nearest Neighbors Classifier only gave an accuracy of 64.62% with hyperparameter tuning.**