In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('../input/drug-classification/drug200.csv')

**Predicted label:**

- Drug type

**Features:**

- Age of patient
- Sex of patient
- Blood Pressure Levels (BP)
- Cholesterol Levels
- Na to Potassium Ratio

In [None]:
df.head()

In [None]:
df.isnull().sum()

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
df['Drug'].unique()

In [None]:
df['Drug'] = df['Drug'].transform(lambda value: value[-1])

In [None]:
df['Drug']

In [None]:
df.head()

# Exploratory Data Analysis

In [None]:
sns.countplot(x='Drug',data=df)

In [None]:
sns.countplot(x='Drug',hue='Sex',data=df)

In [None]:
sns.boxplot(x='Age',y='Drug',hue='Sex',data=df)

In [None]:
sns.countplot(x='Cholesterol',data=df)

In [None]:
sns.countplot(x='BP',data=df)

In [None]:
sns.countplot(x='Sex',data=df)

In [None]:
sns.jointplot(data=df,x='Age',y='Na_to_K',kind='hex')

In [None]:
sns.catplot(data=df,x='Sex',y='Age',kind='box',
           col='Cholesterol',row='BP')

In [None]:
sns.displot(x='Age',col='Sex',data=df,bins=15,kde=True)

In [None]:
plt.figure(figsize=(6,4),dpi=200)
sns.scatterplot(x='Age',y='Na_to_K',hue='Drug',data=df)
plt.legend(bbox_to_anchor=(1.3,0.7),title='Drug Type')

# Feature Engineering 

In [None]:
df['BP'].unique()

In [None]:
df['Cholesterol'].unique()

### Ordinal Encoding

In [None]:
bp_map = {'LOW':0,'NORMAL':1,'HIGH':2}

In [None]:
df['BP'] = df['BP'].replace(bp_map)

In [None]:
df.head()

 ### One-Hot Encoding

In [None]:
dummie_df  = pd.get_dummies(df[['Cholesterol','Sex']],drop_first=True)

In [None]:
dummie_df

In [None]:
df = pd.concat([df,dummie_df],axis=1)

In [None]:
df.head()

In [None]:
df = df.drop(['Sex','Cholesterol'],axis=1)

In [None]:
df.head()

In [None]:
df = df[['Age','Sex_M','BP','Cholesterol_NORMAL','Na_to_K','Drug']]

In [None]:
df.head()

In [None]:
df.columns = ['Age','Sex_Male','BP','Cholesterol_Normal','Na_to_K','Drug']

In [None]:
df.head()

# Random Forests Using GridSearchCV and Parameter Search

In [None]:
X = df.drop('Drug',axis=1)

In [None]:
y = df['Drug']

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=101)

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
n_estimators = [64,100,128,200]

In [None]:
max_features = [2,3,4,5]

In [None]:
bootstrap = [True,False]

In [None]:
param_grid = {'n_estimators':n_estimators,
             'max_features':max_features,
             'bootstrap':bootstrap}

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rfc = RandomForestClassifier()

In [None]:
grid = GridSearchCV(rfc,param_grid)

In [None]:
grid.fit(X_train,y_train)

In [None]:
grid.best_params_

In [None]:
predictions = grid.predict(X_test)

In [None]:
predictions

In [None]:
y_test

# Performance Evaluation

In [None]:
from sklearn.metrics import plot_confusion_matrix,classification_report,accuracy_score

In [None]:
plot_confusion_matrix(grid,X_test,y_test)

In [None]:
print(classification_report(y_test,predictions))

In [None]:
errors = []
misclassifications = []

for n in range(1,200):
    rfc = RandomForestClassifier(n_estimators=n,max_features=2)
    rfc.fit(X_train,y_train)
    preds = rfc.predict(X_test)
    err = 1 - accuracy_score(y_test,preds)
    n_missed = np.sum(preds != y_test)
    
    errors.append(err)
    misclassifications.append(n_missed)

In [None]:
plt.plot(range(1,200),errors)

In [None]:
plt.plot(range(1,200),misclassifications)

In [None]:
# Performance starts to level off at around 13 decision trees