In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Importing Libraries

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
from sklearn import tree
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split , GridSearchCV
from sklearn.metrics import plot_confusion_matrix ,accuracy_score

In [None]:
df=pd.read_csv('../input/drug-classification/drug200.csv')
df.head()

In [None]:
df['Drug'].value_counts()

# Encoding Categorical Variables

In [None]:
categorical_cols=[col for col in df.columns if df[col].dtype=='object']
categorical_cols

In [None]:
label_encoder=LabelEncoder()
labelled_df=df.copy()
for col in categorical_cols:
    labelled_df[col]=label_encoder.fit_transform(df[col])
labelled_df.head()    

# Train Test Split

In [None]:
X=labelled_df.drop('Drug',axis=1)
y=labelled_df['Drug']
y.head()

In [None]:
scaler=StandardScaler()
feature_set=scaler.fit_transform(X)
feature_set.shape

In [None]:
X_train,X_val,y_train,y_val=train_test_split(feature_set,y,test_size=0.2,random_state=0)
X_train.shape

# KNearest Neighbors Classification

In [None]:
from sklearn.neighbors import KNeighborsClassifier

Ks=10
mean_acc=np.zeros((Ks-1))
std_acc=np.zeros((Ks-1))
ConfusionMx=[];
for n in range(1,Ks):
    #Train Model and Predict
    neigh=KNeighborsClassifier(n_neighbors=n).fit(X_train,y_train)
    yhat=neigh.predict(X_val)
    mean_acc[n-1]=metrics.accuracy_score(y_val,yhat)
    
    std_acc[n-1]=np.std(yhat==y_val)/np.sqrt(yhat.shape[0])
mean_acc

In [None]:
plt.plot(range(1,Ks),mean_acc,'g')
plt.fill_between(range(1,Ks),mean_acc-1*std_acc,mean_acc+1*std_acc,alpha=0.10)
plt.legend(('Accuracy','+/-3xstd'))
plt.ylabel('Accuracy')
plt.xlabel('Number of neighbors(K)')
plt.show()

In [None]:
knn_acc=mean_acc.max()
print('The best accuracy = ',knn_acc,'with K = ',mean_acc.argmax()+1)

# Gaussian Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB

nb=GaussianNB()
nb.fit(X_train,y_train)
pred=nb.predict(X_val)
nb_acc=accuracy_score(pred,y_val)
nb_acc

# Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

lr=LogisticRegression(C=0.01,solver='liblinear')
lr.fit(X_train,y_train)
pred=lr.predict(X_val)
lr_acc=accuracy_score(pred,y_val)
lr_acc

# Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

model=DecisionTreeClassifier()
parameters=[{'max_depth':[1,2,3,4,5,6,7,8,9,10]}]
clf=GridSearchCV(model,parameters,cv=5,scoring='accuracy')
clf.fit(X_train,y_train)
print(clf.best_params_)

**So, max_depth is 4**

In [None]:
clf=DecisionTreeClassifier(criterion='entropy',max_depth=4)
clf.fit(X_train,y_train)
pred=clf.predict(X_val)
tree_acc=accuracy_score(pred,y_val)
tree_acc

**100% accuracy**

In [None]:
plot_confusion_matrix(clf,X_val,y_val,cmap=plt.cm.Reds,display_labels=['Drug A','Drug B','Drug C','Drug X',
                                                     'Drug Y'])

# Accuracy Scores

In [None]:
scores=pd.DataFrame({
    'Models':['KNearest Neighbors','Gaussian Naive Bayes','Logistic Regression','Decision Tree'],
    'Accuracy':[knn_acc,nb_acc,lr_acc,tree_acc]
})
scores.sort_values(by='Accuracy',ascending=False)

**So, We use Decision Tree Classification Model to predict Drugs with 100% accuracy**

# Visualization

In [None]:
plt.figure(figsize=(14,7))
tree.plot_tree(clf, rounded=True, filled=True)