In [None]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df=pd.read_csv('../input/drug-classification/drug200.csv')

In [None]:
df.head()

In [None]:
df.isna().sum()

In [None]:
df.describe(include='all')

In [None]:
plt.subplots(12,figsize=(14,6))
plt.subplot(121)
sns.distplot(df['Age'],color='pink')
sns.despine()
plt.xlim([10,80])
plt.title('Age Distribution of Respondents')

plt.subplot(122)
sns.boxplot(y='Age',x='Drug',data=df)
sns.despine()
plt.title('Distribution of Age by Drug')

# Majority of the people are between ages 35-40.Moreover, if we look at different drugs Drug A is suitable for younger people however for older people Drug B is more appropriate

In [None]:
plt.subplots(12)
plt.subplot(121)
df['Sex'].value_counts().plot(kind='pie')
plt.title('Gender Distribution')

plt.subplot(122)
pd.crosstab(index=df['Drug'],columns=df['Sex'],normalize='columns').plot(kind='bar')



# There is some gender correlation with females preferring Drug Y & males preferring Drug B

In [None]:
plt.subplots(12,figsize=(14,6))
plt.subplot(121)
df['BP'].value_counts().plot(kind='pie')
plt.title('BP Distribution')

plt.subplot(122)
pd.crosstab(index=df['Drug'],columns=df['BP'],normalize='columns').plot(kind='barh')


In [None]:
plt.subplots(12,figsize=(14,6))
plt.subplot(121)
sns.distplot(df['Na_to_K'],color='blue')
sns.despine()
plt.xlim([0,40])
plt.title('Na_to_K Distribution of Respondents')

plt.subplot(122)
sns.boxplot(y='Na_to_K',x='Drug',data=df)
sns.despine()
plt.title('Distribution of NA_K by Drug')

# Data Preprocessing

In [None]:
df['Drug']=df['Drug'].astype('category')
df["Drug_codes"] = df["Drug"].cat.codes

In [None]:
df

In [None]:
from sklearn.preprocessing import OneHotEncoder
cat_variables=['Sex','BP','Cholesterol']
#ohe=OneHotEncoder()
#df=ohe.fit_transform(df[cat_variables].toarray())
df=pd.get_dummies(df,columns=cat_variables,drop_first=True)
#df=pd.concat([df,dummies],axis=0)
#dummies


In [None]:
df

# Classification

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix

In [None]:
X=df.drop(['Drug_codes','Drug'],axis=1)
y=df['Drug_codes']
X_train,X_test,y_train,y_test=train_test_split(X,y)

In [None]:
lr=LogisticRegression()
lr.fit(X_train,y_train)
y_pred=lr.predict(X_test)


In [None]:
print(classification_report(y_test,y_pred))

In [None]:
cm=confusion_matrix(y_test,y_pred)
sns.heatmap(cm,annot=True,cmap='viridis',xticklabels=['drugY','drugA','DrugB','DrugC','DrugX'],yticklabels=['drugY','drugA','DrugB','DrugC','DrugX'])


In [None]:
from sklearn.svm import SVC
svc=SVC(kernel='linear')
svc.fit(X_train,y_train)
y_pred=svc.predict(X_test)
print(classification_report(y_test,y_pred))

In [None]:
cm=confusion_matrix(y_test,y_pred)
sns.heatmap(cm,annot=True,cmap='viridis',xticklabels=['drugY','drugA','DrugB','DrugC','DrugX'],yticklabels=['drugY','drugA','DrugB','DrugC','DrugX'])


In [None]:
from sklearn.tree import DecisionTreeClassifier
dc=DecisionTreeClassifier()
dc.fit(X_train,y_train)
y_pred=dc.predict(X_test)
print(classification_report(y_test,y_pred))

In [None]:
cm=confusion_matrix(y_test,y_pred)
sns.heatmap(cm,annot=True,cmap='viridis',xticklabels=['drugY','drugA','DrugB','DrugC','DrugX'],yticklabels=['drugY','drugA','DrugB','DrugC','DrugX'])


In [None]:
imp=dc.feature_importances_
features=['Age','Sodium to Potassium','Gender','Blood Pressure Low','Blood Pressure Normal','Cholestrol']
pd.DataFrame(imp,index=features).sort_values(by=0,ascending=False).plot(kind='barh')

In [None]:
from sklearn.ensemble import RandomForestClassifier
rc=RandomForestClassifier()
rc.fit(X_train,y_train)
y_pred=rc.predict(X_test)
print(classification_report(y_test,y_pred))

In [None]:
imp=dc.feature_importances_
features=['Age','Sodium to Potassium','Gender','Blood Pressure Low','Blood Pressure Normal','Cholestrol']
pd.DataFrame(imp,index=features).sort_values(by=0,ascending=False).plot(kind='barh')

In [None]:
cm=confusion_matrix(y_test,y_pred)
sns.heatmap(cm,annot=True,cmap='viridis',xticklabels=['drugY','drugA','DrugB','DrugC','DrugX'],yticklabels=['drugY','drugA','DrugB','DrugC','DrugX'])
