In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import seaborn as sns
sns.set_style('darkgrid')
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize']=(8,6)

In [None]:
df=pd.read_csv('/kaggle/input/drug-classification/drug200.csv')
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df.describe().T

In [None]:
df.Drug.value_counts()

In [None]:
sns.countplot(x='Drug',data=df)

In [None]:
sns.distplot(df['Age'],bins=20,kde=True)

**Age feature follows a normal distribution**

In [None]:
fig,axis=plt.subplots(1,2,figsize=(16,5))
sns.countplot(x='Sex', data=df,ax=axis[0]).set_title('Male Female Count')
sns.countplot(x='Drug',hue='Sex', data=df,ax=axis[1]).set_title('Male Female Count For Each Drug')

In [None]:
fig,axis=plt.subplots(1,2,figsize=(16,5))
sns.countplot(x='BP', data=df,ax=axis[0]).set_title('BP Count')
sns.countplot(x='Drug',hue='BP', data=df,ax=axis[1]).set_title('BP Count For Each Drug')

In [None]:
fig,axis=plt.subplots(1,2,figsize=(16,5))
sns.countplot(x='Cholesterol', data=df,ax=axis[0]).set_title('Cholesterol Count')
sns.countplot(x='Drug',hue='Cholesterol', data=df,ax=axis[1]).set_title('Cholesterol Count For Each Drug')

In [None]:
plt.figure(figsize=(8,6))
sns.distplot(df['Na_to_K'], kde=True, bins=40)

**Na_to_k feature follows a skewed distribution**

In [None]:
#finding outliers
df['Age'].plot(kind='box')

In [None]:
df['Na_to_K'].plot(kind='box')

In [None]:
from scipy import stats
value=stats.zscore(df['Na_to_K'],axis=0)
value

In [None]:
IQR1=np.quantile(df['Na_to_K'],0.25)
IQR3=np.quantile(df['Na_to_K'],0.75)
IQR=IQR3-IQR1
IQR

In [None]:
df_outlier=df[df['Na_to_K']>(IQR3+1.5*IQR)]
df_outlier

In [None]:
#removed all the outliers
df.drop(df_outlier.index.tolist(),axis=0,inplace=True)
df.shape

In [None]:
X=df.drop('Drug',axis=1)
y=df['Drug']

In [None]:
#Converting all the categorical values into numerical values
categorical_features=X.select_dtypes('object').columns.tolist()
from sklearn.preprocessing import LabelEncoder
for i in categorical_features:
    X[i]=LabelEncoder().fit_transform(X[i])

In [None]:
X.head()

In [None]:
from sklearn.preprocessing import StandardScaler
X_norm=StandardScaler().fit_transform(X)
X_norm[0:5]

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X_norm,y,test_size=0.3,random_state=42)
X_train.shape,y_train.shape,X_test.shape,y_test.shape

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc=RandomForestClassifier(n_estimators=100)
rfc.fit(X_train,y_train)
yhat=rfc.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix,classification_report, accuracy_score,f1_score
print('Accuracy Score:',accuracy_score(y_test,yhat))

In [None]:
sns.heatmap(confusion_matrix(y_test,yhat),annot=True,fmt='.0f')

In [None]:
print(classification_report(y_test,yhat))

In [None]:
f1_score(y_test,yhat,average='weighted')

In [None]:
from sklearn.svm import SVC
svc=SVC(kernel='rbf')
svc.fit(X_train,y_train)
yhat=svc.predict(X_test)

In [None]:
accuracy_score(y_test,yhat)

In [None]:
sns.heatmap(confusion_matrix(y_test,yhat),annot=True,fmt='.0f')

In [None]:
print(classification_report(y_test,yhat))

In [None]:
f1_score(y_test,yhat,average='weighted')