# AIM: CLASSIFY DRUGS

STEPS:
* 1)EXPLORATORY DATA-ANALYSIS
* 2)FEATURE SELECTION
* 3)FEATURE SCALING
* 4)MODEL BUILDING



# MODEL SUMMARY

* DECISION TREE-100% ACCURATE ON TEST SET 
* NAIVE BAYES-100% ACCURATE ON TEST SET 
* RANDOM FOREST-100% ACCURATE ON TEST SE

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import tree
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
from sklearn.metrics import precision_score, recall_score, accuracy_score
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv("../input/drug-classification/drug200.csv")

# EXPLORATORY DATA ANALYSIS AND FEATURE ENGINEERING

In [None]:
#check first 5 head
df.head()

In [None]:
#check bottom 5
df.tail()

In [None]:
#get info
df.info()

In [None]:
#check number of rows n columns
df.shape

In [None]:
#check some stat for numerical var
df.describe()

In [None]:
#chech for null value
df.isnull().sum()

In [None]:
#check for datatypes
df.dtypes

In [None]:
#get numerical variables
num_vars=[var for var in df.columns if df[var].dtypes!='O']
num_vars

In [None]:
#get categorical variables
cat_vars=[var for var in df.columns if df[var].dtypes=='O']
cat_vars

In [None]:
#lets check unique labels in each category
for var in cat_vars:
    print(df[var].unique())

In [None]:
#let visualize the Age
sns.displot(df['Age'],kde=True)
plt.xlabel("Age")
plt.ylabel("Count")
plt.show()

In [None]:
#How does age influence the type of drug people take
sns.barplot(x=df["Drug"],y=df["Age"])

In [None]:
#Lets check how does sodium to potasium levels affeck kind of drug?
sns.barplot(x=df['Drug'],y=df['Na_to_K'])

In [None]:
#lets evalute the distribution of Na_to_K
sns.histplot(df['Na_to_K'],color='yellow')

In [None]:
#how does age affect Na_to_K levels
sns.scatterplot(x=df['Age'],y=df['Na_to_K'])

In [None]:
#How does cholesterol levels relate with drugs
cholesterol_drug = df.groupby(["Drug","Cholesterol"]).size().reset_index(name = "Count")
cholesterol_drug

In [None]:
#lets visualise it
sns.barplot(x=cholesterol_drug['Drug'],y=cholesterol_drug['Count'])
plt.ylabel('cholesterol')

In [None]:
#how does gender and Na_to_K levels affect the kind of drug
ax = sns.barplot(x="Drug", y="Na_to_K", hue="Sex", data=df)

In [None]:
#how does age affects Bp the kind of drug
ax = sns.barplot(x="BP", y="Age",data=df)

In [None]:
#how many times people take each drug
df.Drug.value_counts()

In [None]:
#lets visualise the counts
sns.barplot(x=df.Drug.value_counts().index,y=df.Drug.value_counts())

In [None]:
#lets add this new feature
df['na/ka>15'] = df['Na_to_K'].apply(lambda x: 1 if x>=15.015  else 0)

In [None]:
#converting Cholesterol into numeric
CH_mapping = {'HIGH':1,'NORMAL':0}
df['Cholesterol'] = df.Cholesterol.map(CH_mapping)

In [None]:
#converting BP into numeric
BP_mapping = {'HIGH':2,'LOW':0,'NORMAL':1}
df['BP']= df['BP'].map(BP_mapping)

In [None]:
#converting drugs into numeric
drug_mapping = {'DrugY':0,'drugC':2,'drugX':1,'drugA':3,'drugB':4}
df['Drug']=df.Drug.map(drug_mapping)

In [None]:
#lets convert other columns to numbers using get_dummies
df = pd.get_dummies(df,drop_first=True)

# FEATURE SELECTION AND SCALING

In [None]:
#lets to for correlation with target
matrix_corr = df.corr().index
plt.figure(figsize=(20,26))
sns.heatmap(df[matrix_corr].corr(),annot=True,cmap='YlGn_r')

In [None]:
#divide into independent and indepent variables
x = df[['Age', 'BP', 'Cholesterol', 'Na_to_K', 'Drug', 'na/ka>15', 'Sex_M']]
y=df['Drug']

In [None]:
#splitting into training and test
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=4)

In [None]:
#scaling
scaler = StandardScaler()
scaler.fit(x_train)
x_train_scaled = scaler.transform(x_train)
x_test_scaled = scaler.transform(x_test)

# MODEL BUILDING

DECISION TREE MODEL

In [None]:
#set the decision tree
clf = tree.DecisionTreeClassifier(random_state=4)

In [None]:
#fit to training data
clf.fit(x_train_scaled,y_train)

In [None]:
#predict the test data
y_pred_t = clf.predict(x_test_scaled)

In [None]:
print(accuracy_score(y_test,y_pred_t)*100,"% accuracy")

In [None]:
#evalute ur model
print(classification_report(y_test,y_pred_t))

# RANDOM FOREST

In [None]:
#initialise the random forest object
clf = RandomForestClassifier(random_state=0)

In [None]:
#fit to train data
clf.fit(x_train_scaled,y_train)

In [None]:
#predict the test
pred_rf = clf.predict(x_test_scaled)

In [None]:
print(classification_report(y_test,pred_rf))

In [None]:
#max features to consider 
criterion = ['gini', 'entropy']
max_features = ['auto', 'sqrt']
# Maximum number of depth in trees
max_depth = [1,2,5, 10, 15, 20, 25, 30]
#min samples for splitting nodes
min_samples_split = [5, 10, 15, 20,25,60,100]
# Min samples for each leaf 
min_samples_leaf = [1, 2, 3,5, 12]

In [None]:
params_dict={
    "criterion":criterion,
    "max_features":max_features,
    "max_depth":max_depth,
    "min_samples_split":min_samples_split,
    "min_samples_leaf":min_samples_leaf
}

In [None]:
model_grid = GridSearchCV(clf,param_grid=params_dict,verbose=2,n_jobs=-1,cv=3)

In [None]:
model_grid.fit(x_train_scaled,y_train)

In [None]:
pred_rf = model_grid.predict(x_test_scaled)

In [None]:
print(classification_report(y_test,pred_rf))

# NAIVE BAYES

In [None]:
#instantiate naive bayes
clf = GaussianNB()

In [None]:
#fit the model
clf.fit(x_train_scaled,y_train)

In [None]:
#predict the test data
y_pred_n  = clf.predict(x_test_scaled)

In [None]:
print(classification_report(y_test,y_pred_n))

# CONCLUSION

DECISION TREE AND NAIVE BAYES GAVE ME 100% ACCURACY,BUT MY RANDOM FOREST GAVE 98% ACCURACY.