In [None]:
import operator as op
import random
random.seed(123)

import numpy as np
import pandas as pd

from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.svm  import SVC
import sklearn.metrics as skm

import utils_data_prepping as udp
import utils_clf_models as clf

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set(rc={'figure.figsize':(12,8)})


from sklearn.exceptions import ConvergenceWarning
import warnings
warnings.filterwarnings("ignore")
warnings.filterwarnings("ignore", category=ConvergenceWarning,
                        module="sklearn")

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df = pd.read_csv('/kaggle/input/drug-classification/drug200.csv')
df.head()

In [None]:
df.shape

In [None]:
df['Drug'].unique()

In [None]:
df.info()

In [None]:
# Categorical Encoding 
cat_vars = ['Sex', 'BP', 'Cholesterol']
for i in cat_vars:
    df[i+"_cat"] = df[i].astype('category').cat.codes
df.drop(cat_vars, axis=1, inplace=True)
df.head()

# Exploratory Data Analysis

## 1. Distribution of target variable

In [None]:
sns.histplot(data=df, x='Drug')
plt.show()

## 2. Visualization of entire dataset

In [None]:
# Masking to show only one side of the matrix
corr = np.corrcoef(df.corr())                        
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask)] = True

# Axtual Correlation matrix as a heatmap
sns.heatmap(df.corr(), annot=True, mask=mask, cmap="YlGnBu")
plt.show

## 3. Relationship between Drug Cat and Na to Potassium Ration

In [None]:
sns.histplot(data=df, x='Na_to_K', hue='Drug')
plt.show()

# Model Building & Evaluation

In [None]:
# Dictionary with model scores
models = {}

## 1. K-Nearest Neighbors

In [None]:
class knn():
    def __init__(self, df, target):
        self.df = df
        self.target = target
    
    def pre_processing(self):
        X = self.df.drop([self.target], axis=1).values
        Y = self.df[self.target].values

        self.X_train, self.X_test, self.Y_train, self.Y_test = train_test_split(X, Y, 
                                                            test_size = 0.2, 
                                                            random_state = 2)
        return self
        
    def fit_pred_acc(self):
        # Elbow method to find best fit
        errors = {}    
        for k in range(1, 10):
            model_k = KNeighborsClassifier(n_neighbors=k)
            scores = cross_val_score(model_k, self.X_train, 
                                     self.Y_train, 
                                     cv=20, scoring='accuracy')
            errors[k] = scores.mean()
#         plt.plot(list(errors.keys()), list(errors.values()))
#         plt.xlabel('Value of K for KNN')
#         plt.ylabel('Cross-validated accuracy')
        k_best = max(errors.items(), key=op.itemgetter(1))[0]
        clf = KNeighborsClassifier(n_neighbors=k_best)
        clf.fit(self.X_train, self.Y_train)
        pred = clf.predict(self.X_test)
        print(skm.classification_report(self.Y_test, pred))
        return round(skm.accuracy_score(self.Y_test, pred), 2)

model = knn(df, 'Drug').pre_processing()
models['knn'] = model.fit_pred_acc()

In [None]:
# Using utils
X, y = udp.pre_processing(df, 'Drug')
clf1 = clf.Classifier(X, y, 'knn')
clf1.preprocess_split(0.3, 62)
clf1.fit_predict()
print('For the training set:')
clf1.metrics(printing=True)

## 2. Naive Bayes

In [None]:
class nb():
    def __init__(self, df, target):
        self.df = df
        self.target = target
    
    def pre_processing(self):
        X = self.df.drop([self.target], axis=1).values
        Y = self.df[self.target].values

        self.X_train, self.X_test, self.Y_train, self.Y_test = train_test_split(X, Y, 
                                                            test_size = 0.25, 
                                                            random_state = 9)
        return self
        
    def fit_pred_acc(self):
        # Elbow method to find best fit
        clf = GaussianNB()
        clf.fit(self.X_train, self.Y_train)  
        pred = clf.predict(self.X_test)
        drug_names = ['DrugY', 'drugA', 'drugB', 'drugC', 'drugX']
        
        print(skm.classification_report(self.Y_test, pred, target_names=drug_names))
        return round(skm.accuracy_score(self.Y_test, pred), 2)

model = nb(df, 'Drug')
model = model.pre_processing()
models['nb'] = model.fit_pred_acc() 

In [None]:
# Using utils
X, y = udp.pre_processing(df, 'Drug')
clf1 = clf.Classifier(X, y, 'guass_nb')
clf1.preprocess_split(0.3, 142)
clf1.fit_predict()
print('For the training set:')
clf1.metrics(printing=True)

## 3. Logistic Regression

In [None]:
class logreg():
    def __init__(self, df, target):
        self.df = df
        self.target = target
    
    def pre_processing(self):
        X = self.df.drop([self.target], axis=1).values
        Y = self.df[self.target].values

        self.X_train, self.X_test, self.Y_train, self.Y_test = train_test_split(X, Y, 
                                                            test_size = 0.25, 
                                                            random_state = 16)
        return self
        
    def fit_pred_acc(self):
        # Elbow method to find best fit
        clf = LogisticRegression()
        clf.fit(self.X_train, self.Y_train)  
        pred = clf.predict(self.X_test)
        drug_names = ['DrugY', 'drugA', 'drugB', 'drugC', 'drugX']
        
        print(skm.classification_report(self.Y_test, pred, target_names=drug_names))
        return round(skm.accuracy_score(self.Y_test, pred), 2)
model = logreg(df, 'Drug')
model = model.pre_processing()
models['logreg'] = model.fit_pred_acc()

In [None]:
# Using utils
X, y = udp.pre_processing(df, 'Drug')
clf1 = clf.Classifier(X, y, 'log_reg')
clf1.preprocess_split(0.3, 62)
clf1.fit_predict()
print('For the training set:')
clf1.metrics(printing=True)

## 4. Decision Tree

In [None]:
class tree():
    def __init__(self, df, target):
        self.df = df
        self.target = target
    
    def pre_processing(self):
        X = self.df.drop([self.target], axis=1).values
        Y = self.df[self.target].values

        self.X_train, self.X_test, self.Y_train, self.Y_test = train_test_split(X, Y, 
                                                            test_size = 0.3, 
                                                            random_state = 2)
        return self
        
    def fit_pred_acc(self):
        # Elbow method to find best fit
        clf = DecisionTreeClassifier()
        clf.fit(self.X_train, self.Y_train)  
        pred = clf.predict(self.X_test)
        drug_names = ['DrugY', 'drugA', 'drugB', 'drugC', 'drugX']
        
        print(skm.classification_report(self.Y_test, pred, target_names=drug_names))
        return round(skm.accuracy_score(self.Y_test, pred), 2)
    
model = tree(df, 'Drug')
model = model.pre_processing()
models['tree'] = model.fit_pred_acc()

In [None]:
# Using utils
X, y = udp.pre_processing(df, 'Drug')
clf1 = clf.Classifier(X, y, 'tree')
clf1.preprocess_split(0.3, 142)
clf1.fit_predict()
print('For the training set:')
clf1.metrics(printing=True)

## 5. Support Vector Classifier

In [None]:
class svc():
    def __init__(self, df, target):
        self.df = df
        self.target = target
    
    def pre_processing(self):
        X = self.df.drop([self.target], axis=1).values
        Y = self.df[self.target].values

        self.X_train, self.X_test, self.Y_train, self.Y_test = train_test_split(X, Y, 
                                                            test_size = 0.15, 
                                                            random_state = 64)
        return self
        
    def fit_pred_acc(self):
        # Elbow method to find best fit
        clf = SVC(gamma='auto')
        clf.fit(self.X_train, self.Y_train)  
        pred = clf.predict(self.X_test)
        drug_names = ['DrugY', 'drugA', 'drugB', 'drugC', 'drugX']
        
        print(skm.classification_report(self.Y_test, pred, target_names=drug_names))
        return round(skm.accuracy_score(self.Y_test, pred), 2)
    
model = svc(df, 'Drug')
model = model.pre_processing()
models['svc'] = model.fit_pred_acc()

In [None]:
# Using utils
# X, y = udp.pre_processing(df, 'Drug')
# clf1 = clf.Classifier(X, y, 'svc')
# clf1.preprocess_split(0.3, 62)
# clf1.fit_predict()
# print('For the training set:')
# clf1.metrics(printing=True)

In [None]:
scores_df = pd.DataFrame.from_dict(models, orient='index', 
                              columns = ['accuracy'])
scores_df