In [None]:
import pandas as pd
import numpy as np
import math

In [None]:
df= pd.read_csv('../input/drug-classification/drug200.csv')
df

In [None]:
set(df['Drug'])

In [None]:
{0:'DrugY', 1:'drugA', 2:'drugB', 3:'drugC', 4:'drugX'}

## Identifing Missing Values

In [None]:
df.describe()

In [None]:
df.info()

## Handling Categorical Variables

In [None]:
X=df.drop('Drug', axis=1)
Y=df['Drug']

X=pd.get_dummies(X)

In [None]:
Y=Y.map(lambda x:0 if x.strip()=='DrugY' else 1 if x.strip()=='drugA' else 2 if x.strip()=='drugB' else 3 if x.strip()=='drugC' else 4)

In [None]:
X

In [None]:
Y

In [None]:
X.info()

In [None]:
X.isnull().sum()

In [None]:
X.describe()

## Identifing occurence of each category

In [None]:
Y.value_counts()

In [None]:
print(X.shape)
print(Y.shape)

In [None]:
print('X=', X.shape)
print('Y=', Y.shape)
print(Y.value_counts())

## Splitting data into Train and Test data

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
x_train, x_test, y_train, y_test= train_test_split(X, Y, train_size=0.9, random_state=42, stratify=Y)

In [None]:
print('x train:',x_train.shape)
print('x test',x_test.shape)
print('y train',y_train.shape)
print('y test', y_test.shape)

## Training

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.dummy import DummyClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.ensemble import BaggingClassifier

In [None]:
scc=StandardScaler()
x_train=scc.fit_transform(x_train)

In [None]:
#pipe=Pipeline([('standardscaler', StandardScaler()),('classifier', BaggingClassifier())])
pipe=Pipeline([('classifier', BaggingClassifier())])

# You can run all algorithms but running all algorithms takes very long time
# DecisionTreeClassifier gives best accuracy and f1-score than remaining so I only ran this algorithm 

grid_param=[
                #{
                #    'classifier': [LogisticRegression()],
                #    'classifier__penalty':['l2','l1'],
                #    'classifier__C':np.logspace(0,4,10)
                #},
                #{
                #    'classifier':[SVC()],
                #    'classifier__kernel':['rbf'],
                #    'classifier__gamma':[0.001,0.0001,0.01,0.1, 0,1,10,100,1000,10000],
                #    'classifier__C':[0.001,0.0001,0.01,0.1, 0,1,10,100,1000,10000],
                #},
                {
                    'classifier':[DecisionTreeClassifier()],
                    'classifier__max_leaf_nodes':np.arange(1,11)
                },
                #{
                #    'classifier':[RandomForestClassifier()],
                #    'classifier__n_estimators':np.arange(50,1050,50),
                #    'classifier__max_leaf_nodes':np.arange(1,11)
                #},
                #{
                #    'classifier':[XGBClassifier()],
                #    'classifier__n_estimators':np.arange(50,1050,50),
                    #'classifier__max_leaf_nodes':np.arange(1,11)
                #    'classifier__max_depth':np.arange(1,21)
                #}
]

In [None]:
from sklearn.model_selection import StratifiedKFold

skf=StratifiedKFold(n_splits=3, random_state=None)   

In [None]:
grid_search= GridSearchCV(pipe, grid_param, cv=skf, verbose=4, n_jobs=-1, scoring='f1_macro')
best_model=grid_search.fit(x_train, y_train)

In [None]:
print(best_model.best_estimator_)

In [None]:
print(best_model.best_params_)

In [None]:
print(best_model.best_score_)

In [None]:
#pd.DataFrame(best_model.cv_results_)

In [None]:
print(y_train.value_counts())
print(y_test.value_counts())

In [None]:
dic={}

for p,s in zip(best_model.cv_results_['params'],best_model.cv_results_['mean_test_score']):
    dic[s]=p
    #print(s,p)
#for p,s in sorted(dic, reverse=True):
#    print(s,p)
for zzz in sorted(dic, reverse=True):
    print(zzz,dic[zzz])

In [None]:
from sklearn.model_selection import cross_val_predict

y_train_pred = cross_val_predict(best_model.best_estimator_, x_train, y_train, cv=5)

In [None]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_train, y_train_pred)

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_train, y_train_pred))

# Evaluation

In [None]:
x_test_scaled=scc.transform(x_test)

y_test_pred=best_model.best_estimator_.predict(x_test_scaled)

In [None]:
confusion_matrix(y_test, y_test_pred)

In [None]:
print(classification_report(y_test, y_test_pred))