In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report 

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier

In [None]:
data = pd.read_csv("../input/drug-classification/drug200.csv")

from sklearn.preprocessing import LabelEncoder

def label_encoder(y):
    le = LabelEncoder()
    data[y] = le.fit_transform(data[y])

#data['Na_to_K_Bigger_Than_15'] = [1 if i >=15.015 else 0 for i in data.Na_to_K]
#label_list = ["Sex","BP","Cholesterol","Na_to_K","Na_to_K_Bigger_Than_15","Drug"]

label_list = ["Sex","BP","Cholesterol","Na_to_K","Drug"]

for l in label_list:
    label_encoder(l)
    
X, y = data.drop(['Drug'], axis=1), data['Drug']
train_X, test_X, train_y, test_y= train_test_split(X,y, test_size=0.33, random_state=101)
print(train_X.shape)
print(test_X.shape)

data.head()
final_results = []

In [None]:
features = ['Age', 'Sex', 'BP', 'Cholesterol', 'Na_to_K', 'Drug']
sns.set_style('darkgrid')
sns.pairplot(data[features])

In [None]:
sns.set_style('darkgrid')
axes = pd.plotting.scatter_matrix(data, alpha = 0.3, figsize = (10,7), diagonal = 'kde' ,s=80)
corr = data.corr().values

plt.xticks(fontsize =10,rotation =0)
plt.yticks(fontsize =10)
for ax in axes.ravel():
    ax.set_xlabel(ax.get_xlabel(),fontsize = 15, rotation = 60)
    ax.set_ylabel(ax.get_ylabel(),fontsize = 15, rotation = 60)
# put the correlation between each pair of variables on each graph
for i, j in zip(*np.triu_indices_from(axes, k=1)):
    axes[i, j].annotate("%.3f" %corr[i, j], (0.8, 0.8), xycoords="axes fraction", ha="center", va="center")

In [None]:
def dt(ccp_alpha):
    clf = DecisionTreeClassifier(ccp_alpha=ccp_alpha, random_state=0)
    clf = clf.fit(train_X, train_y)
    return [ccp_alpha, clf.score(train_X, train_y), clf.score(test_X, test_y)]

clf = DecisionTreeClassifier(random_state=0)
path = clf.cost_complexity_pruning_path(train_X, train_y)
ccp_alphas, impurities = path.ccp_alphas, path.impurities

print(len(ccp_alphas), len(impurities))

results=[]
for ccp_alpha in ccp_alphas:
    results.append(dt(ccp_alpha))
    
columns=['ccp', 'train_score', 'test_score']
results = pd.DataFrame(results, columns=columns)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [None]:
fig, ax = plt.subplots()
ax.set_xlabel("alpha")
ax.set_ylabel("accuracy")
ax.set_title("Accuracy vs alpha for training and testing sets")
ax.plot(ccp_alphas, results['train_score'], marker='o', label="train",
        drawstyle="steps-post")
ax.plot(ccp_alphas, results['test_score'], marker='o', label="test",
        drawstyle="steps-post")
ax.legend()
plt.show()

In [None]:
best_result = results.iloc[results['test_score'].idxmax()]
print(f'For decision tree, pick ccp_alpha={best_result["ccp"]}, train_score={best_result["train_score"]}, test_scores={best_result["test_score"]}')

final_results.append(['decision tree', best_result["train_score"], best_result["test_score"]])

In [None]:
grid = {'n_neighbors':np.arange(1,90),
        'p':np.arange(1,3),
        'weights':['uniform','distance']
       }

knn = KNeighborsClassifier(algorithm = "auto")
knn_cv = GridSearchCV(knn,grid,cv=5)
knn_cv.fit(train_X,train_y)

print("Hyperparameters:",knn_cv.best_params_)
print("Train Score:",knn_cv.best_score_)
print("Test Score:",knn_cv.score(test_X,test_y))

In [None]:
final_results.append(['KNN', knn_cv.best_score_, knn_cv.score(test_X,test_y)])

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

def boosted_dt(ccp_alpha):
    clf = GradientBoostingClassifier(ccp_alpha=ccp_alpha, random_state=0)
    clf = clf.fit(train_X, train_y)
    return [ccp_alpha, clf.score(train_X, train_y), clf.score(test_X, test_y)]

#clf = GradientBoostingRegressor(random_state=0)
#path = clf.cost_complexity_pruning_path(train_X, train_y)
#ccp_alphas, impurities = path.ccp_alphas, path.impurities

print(len(ccp_alphas), len(impurities))

results=[]
for ccp_alpha in ccp_alphas:
    results.append(dt(ccp_alpha))
    
columns=['ccp', 'train_score', 'test_score']
results = pd.DataFrame(results, columns=columns)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

best_result = results.iloc[results['test_score'].idxmax()]
print(f'For GradientBoostingClassifier, pick ccp_alpha={best_result["ccp"]}, train_score={best_result["train_score"]}, test_scores={best_result["test_score"]}')
final_results.append(['GradientBoostingClassifier', best_result["train_score"], best_result["test_score"]])

In [None]:
grid = {
    'C':[0.01,0.1,1,10],
    'kernel' : ["linear","poly","rbf","sigmoid"],
    'degree' : [1,3,5,7],
    'gamma' : [0.01,1]
}

svm  = SVC ();
svm_cv = GridSearchCV(svm, grid, cv = 5)
svm_cv.fit(train_X,train_y)
print("Best Parameters:",svm_cv.best_params_)
print("Train Score:",svm_cv.best_score_)
print("Test Score:",svm_cv.score(test_X,test_y))
final_results.append(['Support Vector Machines', svm_cv.best_score_, svm_cv.score(test_X,test_y)])

In [None]:
def test_mlp(hls, alpha):
    clf = MLPClassifier(activation='identity', alpha=alpha, hidden_layer_sizes=hls, solver='lbfgs', random_state=0, max_iter=200)
    clf = clf.fit(train_X, np.ravel(train_y, order='C'))
    #print(clf.score(train_X, train_y), clf.score(test_X, test_y))
    return [hls, alpha, clf.score(train_X, train_y), clf.score(test_X, test_y)]

results=[]
for hidden_layer_sizes in [(i,) for i in range(1,20)]:
        results.append(test_mlp(hidden_layer_sizes, 0.0001))
    
columns=['hidden_layer_sizes',  'alpha', 'train_score', 'test_score']
results = pd.DataFrame(results, columns=columns)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
best_result = results.iloc[results['test_score'].idxmax()]
results

In [None]:
final_results.append(['Neural Network', best_result["train_score"], best_result["test_score"]])

In [None]:
columns=['algorithm', 'train_score', 'test_score']
pd.DataFrame(final_results, columns=columns)