In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Part 1: Importing and Visualizing Data

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

df = pd.read_csv('../input/drug-classification/drug200.csv')

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
plt.figure(figsize = (20,10))

sns.countplot(df['Age'])

In [None]:
sns.countplot(df['Sex'])

In [None]:
sns.countplot(df['BP'])

In [None]:
sns.countplot(df['Cholesterol'])

In [None]:
sns.countplot(df['Drug'], hue='Sex', data = df)

In [None]:
sns.boxplot(x = 'Sex', y = 'Na_to_K', data= df)

In [None]:
sns.boxplot(x = 'Drug', y = 'Na_to_K', data = df)

In [None]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()

for i in list(df.columns):
    if df[i].dtype=='object':
        df[i]=le.fit_transform(df[i])

In [None]:
plt.figure(figsize = (20,10))
sns.heatmap(df.corr(), annot = True)

# Part 2: Model (Decision Tree)

In [None]:
df.head()

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X = df.drop('Drug',axis=1)
y = df['Drug']

In [None]:
X.head()

In [None]:
from sklearn import tree
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
dtree = DecisionTreeClassifier()

In [None]:
dtree.fit(X_train,y_train)

In [None]:
predictions = dtree.predict(X_test)

In [None]:
from sklearn.metrics import classification_report,confusion_matrix

In [None]:
print(classification_report(y_test,predictions))

In [None]:
features = list(df.columns[1:])

In [None]:
plt.figure(figsize = (20,10))

tree.plot_tree(dtree,feature_names=features,filled=True,rounded=True)


# Part 4: Building a Model (Random Forest)

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(X_train, y_train)

In [None]:
rfc_pred = rfc.predict(X_test)

In [None]:
print(confusion_matrix(y_test,rfc_pred))

In [None]:
sns.heatmap(confusion_matrix(y_test, rfc_pred), annot=True)


In [None]:
print(classification_report(y_test,rfc_pred))

# Part 5: Building A Model (Logistic Regression)

In [None]:
X = df.drop('Drug',axis=1)
y = df['Drug'] 

In [None]:
X.head()

In [None]:
from sklearn.linear_model import LogisticRegression
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35, random_state=0)

In [None]:
logmodel = LogisticRegression()


In [None]:
logmodel.fit(X_train, y_train)


In [None]:
predictions = logmodel.predict(X_test)


In [None]:
sns.heatmap(confusion_matrix(y_test, predictions), annot=True)


In [None]:
print(classification_report(y_test, predictions))


# Part 6: Building A Model (KNN)

In [None]:
from sklearn.model_selection import train_test_split
X = df.drop('Drug',axis=1)
y = df['Drug'] 
X.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 1)


In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X_train,y_train)

In [None]:
pred = knn.predict(X_test)



In [None]:
from sklearn.metrics import classification_report,confusion_matrix
print(confusion_matrix(y_test,pred))

In [None]:
sns.heatmap(confusion_matrix(y_test, pred), annot=True)


In [None]:
print(classification_report(y_test,pred))


In [None]:
error_rate = []

# Will take some time
for i in range(1,40):
    
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train,y_train)
    pred_i = knn.predict(X_test)
    error_rate.append(np.mean(pred_i != y_test))

In [None]:
plt.figure(figsize=(10,6))
plt.plot(range(1,40),error_rate,color='blue', linestyle='dashed', marker='o',
         markerfacecolor='red', markersize=10)
plt.title('Error Rate vs. K Value')
plt.xlabel('K')
plt.ylabel('Error Rate')

In [None]:
knn = KNeighborsClassifier(n_neighbors=23)

knn.fit(X_train,y_train)
pred = knn.predict(X_test)

print('WITH K=23')
print('\n')
print(confusion_matrix(y_test,pred))
print('\n')
print(classification_report(y_test,pred))

# Part 7: Building a Model (SVM)

In [None]:
from sklearn.model_selection import train_test_split
X = df.drop('Drug',axis=1)
y = df['Drug'] 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 1)

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
svc_model = SVC()
svc_model.fit(X_train, y_train)

In [None]:
y_predict = svc_model.predict(X_test)


In [None]:
cm = confusion_matrix(y_test, y_predict)

In [None]:
sns.heatmap(cm, annot = True)


In [None]:
print(classification_report(y_test, y_predict))


In [None]:
min_train = X_train.min()
range_train = (X_train-min_train).max()
X_train_scaled = (X_train - min_train)/range_train

min_test = X_test.min()
range_test = (X_test - min_test).max()
X_test_scaled = (X_test - min_test)/range_test

In [None]:
param_grid = {'C' : [0.1, 1, 10, 100], 'gamma' : [1, .1, .01, .001], 'kernel' : ['rbf']}


In [None]:
from sklearn.model_selection import GridSearchCV


In [None]:
grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 4)


In [None]:
grid.fit(X_train_scaled, y_train)


In [None]:
grid.best_params_


In [None]:
min_test = X_test.min()
range_test = (X_test - min_test).max()
X_test_scaled = (X_test - min_test)/range_test

In [None]:
grid_predictions = grid.predict(X_test_scaled)


In [None]:
cm = confusion_matrix(y_test, grid_predictions)

In [None]:
sns.heatmap(cm, annot=True)


In [None]:
print(classification_report(y_test, grid_predictions))