In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
## visualization
import seaborn as sns
import matplotlib.pyplot as plt

## preprocessing
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer

## model
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, ConfusionMatrixDisplay

## pipeline
from sklearn.pipeline import Pipeline

import random

seed = random.seed(100)

## Loading Dataset and Basic Analysis

In [None]:
path = '../input/drug-classification/drug200.csv'
dataLoad = pd.read_csv(path)
dataLoad.head()

In [None]:
dataLoad.info()

In [None]:
dataLoad.describe()

In [None]:
dataLoad.isnull().sum()

## Preprocessing

In [None]:
dataCopy = dataLoad.copy()
target = dataCopy['Drug']
features = dataCopy.drop('Drug', axis=1)
features.head()

In [None]:
OH_cols=[]
for i in features.columns:
    if features[i].nunique() < 5 :
        OH_cols.append(i)
        

encoder = OneHotEncoder()
encodedData = encoder.fit_transform(features[OH_cols]).toarray()
encodedLabel = encoder.categories_

featLabel = []
for j in range(3):
    for k in range(len(encodedLabel[j])):
        label = encodedLabel[j][k]
        featLabel.append(label)

encoded_df = pd.DataFrame(encodedData, columns=featLabel)
encoded_df.head()

In [None]:
df_features = pd.concat([features['Age'], features['Na_to_K'], encoded_df], axis=1)
df_features.head()

In [None]:
labelenc = LabelEncoder()
targetEncoded = pd.DataFrame(labelenc.fit_transform(target), columns=['Drug'])
targetEncoded.head()

In [None]:
x_train, x_test, y_train, y_test = train_test_split(df_features, targetEncoded, test_size=0.2)
X_train, X_valid, Y_train, Y_valid = train_test_split(x_train, y_train, test_size=0.2)
print(X_train.shape , X_valid.shape)
print(Y_train.shape, Y_valid.shape)

In [None]:
scaler= MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_valid = scaler.fit_transform(X_valid)

X_train.shape , X_valid.shape

## Baseline Model

In [None]:
modelBl= KNeighborsClassifier()
modelBl.fit(X_train, Y_train.values.ravel())
predBl = modelBl.predict(X_valid)
print(accuracy_score(Y_valid.values.ravel(), predBl))

In [None]:
print(classification_report(Y_valid.values.ravel(), predBl))

In [None]:
cmBl = confusion_matrix(Y_valid.values.ravel(), predBl)
ConfusionMatrixDisplay(cmBl).plot()

In [None]:
Y_valid.value_counts()

## Exploratory Data Analysis

In [None]:
drug = target.value_counts()
sns.barplot(y=drug, x=drug.index)

In [None]:
cols = ['Age', 'Na_to_K']

for col in cols:
    sns.boxplot(y = col, data=dataCopy)
    plt.show()

In [None]:
for u in cols:
    sns.boxplot(x = 'Drug', y = u, data=dataCopy)
    plt.show()

## Removing Outliers

The next step we remove an outliers from the data. It because KNN model is very sensitive with outliers.

In [None]:
## removing outliers on Na_to_K variable

Q1 = dataCopy['Na_to_K'].quantile(0.25)
Q3 = dataCopy['Na_to_K'].quantile(0.75)

IQR = Q3 - Q1

for i in range(len(dataCopy['Na_to_K'])):
    if dataCopy['Na_to_K'][i] < (Q1-IQR*1.5) or dataCopy['Na_to_K'][i] > (Q3 + IQR*1.5):
        dataCopy['Na_to_K'][i] = np.nan
    else:
        continue

dataCopy['Na_to_K'].isnull().sum()

In [None]:
## Imputing nan value with mean
imputer = SimpleImputer(strategy='mean')

naClean = pd.DataFrame(imputer.fit_transform(dataCopy[['Na_to_K']]), columns=['Na_to_K'])
naClean.isnull().sum()

In [None]:
df_feature= df_features.drop('Na_to_K', axis=1)
df_feature.head()

In [None]:
naClean.index = df_feature.index

df_Feature =pd.concat([df_feature, naClean], axis=1)
df_Feature.head()

In [None]:
x_train, x_test, y_train, y_test = train_test_split(df_Feature, targetEncoded, test_size=0.2)
X_train, X_valid, Y_train, Y_valid = train_test_split(x_train, y_train, test_size=0.2)
print(X_train.shape , X_valid.shape)
print(Y_train.shape, Y_valid.shape)

In [None]:
scaler= MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_valid = scaler.fit_transform(X_valid)

X_train.shape , X_valid.shape

## Model Optimization

Then, we are trying to get optimum parameter for our KNN model. In this case I will try to search the most optimum k number. I did it with gridcv method

In [None]:
params ={'n_neighbors':[3, 5, 7, 9]}

model = KNeighborsClassifier()
knnGrid = GridSearchCV(model, params, cv=5)

knnGrid.fit(X_train, Y_train.values.ravel())

print(knnGrid.best_score_)
print(knnGrid.best_params_)

In [None]:
optmodel = KNeighborsClassifier(n_neighbors=3)
optmodel.fit(X_train, Y_train.values.ravel())
optpred = optmodel.predict(X_valid)
print(accuracy_score(Y_valid.values.ravel(), optpred))

In [None]:
print(classification_report(Y_valid.values.ravel(), optpred))

In [None]:
cmopt = confusion_matrix(Y_valid.values.ravel(), optpred)
cmviz = ConfusionMatrixDisplay(cmopt).plot()

## Making pipeline

In [None]:
my_pipeline = Pipeline(steps=[('scaler', MinMaxScaler()), ('model', optmodel)])

my_pipeline.fit(x_train, y_train.values.ravel())
pred = my_pipeline.predict(x_test)

print(accuracy_score(y_test.values.ravel(), pred))

In [None]:
print(classification_report(y_test.values.ravel(), pred))

In [None]:
cm = confusion_matrix(y_test.values.ravel(), pred)
cmviz = ConfusionMatrixDisplay(cm).plot()