In [None]:
import pandas as pd
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

In [None]:
df = pd.read_csv('../input/drug-classification/drug200.csv')

In [None]:
df.head()

In [None]:
### Lets check the shape of our dataset
df.shape

In [None]:
df.describe()

In [None]:
## Lets check the null values in our dataset
df.isnull().sum()
## There are no null values

In [None]:
df.info()

In [None]:
df['Drug'].value_counts()

In [None]:
### Lets do Univariate Analysis with our dataset

plt.figure(figsize=(10,8))
sns.distplot(df['Age'])
plt.show()

In [None]:
plt.hist(x='Age',bins=30,histtype='step',color='#BA6C5D',data=df)
###We can observe that in our dataset ,age b/w 45-50 are more

In [None]:
## Lets check the Age column
sns.countplot(x='Sex',palette='pastel',orient='v',data=df)
## In our dataset the category Male are slightly more than female category

In [None]:
# Lets check BP column
sns.countplot(x='BP',data=df)
## People with high BP are more in the dataset

In [None]:
## Lets check Cholesterol
sns.countplot(x='Cholesterol',data=df)
## Cholesterol is also balanced but high Cholesterol level people are a bit more

In [None]:
## Lets check our output variable : Drug

sns.countplot(x='Drug',data=df)
## Drug Y dataset is more than other datasets. So our dataset is an unbalanced dataset.It is better to apply oversampling techniques

In [None]:
### Bi-Variate analysis
df.head()

In [None]:
## Age and Target variable
sns.boxplot(x='Age',y='Drug',orient="h",data=df)

## Drug B was used by age group b/w 50-70, Drug A was used by age group b/w 20-50.
## Where as Drug Y is being used from all the ages in this dataset.

In [None]:
###Sex and Drug
ax = sns.countplot(x='Drug', hue="Sex",palette="Set1", data=df)
## Male  gets  drugA, drugB and drugC more than female.
## drugX is equal for male and female people

In [None]:
## BP and Drug
ax = sns.countplot(x='Drug', hue="BP", data=df)
# Drug A and Drug B for people with high Blood Pressure
## Drug X is not taken by High BP people
### Drug C is only taken by people with low Blood Pressure 

In [None]:
##Drug and Cholesterol
ax = sns.countplot(x='Drug', hue="Cholesterol", palette="Set2",data=df)
# Drug Y is more used by high cholesterol and normal people
## Where as Drug C is only for the people with high cholesterol

In [None]:
### Na_k and Drug
plt.figure(figsize = (9,5))
sns.swarmplot(x = "Drug", y = "Na_to_K",hue="BP",data = df)
plt.legend()
plt.title("Na_to_K -- BP -- Drug")
plt.show()

In [None]:
na_to_k_groups = []
for i in df['Na_to_K']:
    if i <= 10:
        na_to_k_groups.append('5-10')
    if i > 10 and i <= 15:
        na_to_k_groups.append('10-15')
    if i > 15 and i <= 20:
        na_to_k_groups.append('15-20')
    if i > 20 and i <= 25:
        na_to_k_groups.append('20-25')
    if i > 25 and i <= 30:
        na_to_k_groups.append('25-30')
    if i > 30:
        na_to_k_groups.append('30+')
        
df['Na_to_K_groups'] = na_to_k_groups

In [None]:
df.head()

In [None]:
from sklearn.preprocessing import LabelEncoder

def label_encoder(y):
    le = LabelEncoder()
    df[y] = le.fit_transform(df[y])

In [None]:
label_list = ["Sex","BP","Cholesterol","Na_to_K","Na_to_K_groups","Drug"]

for l in label_list:
    label_encoder(l)

In [None]:
df.drop(['Na_to_K'], axis=1, inplace=True)

In [None]:
df.head()

In [None]:
from sklearn.model_selection import train_test_split

X = df.drop(['Drug'],axis=1)
y = df.Drug

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state = 42,shuffle=True)

In [None]:
X_train

In [None]:
##pip install "imbalanced_learn==0.6.2"

In [None]:
from imblearn.over_sampling import SMOTE
X_train,y_train = SMOTE().fit_resample(X_train,y_train)

In [None]:
X_train

In [None]:
## Lets apply Logistic Regression first
from sklearn import linear_model, naive_bayes, neighbors, svm
log_reg = linear_model.LogisticRegression(max_iter = 7000)
log_reg.fit(X_train, y_train)
log_reg_acc = 100*log_reg.score(X_test, y_test)
print('Logistic Regression Predictions: \n', log_reg.predict(X_test), '\n Accuracy:', log_reg_acc, '%')

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()
accuracies = cross_val_score(knn, X_train, y_train, cv=5)
knn.fit(X_train,y_train)

print("Train Score:",np.mean(accuracies))


In [None]:
print("Test Score:",knn.score(X_test,y_test))

In [None]:
grid = {'n_neighbors':np.arange(1,120),
        'p':np.arange(1,3),
        'weights':['uniform','distance']
       }

knn = KNeighborsClassifier(algorithm = "auto")
knn_cv = GridSearchCV(knn,grid,cv=5)
knn_cv.fit(X_train,y_train)

print("Hyperparameters:",knn_cv.best_params_)

In [None]:
knn = KNeighborsClassifier(algorithm = "auto",n_neighbors=5,p=1,weights='distance')

In [None]:
knn.fit(X_train,y_train)

In [None]:
knn.score(X_test,y_test)

In [None]:
knn.score(X_train,y_train)