In [29]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

In [16]:
df = pd.read_csv("C:/Users/Administrator.DAI-PC2/Desktop/ML/Day 5/Kyphosis.csv")
le = LabelEncoder()
y = le.fit_transform(df["Kyphosis"])
X = df.drop("Kyphosis", axis = 1)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=24, stratify=y)

kfold = StratifiedKFold(n_splits = 5, shuffle = True, random_state=24)

In [18]:
#minmax scaling
std_mm = MinMaxScaler()
X_scl_trn = std_mm.fit_transform(X_train)
X_scl_tst = std_mm.transform(X_test)


knn = KNeighborsClassifier(n_neighbors=3)

knn.fit(X_scl_trn, y_train)

y_pred = knn.predict(X_scl_tst)
y_pred_prob = knn.predict_proba(X_scl_tst)

print(accuracy_score(y_test, y_pred))
print(roc_auc_score(y_test, y_pred_prob[:,1]))
print(log_loss(y_test,y_pred_prob))

0.76
0.845
1.7425616149199414


In [19]:
#Standard scaling
std_scaler = StandardScaler()
X_scl_trn = std_scaler.fit_transform(X_train)
X_scl_tst = std_scaler.transform(X_test)

knn = KNeighborsClassifier(n_neighbors=3)

knn.fit(X_scl_trn, y_train)

y_pred = knn.predict(X_scl_tst)
y_pred_prob = knn.predict_proba(X_scl_tst)

print(accuracy_score(y_test, y_pred))
print(roc_auc_score(y_test, y_pred_prob[:,1]))
print(log_loss(y_test,y_pred_prob))

0.8
0.875
0.3170340836795815


In [25]:
#pipeline for standard
pipe_std = Pipeline([('SCL', std_scaler), ('KNN',knn)])
pipe_std.fit(X_train, y_train)

y_pred = pipe_std.predict(X_test)
y_pred_prob = pipe_std.predict_proba(X_test)

print(accuracy_score(y_test, y_pred))
print(roc_auc_score(y_test, y_pred_prob[:,1]))
print(log_loss(y_test,y_pred_prob))

0.8
0.875
0.3170340836795815


In [33]:
#minmax using pipeline
pipe_mm = Pipeline([('Rishi', std_mm), ('KNN',knn)])
pipe_mm.fit(X_train, y_train)

y_pred = pipe_mm.predict(X_test)
y_pred_prob = pipe_mm.predict_proba(X_test)

print(accuracy_score(y_test, y_pred))
print(roc_auc_score(y_test, y_pred_prob[:,1]))
print(log_loss(y_test,y_pred_prob))

0.76
0.89
0.35098190714145305


In [32]:
#standard scaling with pipeline and gcv
kfold = StratifiedKFold(n_splits = 5, shuffle = True, random_state=24)
std_scaler = StandardScaler()
knn = KNeighborsClassifier()
pipe_std = Pipeline([('SCL', std_scaler), ('KNN',knn)])
params = {'KNN__n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}
gcv = GridSearchCV(pipe_std, param_grid = params, cv = kfold, scoring = 'neg_log_loss')
gcv.fit(X, y)
print(gcv.best_score_)
print(gcv.best_params_)

-0.3545562027841026
{'KNN__n_neighbors': 10}


In [35]:
#minamx scaling with pipeline and gcv
kfold = StratifiedKFold(n_splits = 5, shuffle = True, random_state=24)
knn = KNeighborsClassifier()
pipe_mm = Pipeline([('SCL', std_mm), ('Rishi',knn)])
params = {'Rishi__n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}
gcv = GridSearchCV(pipe_mm, param_grid = params, cv = kfold, scoring = 'neg_log_loss')
gcv.fit(X, y)
print(gcv.best_score_)
print(gcv.best_params_)

-0.3541342613432673
{'Rishi__n_neighbors': 9}


In [36]:
#gcv, pipeline with all three scaling together
kfold = StratifiedKFold(n_splits = 5, shuffle = True, random_state=24)
knn = KNeighborsClassifier()
pipe_std = Pipeline([('SCL', None), ('KNN',knn)])
params = {'KNN__n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 'SCL':[std_scaler, std_mm, None]}
gcv = GridSearchCV(pipe_std, param_grid = params, cv = kfold, scoring = 'neg_log_loss')
gcv.fit(X, y)
print(gcv.best_score_)
print(gcv.best_params_)

-0.3541342613432673
{'KNN__n_neighbors': 9, 'SCL': MinMaxScaler()}
