In [118]:
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from scipy.stats import zscore
from sklearn.preprocessing import Imputer
from sklearn.metrics import accuracy_score
import seaborn as sns
import os
%matplotlib inline

In [119]:
## Import the datafile
os.chdir("D:/K2Analytics/datafile")
train = pd.read_csv("DEV_SAMPLE_1HOT_ENCODED.csv")
test = pd.read_csv("HOLD_SAMPLE_1HOT_ENCODED.csv")
train.head() 

Unnamed: 0,Cust_ID,Target,Age,Balance,Cnt_Txns,SCR,Holding_Period,Occ_PROF,Occ_SAL,Occ_SELF_EMP,Occ_SENP,F,M,O
0,C16505,0,41,91519.92,38,926,15,0,0,1,0,0,1,0
1,C17241,0,52,117288.96,17,768,13,0,1,0,0,0,1,0
2,C18802,0,31,259827.44,8,816,5,0,0,0,1,1,0,0
3,C19289,0,45,26677.55,14,353,18,1,0,0,0,1,0,0
4,C14028,0,39,43440.31,1,751,31,0,0,0,1,1,0,0


In [120]:
train.shape

(14000, 14)

In [121]:
## List the columns in Dev Sample
train.columns

Index(['Cust_ID', 'Target', 'Age', 'Balance', 'Cnt_Txns', 'SCR',
       'Holding_Period', 'Occ_PROF', 'Occ_SAL', 'Occ_SELF_EMP', 'Occ_SENP',
       'F', 'M', 'O'],
      dtype='object')

In [122]:
## drop Customer ID, Target Variables
train_pv = train.drop(labels = ["Cust_ID", "Target"], axis = 1)
train_pv.head()

Unnamed: 0,Age,Balance,Cnt_Txns,SCR,Holding_Period,Occ_PROF,Occ_SAL,Occ_SELF_EMP,Occ_SENP,F,M,O
0,41,91519.92,38,926,15,0,0,1,0,0,1,0
1,52,117288.96,17,768,13,0,1,0,0,0,1,0
2,31,259827.44,8,816,5,0,0,0,1,1,0,0
3,45,26677.55,14,353,18,1,0,0,0,1,0,0
4,39,43440.31,1,751,31,0,0,0,1,1,0,0


In [123]:
## scaling all variables
train_z_trf = train_pv.apply(zscore)
train_z_trf.shape

(14000, 12)

In [124]:
## Store the normalized features data into np array
X_train = np.array(train_z_trf)
X_train.shape

(14000, 12)

In [125]:
## Capture the target variable into a pandas series akin to array
y_train = train['Target']

In [126]:
## Fit the model
NNH.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='euclidean',
           metric_params=None, n_jobs=None, n_neighbors=21, p=2,
           weights='uniform')

In [127]:
# Compute and print AUC score
from sklearn.metrics import roc_auc_score
auc = roc_auc_score(train["Target"],pd.DataFrame(NNH.predict_proba(X_train))[1])
auc

0.84293099647790726

In [128]:
## Data Preparation for the Hold Out Sample
## Predict the labels on Test Data
test_pv = test.drop(labels = ["Cust_ID", "Target"], axis = 1)
X_test = test_pv.apply(zscore)
h_auc = roc_auc_score(test["Target"],pd.DataFrame(NNH.predict_proba(X_test))[1])
h_auc

0.73860253810589516

### Train AUC = 0.85, Test AUC = 0.74
### Conclusion: Above Model is Overfitting
# GridSearchCV helps Parameter Tuning and Optimize the Model

In [146]:
from sklearn.model_selection import GridSearchCV

k = np.arange(151,163,2)
knn = KNeighborsClassifier()
parameters = {'n_neighbors' : k, 'algorithm' : ['kd_tree']}
#parameters = {'n_neighbors' : k, 'algorithm' : ['kd_tree', 'ball_tree']}
GS = GridSearchCV(knn, parameters, scoring = 'roc_auc', cv=3, verbose = 10)

In [147]:
GS.fit(X_train, y_train)

Fitting 3 folds for each of 6 candidates, totalling 18 fits
[CV] algorithm=kd_tree, n_neighbors=151 ..............................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  algorithm=kd_tree, n_neighbors=151, score=0.7531262478181009, total=   1.9s
[CV] algorithm=kd_tree, n_neighbors=151 ..............................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    5.6s remaining:    0.0s


[CV]  algorithm=kd_tree, n_neighbors=151, score=0.7698504329572292, total=   1.8s
[CV] algorithm=kd_tree, n_neighbors=151 ..............................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   11.0s remaining:    0.0s


[CV]  algorithm=kd_tree, n_neighbors=151, score=0.7734870382918622, total=   1.9s
[CV] algorithm=kd_tree, n_neighbors=153 ..............................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   16.6s remaining:    0.0s


[CV]  algorithm=kd_tree, n_neighbors=153, score=0.7528547225993406, total=   1.8s
[CV] algorithm=kd_tree, n_neighbors=153 ..............................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   22.1s remaining:    0.0s


[CV]  algorithm=kd_tree, n_neighbors=153, score=0.770442255256523, total=   1.9s
[CV] algorithm=kd_tree, n_neighbors=153 ..............................


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   27.8s remaining:    0.0s


[CV]  algorithm=kd_tree, n_neighbors=153, score=0.7743158899934528, total=   2.1s
[CV] algorithm=kd_tree, n_neighbors=155 ..............................


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:   33.8s remaining:    0.0s


[CV]  algorithm=kd_tree, n_neighbors=155, score=0.7529882034841934, total=   1.9s
[CV] algorithm=kd_tree, n_neighbors=155 ..............................


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:   39.6s remaining:    0.0s


[CV]  algorithm=kd_tree, n_neighbors=155, score=0.7708252997615598, total=   2.0s
[CV] algorithm=kd_tree, n_neighbors=155 ..............................


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:   45.3s remaining:    0.0s


[CV]  algorithm=kd_tree, n_neighbors=155, score=0.7746140936239319, total=   2.0s
[CV] algorithm=kd_tree, n_neighbors=157 ..............................


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:   51.3s remaining:    0.0s


[CV]  algorithm=kd_tree, n_neighbors=157, score=0.7530021790469237, total=   2.0s
[CV] algorithm=kd_tree, n_neighbors=157 ..............................
[CV]  algorithm=kd_tree, n_neighbors=157, score=0.7701125460623139, total=   2.1s
[CV] algorithm=kd_tree, n_neighbors=157 ..............................
[CV]  algorithm=kd_tree, n_neighbors=157, score=0.7738252692552914, total=   2.3s
[CV] algorithm=kd_tree, n_neighbors=159 ..............................
[CV]  algorithm=kd_tree, n_neighbors=159, score=0.7520986161340741, total=   2.3s
[CV] algorithm=kd_tree, n_neighbors=159 ..............................
[CV]  algorithm=kd_tree, n_neighbors=159, score=0.770718629139904, total=   2.0s
[CV] algorithm=kd_tree, n_neighbors=159 ..............................
[CV]  algorithm=kd_tree, n_neighbors=159, score=0.7749874914584531, total=   1.9s
[CV] algorithm=kd_tree, n_neighbors=161 ..............................
[CV]  algorithm=kd_tree, n_neighbors=161, score=0.7522412239170364, total=   2.0s
[

[Parallel(n_jobs=1)]: Done  18 out of  18 | elapsed:  1.7min finished


GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'algorithm': ['kd_tree'], 'n_neighbors': array([151, 153, 155, 157, 159, 161])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=10)

In [148]:
GS.best_params_

{'algorithm': 'kd_tree', 'n_neighbors': 155}

In [132]:
GS.best_score_

0.76567308644487397

In [133]:
## Hold Out Model Performance - AUC
hold_out['prob'] = pd.DataFrame(GS.predict_proba(X_test))[1]
h_auc = roc_auc_score(hold_out["Target"],hold_out["prob"])
h_auc

0.76549527809529661

## RandomSearchCV

In [152]:
from sklearn.model_selection import RandomizedSearchCV
k = np.arange(51,201,2)
knn = KNeighborsClassifier()
parameters = {'n_neighbors' : k, 'algorithm' : ['kd_tree', 'ball_tree']}
RS = RandomizedSearchCV(knn, parameters, n_iter=10, scoring = 'roc_auc', cv=3, verbose = 10)

In [153]:
RS.fit(X_train, y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV] algorithm=kd_tree, n_neighbors=121 ..............................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  algorithm=kd_tree, n_neighbors=121, score=0.7545674420727185, total=   1.6s
[CV] algorithm=kd_tree, n_neighbors=121 ..............................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    4.6s remaining:    0.0s


[CV]  algorithm=kd_tree, n_neighbors=121, score=0.7676673930156412, total=   1.7s
[CV] algorithm=kd_tree, n_neighbors=121 ..............................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    9.6s remaining:    0.0s


[CV]  algorithm=kd_tree, n_neighbors=121, score=0.7733317894219195, total=   1.5s
[CV] algorithm=ball_tree, n_neighbors=169 ............................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   14.2s remaining:    0.0s


[CV]  algorithm=ball_tree, n_neighbors=169, score=0.7522882844854141, total=   1.4s
[CV] algorithm=ball_tree, n_neighbors=169 ............................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   18.6s remaining:    0.0s


[CV]  algorithm=ball_tree, n_neighbors=169, score=0.7699411315071932, total=   1.5s
[CV] algorithm=ball_tree, n_neighbors=169 ............................


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   23.0s remaining:    0.0s


[CV]  algorithm=ball_tree, n_neighbors=169, score=0.7738184074267858, total=   1.5s
[CV] algorithm=ball_tree, n_neighbors=83 .............................


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:   27.4s remaining:    0.0s


[CV]  algorithm=ball_tree, n_neighbors=83, score=0.7494977353884066, total=   1.3s
[CV] algorithm=ball_tree, n_neighbors=83 .............................


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:   31.2s remaining:    0.0s


[CV]  algorithm=ball_tree, n_neighbors=83, score=0.7684177951695891, total=   1.3s
[CV] algorithm=ball_tree, n_neighbors=83 .............................


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:   35.4s remaining:    0.0s


[CV]  algorithm=ball_tree, n_neighbors=83, score=0.7704766969444851, total=   1.3s
[CV] algorithm=ball_tree, n_neighbors=115 ............................


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:   39.3s remaining:    0.0s


[CV]  algorithm=ball_tree, n_neighbors=115, score=0.7538894846725154, total=   1.4s
[CV] algorithm=ball_tree, n_neighbors=115 ............................
[CV]  algorithm=ball_tree, n_neighbors=115, score=0.7680301872154974, total=   1.5s
[CV] algorithm=ball_tree, n_neighbors=115 ............................
[CV]  algorithm=ball_tree, n_neighbors=115, score=0.7723139515268997, total=   1.4s
[CV] algorithm=kd_tree, n_neighbors=51 ...............................
[CV]  algorithm=kd_tree, n_neighbors=51, score=0.742183382200267, total=   1.2s
[CV] algorithm=kd_tree, n_neighbors=51 ...............................
[CV]  algorithm=kd_tree, n_neighbors=51, score=0.7656252495636201, total=   1.1s
[CV] algorithm=kd_tree, n_neighbors=51 ...............................
[CV]  algorithm=kd_tree, n_neighbors=51, score=0.7622965396370665, total=   1.1s
[CV] algorithm=kd_tree, n_neighbors=99 ...............................
[CV]  algorithm=kd_tree, n_neighbors=99, score=0.7515284702177907, total=   1.4s

[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:  2.2min finished


RandomizedSearchCV(cv=3, error_score='raise-deprecating',
          estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform'),
          fit_params=None, iid='warn', n_iter=10, n_jobs=None,
          param_distributions={'algorithm': ['kd_tree', 'ball_tree'], 'n_neighbors': array([ 51,  53, ..., 197, 199])},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring='roc_auc', verbose=10)

In [154]:
RS.best_params_

{'algorithm': 'kd_tree', 'n_neighbors': 133}