In [1]:
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from scipy.stats import zscore
from sklearn.preprocessing import Imputer
from sklearn.metrics import accuracy_score
import seaborn as sns
import os
%matplotlib inline

In [2]:
## Import the datafile
os.chdir("D:/K2Analytics/datafile")
dev = pd.read_csv("DEV_SAMPLE.csv")
dev.head()

Unnamed: 0,Cust_ID,Target,Age,Gender,Balance,Occupation,No_OF_CR_TXNS,AGE_BKT,SCR,Holding_Period
0,C16505,0,41,M,91519.92,SELF-EMP,38,41-45,926,15
1,C17241,0,52,M,117288.96,SAL,17,>50,768,13
2,C18802,0,31,F,259827.44,SENP,8,31-35,816,5
3,C19289,0,45,F,26677.55,PROF,14,41-45,353,18
4,C14028,0,39,F,43440.31,SENP,1,36-40,751,31


In [3]:
dev.shape

(14000, 10)

In [4]:
dev.dtypes

Cust_ID            object
Target              int64
Age                 int64
Gender             object
Balance           float64
Occupation         object
No_OF_CR_TXNS       int64
AGE_BKT            object
SCR                 int64
Holding_Period      int64
dtype: object

In [5]:
dev.describe(include='all').transpose()

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
Cust_ID,14000,14000.0,C5890,1.0,,,,,,,
Target,14000,,,,0.0882143,0.283616,0.0,0.0,0.0,0.0,1.0
Age,14000,,,,38.4666,9.53708,21.0,30.0,38.0,46.25,55.0
Gender,14000,3.0,M,10017.0,,,,,,,
Balance,14000,,,,144987.0,169261.0,0.0,23729.4,79059.2,213225.0,1246970.0
Occupation,14000,4.0,SAL,4101.0,,,,,,,
No_OF_CR_TXNS,14000,,,,16.7959,13.0176,0.0,7.0,14.0,22.0,50.0
AGE_BKT,14000,7.0,31-35,2505.0,,,,,,,
SCR,14000,,,,556.279,261.464,100.0,330.0,559.0,785.25,999.0
Holding_Period,14000,,,,15.2706,8.96702,1.0,7.0,16.0,23.0,31.0


In [6]:
## Target Variable Frequency Distribution
freq = dev['Target'].value_counts().to_frame()
freq.reset_index(inplace=True)
freq.columns = [freq.columns[1], 'count']
freq['prop'] = freq['count'] / sum(freq['count'])
freq

Unnamed: 0,Target,count,prop
0,0,12765,0.911786
1,1,1235,0.088214


In [7]:
## Creating the K Nearest Neighbour Classifier Object
## weights options are 'uniform' 'distance'
## weights = 'distance' - weighs points by inverse of their distance
NNH = KNeighborsClassifier(n_neighbors = 21, weights = 'uniform', 
                           metric = 'euclidean')

In [8]:
## Dummy 1 / 0 matrix for Occupation Variable
df_occ = pd.get_dummies(data = dev['Occupation'], dummy_na = True)
df_occ = df_occ.rename(columns = {'SELF-EMP' : 'SELF_EMP'})
dev = pd.concat([dev, df_occ.iloc[:, 0:4]], axis = 1)
dev.head()
## Likewise one may convert Gender Variable into 1 / 0 matrix

Unnamed: 0,Cust_ID,Target,Age,Gender,Balance,Occupation,No_OF_CR_TXNS,AGE_BKT,SCR,Holding_Period,PROF,SAL,SELF_EMP,SENP
0,C16505,0,41,M,91519.92,SELF-EMP,38,41-45,926,15,0.0,0.0,1.0,0.0
1,C17241,0,52,M,117288.96,SAL,17,>50,768,13,0.0,1.0,0.0,0.0
2,C18802,0,31,F,259827.44,SENP,8,31-35,816,5,0.0,0.0,0.0,1.0
3,C19289,0,45,F,26677.55,PROF,14,41-45,353,18,1.0,0.0,0.0,0.0
4,C14028,0,39,F,43440.31,SENP,1,36-40,751,31,0.0,0.0,0.0,1.0


In [9]:
## List the columns in Dev Sample
dev.columns

Index(['Cust_ID', 'Target', 'Age', 'Gender', 'Balance', 'Occupation',
       'No_OF_CR_TXNS', 'AGE_BKT', 'SCR', 'Holding_Period', 'PROF', 'SAL',
       'SELF_EMP', 'SENP'],
      dtype='object')

In [10]:
## drop Customer ID, Target, Gender, Occupation and Age Bucket Variables
dev_pv = dev.drop(labels = ["Cust_ID", "Target", "Gender", 
                          "Occupation", "AGE_BKT"], axis = 1)
dev_pv.head()

Unnamed: 0,Age,Balance,No_OF_CR_TXNS,SCR,Holding_Period,PROF,SAL,SELF_EMP,SENP
0,41,91519.92,38,926,15,0.0,0.0,1.0,0.0
1,52,117288.96,17,768,13,0.0,1.0,0.0,0.0
2,31,259827.44,8,816,5,0.0,0.0,0.0,1.0
3,45,26677.55,14,353,18,1.0,0.0,0.0,0.0
4,39,43440.31,1,751,31,0.0,0.0,0.0,1.0


In [11]:
## scaling all variables
df_z = dev_pv.apply(zscore)
df_z.shape

(14000, 9)

In [12]:
## Store the normalized features data into np array
X_train = np.array(df_z)
X_train.shape

(14000, 9)

In [13]:
## Capture the target variable into a pandas series akin to array
y_train = dev['Target']

In [14]:
## Fit the model
NNH.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='euclidean',
           metric_params=None, n_jobs=1, n_neighbors=21, p=2,
           weights='uniform')

In [15]:
## Get the probability
dev['prob'] = pd.DataFrame(NNH.predict_proba(X_train))[1]

In [16]:
## Model Performance Measure
## Deciling
dev['decile']=pd.qcut(dev.prob.rank(method='first'), 10, labels=False)

## Rank Order Table and KS Statistics
def Response_Rate(X,y,Target):
    
    Rank=X.groupby('decile').apply(lambda x: pd.Series([
        np.min(x[y]),
        np.max(x[y]),
        np.mean(x[y]),
        np.size(x[y]),
        np.sum(x[Target]),
        np.size(x[Target][x[Target]==0]),
        ],
        index=(["min_prob","max_prob","avg_prob",
                "cnt","cnt_resp","cnt_non_resp"])
        )).reset_index()
    Rank=Rank.sort_values(by='decile',ascending=False)
    Rank["rrate"]=round(Rank["cnt_resp"]*100/Rank["cnt"],2)
    Rank["cum_resp"]=np.cumsum(Rank["cnt_resp"])
    Rank["cum_non_resp"]=np.cumsum(Rank["cnt_non_resp"])
    Rank["cum_resp_pct"]=round(Rank["cum_resp"]*100/np.sum(Rank["cnt_resp"]),2)
    Rank["cum_non_resp_pct"]=round(
            Rank["cum_non_resp"]*100/np.sum(Rank["cnt_non_resp"]),2)
    Rank["KS"] = round(Rank["cum_resp_pct"] - Rank["cum_non_resp_pct"],2)
    Rank
    return(Rank)


RRate = Response_Rate(dev,"prob","Target")
RRate

Unnamed: 0,decile,min_prob,max_prob,avg_prob,cnt,cnt_resp,cnt_non_resp,rrate,cum_resp,cum_non_resp,cum_resp_pct,cum_non_resp_pct,KS
9,9,0.190476,0.857143,0.349796,1400.0,503.0,897.0,35.93,503.0,897.0,40.73,7.03,33.7
8,8,0.142857,0.190476,0.168844,1400.0,255.0,1145.0,18.21,758.0,2042.0,61.38,16.0,45.38
7,7,0.095238,0.142857,0.120544,1400.0,189.0,1211.0,13.5,947.0,3253.0,76.68,25.48,51.2
6,6,0.095238,0.095238,0.095238,1400.0,131.0,1269.0,9.36,1078.0,4522.0,87.29,35.42,51.87
5,5,0.047619,0.095238,0.052959,1400.0,60.0,1340.0,4.29,1138.0,5862.0,92.15,45.92,46.23
4,4,0.047619,0.047619,0.047619,1400.0,60.0,1340.0,4.29,1198.0,7202.0,97.0,56.42,40.58
3,3,0.0,0.047619,0.03034,1400.0,37.0,1363.0,2.64,1235.0,8565.0,100.0,67.1,32.9
2,2,0.0,0.0,0.0,1400.0,0.0,1400.0,0.0,1235.0,9965.0,100.0,78.07,21.93
1,1,0.0,0.0,0.0,1400.0,0.0,1400.0,0.0,1235.0,11365.0,100.0,89.03,10.97
0,0,0.0,0.0,0.0,1400.0,0.0,1400.0,0.0,1235.0,12765.0,100.0,100.0,0.0


In [17]:
# Compute and print AUC score
from sklearn.metrics import roc_auc_score
auc = roc_auc_score(dev["Target"],dev["prob"])
auc

0.84159431390552675

In [18]:
## Data Preparation for the Hold Out Sample
## Predict the labels on Test Data
hold_out = pd.read_csv("HOLDOUT_SAMPLE.csv")


hold_out_occ = pd.get_dummies(data = hold_out['Occupation'], 
                                  dummy_na = True)
hold_out_occ = hold_out_occ.rename(columns = {'SELF-EMP' : 'SELF_EMP'})
hold_out = pd.concat([hold_out, 
                              hold_out_occ.iloc[:, 0:4]], axis = 1)


hold_out_pv = hold_out.drop(labels = ["Cust_ID", "Target", "Gender", 
                          "Occupation", "AGE_BKT"], axis = 1)
X_test = hold_out_pv.apply(zscore)
X_test.shape

(6000, 9)

In [19]:
## Hold Out Model Performance
## Get the probability

hold_out['prob'] = pd.DataFrame(NNH.predict_proba(X_test))[1]
## Deciling
hold_out['decile']=pd.qcut(hold_out.prob.rank(method='first'), 
        10, labels=False)

h_RRate = Response_Rate(hold_out,"prob","Target")
h_RRate

Unnamed: 0,decile,min_prob,max_prob,avg_prob,cnt,cnt_resp,cnt_non_resp,rrate,cum_resp,cum_non_resp,cum_resp_pct,cum_non_resp_pct,KS
9,9,0.238095,0.857143,0.348492,600.0,168.0,432.0,28.0,168.0,432.0,33.73,7.85,25.88
8,8,0.142857,0.238095,0.175,600.0,68.0,532.0,11.33,236.0,964.0,47.39,17.52,29.87
7,7,0.095238,0.142857,0.118095,600.0,74.0,526.0,12.33,310.0,1490.0,62.25,27.08,35.17
6,6,0.095238,0.095238,0.095238,600.0,38.0,562.0,6.33,348.0,2052.0,69.88,37.3,32.58
5,5,0.047619,0.095238,0.049683,600.0,34.0,566.0,5.67,382.0,2618.0,76.71,47.58,29.13
4,4,0.047619,0.047619,0.047619,600.0,30.0,570.0,5.0,412.0,3188.0,82.73,57.94,24.79
3,3,0.0,0.047619,0.03127,600.0,25.0,575.0,4.17,437.0,3763.0,87.75,68.39,19.36
2,2,0.0,0.0,0.0,600.0,26.0,574.0,4.33,463.0,4337.0,92.97,78.83,14.14
1,1,0.0,0.0,0.0,600.0,20.0,580.0,3.33,483.0,4917.0,96.99,89.37,7.62
0,0,0.0,0.0,0.0,600.0,15.0,585.0,2.5,498.0,5502.0,100.0,100.0,0.0


In [20]:
# Compute and print AUC score
from sklearn.metrics import roc_auc_score
h_auc = roc_auc_score(hold_out["Target"],hold_out["prob"])
h_auc

0.71784338371296896

In [21]:
from sklearn.grid_search import GridSearchCV

k = np.arange(151,175,2)
knn = KNeighborsClassifier(algorithm = 'kd_tree')
parameters = {'n_neighbors' : k}
##GS = GridSearchCV(knn, parameters, scoring = my_auc, cv=10)
GS = GridSearchCV(knn, parameters, scoring = 'roc_auc', cv=10)



In [22]:
GS.fit(X_train, y_train)

GridSearchCV(cv=10, error_score='raise',
       estimator=KNeighborsClassifier(algorithm='kd_tree', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_neighbors': array([151, 153, 155, 157, 159, 161, 163, 165, 167, 169, 171, 173])},
       pre_dispatch='2*n_jobs', refit=True, scoring='roc_auc', verbose=0)

In [23]:
GS.best_params_

{'n_neighbors': 159}

In [24]:
GS.best_score_

0.7660942185708379

In [25]:
## Hold Out Model Performance - AUC
hold_out['prob'] = pd.DataFrame(GS.predict_proba(X_test))[1]
h_auc = roc_auc_score(hold_out["Target"],hold_out["prob"])
h_auc

0.76003943071449742

In [26]:
## Rank Ordering on Development Sample
dev['prob'] = pd.DataFrame(GS.predict_proba(X_train))[1]
dev['decile']=pd.qcut(dev.prob.rank(method='first'), 10, labels=False)
RRate = Response_Rate(dev,"prob","Target")
RRate

Unnamed: 0,decile,min_prob,max_prob,avg_prob,cnt,cnt_resp,cnt_non_resp,rrate,cum_resp,cum_non_resp,cum_resp_pct,cum_non_resp_pct,KS
9,9,0.176101,0.584906,0.278684,1400.0,465.0,935.0,33.21,465.0,935.0,37.65,7.32,30.33
8,8,0.125786,0.176101,0.147444,1400.0,210.0,1190.0,15.0,675.0,2125.0,54.66,16.65,38.01
7,7,0.09434,0.125786,0.110907,1400.0,164.0,1236.0,11.71,839.0,3361.0,67.94,26.33,41.61
6,6,0.075472,0.09434,0.086195,1400.0,121.0,1279.0,8.64,960.0,4640.0,77.73,36.35,41.38
5,5,0.062893,0.075472,0.068679,1400.0,84.0,1316.0,6.0,1044.0,5956.0,84.53,46.66,37.87
4,4,0.044025,0.062893,0.053926,1400.0,77.0,1323.0,5.5,1121.0,7279.0,90.77,57.02,33.75
3,3,0.037736,0.044025,0.041096,1400.0,56.0,1344.0,4.0,1177.0,8623.0,95.3,67.55,27.75
2,2,0.025157,0.037736,0.030022,1400.0,32.0,1368.0,2.29,1209.0,9991.0,97.89,78.27,19.62
1,1,0.012579,0.025157,0.020485,1400.0,18.0,1382.0,1.29,1227.0,11373.0,99.35,89.1,10.25
0,0,0.0,0.012579,0.008562,1400.0,8.0,1392.0,0.57,1235.0,12765.0,100.0,100.0,0.0


In [27]:
## Deciling
hold_out['decile']=pd.qcut(hold_out.prob.rank(method='first'), 
        10, labels=False)

h_RRate = Response_Rate(hold_out,"prob","Target")
h_RRate

Unnamed: 0,decile,min_prob,max_prob,avg_prob,cnt,cnt_resp,cnt_non_resp,rrate,cum_resp,cum_non_resp,cum_resp_pct,cum_non_resp_pct,KS
9,9,0.176101,0.578616,0.271541,600.0,169.0,431.0,28.17,169.0,431.0,33.94,7.83,26.11
8,8,0.125786,0.176101,0.147474,600.0,83.0,517.0,13.83,252.0,948.0,50.6,17.23,33.37
7,7,0.09434,0.125786,0.111237,600.0,63.0,537.0,10.5,315.0,1485.0,63.25,26.99,36.26
6,6,0.075472,0.09434,0.08587,600.0,59.0,541.0,9.83,374.0,2026.0,75.1,36.82,38.28
5,5,0.062893,0.075472,0.068092,600.0,36.0,564.0,6.0,410.0,2590.0,82.33,47.07,35.26
4,4,0.044025,0.062893,0.053637,600.0,30.0,570.0,5.0,440.0,3160.0,88.35,57.43,30.92
3,3,0.037736,0.044025,0.041111,600.0,28.0,572.0,4.67,468.0,3732.0,93.98,67.83,26.15
2,2,0.025157,0.037736,0.030241,600.0,11.0,589.0,1.83,479.0,4321.0,96.18,78.54,17.64
1,1,0.018868,0.025157,0.021572,600.0,12.0,588.0,2.0,491.0,4909.0,98.59,89.22,9.37
0,0,0.0,0.018868,0.008784,600.0,7.0,593.0,1.17,498.0,5502.0,100.0,100.0,0.0
