In [1]:
import numpy as np
import pandas as pd
from sklearn.naive_bayes import GaussianNB
from scipy.stats import zscore
from sklearn.preprocessing import Imputer
from sklearn.metrics import accuracy_score
import seaborn as sns
import os
%matplotlib inline

In [2]:
## Import the datafile
os.chdir("D:/K2Analytics/datafile")
dev = pd.read_csv("DEV_SAMPLE.csv")
dev.head()

Unnamed: 0,Cust_ID,Target,Age,Gender,Balance,Occupation,No_OF_CR_TXNS,AGE_BKT,SCR,Holding_Period
0,C16505,0,41,M,91519.92,SELF-EMP,38,41-45,926,15
1,C17241,0,52,M,117288.96,SAL,17,>50,768,13
2,C18802,0,31,F,259827.44,SENP,8,31-35,816,5
3,C19289,0,45,F,26677.55,PROF,14,41-45,353,18
4,C14028,0,39,F,43440.31,SENP,1,36-40,751,31


In [3]:
dev['is_male']= np.where((dev['Gender']) == 'M', 1, 0)
dev['is_self_emp']= np.where((dev['Occupation']) == 'SELF-EMP', 1, 0)
dev_nb = dev[['Cust_ID', 'is_male', 'is_self_emp', 'Target']]
dev_nb.shape

(14000, 4)

In [4]:
dev_nb.head()

Unnamed: 0,Cust_ID,is_male,is_self_emp,Target
0,C16505,1,1,0
1,C17241,1,0,0
2,C18802,0,0,0
3,C19289,0,0,0
4,C14028,0,0,0


In [5]:
dev_nb.describe(include='all').transpose()

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
Cust_ID,14000,14000.0,C3589,1.0,,,,,,,
is_male,14000,,,,0.7155,0.45122,0.0,0.0,1.0,1.0,1.0
is_self_emp,14000,,,,0.17,0.375648,0.0,0.0,0.0,0.0,1.0
Target,14000,,,,0.0882143,0.283616,0.0,0.0,0.0,0.0,1.0


In [6]:
## Target Variable Frequency Distribution
freq = dev_nb['Target'].value_counts().to_frame()
freq.reset_index(inplace=True)
freq.columns = [freq.columns[1], 'count']
freq['prop'] = freq['count'] / sum(freq['count'])
freq

Unnamed: 0,Target,count,prop
0,0,12765,0.911786
1,1,1235,0.088214


In [7]:
## Store the normalized features data into np array
X_train = np.array(dev_nb[['is_male', 'is_self_emp']])
X_train.shape

(14000, 2)

In [8]:
## Capture the target variable into a pandas series akin to array
y_train = dev['Target']

In [9]:
from sklearn.naive_bayes import BernoulliNB
NB = BernoulliNB()
NB.fit(X_train, y_train)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [10]:
## Get the probability
dev_nb['prob'] = pd.DataFrame(NB.predict_proba(X_train))[1]
dev_nb[['is_male','is_self_emp','prob']].head(20)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


Unnamed: 0,is_male,is_self_emp,prob
0,1,1,0.164815
1,1,0,0.078329
2,0,0,0.063694
3,0,0,0.063694
4,0,0,0.063694
5,1,0,0.078329
6,1,0,0.078329
7,0,1,0.136413
8,1,0,0.078329
9,1,1,0.164815


In [11]:
dev_nb['Target0']= np.where((dev_nb['Target']) == 0, 1, 0)
aggregations = {
    'Target0': { 'cnt_T0' : 'sum'},
    'Target' : { 'cnt_T1' : 'sum',
                'obs' : 'count',
                'prob' : 'mean'}
}
dev_nb.groupby(['is_male', 'is_self_emp']).agg(aggregations)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


Unnamed: 0_level_0,Unnamed: 1_level_0,Target,Target,Target,Target0
Unnamed: 0_level_1,Unnamed: 1_level_1,obs,prob,cnt_T1,cnt_T0
is_male,is_self_emp,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
0,0,3346,0.06575,220,3126
0,1,637,0.128728,82,555
1,0,8274,0.077592,642,7632
1,1,1743,0.166954,291,1452


In [12]:
pd.crosstab(dev_nb['is_male'], dev_nb['Target'], rownames=['is_male'], colnames=['Target'])

Target,0,1
is_male,Unnamed: 1_level_1,Unnamed: 2_level_1
0,3681,302
1,9084,933


In [13]:
pd.crosstab(dev_nb['is_self_emp'], dev_nb['Target'], rownames=['is_self_emp'], colnames=['Target'])

Target,0,1
is_self_emp,Unnamed: 1_level_1,Unnamed: 2_level_1
0,10758,862
1,2007,373


In [14]:
###########################################
###########################################
######  Gaussian Naive Bayes Model  ######

In [15]:
import numpy as np
import pandas as pd
from scipy.stats import zscore
from sklearn.preprocessing import Imputer
from sklearn.metrics import accuracy_score
import seaborn as sns
import os
%matplotlib inline

In [16]:
## Import the datafile
os.chdir("D:/K2Analytics/datafile")
dev = pd.read_csv("DEV_SAMPLE.csv")
dev.head()

Unnamed: 0,Cust_ID,Target,Age,Gender,Balance,Occupation,No_OF_CR_TXNS,AGE_BKT,SCR,Holding_Period
0,C16505,0,41,M,91519.92,SELF-EMP,38,41-45,926,15
1,C17241,0,52,M,117288.96,SAL,17,>50,768,13
2,C18802,0,31,F,259827.44,SENP,8,31-35,816,5
3,C19289,0,45,F,26677.55,PROF,14,41-45,353,18
4,C14028,0,39,F,43440.31,SENP,1,36-40,751,31


In [17]:
## Dummy 1 / 0 matrix for Occupation Variable
df_occ = pd.get_dummies(data = dev['Occupation'], dummy_na = True)
df_occ = df_occ.rename(columns = {'SELF-EMP' : 'SELF_EMP'})
dev = pd.concat([dev, df_occ.iloc[:, 0:4]], axis = 1)
dev.head()

Unnamed: 0,Cust_ID,Target,Age,Gender,Balance,Occupation,No_OF_CR_TXNS,AGE_BKT,SCR,Holding_Period,PROF,SAL,SELF_EMP,SENP
0,C16505,0,41,M,91519.92,SELF-EMP,38,41-45,926,15,0.0,0.0,1.0,0.0
1,C17241,0,52,M,117288.96,SAL,17,>50,768,13,0.0,1.0,0.0,0.0
2,C18802,0,31,F,259827.44,SENP,8,31-35,816,5,0.0,0.0,0.0,1.0
3,C19289,0,45,F,26677.55,PROF,14,41-45,353,18,1.0,0.0,0.0,0.0
4,C14028,0,39,F,43440.31,SENP,1,36-40,751,31,0.0,0.0,0.0,1.0


In [18]:
## Dummy 1 / 0 matrix for Occupation Variable
df_gen = pd.get_dummies(data = dev['Gender'], dummy_na = True)
dev = pd.concat([dev, df_gen.iloc[:, 1:2]], axis = 1)
dev.head()

Unnamed: 0,Cust_ID,Target,Age,Gender,Balance,Occupation,No_OF_CR_TXNS,AGE_BKT,SCR,Holding_Period,PROF,SAL,SELF_EMP,SENP,M
0,C16505,0,41,M,91519.92,SELF-EMP,38,41-45,926,15,0.0,0.0,1.0,0.0,1.0
1,C17241,0,52,M,117288.96,SAL,17,>50,768,13,0.0,1.0,0.0,0.0,1.0
2,C18802,0,31,F,259827.44,SENP,8,31-35,816,5,0.0,0.0,0.0,1.0,0.0
3,C19289,0,45,F,26677.55,PROF,14,41-45,353,18,1.0,0.0,0.0,0.0,0.0
4,C14028,0,39,F,43440.31,SENP,1,36-40,751,31,0.0,0.0,0.0,1.0,0.0


In [19]:
## drop Customer ID, Target, Gender, Occupation and Age Bucket Variables
dev_pv = dev.drop(labels = ["Cust_ID", "Target",  "Gender", "Occupation", "AGE_BKT" ], axis = 1)
dev_pv.head()

Unnamed: 0,Age,Balance,No_OF_CR_TXNS,SCR,Holding_Period,PROF,SAL,SELF_EMP,SENP,M
0,41,91519.92,38,926,15,0.0,0.0,1.0,0.0,1.0
1,52,117288.96,17,768,13,0.0,1.0,0.0,0.0,1.0
2,31,259827.44,8,816,5,0.0,0.0,0.0,1.0,0.0
3,45,26677.55,14,353,18,1.0,0.0,0.0,0.0,0.0
4,39,43440.31,1,751,31,0.0,0.0,0.0,1.0,0.0


In [20]:
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()

In [21]:
## Store the normalized features data into np array
X_train = np.array(dev_pv)
X_train.shape

(14000, 10)

In [22]:
## Capture the target variable into a pandas series akin to array
y_train = dev['Target']

In [23]:
GNB = clf.fit(X_train, y_train)

In [24]:
## Get the probability
dev['prob'] = pd.DataFrame(GNB.predict_proba(X_train))[1]

In [25]:
## Model Performance Measure
## Deciling
dev['decile']=pd.qcut(dev.prob.rank(method='first'), 10, labels=False)

## Rank Order Table and KS Statistics
def Response_Rate(X,y,Target):
    
    Rank=X.groupby('decile').apply(lambda x: pd.Series([
        np.min(x[y]),
        np.max(x[y]),
        np.mean(x[y]),
        np.size(x[y]),
        np.sum(x[Target]),
        np.size(x[Target][x[Target]==0]),
        ],
        index=(["min_prob","max_prob","avg_prob",
                "cnt","cnt_resp","cnt_non_resp"])
        )).reset_index()
    Rank=Rank.sort_values(by='decile',ascending=False)
    Rank["rrate"]=round(Rank["cnt_resp"]*100/Rank["cnt"],2)
    Rank["cum_resp"]=np.cumsum(Rank["cnt_resp"])
    Rank["cum_non_resp"]=np.cumsum(Rank["cnt_non_resp"])
    Rank["cum_resp_pct"]=round(Rank["cum_resp"]*100/np.sum(Rank["cnt_resp"]),2)
    Rank["cum_non_resp_pct"]=round(
            Rank["cum_non_resp"]*100/np.sum(Rank["cnt_non_resp"]),2)
    Rank["KS"] = round(Rank["cum_resp_pct"] - Rank["cum_non_resp_pct"],2)
    Rank
    return(Rank)


RRate = Response_Rate(dev,"prob","Target")
RRate

Unnamed: 0,decile,min_prob,max_prob,avg_prob,cnt,cnt_resp,cnt_non_resp,rrate,cum_resp,cum_non_resp,cum_resp_pct,cum_non_resp_pct,KS
9,9,0.2097451,0.582578,0.294614,1400.0,362.0,1038.0,25.86,362.0,1038.0,29.31,8.13,21.18
8,8,0.1504595,0.209656,0.176878,1400.0,240.0,1160.0,17.14,602.0,2198.0,48.74,17.22,31.52
7,7,0.1172877,0.150434,0.132793,1400.0,179.0,1221.0,12.79,781.0,3419.0,63.24,26.78,36.46
6,6,0.09261111,0.117278,0.10442,1400.0,112.0,1288.0,8.0,893.0,4707.0,72.31,36.87,35.44
5,5,0.07185779,0.092581,0.082084,1400.0,101.0,1299.0,7.21,994.0,6006.0,80.49,47.05,33.44
4,4,0.05424886,0.071853,0.062744,1400.0,78.0,1322.0,5.57,1072.0,7328.0,86.8,57.41,29.39
3,3,0.0398152,0.054243,0.046964,1400.0,54.0,1346.0,3.86,1126.0,8674.0,91.17,67.95,23.22
2,2,0.02715647,0.039805,0.033199,1400.0,48.0,1352.0,3.43,1174.0,10026.0,95.06,78.54,16.52
1,1,0.01552272,0.027147,0.02125,1400.0,25.0,1375.0,1.79,1199.0,11401.0,97.09,89.31,7.78
0,0,6.441853e-07,0.015517,0.007993,1400.0,36.0,1364.0,2.57,1235.0,12765.0,100.0,100.0,0.0


In [26]:
# Compute and print AUC score
from sklearn.metrics import roc_auc_score
auc = roc_auc_score(dev["Target"],dev["prob"])
auc

0.73853994110287013

In [27]:
## Data Preparation for the Hold Out Sample
## Predict the labels on Test Data
hold_out = pd.read_csv("HOLDOUT_SAMPLE.csv")


hold_out_occ = pd.get_dummies(data = hold_out['Occupation'], 
                                  dummy_na = True)
hold_out_occ = hold_out_occ.rename(columns = {'SELF-EMP' : 'SELF_EMP'})
hold_out = pd.concat([hold_out, 
                              hold_out_occ.iloc[:, 0:4]], axis = 1)

hold_out_gen = pd.get_dummies(data = hold_out['Gender'], dummy_na = True)
hold_out = pd.concat([hold_out, hold_out_gen.iloc[:, 1:2]], axis = 1)


hold_out_pv = hold_out.drop(labels = ["Cust_ID", "Target", "Gender", 
                          "Occupation", "AGE_BKT"], axis = 1)

X_test = np.array(hold_out_pv)
X_test.shape

(6000, 10)

In [28]:
## Hold Out Model Performance
## Get the probability

hold_out['prob'] = pd.DataFrame(GNB.predict_proba(X_test))[1]
## Deciling
hold_out['decile']=pd.qcut(hold_out.prob.rank(method='first'), 
        10, labels=False)

h_RRate = Response_Rate(hold_out,"prob","Target")
h_RRate

Unnamed: 0,decile,min_prob,max_prob,avg_prob,cnt,cnt_resp,cnt_non_resp,rrate,cum_resp,cum_non_resp,cum_resp_pct,cum_non_resp_pct,KS
9,9,0.2008252,0.578104,0.282574,600.0,147.0,453.0,24.5,147.0,453.0,29.52,8.23,21.29
8,8,0.1443331,0.200792,0.168825,600.0,102.0,498.0,17.0,249.0,951.0,50.0,17.28,32.72
7,7,0.112135,0.144319,0.127342,600.0,71.0,529.0,11.83,320.0,1480.0,64.26,26.9,37.36
6,6,0.08812169,0.112105,0.099992,600.0,46.0,554.0,7.67,366.0,2034.0,73.49,36.97,36.52
5,5,0.06860335,0.087994,0.07789,600.0,44.0,556.0,7.33,410.0,2590.0,82.33,47.07,35.26
4,4,0.05313628,0.068564,0.060749,600.0,18.0,582.0,3.0,428.0,3172.0,85.94,57.65,28.29
3,3,0.03878353,0.053128,0.045712,600.0,22.0,578.0,3.67,450.0,3750.0,90.36,68.16,22.2
2,2,0.0266795,0.038744,0.032621,600.0,19.0,581.0,3.17,469.0,4331.0,94.18,78.72,15.46
1,1,0.01547558,0.026676,0.021065,600.0,15.0,585.0,2.5,484.0,4916.0,97.19,89.35,7.84
0,0,7.750388e-07,0.015465,0.007494,600.0,14.0,586.0,2.33,498.0,5502.0,100.0,100.0,0.0
