# Objective

To predict the likelihood of digital prospects to buy Limted Basic/Blast or Limited Basic/Performance tiers when they select 
Video/HSD product mix

This is a binary model

## Business use case

This model is to be used by the digital marketing team for prospects offer

# Data 

Data : Input variables are aggregated 10 sessions data of prospects who converted in both dotcom and mdotcom

    Raw data source: digital_modeling.omn_vw_visit_activity_dotcom + digital_modeling.omn_vw_visit_activity_mdotcom    
    Data source for model: digital_modeling.anonymous_digital_klondike_pml_p2_data
        
Target : Limited Basic/Performance - 1, Limted Basic/Blast - 0

Comparing trends m-o-m :   

    Dev --> Jan'19    
    OOT --> Feb'19  

Customers with Video/HSD and Limted Basic/Blast + Limited Basic/Performance tiers :    

    Jan'19 - 2,593   
    Feb'19 - 1,530


## Target Distribution

46.12% - Limited Basic/Performance (1)    
53.88% - Limted Basic/Blast (0)  
 

### Where condition & Target definition

<a id='links'></a>
[Execution](#execution)   
[Step1 Model](#step1)   
[Step1 Final Model](#1final)       
[Step1 OOT](#1oot)     

[Data Prep](#data_make)   
[Final Data mapping](#datamap)    
[WOE & IV](#WOE&IV)   
[Categ Corr](#catg_corr)  
[Encoder](#Encoder)  
[Model fit](#model_fit)    
[Metrics](#metrics)   
[ROC AUC](#roc_auc)   
[KS & Deciles](#ks_deciles)   
[Plots](#plots)  
[Final o/p variables](#score_decile_segment)

<script>
  function code_toggle() {
    if (code_shown){
      $('div.input').hide('500');
      $('#toggleButton').val('Show Code')
    } else {
      $('div.input').show('500');
      $('#toggleButton').val('Hide Code')
    }
    code_shown = !code_shown
  }

  $( document ).ready(function(){
    code_shown=false;
    $('div.input').hide()
  });
</script>

In [1]:
import pandas as pd
from datetime import datetime
from numpy import *
from sklearn import preprocessing
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.preprocessing import label_binarize
from sklearn.metrics import *
from sklearn import linear_model
import matplotlib.pyplot as plt
from collections import defaultdict
from scipy.stats import binned_statistic
import warnings
from sklearn.externals import joblib
import statsmodels.api as sm
#pd.options.display.float_format = '{:.0f}'.format

import inspect
#inspect.getargspec(KNN)
% matplotlib inline


  from pandas.core import datetools
UsageError: Line magic function `%` not found.


<a id = 'datamap'></a>

In [2]:
def data_prep(df, var_drop, target):
    
    num = df.select_dtypes(include=['float64','int64']).columns
    cat = df.select_dtypes(include=['O']).columns
    
    cat_var = [x for x in list(cat) if x in var_drop]
    num_var = [x for x in list(num) if x in var_drop]
    
    num_var = num.drop(num_var)
    cat_var = cat.drop(cat_var)
    
    num_df = pd.concat([df[num_var].fillna(0), pd.DataFrame(df[target])], axis=1)
    cat_df = pd.concat([df[cat_var].fillna('NA').replace(' ','NA').apply(lambda x: x.astype(str).str.upper()),pd.DataFrame(df[target])], axis=1)

    return num_df, cat_df, num_var, cat_var

In [3]:
def datamap(data_full, varb, enc_map, woe_map):
    
    num_df = data_full[data_full.select_dtypes(include=['float64','int64']).columns].fillna(999999)
    cat_df = data_full[data_full.select_dtypes(include=['O']).columns].replace(' ','NA').fillna('NA')\
                    .apply(lambda x: x.astype(str).str.upper())
    data = pd.concat([num_df, cat_df],axis=1)
    
    gud = pd.DataFrame(varb[~((varb['Var_name'].str.contains('LE')) | (varb['Var_name'].str.contains('WOE')))]['Var_name'])
    
    enc = pd.DataFrame(varb[varb['Var_name'].str.contains('LE')]['Var_name'])
    woe = pd.DataFrame(varb[varb['Var_name'].str.contains('WOE')]['Var_name'])
    enc.replace({'_LE': ''}, regex=True, inplace = True)
    woe.replace({'_WOE': ''}, regex=True, inplace = True)
    
    enc_res = pd.DataFrame()
    woe_res = pd.DataFrame()
    for k in enc['Var_name']:
        enc_res = pd.concat([enc_res, pd.DataFrame(enc_map.get(k).transform(data[k]), columns = [str(k)+'_LE'])], axis = 1)
    enc_res = enc_res.reindex(data.index)
    for m in woe['Var_name']:
        r = (pd.DataFrame(data[m]).merge(woe_map[[n for n in range(len(woe_map)) if m in woe_map[n]][0]]))
        woe_res = pd.concat([woe_res, pd.DataFrame(r.iloc[:,1]).rename(columns = {'WOE':str(m)+'_WOE'})], axis = 1)
    woe_res = woe_res.reindex(data.index)
    fin_df = pd.concat([pd.DataFrame(data[gud['Var_name']]), enc_res, woe_res], axis = 1).fillna(0)
    
    return fin_df

<a id='WOE&IV'></a>

In [4]:
def woe_iv(df,var_list,target):
    
    r = []
    iv = []
    m = []
    for i in range(len(var_list)):
        iv.append([])
        iv[i].append(var_list[i])
        v = df.groupby([var_list[i],target]).size().unstack()
        g = v.apply(lambda x: x/float(x.sum())).fillna(0)
        v['Non event_pct'] = g[0]
        v['Event_pct'] = g[1]
        v['WOE'] = (log((g[1]+0.001)/(g[0]+0.001)))
        v['IV'] = (g[1]-g[0])*v['WOE']
        m.append(pd.DataFrame(v['WOE']).reset_index())
        r.append(v)
        iv[i].append(v['IV'].sum())
    iv=pd.DataFrame(iv,columns = ['Variable_name','IV']).sort_values('IV', ascending = 0) 
    
    return (r, iv, m)  

<a id='catg_corr'></a>

In [5]:
def cor_cat_var(df, woe_df, var_list):
    res = pd.DataFrame()
    fin_cat = pd.DataFrame()
    
    for i in range(len(var_list)):
        woe = pd.DataFrame({var_list[i]+'_WOE' : woe_df[i]['WOE'].values,var_list[i]: woe_df[i]['WOE'].index})
        var = pd.DataFrame(df.iloc[:,i]).reset_index().merge(woe).set_index('index')
        res = pd.concat([res,var],axis=1).reindex(var.index)
    
    for j in range(1,len(res.columns),2):
        fin_cat = pd.concat([fin_cat, pd.DataFrame(res.iloc[:,j])],axis = 1)
    
    return res, fin_cat

<a id='Encoder'></a>

In [6]:
def encoder(df, var_list):
    
    m = defaultdict(preprocessing.LabelEncoder)
    enc_res = df[var_list].fillna('NA').replace(' ','NA').apply(lambda x: x.astype(str).str.upper()).apply(lambda x: m[x.name].fit_transform(x))
    enc_res.columns = [str(x) +'_LE' for x in enc_res.columns]
    
    return enc_res, m

<a id='model_fit'></a>

In [7]:
def model_fit(X, Y, obj, size):
    i = 0
    var_list = X.columns
    while len(var_list) > size:
        obj.fit(X[var_list],Y)
        var_imp = pd.DataFrame(list(zip(X[var_list], obj.feature_importances_)), columns = ['Var_name', 'Importance'])\
            .sort_values(['Importance'], ascending = [0])    
        redu_feat = var_imp[var_imp['Importance']>min(var_imp['Importance'])]
        i = i+1
       # print ("Iteration "+ str(i) + " : No. of original features are: ", len(X[var_list].columns))
       # print ("Iteration "+ str(i) + " : No. of reduced features are: ", len(redu_feat))
        var_list = redu_feat['Var_name']
    print ("Iteration "+ str(i) + " : No. of reduced features are: ", len(redu_feat))
    return redu_feat

<a id='metrics'></a>

In [8]:
def metrics(y_true, y_pred, y_proba):

    fpr, tpr, _ = roc_curve(y_true, y_proba)
    roc_auc = auc(fpr, tpr)
    acu = accuracy_score(y_true, y_pred)
    p = precision_score(y_true, y_pred)
    r = recall_score(y_true, y_pred)
    cm = confusion_matrix(y_true, y_pred)
    set_printoptions(precision=2)
        
    return roc_auc, acu, p, r, cm

<a id='roc_auc'></a>

In [9]:
def roc_auc(X, Y, obj):

    y_pred = obj.predict(X)
    y_proba = obj.predict_proba(X)[:,1]
    fpr, tpr, _ = roc_curve(Y, y_proba)
    roc_auc = auc(fpr, tpr)
    plt.subplot(1,2,1)
    plt.title('ROC Curve')
    plt.plot(fpr, tpr, label = 'AUC = %0.2f' % roc_auc)
    plt.legend(loc = 'lower right')
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.ylabel('True Positive Rate (Sensitivity)')
    plt.xlabel('False Positive Rate (1 - Specificity)')
    
    return roc_auc

<a id='ks_deciles'></a>

In [10]:
def out_ks(df, Y, obj, targ):
    
    outp = pd.DataFrame(Y)
    outp['Pred'] = obj.predict(df)
    outp['Proba'] = obj.predict_proba(df)[:,1]
    outp['Decile'] = pd.qcut(outp['Proba'], 10, labels = arange(10,0,-1))

    fin_res = pd.DataFrame(arange(1,11),columns = ['Decile'])
    fin_res['Count']=(outp.groupby(['Decile'],as_index=False).count())['Pred']
    fin_res['Target']=(outp.loc[outp[targ] == 1].groupby(['Decile'],as_index=False).count())['Pred']
    fin_res['Non_Target']=(outp.loc[outp[targ] == 0].groupby(['Decile'],as_index=False).count())['Pred']
    fin_res['Prob_Target']=(outp.groupby(['Decile'],as_index=False).sum())['Proba']
    fin_res['1_pct'] = (fin_res['Target']/sum(fin_res['Target']))*100
    fin_res['0_pct'] = (fin_res['Non_Target']/sum(fin_res['Non_Target']))*100
    fin_res['1_cum'], fin_res['0_cum'] = " ", " "
    r1 = 0
    r0 = 0
    for i in range(10):
        r1 = r1 + fin_res['1_pct'][i]
        r0 = r0 + fin_res['0_pct'][i]
        fin_res['1_cum'][i] = r1
        fin_res['0_cum'][i] = r0
    fin_res['KS'] = abs(fin_res['1_cum'] - fin_res['0_cum'])
    ks = max(fin_res['KS'])
           
    return (outp['Pred'], outp['Proba'], ks, fin_res)

<a id='plots'></a>

In [11]:
def plots(y_true, y_proba, color):
    fpr, tpr, _ = roc_curve(y_true, y_proba)
    roc_auc = auc(fpr, tpr)
    plt.subplot(1,2,1)
    plt.title('ROC Curve')
    plt.plot(fpr, tpr, color = color, label = 'AUC = %0.2f' % roc_auc)
    plt.legend(loc = 'lower right')
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.ylabel('True Positive Rate (Sensitivity)')
    plt.xlabel('False Positive Rate (1 - Specificity)')
    
    precision, recall, _ = precision_recall_curve(y_true, y_proba)
    average_precision = average_precision_score(y_true, y_proba)
    plt.subplot(1,2,2)
    plt.tight_layout()
    plt.title('Precision-Recall : AUC={0:0.2f}'.format(average_precision))
    plt.plot(recall, precision, color = color)
    plt.legend(loc = 'lower right')
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.ylabel('Precision')
    plt.xlabel('Recall (Sensitivity)')

<a id='score_decile_segment'></a>

In [12]:
def outs(d, fin_vars, obj):
    d['pred'] = obj.predict(d[fin_vars])
    d['score'] = obj.predict_proba(d[fin_vars])[:,1]
    #d['nat_decile'] = pd.qcut(d['score'], 10, labels = arange(10,0,-1))
    #d['div_decile'] = d.groupby('division')['score'].apply(lambda x: pd.qcut(x, 10, labels = arange(10,0,-1)))
    d['nat_decile'] = d.groupby(['decile_segment_flag'])['score'].apply(lambda x: pd.qcut(x, 10, labels = arange(10,0,-1)))
    d['div_decile'] = d.groupby(['decile_segment_flag','division'])['score'].apply(lambda x: pd.qcut(x, 10, labels = arange(10,0,-1)))
    d['reg_decile'] = d.groupby(['decile_segment_flag','region'])['score'].apply(lambda x: pd.qcut(x, 10, labels = arange(10,0,-1)))
    d.loc[(d['nat_decile'] == 1) | (d['nat_decile'] == 2) | (d['nat_decile'] == 3), 'segment'] = 'HIGH'
    d.loc[(d['nat_decile'] == 4) | (d['nat_decile'] == 5) | (d['nat_decile'] == 6), 'segment'] = 'MID'
    d.loc[(d['nat_decile'] == 7) | (d['nat_decile'] == 8) | (d['nat_decile'] == 9) | (d['nat_decile'] == 10), 'segment'] = 'LOW'
    return d


<a id='execution'></a>
### Data pull

In [13]:
warnings.filterwarnings("ignore")
data = pd.read_csv(r'/home/pbojja200/Digital/PML_P2/vid_tiers_dev_data_zip.txt', sep='|')
oot = pd.read_csv(r'/home/pbojja200/Digital/PML_P2/vid_tiers_oot_data_zip.txt', sep='|')

In [14]:
data.columns = [i[1] for i in data.columns.str.split('.')]
oot.columns = [i[1] for i in oot.columns.str.split('.')]

In [15]:
act_vars = data.columns
oot_1 = oot[act_vars]
oot_1.shape

(1530, 323)

In [16]:
full_df = data.append(oot_1, ignore_index=True)
print(full_df.shape)

(4123, 323)


In [17]:
full_df.tail()

Unnamed: 0,s_sum_page_loadtime_last_5sess,s_cnt_localizn_adrs_valid_type_usps_last_10s,d_cnt_phone_visit_l30d,s_cnt_geo_region_ca_last_5sess,s_cnt_geo_loc_mid_west_last_7sess,s_cnt_domain_comcast_last_10sess,d_cnt_comcast_region_mou_last_15days,d_cnt_comcast_region_big_last_3days,d_cnt_comcast_region_hea_last_15days,s_dealsofferspage_flag_y_last_3sess,...,s_cnt_comcast_region_mou_last_10sess,day_id,s_sum_attempt_localisation_l3d,s_geo_dma_602_last_5sess,s_cnt_prev_page_2173979979_last_5sess,s_cnt_page_thru_triple_play_last_3sess,s_cnt_geo_loc_co_last_5sess,s_sub_section_name_my_plan_last_7sess,d_cnt_phone_visit_l3d,division
4118,29.0,1.0,1.0,0.0,0.0,2.0,0.0,0.0,0.0,1.0,...,0.0,2019-02-03,1.0,0.0,0.0,0.0,0.0,0.0,1.0,WEST DIVISION
4119,19.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,2019-02-04,0.0,0.0,1.0,,0.0,0.0,1.0,WEST DIVISION
4120,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,2019-02-04,0.0,0.0,1.0,,0.0,0.0,2.0,
4121,163.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,2019-02-01,1.0,0.0,0.0,0.0,0.0,0.0,0.0,WEST DIVISION
4122,34.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,2019-02-02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,


In [18]:
target = 'vid_tiers_target'
#data[target] = data[target].astype('int')

In [19]:
full_df[target].value_counts()

0    2142
1    1981
Name: vid_tiers_target, dtype: int64

In [20]:
#df.astype(float)
df = full_df.copy()
for x in df.columns:
    try:
        df[x] = df[x].astype(float)
    except:
        continue

In [21]:
#df = data.copy()
x_vars = list(df.columns)
print(len(x_vars))
x_vars.remove(target)
print(len(x_vars))

X_dev, X_val, Y_dev, Y_val = train_test_split(df[x_vars], df[target], test_size=0.20, 
                                                    random_state=78)

print (X_dev.shape)
print (X_val.shape)

dev = pd.concat([X_dev, Y_dev], axis = 1)
val = pd.concat([X_val, Y_val], axis = 1)
dev.shape, val.shape

323
322
(3298, 322)
(825, 322)


((3298, 323), (825, 323))

In [22]:
print("Original dev : ", data[target].value_counts()/data.shape[0])
print("OOT : ", oot[target].value_counts()/oot.shape[0])
print("Full data : ", full_df[target].value_counts()/full_df.shape[0])
print("dev : ", dev[target].value_counts()/dev.shape[0]) 
print("val : ", val[target].value_counts()/val.shape[0])

Original dev :  1    0.538758
0    0.461242
Name: vid_tiers_target, dtype: float64
OOT :  0    0.618301
1    0.381699
Name: vid_tiers_target, dtype: float64
Full data :  0    0.519525
1    0.480475
Name: vid_tiers_target, dtype: float64
dev :  0.0    0.522135
1.0    0.477865
Name: vid_tiers_target, dtype: float64
val :  0.0    0.509091
1.0    0.490909
Name: vid_tiers_target, dtype: float64


In [23]:
num = dev.select_dtypes(include=['float64','int64']).columns
cat = dev.select_dtypes(include=['O']).columns

In [24]:
cat

Index(['product_category', 'product_tiers', 'geo_zip', 'geo_region',
       'geo_city', 'day_id', 'division'],
      dtype='object')

In [25]:
# works for numerical columns only
s_num = pd.DataFrame(dev[num].describe())
s_num.ix['max_min'] = s_num.ix['max'] - s_num.ix['min']
s_num.ix['missing'] = ((len(dev[num]) - s_num.ix['count'])/len(dev[num]))*100
st = s_num.transpose()
st[st['missing']>50]

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,max_min,missing
timespent_seconds_afternoon_l7d,1405.0,1867.886833,2787.470308,0.0,351.0,1072.0,2276.0,31288.0,31288.0,57.398423
num_of_pages_afternoon,1405.0,121.479715,162.107349,1.0,31.0,86.0,152.0,2465.0,2464.0,57.398423
num_of_pages_early_morning_l7d,622.0,91.615756,86.826397,1.0,22.0,77.0,128.0,571.0,570.0,81.140085
timespent_seconds_early_morning,622.0,1338.565916,1500.558079,0.0,290.5,917.5,1872.0,11326.0,11326.0,81.140085
num_of_pages_daytime_l7d,1456.0,126.116071,133.672831,1.0,39.75,93.0,167.0,1259.0,1258.0,55.852032
num_of_pages_afternoon_l7d,1405.0,121.479715,162.107349,1.0,31.0,86.0,152.0,2465.0,2464.0,57.398423
timespent_seconds_primetime_l7d,976.0,1440.064549,1808.77096,0.0,191.5,876.5,2006.0,16660.0,16660.0,70.406307
timespent_seconds_early_fringe_l15d,994.0,1582.167002,2162.91793,0.0,244.0,933.0,2072.0,21525.0,21525.0,69.860522
num_of_pages_morning_l7d,1186.0,117.267285,113.61725,1.0,44.0,91.0,155.75,1009.0,1008.0,64.038811
timespent_seconds_primetime,976.0,1440.064549,1808.77096,0.0,191.5,876.5,2006.0,16660.0,16660.0,70.406307


In [26]:
#num_rmv = list(st[st['missing']>85].index)

In [27]:
s_cat = pd.DataFrame(df[cat].isnull().sum()/len(df[cat])*100, columns = ['missing'])
s_cat = s_cat.sort_values(by='missing', ascending=False)
s_cat[s_cat['missing']>50]

Unnamed: 0,missing


In [28]:
# remove redundant variables
geo_drop = [x for x in df.columns if ("comcast_division" in x)]
geo_drop = geo_drop + [x for x in df.columns if ("comcast_region" in x)]
geo_drop = geo_drop + [x for x in df.columns if ("geo_loc" in x)]
geo_drop = geo_drop + [x for x in df.columns if ("geo_region" in x)]
geo_drop = geo_drop + [x for x in df.columns if ("geo_city" in x)]
geo_drop = geo_drop + [x for x in df.columns if ("geo_dma" in x)]
var_drop_list = geo_drop + ['decile_segment_flag','housekey','visid_hl','day_id','product_category','product_tiers',
                 'geo_region','geo_zip',target, 'hsd_tiers_target','division']
#var_drop_list = var_drop_list+[x for x in num_rmv]

print ("Dropped variables :", len(var_drop_list))

Dropped variables : 155


In [29]:
cat_var = [x for x in list(cat) if x in var_drop_list]
num_var = [x for x in list(num) if x in var_drop_list]

num_var = num.drop(num_var)
cat_var = cat.drop(cat_var)

#df[target] = df[target].astype(int)
num_dev = pd.concat([dev[num_var], pd.DataFrame(dev[target])], axis=1)
cat_dev = pd.concat([dev[cat_var], pd.DataFrame(dev[target])], axis=1)

print ("Numerical dataframe :", num_dev.shape)
print ("Categorical dataframe :",cat_dev.shape)

Numerical dataframe : (3298, 173)
Categorical dataframe : (3298, 1)


In [30]:
cat_dev.columns, num_dev.columns

(Index(['vid_tiers_target'], dtype='object'),
 Index(['s_sum_page_loadtime_last_5sess',
        's_cnt_localizn_adrs_valid_type_usps_last_10s',
        'd_cnt_phone_visit_l30d', 's_cnt_domain_comcast_last_10sess',
        's_dealsofferspage_flag_y_last_3sess',
        's_authentication_active_authenticated_last_7sess',
        's_auth_sts_actauthen_unrec_last_7sess',
        's_os_cnt_1240087047_last_5sess',
        's_cnt_click_context_offers_last_3sess',
        's_cnt_resol_len_1024_last_7sess',
        ...
        's_cnt_prev_page_0_last_5sess', 's_cnt_prev_page_0_last_7sess',
        's_cnt_click_action_submit_last_10sess',
        's_cnt_user_server_ebzweb_ch2_last_3sess',
        's_sum_attempt_localisation_l3d',
        's_cnt_prev_page_2173979979_last_5sess',
        's_cnt_page_thru_triple_play_last_3sess',
        's_sub_section_name_my_plan_last_7sess', 'd_cnt_phone_visit_l3d',
        'vid_tiers_target'],
       dtype='object', length=173))

## Numerical Imputation

In [31]:
#num_df = pd.DataFrame(KNN(k=3, verbose = False).complete(num_dev), columns = num_dev.columns)
num_df = num_dev.fillna(999999)

In [32]:
#num_df = num_dev.copy()
s_num1 = pd.DataFrame(num_df.describe())
s_num1.ix['max_min'] = s_num1.ix['max'] - s_num1.ix['min']
s_num1.ix['missing'] = ((len(num_df) - s_num1.ix['count'])/len(num_df))*100
st_num1 = s_num1.transpose()
st_num1[st_num1['missing']>0]

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,max_min,missing


## Categorical Imputation

In [33]:
cat_df = cat_dev.fillna('NA')
s_catt = pd.DataFrame(cat_df.isnull().sum()/len(cat_df)*100, columns = ['missing'])
s_catt[s_catt['missing']>0]

Unnamed: 0,missing


In [34]:
ws = []
[ws.append(x) for x in cat_df[cat_var].columns if (cat_df[x].str.isspace().sum())>0]
print("# white space cols :",len(ws))
print(ws)
[cat_df[x].replace(' ','NA',inplace = True) for x in ws]

# white space cols : 0
[]


[]

## Data Preparation

[All links](#links)

In [35]:
############## step data preparation ##############
# weight of evidence
#woe_data, iv, woe_map = woe_iv(cat_df, cat_var, target)
#result, final_catg = cor_cat_var(cat_df, woe_data, cat_var)
# label encoding
#enc_res, enc_map = encoder(cat_df, cat_var)
# concatenating numerical data, woe data, encoded data to form final dataset
#final_data = pd.concat([num_df, final_catg, enc_res], axis = 1).reindex(final_catg.index)
final_data = pd.concat([num_df], axis = 1).reindex(num_df.index)
var_names = final_data.columns.drop(target)

In [36]:
print (" Numerical variables: ", len(num_var))
print (" Categorical variables: ", len(cat_var))
print (" Input variables: ", len(var_names))
print (" Initial input data: ", final_data.shape)

 Numerical variables:  172
 Categorical variables:  0
 Input variables:  172
 Initial input data:  (3298, 173)


#### Development phase

In [37]:
# split the development file into training, test datasets in the ratio 80:20 resp.
X_train, X_test, Y_train, Y_test = train_test_split(final_data[var_names], final_data[target], test_size=0.20, 
                                                    random_state=247)

print (X_train.shape)
print (X_test.shape)

train = pd.concat([X_train, Y_train], axis = 1)
test = pd.concat([X_test, Y_test], axis = 1)
train.shape, test.shape

(2638, 172)
(660, 172)


((2638, 173), (660, 173))

In [38]:
rnf = RandomForestClassifier(n_estimators=100, max_features='sqrt', n_jobs=-1, random_state =72, max_depth=3)
# Variable selection based on random forest's variable importance
start_time = datetime.now()
redu = model_fit(X_train, Y_train, rnf, size=50)
print (datetime.now() - start_time)
print ("Final variables of the model are: " + "\n", redu.reset_index(drop=True))

Iteration 45 : No. of reduced features are:  50
0:00:19.004855
Final variables of the model are: 
                                              Var_name  Importance
0              s_cnt_user_server_ebzweb_wc_last_3sess    0.082604
1             s_cnt_user_server_ebzweb_ch2_last_5sess    0.062010
2             s_cnt_user_server_ebzweb_ch2_last_7sess    0.054630
3             s_cnt_user_server_ebzweb_ch2_last_3sess    0.052989
4    s_sum_time_secs_to_complete_pvtoorder_last_7sess    0.045263
5              s_cnt_user_server_ebzweb_wc_last_7sess    0.044037
6            s_cnt_user_server_ebzweb_ch2_last_10sess    0.037104
7   s_avg_time_secs_to_complete_e12toorder_last_7sess    0.036149
8             s_cnt_user_server_ebzweb_wc_last_10sess    0.036020
9   s_sum_time_secs_to_complete_pvtoorder_last_10sess    0.033636
10   s_sum_time_secs_to_complete_pvtoorder_last_5sess    0.032500
11   s_avg_time_secs_to_complete_pvtoorder_last_5sess    0.031954
12  s_sum_time_secs_to_complete_e12toorder_

##### Correlation of input with output

In [39]:
v = list(redu['Var_name'])
v.append(target)
w = pd.DataFrame(train[v].corr())
#len(v)
#w.iloc[:,40:]
w['vid_tiers_target']

s_cnt_user_server_ebzweb_wc_last_3sess              -0.132693
s_cnt_user_server_ebzweb_ch2_last_5sess             -0.113721
s_cnt_user_server_ebzweb_ch2_last_7sess             -0.105490
s_cnt_user_server_ebzweb_ch2_last_3sess             -0.128135
s_sum_time_secs_to_complete_pvtoorder_last_7sess     0.015794
s_cnt_user_server_ebzweb_wc_last_7sess              -0.124298
s_cnt_user_server_ebzweb_ch2_last_10sess            -0.097082
s_avg_time_secs_to_complete_e12toorder_last_7sess    0.015631
s_cnt_user_server_ebzweb_wc_last_10sess             -0.119394
s_sum_time_secs_to_complete_pvtoorder_last_10sess    0.015832
s_sum_time_secs_to_complete_pvtoorder_last_5sess     0.015759
s_avg_time_secs_to_complete_pvtoorder_last_5sess     0.015634
s_sum_time_secs_to_complete_e12toorder_last_7sess    0.015777
s_cnt_lnk_ref_page_internet_last_10sess              0.015588
s_cnt_user_server_ebzweb_wc_last_5sess              -0.128292
s_sum_time_secs_to_complete_e12toorder_last_5sess    0.015746
s_afflia

In [40]:
cor_var_drop = ['geo_zip_WOE','geo_city_WOE','geo_region_WOE','division_LE','division_WOE']
print(len(cor_var_drop))
unc_var = [x for x in redu['Var_name'] if x not in cor_var_drop]
len(unc_var)

5


50

In [41]:
unc_var

['s_cnt_user_server_ebzweb_wc_last_3sess',
 's_cnt_user_server_ebzweb_ch2_last_5sess',
 's_cnt_user_server_ebzweb_ch2_last_7sess',
 's_cnt_user_server_ebzweb_ch2_last_3sess',
 's_sum_time_secs_to_complete_pvtoorder_last_7sess',
 's_cnt_user_server_ebzweb_wc_last_7sess',
 's_cnt_user_server_ebzweb_ch2_last_10sess',
 's_avg_time_secs_to_complete_e12toorder_last_7sess',
 's_cnt_user_server_ebzweb_wc_last_10sess',
 's_sum_time_secs_to_complete_pvtoorder_last_10sess',
 's_sum_time_secs_to_complete_pvtoorder_last_5sess',
 's_avg_time_secs_to_complete_pvtoorder_last_5sess',
 's_sum_time_secs_to_complete_e12toorder_last_7sess',
 's_cnt_lnk_ref_page_internet_last_10sess',
 's_cnt_user_server_ebzweb_wc_last_5sess',
 's_sum_time_secs_to_complete_e12toorder_last_5sess',
 's_affliate_einstein_last_5sess',
 's_affliate_telesales_last_10sess',
 's_avg_page_loadtime_last_3sess',
 's_sum_page_loadtime_last_7sess',
 's_affliate_telesales_last_7sess',
 's_affliate_telesales_last_5sess',
 's_sum_time_secs

In [42]:
lgr = LogisticRegression()
clf_lr = lgr.fit(X_train[unc_var], Y_train)

y_pred_train, y_proba_train, ks_train, dec_train = out_ks(X_train[unc_var], Y_train, clf_lr, target)
y_pred_test, y_proba_test, ks_test, dec_test = out_ks(X_test[unc_var], Y_test, clf_lr, target)


print ("KS_Train : ", round(ks_train,2))
print ("KS_Test : ", round(ks_test,2))

KS_Train :  12.73
KS_Test :  24.34


In [43]:
clf = RandomForestClassifier(n_estimators=50, max_depth=2, max_features='sqrt', n_jobs=-1, random_state =72, verbose=0)
#clf = RandomForestClassifier(n_estimators=60, criterion='gini', max_depth=5, min_samples_split=2, min_samples_leaf=2,
                            #max_features='auto', max_leaf_nodes=2, random_state=99)

clf_rf = clf.fit(X_train[unc_var], Y_train)

y_pred_train, y_proba_train, ks_train, dec_train = out_ks(X_train[unc_var], Y_train, clf_rf, target)
y_pred_test, y_proba_test, ks_test, dec_test = out_ks(X_test[unc_var], Y_test, clf_rf, target)

print ("KS_Train : ", round(ks_train,2))
print ("KS_Test : ", round(ks_test,2))

KS_Train :  22.02
KS_Test :  29.93


##### VIF of first cut 50 variables

In [44]:
x_f = final_data[redu['Var_name']]
vif_cat_f = pd.DataFrame()
len(x_f.columns)
#x_f.columns

50

In [45]:
vif_drop_var_f = []
vif_drop_val_f = []

startTime = datetime.now()
vif_f = [variance_inflation_factor(x_f.values, ix) for ix in range(x_f.shape[1])]
vif_cat_f = pd.concat([pd.DataFrame(x_f.columns, columns = ['Var_name']), pd.DataFrame(vif_f, columns = ['VIF'])],axis=1)
vif_cat_f = vif_cat_f.sort_values('VIF', ascending=0).reset_index(drop=True)
print (datetime.now() - startTime)
print ("Final no. of categorical variables after VIF based reduction : ", len(vif_cat_f))


while (vif_cat_f['VIF'][0] >=2):
    x_f = final_data[redu['Var_name']]
    vif_drop_var_f.append(vif_cat_f['Var_name'][0])
    vif_drop_val_f.append(vif_cat_f['VIF'][0])
    x_f = x_f.drop(vif_drop_var_f, axis = 1)

    startTime = datetime.now()
    vif_f = [variance_inflation_factor(x_f.values, ix) for ix in range(x_f.shape[1])]
    vif_cat_f = pd.concat([pd.DataFrame(x_f.columns, columns = ['Var_name']), pd.DataFrame(vif_f, columns = ['VIF'])],axis=1)
    vif_cat_f = vif_cat_f.sort_values('VIF', ascending=0).reset_index(drop=True)
    print (datetime.now() - startTime)
    print ("Final no. of categorical variables after VIF based reduction : ", len(vif_cat_f))
    
    vif_drop_f = pd.DataFrame({'VIF':vif_drop_val_f,'Var_name':vif_drop_var_f})
    

print(vif_drop_f)
print(vif_cat_f)

0:00:01.461122
Final no. of categorical variables after VIF based reduction :  50
0:00:01.255845
Final no. of categorical variables after VIF based reduction :  49
0:00:01.397229
Final no. of categorical variables after VIF based reduction :  48
0:00:01.414961
Final no. of categorical variables after VIF based reduction :  47
0:00:01.237222
Final no. of categorical variables after VIF based reduction :  46
0:00:01.248684
Final no. of categorical variables after VIF based reduction :  45
0:00:01.241668
Final no. of categorical variables after VIF based reduction :  44
0:00:01.155211
Final no. of categorical variables after VIF based reduction :  43
0:00:01.031411
Final no. of categorical variables after VIF based reduction :  42
0:00:00.699210
Final no. of categorical variables after VIF based reduction :  41
0:00:00.614015
Final no. of categorical variables after VIF based reduction :  40
0:00:00.773773
Final no. of categorical variables after VIF based reduction :  39
0:00:00.790517
F

In [46]:
logit_model=sm.Logit(Y_train,X_train[vif_cat_f['Var_name']])
result=logit_model.fit()
print(result.summary())

Optimization terminated successfully.
         Current function value: 0.669032
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:       vid_tiers_target   No. Observations:                 2638
Model:                          Logit   Df Residuals:                     2629
Method:                           MLE   Df Model:                            8
Date:                Thu, 07 Nov 2019   Pseudo R-squ.:                 0.03433
Time:                        19:02:20   Log-Likelihood:                -1764.9
converged:                       True   LL-Null:                       -1827.6
                                        LLR p-value:                 2.444e-23
                                              coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------------------------------
s_cnt_domain_comcast_last_10sess            0.0105

In [47]:
lgr = LogisticRegression()

clf_lr_f = lgr.fit(X_train[vif_cat_f['Var_name']], Y_train)

y_pred_train_f, y_proba_train_f, ks_train_f, dec_train_f = out_ks(X_train[vif_cat_f['Var_name']], Y_train, clf_lr_f, target)
y_pred_test_f, y_proba_test_f, ks_test_f, dec_test_f = out_ks(X_test[vif_cat_f['Var_name']], Y_test, clf_lr_f, target)


print ("KS_Train : ", round(ks_train_f,2))
print ("KS_Test : ", round(ks_test_f,2))

KS_Train :  15.2
KS_Test :  18.19


In [48]:
roc_train, acu_train, prec_train, rec_train, cm_train = metrics(Y_train, y_pred_train_f, y_proba_train_f)
roc_test, acu_test, prec_test, rec_test, cm_test = metrics(Y_test, y_pred_test_f, y_proba_test_f)
print ("AUC_Train : {0:.2%}".format(roc_train))
print ("AUC_Test : {0:.2%}".format(roc_test))

print ("\n" + "Accuracy_Train : {0:.2%}".format(acu_train))
print ("Precision_Train : {0:.2%}".format(prec_train))
print ("Recall_Train : {0:.2%}".format(rec_train))
print ("Confusion matrix: "+"\n",cm_train)

print ("\n" + "Accuracy_Test : {0:.2%}".format(acu_test))
print ("Precision_Test : {0:.2%}".format(prec_test))
print ("Recall_Test : {0:.2%}".format(rec_test))
print ("Confusion matrix: "+"\n",cm_test)

AUC_Train : 60.95%
AUC_Test : 61.28%

Accuracy_Train : 57.62%
Precision_Train : 57.32%
Recall_Train : 50.89%
Confusion matrix: 
 [[866 487]
 [631 654]]

Accuracy_Test : 59.55%
Precision_Test : 54.29%
Recall_Test : 52.23%
Confusion matrix: 
 [[241 128]
 [139 152]]


In [65]:
vif_cat_f_org = vif_cat_f.copy()
vif_cat_f_org

Unnamed: 0,Var_name,VIF
0,s_cnt_domain_comcast_last_10sess,1.908849
1,s_affliate_telesales_last_3sess,1.797363
2,s_cnt_custom_agent_windows_last_7sess,1.78687
3,s_cnt_click_action_tv_last_5sess,1.657223
4,d_sum_paid_search_last_l60d,1.448976
5,s_cnt_user_server_ebzweb_wc_last_10sess,1.37404
6,s_sum_attempt_localisation_l3d,1.341238
7,s_cnt_lnk_ref_page_internet_last_3sess,1.231239
8,s_sum_page_loadtime_last_3sess,1.010045


In [66]:
vif_cat_f = vif_cat_f_org.drop(vif_cat_f_org[vif_cat_f_org['Var_name']=='s_cnt_domain_comcast_last_10sess'].index[0],axis=0)

In [98]:
clf = RandomForestClassifier(n_estimators=100, max_depth=3, max_features='sqrt', n_jobs=-1, random_state =72, verbose=0)
#clf = RandomForestClassifier(n_estimators=60, criterion='gini', max_depth=5, min_samples_split=2, min_samples_leaf=2,
                            #max_features='auto', max_leaf_nodes=2, random_state=99)
clf_rf_f = clf.fit(X_train[vif_cat_f['Var_name']], Y_train)

y_pred_train_f, y_proba_train_f, ks_train_f, dec_train_f = out_ks(X_train[vif_cat_f['Var_name']], Y_train, clf_rf_f, target)
y_pred_test_f, y_proba_test_f, ks_test_f, dec_test_f = out_ks(X_test[vif_cat_f['Var_name']], Y_test, clf_rf_f, target)

print("*** Training & Test Results ***")
print ("KS_Train : ", round(ks_train_f,2))
print ("KS_Test : ", round(ks_test_f,2))

*** Training & Test Results ***
KS_Train :  22.67
KS_Test :  23.73


In [68]:
var_imp11 = pd.DataFrame(list(zip(X_train[vif_cat_f['Var_name']], clf_rf_f.feature_importances_)), 
                         columns = ['Var_name', 'Importance'])\
            .sort_values(['Importance'], ascending = [0]) 
var_imp11

Unnamed: 0,Var_name,Importance
4,s_cnt_user_server_ebzweb_wc_last_10sess,0.224068
7,s_sum_page_loadtime_last_3sess,0.179423
0,s_affliate_telesales_last_3sess,0.175406
6,s_cnt_lnk_ref_page_internet_last_3sess,0.139766
2,s_cnt_click_action_tv_last_5sess,0.096139
5,s_sum_attempt_localisation_l3d,0.084752
1,s_cnt_custom_agent_windows_last_7sess,0.065786
3,d_sum_paid_search_last_l60d,0.03466


In [69]:
roc_train, acu_train, prec_train, rec_train, cm_train = metrics(Y_train, y_pred_train_f, y_proba_train_f)
roc_test, acu_test, prec_test, rec_test, cm_test = metrics(Y_test, y_pred_test_f, y_proba_test_f)
print ("AUC_Train : {0:.2%}".format(roc_train))
print ("AUC_Test : {0:.2%}".format(roc_test))

print ("\n" + "Accuracy_Train : {0:.2%}".format(acu_train))
print ("Precision_Train : {0:.2%}".format(prec_train))
print ("Recall_Train : {0:.2%}".format(rec_train))
print ("Confusion matrix: "+"\n",cm_train)

print ("\n" + "Accuracy_Test : {0:.2%}".format(acu_test))
print ("Precision_Test : {0:.2%}".format(prec_test))
print ("Recall_Test : {0:.2%}".format(rec_test))
print ("Confusion matrix: "+"\n",cm_test)

AUC_Train : 66.10%
AUC_Test : 63.87%

Accuracy_Train : 61.45%
Precision_Train : 62.50%
Recall_Train : 52.14%
Confusion matrix: 
 [[951 402]
 [615 670]]

Accuracy_Test : 62.12%
Precision_Test : 58.23%
Recall_Test : 49.83%
Confusion matrix: 
 [[265 104]
 [146 145]]


#### Manual feature removal

In [70]:
same_vars = ['s_cnt_user_server_ebzweb_ch2_last_10sess','s_cnt_user_server_ebzweb_wc_last_7sess',
's_cnt_user_server_ebzweb_wc_last_10sess','s_cnt_user_server_ebzweb_ch2_last_5sess','s_cnt_user_server_ebzweb_ch2_last_7sess',
's_cnt_user_server_ebzweb_wc_last_3sess','s_affliate_telesales_last_5sess','s_affliate_telesales_last_10sess',
's_affliate_einstein_last_3sess','s_affliate_telesales_last_3sess','s_cnt_click_action_tv_last_7sess',
             's_sum_time_secs_to_complete_e12toorder_last_5sess','s_sum_time_secs_to_complete_pvtoorder_last_7sess',
's_sum_time_secs_to_complete_pvtoorder_last_5sess','s_cnt_click_action_tv_last_5sess','s_sum_page_loadtime_last_7sess',
's_avg_time_secs_to_complete_pvtoorder_last_3sess','s_sum_time_secs_to_complete_e12toorder_last_7sess',
            's_sub_section_name_my_plan_last_7sess','s_avg_page_loadtime_last_7sess','s_sum_page_loadtime_last_3sess',
            's_avg_total_pages_visited_last_10sess','s_cnt_prev_page_0_last_7sess','s_cnt_lnk_ref_page_internet_last_3sess',
            's_cnt_lnk_ref_page_internet_last_10sess']
unq_vars = [x for x in redu['Var_name'] if x not in same_vars]
len(unq_vars)

26

In [71]:
# Variable selection based on random forest's variable importance
start_time = datetime.now()
rnf.fit(X_train[unq_vars], Y_train)
var_imp = pd.DataFrame(list(zip(X_train[unq_vars], rnf.feature_importances_)), columns = ['Var_name', 'Importance'])\
            .sort_values(['Importance'], ascending = [0])    
redu_feat = var_imp
var_list = redu_feat['Var_name']
print (datetime.now() - start_time)
print ("Final variables of the model are: " + "\n", redu_feat.reset_index(drop=True))

0:00:00.409815
Final variables of the model are: 
                                              Var_name  Importance
0   s_avg_time_secs_to_complete_e12toorder_last_7sess    0.125924
1             s_cnt_user_server_ebzweb_ch2_last_3sess    0.099971
2              s_cnt_user_server_ebzweb_wc_last_5sess    0.087533
3   s_sum_time_secs_to_complete_pvtoorder_last_10sess    0.067855
4    s_avg_time_secs_to_complete_pvtoorder_last_5sess    0.059892
5              s_cnt_lnk_ref_page_internet_last_5sess    0.058880
6   s_sum_time_secs_to_complete_e12toorder_last_3sess    0.058356
7                      s_affliate_einstein_last_5sess    0.057842
8                      s_avg_page_loadtime_last_3sess    0.045472
9        s_cnt_localizn_adrs_valid_type_usps_last_10s    0.040365
10                  s_cnt_click_action_tv_last_10sess    0.032485
11                    s_affliate_telesales_last_7sess    0.032369
12                     s_sum_page_loadtime_last_5sess    0.031924
13               s_avg_to

In [72]:
x_r = X_train[redu_feat['Var_name']]
vif_drop_var_r = []
vif_drop_val_r = []

startTime = datetime.now()
vif_r = [variance_inflation_factor(x_r.values, ix) for ix in range(x_r.shape[1])]
vif_cat_r = pd.concat([pd.DataFrame(x_r.columns,columns=['Var_name']), pd.DataFrame(vif_r, columns = ['VIF'])],axis=1)
vif_cat_r = vif_cat_r.sort_values('VIF', ascending=0).reset_index(drop=True)
print (datetime.now() - startTime)
print ("Final no. of categorical variables after VIF based reduction : ", len(vif_cat_r))

while (vif_cat_r['VIF'][0] >=2):
    x_r = X_train[redu_feat['Var_name']]
    vif_drop_var_r.append(vif_cat_r['Var_name'][0])
    vif_drop_val_r.append(vif_cat_r['VIF'][0])
    x_r = x_r.drop(vif_drop_var_r, axis = 1)

    startTime = datetime.now()
    vif_r = [variance_inflation_factor(x_r.values, ix) for ix in range(x_r.shape[1])]
    vif_cat_r = pd.concat([pd.DataFrame(x_r.columns, columns = ['Var_name']), pd.DataFrame(vif_r, columns = ['VIF'])],axis=1)
    vif_cat_r = vif_cat_r.sort_values('VIF', ascending=0).reset_index(drop=True)
    print (datetime.now() - startTime)
    print ("Final no. of categorical variables after VIF based reduction : ", len(vif_cat_r))
    
    vif_drop_r = pd.DataFrame({'VIF':vif_drop_val_r,'Var_name':vif_drop_var_r})
    

print(vif_drop_r)
print(vif_cat_r)

0:00:00.341655
Final no. of categorical variables after VIF based reduction :  26
0:00:00.329999
Final no. of categorical variables after VIF based reduction :  25
0:00:00.296366
Final no. of categorical variables after VIF based reduction :  24
0:00:00.270840
Final no. of categorical variables after VIF based reduction :  23
0:00:00.230228
Final no. of categorical variables after VIF based reduction :  22
0:00:00.115466
Final no. of categorical variables after VIF based reduction :  21
0:00:00.224001
Final no. of categorical variables after VIF based reduction :  20
0:00:00.182840
Final no. of categorical variables after VIF based reduction :  19
0:00:00.163614
Final no. of categorical variables after VIF based reduction :  18
0:00:00.145681
Final no. of categorical variables after VIF based reduction :  17
0:00:00.131888
Final no. of categorical variables after VIF based reduction :  16
0:00:00.114906
Final no. of categorical variables after VIF based reduction :  15
0:00:00.052895
F

In [73]:
clf = RandomForestClassifier(n_estimators=50, max_depth=2,max_features='sqrt',n_jobs=-1,random_state=72)
#clf = RandomForestClassifier(n_estimators=60, criterion='gini', max_depth=5, min_samples_split=2, min_samples_leaf=2,
                            #max_features='auto', max_leaf_nodes=2, random_state=99)
#clf_rf_r = clf.fit(X_train[redu_feat['Var_name']], Y_train)
clf_rf_r = clf.fit(X_train[vif_cat_r['Var_name']], Y_train)

y_pred_train_f, y_proba_train_f, ks_train_r, dec_train_f = out_ks(X_train[vif_cat_r['Var_name']], Y_train, clf_rf_r, target)
y_pred_test_f, y_proba_test_f, ks_test_r, dec_test_f = out_ks(X_test[vif_cat_r['Var_name']], Y_test, clf_rf_r, target)


print ("KS_Train : ", round(ks_train_r,2))
print ("KS_Test : ", round(ks_test_r,2))

KS_Train :  22.76
KS_Test :  19.42


In [74]:
roc_train, acu_train, prec_train, rec_train, cm_train = metrics(Y_train, y_pred_train_f, y_proba_train_f)
roc_test, acu_test, prec_test, rec_test, cm_test = metrics(Y_test, y_pred_test_f, y_proba_test_f)
print ("AUC_Train : {0:.2%}".format(roc_train))
print ("AUC_Test : {0:.2%}".format(roc_test))

print ("\n" + "Accuracy_Train : {0:.2%}".format(acu_train))
print ("Precision_Train : {0:.2%}".format(prec_train))
print ("Recall_Train : {0:.2%}".format(rec_train))
print ("Confusion matrix: "+"\n",cm_train)

print ("\n" + "Accuracy_Test : {0:.2%}".format(acu_test))
print ("Precision_Test : {0:.2%}".format(prec_test))
print ("Recall_Test : {0:.2%}".format(rec_test))
print ("Confusion matrix: "+"\n",cm_test)

AUC_Train : 65.72%
AUC_Test : 63.87%

Accuracy_Train : 60.31%
Precision_Train : 62.53%
Recall_Train : 46.23%
Confusion matrix: 
 [[997 356]
 [691 594]]

Accuracy_Test : 61.52%
Precision_Test : 58.45%
Recall_Test : 43.99%
Confusion matrix: 
 [[278  91]
 [163 128]]


In [75]:
df_dev = dev.copy()
for x in df_dev.columns:
    try:
        df_dev[x] = df_dev[x].astype(float)
    except:
        continue
df_dev[target] = df_dev[target].astype('int')

### Validation

In [76]:
df_val = val.copy()
for x in df_val.columns:
    try:
        df_val[x] = df_val[x].astype(float)
    except:
        continue
#df_val[target] = df_val[target].astype('int')

In [77]:
enc_map, woe_map  = [], []
inp_val = datamap(df_val, vif_cat_f, enc_map, woe_map)
final_data_val = pd.concat([inp_val, pd.DataFrame(df_val[target])], axis = 1)

X_val = final_data_val[vif_cat_f['Var_name']]
Y_val = final_data_val[target]

inp_val_f = datamap(df_val, vif_cat_f, enc_map, woe_map)
final_data_val_f = pd.concat([inp_val_f, pd.DataFrame(df_val[target])], axis = 1)

X_val_f = final_data_val_f[vif_cat_f['Var_name']]
Y_val_f = final_data_val_f[target]

In [78]:
inp_val_r = datamap(df_val, vif_cat_r, enc_map, woe_map)
final_data_val_r = pd.concat([inp_val_r, pd.DataFrame(df_val[target])], axis = 1)

X_val_r = final_data_val_r[vif_cat_r['Var_name']]
Y_val_r = final_data_val_r[target]

y_pred_val, y_proba_val, ks_val, dec_val = out_ks(X_val_r, Y_val_r, clf_rf_r, target)

print ("\r\n All 50 variables")
print("KS_VAL: ", round(ks_val))

roc_val, acu_val, prec_val, rec_val, cm_val = metrics(Y_val_r, y_pred_val, y_proba_val)
print ("AUC_VAL : {0:.2%}".format(roc_val))
print ("\n" + "Accuracy_VAL : {0:.2%}".format(acu_val))
print ("Precision_VAL : {0:.2%}".format(prec_val))
print ("Recall_VAL : {0:.2%}".format(rec_val))
print ("Confusion matrix: "+"\n",cm_val)


 All 50 variables
KS_VAL:  20.0
AUC_VAL : 63.43%

Accuracy_VAL : 60.12%
Precision_VAL : 62.75%
Recall_VAL : 46.17%
Confusion matrix: 
 [[309 111]
 [218 187]]


In [81]:
# y_pred_val, y_proba_val, ks_val, dec_val = out_ks(X_val, Y_val, clf_lr_u, target)
# print ("KS_VAL: ", round(ks_val))

# roc_val, acu_val, prec_val, rec_val, cm_val = metrics(Y_val, y_pred_val, y_proba_val)
# print ("AUC_VAL : {0:.2%}".format(roc_val))
# print ("\n" + "Accuracy_VAL : {0:.2%}".format(acu_val))
# print ("Precision_VAL : {0:.2%}".format(prec_val))
# print ("Recall_VAL : {0:.2%}".format(rec_val))
# print ("Confusion matrix: "+"\n",cm_val)

# y_pred_val, y_proba_val, ks_val, dec_val = out_ks(X_val_f, Y_val_f, clf_lr_f, target)
# print ("KS_VAL: ", round(ks_val))

# roc_val, acu_val, prec_val, rec_val, cm_val = metrics(Y_val_f, y_pred_val, y_proba_val)
# print ("AUC_VAL : {0:.2%}".format(roc_val))
# print ("\n" + "Accuracy_VAL : {0:.2%}".format(acu_val))
# print ("Precision_VAL : {0:.2%}".format(prec_val))
# print ("Recall_VAL : {0:.2%}".format(rec_val))
# print ("Confusion matrix: "+"\n",cm_val)

In [99]:
# y_pred_val, y_proba_val, ks_val, dec_val = out_ks(X_val, Y_val, clf_rf_u, target)
# print ("KS_VAL: ", round(ks_val))

# roc_val, acu_val, prec_val, rec_val, cm_val = metrics(Y_val, y_pred_val, y_proba_val)
# print ("AUC_VAL : {0:.2%}".format(roc_val))
# print ("\n" + "Accuracy_VAL : {0:.2%}".format(acu_val))
# print ("Precision_VAL : {0:.2%}".format(prec_val))
# print ("Recall_VAL : {0:.2%}".format(rec_val))
# print ("Confusion matrix: "+"\n",cm_val)

y_pred_val, y_proba_val, ks_val, dec_val = out_ks(X_val_f, Y_val_f, clf_rf_f, target)

print ("\r\n *** Validation Results ***")
print("KS_VAL: ", round(ks_val))

roc_val, acu_val, prec_val, rec_val, cm_val = metrics(Y_val_f, y_pred_val, y_proba_val)
print ("AUC_VAL : {0:.2%}".format(roc_val))
print ("\n" + "Accuracy_VAL : {0:.2%}".format(acu_val))
print ("Precision_VAL : {0:.2%}".format(prec_val))
print ("Recall_VAL : {0:.2%}".format(rec_val))
print ("Confusion matrix: "+"\n",cm_val/cm_val.sum(axis=1)[:,newaxis])


 *** Validation Results ***
KS_VAL:  22.0
AUC_VAL : 64.00%

Accuracy_VAL : 61.58%
Precision_VAL : 64.10%
Recall_VAL : 49.38%
Confusion matrix: 
 [[ 0.73  0.27]
 [ 0.51  0.49]]


In [84]:
y_proba_train.max(), y_proba_test.max(), y_proba_val.max()

(0.7559661058183752, 0.74604231817894784, 0.92084133026033843)

In [85]:
### Final ###
vif_cat_f

Unnamed: 0,Var_name,VIF
1,s_affliate_telesales_last_3sess,1.797363
2,s_cnt_custom_agent_windows_last_7sess,1.78687
3,s_cnt_click_action_tv_last_5sess,1.657223
4,d_sum_paid_search_last_l60d,1.448976
5,s_cnt_user_server_ebzweb_wc_last_10sess,1.37404
6,s_sum_attempt_localisation_l3d,1.341238
7,s_cnt_lnk_ref_page_internet_last_3sess,1.231239
8,s_sum_page_loadtime_last_3sess,1.010045


In [86]:
Y_train.value_counts()/Y_train.shape[0]

0.0    0.512889
1.0    0.487111
Name: vid_tiers_target, dtype: float64

In [87]:
y_pred_train_f.value_counts()/y_pred_train_f.shape[0]

0.0    0.639879
1.0    0.360121
Name: Pred, dtype: float64

In [88]:
Y_test.value_counts()/Y_test.shape[0]

0.0    0.559091
1.0    0.440909
Name: vid_tiers_target, dtype: float64

In [89]:
y_pred_test_f.value_counts()/y_pred_test_f.shape[0]

0.0    0.668182
1.0    0.331818
Name: Pred, dtype: float64

In [90]:
Y_val.value_counts()/Y_val.shape[0]

0.0    0.509091
1.0    0.490909
Name: vid_tiers_target, dtype: float64

In [91]:
y_pred_val.value_counts()/y_pred_val.shape[0]

0.0    0.621818
1.0    0.378182
Name: Pred, dtype: float64

#### Division Results

In [92]:
inp_val_c = datamap(full_df[full_df['division']=="CENTRAL DIVISION"], vif_cat_f, enc_map, woe_map)
final_data_val_c = pd.concat([inp_val_c, pd.DataFrame(full_df[full_df['division']=="CENTRAL DIVISION"][target])],axis = 1)

X_val_c = final_data_val_c[vif_cat_f['Var_name']]
Y_val_c = final_data_val_c[target]

y_pred_val, y_proba_val, ks_val, dec_val = out_ks(X_val_c, Y_val_c, clf_rf_f, target)

print("**Central Division**")
print("KS : ",ks_val)

roc_val, acu_val, prec_val, rec_val, cm_val = metrics(Y_val_c, y_pred_val, y_proba_val)
print ("AUC_VAL : {0:.2%}".format(roc_val))
print ("\n" + "Accuracy_VAL : {0:.2%}".format(acu_val))
print ("Precision_VAL : {0:.2%}".format(prec_val))
print ("Recall_VAL : {0:.2%}".format(rec_val))
print ("Confusion matrix: "+"\n",cm_val)

**Central Division**
KS :  16.1499418699
AUC_VAL : 59.64%

Accuracy_VAL : 61.73%
Precision_VAL : 48.06%
Recall_VAL : 43.49%
Confusion matrix: 
 [[1090  415]
 [ 499  384]]


In [93]:
inp_val_w = datamap(full_df[full_df['division']=="WEST DIVISION"], vif_cat_f, enc_map, woe_map)
final_data_val_w = pd.concat([inp_val_w, pd.DataFrame(full_df[full_df['division']=="WEST DIVISION"][target])], axis = 1)

X_val_w = final_data_val_w[vif_cat_f['Var_name']]
Y_val_w = final_data_val_w[target]

y_pred_val, y_proba_val, ks_val, dec_val = out_ks(X_val_w, Y_val_w, clf_rf_f, target)
print("**West Division**")
print("KS : ",ks_val)

roc_val, acu_val, prec_val, rec_val, cm_val = metrics(Y_val_w, y_pred_val, y_proba_val)
print ("AUC_VAL : {0:.2%}".format(roc_val))
print ("\n" + "Accuracy_VAL : {0:.2%}".format(acu_val))
print ("Precision_VAL : {0:.2%}".format(prec_val))
print ("Recall_VAL : {0:.2%}".format(rec_val))
#print ("Confusion matrix: "+"\n",cm_val)

**West Division**
KS :  24.9527383931
AUC_VAL : 65.69%

Accuracy_VAL : 65.41%
Precision_VAL : 90.20%
Recall_VAL : 66.97%


In [94]:
inp_val_n = datamap(full_df[full_df['division']=="NORTHEAST DIVISION"], vif_cat_f, enc_map, woe_map)
final_data_val_n = pd.concat([inp_val_n, pd.DataFrame(full_df[full_df['division']=="NORTHEAST DIVISION"][target])], axis = 1)

X_val_n = final_data_val_n[vif_cat_f['Var_name']]
Y_val_n = final_data_val_n[target]

y_pred_val, y_proba_val, ks_val, dec_val = out_ks(X_val_n, Y_val_n, clf_rf_f, target)
print("**Northeast Division**")
print("KS : ",ks_val)

roc_val, acu_val, prec_val, rec_val, cm_val = metrics(Y_val_n, y_pred_val, y_proba_val)
print ("AUC_VAL : {0:.2%}".format(roc_val))
print ("\n" + "Accuracy_VAL : {0:.2%}".format(acu_val))
print ("Precision_VAL : {0:.2%}".format(prec_val))
print ("Recall_VAL : {0:.2%}".format(rec_val))
#print ("Confusion matrix: "+"\n",cm_val)


**Northeast Division**
KS :  12.0958751394
AUC_VAL : 52.31%

Accuracy_VAL : 50.41%
Precision_VAL : 38.89%
Recall_VAL : 26.92%


###### Dumping into pickles for scoring

In [96]:
fv,c = joblib.load('Pickles/vid_tiers_no_domain.pkl')

In [97]:
fv

Unnamed: 0,Var_name,VIF
1,s_affliate_telesales_last_3sess,1.797363
2,s_cnt_custom_agent_windows_last_7sess,1.78687
3,s_cnt_click_action_tv_last_5sess,1.657223
4,d_sum_paid_search_last_l60d,1.448976
5,s_cnt_user_server_ebzweb_wc_last_10sess,1.37404
6,s_sum_attempt_localisation_l3d,1.341238
7,s_cnt_lnk_ref_page_internet_last_3sess,1.231239
8,s_sum_page_loadtime_last_3sess,1.010045


In [100]:
dec_train_f

Unnamed: 0,Decile,Count,Target,Non_Target,Prob_Target,1_pct,0_pct,1_cum,0_cum,KS
0,1,264,196,68,166.658481,15.252918,5.025868,15.2529,5.02587,10.227
1,2,258,162,96,142.544091,12.607004,7.095344,27.8599,12.1212,15.7387
2,3,267,159,108,141.033536,12.373541,7.982262,40.2335,20.1035,20.13
3,4,266,145,121,135.544337,11.284047,8.943089,51.5175,29.0466,22.4709
4,5,262,127,135,128.253103,9.883268,9.977827,61.4008,39.0244,22.3764
5,6,230,114,116,110.357704,8.871595,8.57354,70.2724,47.5979,22.6744
6,7,297,120,177,139.336873,9.338521,13.08204,79.6109,60.68,18.9309
7,8,266,106,160,117.600954,8.249027,11.825573,87.8599,72.5055,15.3544
8,9,262,93,169,109.585745,7.237354,12.490761,95.0973,84.9963,10.101
9,10,266,63,203,101.40984,4.902724,15.003695,100.0,100.0,1.42109e-14


In [101]:
dec_test_f

Unnamed: 0,Decile,Count,Target,Non_Target,Prob_Target,1_pct,0_pct,1_cum,0_cum,KS
0,1,66,41,25,40.862879,14.089347,6.775068,14.0893,6.77507,7.31428
1,2,66,41,25,36.05423,14.089347,6.775068,28.1787,13.5501,14.6286
2,3,65,37,28,33.846335,12.714777,7.588076,40.8935,21.1382,19.7553
3,4,67,36,31,33.875313,12.371134,8.401084,53.2646,29.5393,23.7253
4,5,66,19,47,32.001423,6.52921,12.737127,59.7938,42.2764,17.5174
5,6,66,29,37,31.436311,9.965636,10.0271,69.7595,52.3035,17.4559
6,7,66,24,42,30.427457,8.247423,11.382114,78.0069,63.6856,14.3212
7,8,66,31,35,28.846167,10.652921,9.485095,88.6598,73.1707,15.4891
8,9,66,17,49,27.371548,5.841924,13.279133,94.5017,86.4499,8.05185
9,10,66,16,50,24.791539,5.498282,13.550136,100.0,100.0,1.42109e-14


In [102]:
dec_val

Unnamed: 0,Decile,Count,Target,Non_Target,Prob_Target,1_pct,0_pct,1_cum,0_cum,KS
0,1,82,59,23,52.712005,14.567901,5.47619,14.5679,5.47619,9.09171
1,2,83,53,30,45.708989,13.08642,7.142857,27.6543,12.619,15.0353
2,3,83,53,30,43.18062,13.08642,7.142857,40.7407,19.7619,20.9788
3,4,82,43,39,41.412301,10.617284,9.285714,51.358,29.0476,22.3104
4,5,82,32,50,39.780618,7.901235,11.904762,59.2593,40.9524,18.3069
5,6,83,36,47,39.772186,8.888889,11.190476,68.1481,52.1429,16.0053
6,7,82,38,44,38.28441,9.382716,10.47619,77.5309,62.619,14.9118
7,8,83,40,43,36.968678,9.876543,10.238095,87.4074,72.8571,14.5503
8,9,82,19,63,34.521949,4.691358,15.0,92.0988,87.8571,4.24162
9,10,83,32,51,32.007794,7.901235,12.142857,100.0,100.0,1.42109e-14
