## Importing required packages

In [1]:
import pandas as pd
import numpy as np
import datetime
from sklearn import tree
from sklearn.metrics import confusion_matrix,accuracy_score,roc_curve
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import KFold,cross_val_score,train_test_split

## Loading dataset

In [2]:
file_path = r'E:\Second Sem\Capstone\New_data\modeling_data.csv'
final_data = pd.read_csv(file_path)

## Data preparation

In [9]:
# considering only Quote and Bound
final_data = final_data[final_data['MAIN_UW_STATUS_L1_CALC'].isin(['Quote','Bound'])]

#changing datatype of "Creation date" and "inception_date" 
final_data['CREATION_DATE'] = pd.to_datetime(final_data['CREATION_DATE'])
final_data['INCEPTION_DATE'] = pd.to_datetime(final_data['INCEPTION_DATE'])

#Computing difference between "INCEPTION_DATE" and "CREATION_DATE"
final_data["Creation_Inception_Diff"] = (final_data['INCEPTION_DATE'] - final_data['CREATION_DATE'])

#Converting the difference into days
final_data["Creation_Inception_Diff"] = final_data["Creation_Inception_Diff"].apply(lambda x : x.astype('timedelta64[D]') /np.timedelta64(1, 'D'))

#Replacing "," in "SUM_INSURED_100"
final_data["SUM_INSURED_100"] = final_data["SUM_INSURED_100"].apply(lambda x : np.float64(str(x).replace(",","")))

## Deriving Year,month and quater from Creation date

In [11]:
#Deriving Year,month,quater from creation date
final_data['CREATION_DATE_YEAR'] = final_data['CREATION_DATE'].apply(lambda x : x.year)
final_data['CREATION_DATE_MONTH'] = final_data['CREATION_DATE'].apply(lambda x : x.month)
final_data['CREATION_DATE_QUATER'] = final_data['CREATION_DATE'].apply(lambda x : (((x.month-1)/3)+1)) 

# replacing numbers with actual alphabetical representation in quater 
final_data.replace({'CREATION_DATE_QUATER' : { 1 : 'Q1',
                                               2 : 'Q2',
                                               3 : 'Q3', 
                                               4 : 'Q4',
                                             }},inplace=True)

# replacing numbers with actual alphabetical representation in months
final_data.replace({'CREATION_DATE_MONTH' : { 1 : 'Jan',
                                             2 : 'Feb',
                                             3 : 'Mar', 
                                             4 : 'Apr',
                                             5 : 'May',
                                             6 : 'Jun',
                                             7 : 'Jul',
                                             8 : 'Aug',
                                             9 : 'Sep',
                                             10 : 'Oct',
                                             11 : 'Nov',
                                             12 : 'Dec'}},inplace=True)


In [12]:
#final shape of dataframe
final_data.shape

(1573, 35)

## Defining column types in dataset

In [13]:
target_col = ["MAIN_UW_STATUS_L1_CALC"]

cat_col = ["LCR_ECONOMIC_ACTIVITY_L1","LCR_ECONOMIC_ACTIVITY_L2",
            "REGION_LEVEL_4_INSURED_STATE","INSURED_CITY_UPPER",
           "BROKER_REGION_L4_STATE","BROKER_LEVEL_2_BROKER_GROUP","BROKER_CITY_UPPER",
            "CREATION_DATE_MONTH","CREATION_DATE_QUATER"]

num_col = ["CREATION_INCEPTION_DIFF","BROKER_HIST_WEIGHT","BROKER_SUCCESS","INSURED_SUCCESS","SUM_INSURED_100"]

In [14]:
#Selecting only required columns
final_data = final_data[cat_col+num_col+target_col]

In [15]:
#Creating dummy variables for all categorical variables
final_data_cat = pd.get_dummies(final_data[cat_col])

In [17]:
#Combining dummy variables with other variables to form a final dataset
final_result = pd.concat([final_data_cat, final_data[num_col+target_col]], axis=1)

#Filling missing values with -9999
final_result.fillna(-9999,inplace=True)

In [18]:
#defining training and target dataset
final_result["Target"] = np.where(final_result["MAIN_UW_STATUS_L1_CALC"] == "Bound",1,0)

final_result_data_old = final_result.drop(['MAIN_UW_STATUS_L1_CALC','Target'],axis=1)
final_result_data = final_result_data_old.as_matrix()

final_result_target = final_result['Target'].as_matrix()


## Feature selection using Lasso

In [26]:
#importing required package
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression,Lasso

#initialising Lasso with alpha of 0.001 
Lass = Lasso(alpha = 0.002)
Lass = Lass.fit(final_result_data,final_result_target)

model_selecting = SelectFromModel(Lass, prefit=True)

features_selected = final_result_data_old.columns[model_selecting.get_support()]

train_features_subset = model_selecting.transform(final_result_data)

print str(train_features_subset.shape[1])+" columns selected"

39 columns selected


## Logistic regression with selected features

In [27]:
#initialising Logistic regression 
LR = LogisticRegression(n_jobs=-1)

#Computing accuracy using 10 fold cross validation
cross_val_score(LR,train_features_subset,final_result_target,cv=10,scoring = 'accuracy').mean()

0.74486207844669505

In [29]:
#ROC_AUC 
cross_val_score(LR,train_features_subset,final_result_target,cv=10,scoring = 'roc_auc').mean()

0.78094966815184264

In [30]:
#Recall
cross_val_score(LR,train_features_subset,final_result_target,cv=10,scoring = 'recall').mean()

0.56477477477477478

## Logistic Model results on training and validation 

In [31]:
train_data =  train_features_subset
target_data = final_result_target

In [53]:
#Splitting dataset into 70% training and 30% validation
X_train,X_test,y_train,y_test = train_test_split(train_data,target_data,test_size = 0.3,random_state = 1111)

#Fitting trainning data 
Ddt = LR.fit(X_train,y_train)

#Predicting probabilities 
pred_prob = Ddt.predict_proba(X_test)

#setting the cutoff 0.25
pred_val = np.where(pred_prob[:,1] > 0.25,1,0)#cut_off 0.25

In [54]:
#confusion matrix
confusion_matrix(y_test, pred_val,labels=[1,0])

array([[194,  21],
       [166,  91]])

In [55]:
#Computing type II errors
a=confusion_matrix(y_test, pred_val,labels=[1,0])

float(a[0][1])/(a[0][0]+a[0][1]) #type_2

0.09767441860465116

In [56]:
#Computing accuracy
accuracy_score(y_test,pred_val)

0.60381355932203384

## Decision trees

In [57]:
from sklearn.tree import DecisionTreeClassifier

DT = DecisionTreeClassifier()

In [58]:
#Computing accuracy using 10 fold cross validation
cross_val_score(DT,final_result_data,final_result_target,cv=10,scoring = 'accuracy').mean()

0.72712464106078556

In [63]:
#ROC_AUC 
cross_val_score(DT,final_result_data,final_result_target,cv=10,scoring = 'roc_auc').mean()

0.71803022770051861

In [62]:
#Recall
cross_val_score(DT,final_result_data,final_result_target,cv=10,scoring = 'recall').mean()

0.71702702702702692

## Decision Tree Model results on training and validation

In [61]:
train_data =  final_result_data
target_data = final_result_target

In [179]:
#Splitting dataset into 70% training and 30% validation
X_train,X_test,y_train,y_test = train_test_split(train_data,target_data,test_size = 0.3,random_state = 111)

#Fitting trainning data 
DT_Model = DT.fit(X_train,y_train)

#Predicting probabilities 
pred_prob = DT_Model.predict_proba(X_test)

#setting the cutoff 0.10
pred_val = np.where(pred_prob[:,1] > 0.10,1,0)#cut_off 0.10

In [180]:
#confusion matrix
confusion_matrix(y_test, pred_val,labels=[1,0])

array([[157,  73],
       [ 82, 160]])

In [181]:
#Computing type II errors
a=confusion_matrix(y_test, pred_val,labels=[1,0])

float(a[0][1])/(a[0][0]+a[0][1]) #type_2

0.3173913043478261

In [182]:
#Computing accuracy
accuracy_score(y_test,pred_val)

0.67161016949152541

## Random forest

In [84]:
from sklearn.ensemble import RandomForestClassifier

RF = RandomForestClassifier(n_estimators = 500,n_jobs=-1)

In [85]:
#Computing accuracy using 10 fold cross validation
cross_val_score(RF,final_result_data,final_result_target,cv=10,scoring = 'accuracy').mean()

0.77852752745923748

In [86]:
#ROC_AUC 
cross_val_score(RF,final_result_data,final_result_target,cv=10,scoring = 'roc_auc').mean()

0.8477430844008863

In [87]:
#Recall
cross_val_score(RF,final_result_data,final_result_target,cv=10,scoring = 'recall').mean()

0.66886486486486485

## Random Forest on training and validation 

In [183]:
#Fitting trainning data 
RF_Model = RF.fit(X_train,y_train)

#Predicting probabilities 
pred_prob = RF_Model.predict_proba(X_test)

#setting the cutoff 0.10
pred_val = np.where(pred_prob[:,1] > 0.10,1,0)#cut_off 0.10

In [184]:
#confusion matrix
confusion_matrix(y_test, pred_val,labels=[1,0])

array([[221,   9],
       [208,  34]])

In [185]:
#Computing type II errors
a=confusion_matrix(y_test, pred_val,labels=[1,0])

float(a[0][1])/(a[0][0]+a[0][1]) #type_2

0.0391304347826087

In [186]:
#Computing accuracy
accuracy_score(y_test,pred_val)

0.5402542372881356

## Gradient Boosting Machines

In [92]:
from sklearn.ensemble import GradientBoostingClassifier

#Setting parameters as 
#   {learning_rate = 0.1,n_estimators=170,max_depth=21,
#    min_samples_split=100,subsample=0.8,max_features='sqrt' }
# after careful tunning  

gbm = GradientBoostingClassifier(learning_rate=0.1, n_estimators=170,max_depth=21, min_samples_split=100,
                                         subsample=0.8, random_state=10, max_features='sqrt')


In [93]:
#Computing accuracy using 10 fold cross validation
cross_val_score(gbm,final_result_data,final_result_target,cv=10,scoring = 'accuracy').mean()

0.75311799047790351

In [94]:
#ROC_AUC 
cross_val_score(gbm,final_result_data,final_result_target,cv=10,scoring = 'roc_auc').mean()

0.85150413387233836

In [95]:
#Recall
cross_val_score(gbm,final_result_data,final_result_target,cv=10,scoring = 'recall').mean()

0.70095495495495486

## GBM on training and validation 

In [171]:
#Splitting dataset into 70% training and 30% validation
X_train,X_test,y_train,y_test = train_test_split(train_data,target_data,test_size = 0.3,random_state = 111)

#Fitting trainning data 
GBM_Model = gbm.fit(X_train,y_train)

#Predicting probabilities 
pred_prob = GBM_Model.predict_proba(X_test)

#setting the cutoff 0.10
pred_val = np.where(pred_prob[:,1] > 0.10,1,0)#cut_off 0.10

In [172]:
#confusion matrix
confusion_matrix(y_test, pred_val,labels=[1,0])

array([[215,  15],
       [183,  59]])

In [194]:
#Computing type II errors
a=confusion_matrix(y_test, pred_val,labels=[1,0])

float(a[0][1])/(a[0][0]+a[0][1])#type_2

0.040730434782608696

In [195]:
#Computing accuracy
accuracy_score(y_test,pred_val)

0.64025423728813557