# Health Insurance Lead Prediction

In [1]:
# importing libraries

import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
import lightgbm as lgb

import warnings
warnings.filterwarnings("ignore")

In [2]:
# loading train data
train_set = pd.read_csv('train.csv')
train_set.head()

Unnamed: 0,ID,City_Code,Region_Code,Accomodation_Type,Reco_Insurance_Type,Upper_Age,Lower_Age,Is_Spouse,Health Indicator,Holding_Policy_Duration,Holding_Policy_Type,Reco_Policy_Cat,Reco_Policy_Premium,Response
0,1,C3,3213,Rented,Individual,36,36,No,X1,14+,3.0,22,11628.0,0
1,2,C5,1117,Owned,Joint,75,22,No,X2,,,22,30510.0,0
2,3,C5,3732,Owned,Individual,32,32,No,,1.0,1.0,19,7450.0,1
3,4,C24,4378,Owned,Joint,52,48,No,X1,14+,3.0,19,17780.0,0
4,5,C8,2190,Rented,Individual,44,44,No,X2,3.0,1.0,16,10404.0,0


In [3]:
# loading test data
test_data = pd.read_csv('test.csv')
test_data.head()

Unnamed: 0,ID,City_Code,Region_Code,Accomodation_Type,Reco_Insurance_Type,Upper_Age,Lower_Age,Is_Spouse,Health Indicator,Holding_Policy_Duration,Holding_Policy_Type,Reco_Policy_Cat,Reco_Policy_Premium
0,50883,C1,156,Owned,Individual,30,30,No,,6.0,3.0,5,11934.0
1,50884,C4,7,Owned,Joint,69,68,Yes,X1,3.0,3.0,18,32204.8
2,50885,C1,564,Rented,Individual,28,28,No,X3,2.0,4.0,17,9240.0
3,50886,C3,1177,Rented,Individual,23,23,No,X3,3.0,3.0,18,9086.0
4,50887,C1,951,Owned,Individual,75,75,No,X3,,,5,22534.0


In [4]:
# performing 80:20 train test split

y = train_set['Response'].values
X = train_set.drop('Response', axis=1)

X_train, X_cv, y_train, y_cv = train_test_split(X, y, test_size=0.2, stratify=y)
print(X_train.shape, y_train.shape)
print(X_cv.shape, y_cv.shape)

(40705, 13) (40705,)
(10177, 13) (10177,)


### Part 1 - Featurization

In [5]:
def featurization(train_set, cv_set, test_set):
    
    '''
    This function is taking dataframes and performing featurization on them.
    Input : train dataframe, cross validation dataframe, test dataframe
    Output: Featurized train dataframe, Featurized cv dataframe, Featurized test dataframe and unique column names (list)
    '''
    
    # initializing a list to hold all the uniqe names as column
    unique_words_bow = []
    
    ######################### for ID #########################
    #########################        #########################
    # converting column vector to row vector in train, cv and test
    id_vec_train = train_set['ID'].values
    id_vec_train = id_vec_train[:, np.newaxis]

    id_vec_cv = cv_set['ID'].values
    id_vec_cv = id_vec_cv[:, np.newaxis]

    id_vec_test = test_set['ID'].values
    id_vec_test = id_vec_test[:, np.newaxis]

    unique_words_bow.append('ID')
    
    ######################### for City_Code #########################
    #########################               #########################
    # to apply one hot encoding in categorical feature
    city_vectorizer = CountVectorizer()
    
    # fit has to happen only on train data
    city_vectorizer.fit(train_set['City_Code'].values) 
    
    # we use the fitted CountVectorizer to convert the text to vector
    train_city_ohe = city_vectorizer.transform(train_set['City_Code'].values)
    cv_city_ohe = city_vectorizer.transform(cv_set['City_Code'].values)
    test_city_ohe = city_vectorizer.transform(test_set['City_Code'].values)
    
    # getting all the names from the vectorizer
    for val in city_vectorizer.get_feature_names():
        unique_words_bow.append(val)
    
    
    ######################### for Region_Code #########################
    #########################                 #########################
    # converting column vector to row vector in train, cv and test
    rc_vec_train = train_set['Region_Code'].values
    rc_vec_train = rc_vec_train[:, np.newaxis]
    
    rc_vec_cv = cv_set['Region_Code'].values
    rc_vec_cv = rc_vec_cv[:, np.newaxis]
    
    rc_vec_test = test_set['Region_Code'].values
    rc_vec_test = rc_vec_test[:, np.newaxis]
    
    unique_words_bow.append('Region_Code')
    
    
    ######################### for Accomodation_Type #########################
    #########################                       #########################
    # to apply one hot encoding in categorical feature
    accomodation_vectorizer = CountVectorizer()
    
    # fit has to happen only on train data
    accomodation_vectorizer.fit(train_set['Accomodation_Type'].values)
    
    # we use the fitted CountVectorizer to convert the text to vector
    train_accomodation_ohe = accomodation_vectorizer.transform(train_set['Accomodation_Type'].values)
    cv_accomodation_ohe = accomodation_vectorizer.transform(cv_set['Accomodation_Type'].values)
    test_accomodation_ohe = accomodation_vectorizer.transform(test_set['Accomodation_Type'].values)
    
    # getting all the names from the vectorizer
    for val in accomodation_vectorizer.get_feature_names():
        unique_words_bow.append(val)

        
    ######################### for Reco_Insurance_Type #########################
    #########################                         #########################
    # to apply one hot encoding in categorical feature
    reco_vectorizer = CountVectorizer()
    
    # fit has to happen only on train data
    reco_vectorizer.fit(train_set['Reco_Insurance_Type'].values) 
    
    # we use the fitted CountVectorizer to convert the text to vector
    train_reco_ohe = reco_vectorizer.transform(train_set['Reco_Insurance_Type'].values)
    cv_reco_ohe = reco_vectorizer.transform(cv_set['Reco_Insurance_Type'].values)
    test_reco_ohe = reco_vectorizer.transform(test_set['Reco_Insurance_Type'].values)
    
    # getting all the names from the vectorizer
    for val in reco_vectorizer.get_feature_names():
        unique_words_bow.append(val)
        
    
    ######################### for Upper_Age #########################
    #########################               #########################
    # converting column vector to row vector in train, cv and test
    ua_vec_train = train_set['Upper_Age'].values
    ua_vec_train = ua_vec_train[:, np.newaxis]
    
    ua_vec_cv = cv_set['Upper_Age'].values
    ua_vec_cv = ua_vec_cv[:, np.newaxis]
    
    ua_vec_test = test_set['Upper_Age'].values
    ua_vec_test = ua_vec_test[:, np.newaxis]
    
    unique_words_bow.append('Upper_Age')
    
    
    ######################### for Lower_Age #########################
    #########################               #########################
    # converting column vector to row vector in train, cv and test
    la_vec_train = train_set['Lower_Age'].values
    la_vec_train = la_vec_train[:, np.newaxis]
    
    la_vec_cv = cv_set['Lower_Age'].values
    la_vec_cv = la_vec_cv[:, np.newaxis]
    
    la_vec_test = test_set['Lower_Age'].values
    la_vec_test = la_vec_test[:, np.newaxis]
    
    unique_words_bow.append('Lower_Age')
    
    
    ######################### for Is_Spouse #########################
    #########################               #########################
    # to apply one hot encoding in categorical feature
    spouse_vectorizer = CountVectorizer()
    
    # fit has to happen only on train data
    spouse_vectorizer.fit(train_set['Is_Spouse'].values) 
    
    # we use the fitted CountVectorizer to convert the text to vector
    train_spouse_ohe = spouse_vectorizer.transform(train_set['Is_Spouse'].values)
    cv_spouse_ohe = spouse_vectorizer.transform(cv_set['Is_Spouse'].values)
    test_spouse_ohe = spouse_vectorizer.transform(test_set['Is_Spouse'].values)
    
    # getting all the names from the vectorizer
    for val in spouse_vectorizer.get_feature_names():
        unique_words_bow.append(val)

    
    ######################### for Health_indicator #########################
    #########################                      #########################
    # Since it is containing nan values so filling most frequently occured value
    train_set['Health Indicator'] = train_set['Health Indicator'].fillna('X1')
    cv_set['Health Indicator'] = cv_set['Health Indicator'].fillna('X1')
    test_set['Health Indicator'] = test_set['Health Indicator'].fillna('X1')
    
    # to apply one hot encoding in categorical feature
    health_indicator_vectorizer = CountVectorizer()
    
    # fit has to happen only on train data
    health_indicator_vectorizer.fit(train_set['Health Indicator'].values) 
    
    # we use the fitted CountVectorizer to convert the text to vector
    train_health_indicator_ohe = health_indicator_vectorizer.transform(train_set['Health Indicator'].values)
    cv_health_indicator_ohe = health_indicator_vectorizer.transform(cv_set['Health Indicator'].values)
    test_health_indicator_ohe = health_indicator_vectorizer.transform(test_set['Health Indicator'].values)
    
    # getting all the names from the vectorizer
    for val in health_indicator_vectorizer.get_feature_names():
        unique_words_bow.append(val)
        
    
    ######################### for Holding_Policy_Duration #########################
    #########################                             #########################
    # replacing values '14+' with '15'
    train_set['Holding_Policy_Duration'] = train_set['Holding_Policy_Duration'].replace(to_replace='14+', value=15.0)
    cv_set['Holding_Policy_Duration'] = cv_set['Holding_Policy_Duration'].replace(to_replace='14+', value=15.0)
    test_set['Holding_Policy_Duration'] = test_set['Holding_Policy_Duration'].replace(to_replace='14+', value=15.0)
    
    # Since it is containing nan values so filling most frequently occured value
    train_set['Holding_Policy_Duration'] = train_set['Holding_Policy_Duration'].fillna('1.0')
    cv_set['Holding_Policy_Duration'] = cv_set['Holding_Policy_Duration'].fillna('1.0')
    test_set['Holding_Policy_Duration'] = test_set['Holding_Policy_Duration'].fillna('1.0')
    
    # converting column vector to row vector in train, cv and test
    hpd_vec_train = train_set['Holding_Policy_Duration'].values
    hpd_vec_train = hpd_vec_train[:, np.newaxis]
    
    hpd_vec_cv = cv_set['Holding_Policy_Duration'].values
    hpd_vec_cv = hpd_vec_cv[:, np.newaxis]
    
    hpd_vec_test = test_set['Holding_Policy_Duration'].values
    hpd_vec_test = hpd_vec_test[:, np.newaxis]
    
    unique_words_bow.append('Holding_Policy_Duration')
    
    
    ######################### for Holding_Policy_Type #########################
    #########################                         #########################
    # Since it is containing nan values so filling most frequently occured value
    train_set['Holding_Policy_Type'] = train_set['Holding_Policy_Type'].fillna(3)
    cv_set['Holding_Policy_Type'] = cv_set['Holding_Policy_Type'].fillna(3)
    test_set['Holding_Policy_Type'] = test_set['Holding_Policy_Type'].fillna(3)
    
    # converting column vector to row vector in train, cv and test
    hpt_vec_train = train_set['Holding_Policy_Type'].values
    hpt_vec_train = hpt_vec_train[:, np.newaxis]
    
    hpt_vec_cv = cv_set['Holding_Policy_Type'].values
    hpt_vec_cv = hpt_vec_cv[:, np.newaxis]
    
    hpt_vec_test = test_set['Holding_Policy_Type'].values
    hpt_vec_test = hpt_vec_test[:, np.newaxis]
    
    unique_words_bow.append('Holding_Policy_Type')
    
    
    ######################### for Reco_Policy_Cat #########################
    #########################                     #########################
    # converting column vector to row vector in train, cv and test
    rpc_vec_train = train_set['Reco_Policy_Cat'].values
    rpc_vec_train = rpc_vec_train[:, np.newaxis]
    
    rpc_vec_cv = cv_set['Reco_Policy_Cat'].values
    rpc_vec_cv = rpc_vec_cv[:, np.newaxis]
    
    rpc_vec_test = test_set['Reco_Policy_Cat'].values
    rpc_vec_test = rpc_vec_test[:, np.newaxis]
    
    unique_words_bow.append('Reco_Policy_Cat')
    
    
    ######################### for Reco_Policy_Premium #########################
    #########################                         #########################
    # converting column vector to row vector in train, cv and test
    rpp_vec_train = train_set['Reco_Policy_Premium'].values
    rpp_vec_train = rpp_vec_train[:, np.newaxis]
    
    rpp_vec_cv = cv_set['Reco_Policy_Premium'].values
    rpp_vec_cv = rpp_vec_cv[:, np.newaxis]
    
    rpp_vec_test = test_set['Reco_Policy_Premium'].values
    rpp_vec_test = rpp_vec_test[:, np.newaxis]
    
    unique_words_bow.append('Reco_Policy_Premium')
    
        
    # combining (stacking, horizontally) all the matrices for train, test and cv
    train_stack = np.hstack((id_vec_train, train_city_ohe.todense(), rc_vec_train, train_accomodation_ohe.todense(),
                            train_reco_ohe.todense(), ua_vec_train, la_vec_train, train_spouse_ohe.todense(),
                            train_health_indicator_ohe.todense(), hpd_vec_train, hpt_vec_train, rpc_vec_train, rpp_vec_train))
    cv_stack = np.hstack((id_vec_cv, cv_city_ohe.todense(), rc_vec_cv, cv_accomodation_ohe.todense(),
                            cv_reco_ohe.todense(), ua_vec_cv, la_vec_cv, cv_spouse_ohe.todense(),
                            cv_health_indicator_ohe.todense(), hpd_vec_cv, hpt_vec_cv, rpc_vec_cv, rpp_vec_cv))
    test_stack = np.hstack((id_vec_test, test_city_ohe.todense(), rc_vec_test, test_accomodation_ohe.todense(),
                            test_reco_ohe.todense(), ua_vec_test, la_vec_test, test_spouse_ohe.todense(),
                            test_health_indicator_ohe.todense(), hpd_vec_test, hpt_vec_test, rpc_vec_test, rpp_vec_test))

    # returning results
    return train_stack, cv_stack, test_stack, unique_words_bow

In [6]:
# function call

tr, cv, te, unq = featurization(X_train, X_cv, test_data)

In [7]:
# getting shapes of results

print(tr.shape, y_train.shape)
print(cv.shape, y_cv.shape)
print(te.shape)

(40705, 59) (40705,)
(10177, 59) (10177,)
(21805, 59)


In [8]:
# creating dataframes

ohe_train_data = pd.DataFrame(data=tr, columns=unq)
ohe_cv_data = pd.DataFrame(data=cv, columns=unq)
ohe_test_data = pd.DataFrame(data=te, columns=unq)

In [9]:
# checking for null values

ohe_train_data.isnull().values.any()

False

In [10]:
# checking for null values

ohe_cv_data.isnull().values.any()

False

In [11]:
# checking for null values

ohe_test_data.isnull().values.any()

False

In [12]:
# saving csv's

ohe_train_data.to_csv('train_data.csv', index=False)
ohe_cv_data.to_csv('cv_data.csv', index=False)
ohe_test_data.to_csv('test_data.csv', index=False)

In [13]:
# saving labels

np.savez_compressed("trainlabels.npz", y_train)
np.savez_compressed("cvlabels.npz", y_cv)

### Part 2 - Modeling (LightGBM)

In [2]:
# loading train data
X_train = pd.read_csv('train_data.csv')
X_train.head()

Unnamed: 0,ID,c1,c10,c11,c12,c13,c14,c15,c16,c17,...,x4,x5,x6,x7,x8,x9,Holding_Policy_Duration,Holding_Policy_Type,Reco_Policy_Cat,Reco_Policy_Premium
0,48985,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,2.0,4.0,22,9240.0
1,6635,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,4.0,3.0,22,15384.0
2,25243,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,5.0,2.0,18,19936.0
3,42010,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,1.0,4.0,22,17342.0
4,19605,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,15.0,3.0,18,14482.0


In [3]:
# loading cv data
X_cv = pd.read_csv('cv_data.csv')
X_cv.head()

Unnamed: 0,ID,c1,c10,c11,c12,c13,c14,c15,c16,c17,...,x4,x5,x6,x7,x8,x9,Holding_Policy_Duration,Holding_Policy_Type,Reco_Policy_Cat,Reco_Policy_Premium
0,4027,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,15.0,1.0,19,30048.0
1,21417,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,1.0,3.0,21,10476.0
2,27830,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,4.0,4.0,17,15990.0
3,34896,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,8.0,3.0,18,12080.0
4,37010,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,9.0,4.0,22,17080.0


In [4]:
# loading train labels
y_train = np.load('trainlabels.npz')
y_train = y_train['arr_0']
y_train

array([0, 0, 0, ..., 0, 0, 0])

In [5]:
# loading cv labels
y_cv = np.load('cvlabels.npz')
y_cv = y_cv['arr_0']
y_cv

array([1, 0, 1, ..., 0, 0, 0])

In [6]:
# dropping 'ID' from train and cv dataset
X_train = X_train.drop('ID', axis=1)
X_cv = X_cv.drop('ID', axis=1)

In [7]:
# setting parameters
param = {'objective' : 'binary', 
         'metric' : 'binary-logloss',
         'num_leaves' : 31,
         'boosting' : 'dart'}

# creating Dataset for train and cross validation data
train = lgb.Dataset(X_train, label=y_train)
test = lgb.Dataset(X_cv, label=y_cv)

In [8]:
# train lgb model
lgb_model = lgb.train(param, train, num_boost_round=100, valid_sets=test)

[LightGBM] [Info] Number of positive: 9767, number of negative: 30938
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 774
[LightGBM] [Info] Number of data points in the train set: 40705, number of used features: 57
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.239946 -> initscore=-1.152976
[LightGBM] [Info] Start training from score -1.152976


In [None]:
# saving lgb model
lgb_model.save_model('finalized_model_lgb.sav')

In [9]:
# loading saved model
lgb_model = lgb.Booster(model_file='finalized_model_lgb.sav')

In [10]:
# predicting probabilities for X_train
pred_train = lgb_model.predict(X_train)
# calculating roc_auc_score
print('train roc_auc_score: ', roc_auc_score(y_train, pred_train))

# predicting probabilities for X_cv
pred_cv = lgb_model.predict(X_cv)
# calculating roc_auc_score
print('test roc_auc_score: ', roc_auc_score(y_cv, pred_cv))

train roc_auc_score:  0.722053909752942
test roc_auc_score:  0.6679687826746651


In [11]:
# loading test data
test_data = pd.read_csv('test_data.csv')
# dropping 'ID' from test set
test_data = test_data.drop('ID', axis=1)
test_data.head()

Unnamed: 0,c1,c10,c11,c12,c13,c14,c15,c16,c17,c18,...,x4,x5,x6,x7,x8,x9,Holding_Policy_Duration,Holding_Policy_Type,Reco_Policy_Cat,Reco_Policy_Premium
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,6.0,3.0,5,11934.0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,3.0,3.0,18,32204.8
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,2.0,4.0,17,9240.0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,3.0,3.0,18,9086.0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1.0,3.0,5,22534.0


In [12]:
# calculating probabilites for test set
test_pred = lgb_model.predict(test_data)
#printing them
test_pred

array([0.28966044, 0.30349629, 0.28169202, ..., 0.01430483, 0.24182427,
       0.14445184])

In [13]:
# loading sample submission file
submit = pd.read_csv('sample_submission.csv')
# dropping default class labels
submit = submit.drop('Response', axis=1, inplace=False)
submit.head()

Unnamed: 0,ID
0,50883
1,50884
2,50885
3,50886
4,50887


In [14]:
# putting predicted probabilities to Response field
submit['Response'] = test_pred
# getting head(top 5 rows)
submit.head()

Unnamed: 0,ID,Response
0,50883,0.28966
1,50884,0.303496
2,50885,0.281692
3,50886,0.257208
4,50887,0.232433


In [15]:
# saving best model to the disk
submit.to_csv('submission_lgbm_second.csv', index=False)