In [None]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns

from scipy import stats
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

sns.set_palette("pastel")
pd.options.display.float_format = "{:,.4f}".format

<h1> Reading the Train and Test Dataset </h1>

<p>The test dataset does not contain target variables. It will ultimately be the dataset we predict for submission.
   The train dataset will be split into a training and validation set using train_test_split. </p>

<h3> What is the validation set used for? </h3>
    
<p>It is taken from a part of the training dataset (0.2% in this case). 
   It will be used to tune the parameters of the model and to avoid overfitting. </p>


In [None]:
train=pd.read_csv('../input/health-insurance-cross-sell-prediction/train.csv')
test=pd.read_csv('../input/health-insurance-cross-sell-prediction/test.csv')

## At first glance

Categorical Data: Gender, Driving_License, Region_Code, Previously_Insured, Vehicle_Damage, Policy_Sales_Channel

Continuous Data: Age,Annual_Premium, Vintage

Target Data: Response



In [None]:
train.head()

## Check if there are any null values

If there are any...
1. Fill it up with a null represented value
2. Remove that row of data

In [None]:
####################
# Null Data Analysis
####################
nullDF=pd.DataFrame()
nullDF['Train']=train.isnull().sum()
nullDF['Test']=test.isnull().sum()
nullDF

## Encoding Categorical Data

ML/AI algorithms requires the inputs to be numerical.



In [None]:
train.groupby(by=['Vehicle_Age']).count()

In [None]:
###########################
# Encoding Categorical Data
###########################

from sklearn.preprocessing import LabelEncoder

le=LabelEncoder()
train['Gender']=le.fit_transform(train['Gender'])
test['Gender']=le.fit_transform(test['Gender'])
print("Gender Encoding Classes:", le.classes_)

train['Vehicle_Damage']=le.fit_transform(train['Vehicle_Damage'])
test['Vehicle_Damage']=le.fit_transform(test['Vehicle_Damage'])
print("Vehicle Damage Encoding Classes:", le.classes_)

def ordered_encoding(lst,x):
    return lst.index(x)
lst = ['< 1 Year','1-2 Year','> 2 Years']
train['Vehicle_Age']=train['Vehicle_Age'].apply(lambda x : ordered_encoding(lst,x))
test['Vehicle_Age']=test['Vehicle_Age'].apply(lambda x : ordered_encoding(lst,x))

train['Region_Code']=train['Region_Code'].apply(lambda x : int(x))
test['Region_Code']=test['Region_Code'].apply(lambda x : int(x))

train['Policy_Sales_Channel']=train['Policy_Sales_Channel'].apply(lambda x : int(x))
test['Policy_Sales_Channel']=test['Policy_Sales_Channel'].apply(lambda x : int(x))

train.drop(columns=['id'])
test.drop(columns=['id'])

train.head()

## Defining our training features and target variable

In [None]:
X=train.drop(columns=['id','Response'])
y=train['Response']

In [None]:
#########################
# Distribution of Target
#########################

sns.countplot(y)
count_0, count_1 = y.value_counts()
total=count_0+count_1
percent_0=float("{:.2f}".format(count_0/total))
percent_1=float("{:.2f}".format(count_1/total))
print("Not Interested: ",count_0,f"{percent_0}%")
print("Interested:     ",count_1,f" {percent_1}%")

## Highly Skewed Dataset
As seen from the countplot above, we have a dataset that recorded 88% of customers not being interested in Vehicle Insurance.

If we continue to learn with this dataset, the model will probably overfit and returns a prediction of not interested way more often than it should.

We will be oversampling with Synthetic Minority Oversampling Technique(SMOTE) and cleaning with Tomek links to produce a balanced dataset.

In [None]:
from imblearn.combine import SMOTETomek

smt=SMOTETomek(random_state=42)

X,y=smt.fit_sample(X,y)

In [None]:
###################################
# Distribution of resampled Labels
###################################

sns.countplot(y)
count_0, count_1 = y.value_counts()
total=count_0+count_1
percent_0=float("{:.2f}".format(count_0/total))
percent_1=float("{:.2f}".format(count_1/total))
print("Not Interested: ",count_0,f"{percent_0}%")
print("Interested:     ",count_1,f" {percent_1}%")

In [None]:
X.head()

## Gender (Categorical)

In [None]:
sns.countplot(X['Gender'],hue=y)
plt.legend(labels=["not interested","interested"])

female,male=X['Gender'].value_counts()
print("Number of female:",female)
print("Number of male:",male)

plt.show()

#### Gender - Response #####
gender=X['Gender']

# Form a Contingency Table #
ctb=pd.crosstab(gender, y, normalize=True)

(chi2,p,dof,_)=stats.chi2_contingency([ctb.iloc[0].values,ctb.iloc[1].values])

gender_=['Gender',chi2,p,dof,gender.var()]
(ctb)

### Distribution:
The number of female and male (by percentage) do not differ by much. 

### Observation:
The females customers favours being interested while the male customers favours being uninterested.


## Age (Discrete)

In [None]:
age_interested=(X.loc[y[y==1].index.values])['Age']
age_notinterested=(X.loc[y[y==0].index.values])['Age']

f, (ax_box, ax_hist) = plt.subplots(2, sharex=True, gridspec_kw={"height_ratios": (.15, .85)})
sns.boxplot(X['Age'],ax=ax_box)
sns.distplot(X['Age'],ax=ax_hist)
ax_box.set(xlabel='')
ax_box.set(title="Distribution of customer's age")
plt.show()

sns.distplot(age_notinterested, color='salmon')
sns.distplot(age_interested, color='lightblue')
plt.title("Distribution of customer's age and their interest")
plt.legend(labels=["not interested","interested"])
plt.show()

### Distribution:
The customers are generally ranging from young to middle age adults. However, the younger adults are predominantly uninterested in the vehicle insurance.

### Analysis:
It is very likely that the younger adults have just started working and might not have the sufficient purchasing power to own a vehicle.

## Driving License (Categorical)


In [None]:
sns.countplot(X['Driving_License'],hue=y)
plt.legend(labels=["not interested","interested"])

dl=X['Driving_License']

dl1,dl0 = X['Driving_License'].value_counts()
print("Number of customers that have a driving license:", dl1)
print("Number of customers that do not have a driving license:", dl0)
print("Variance:", X['Driving_License'].var())
plt.show()

# Form a Contingency Table #
ctb=pd.crosstab(dl, y)

(chi2,p,dof,_)=stats.chi2_contingency([ctb.iloc[0].values,ctb.iloc[1].values])

driving_license=['Driving_License',chi2,p,dof,dl.var()]
(ctb)

### Distribution:
The dataset contains a heavily skewed amount of customers that have a driving license as compared to customers that do not. The poor spread in data will most likely result in a low variance.

### Action:
The variance of this feature (0.0019) is extremely low and therefore will be removed.

## Region Code (Discrete)

In [None]:
print("Variance:", X['Region_Code'].var())

f, (ax_box, ax_hist) = plt.subplots(2, sharex=True, gridspec_kw={"height_ratios": (.15, .85)})
sns.boxplot(X['Region_Code'],ax=ax_box)
sns.distplot(X['Region_Code'],ax=ax_hist)
ax_box.set(xlabel='')
ax_box.set(title="Distribution of customer's region code")
plt.show()


rc_interested=(X.loc[y[y==1].index.values])['Region_Code']
rc_notinterested=(X.loc[y[y==0].index.values])['Region_Code']
sns.distplot(rc_notinterested, color='salmon')
sns.distplot(rc_interested, color='lightblue')
plt.title("Distribution of customer's region code and their interest")
plt.legend(labels=["not interested","interested"])
plt.show()

### Distribution
The distribution of customers from different regions are fairly consistent except for region code 28 where there is a enormous spike in customer data collected.

### Analysis
The spike might be due to a convenient method of collecting data from that region, we can further probe into this hypothesis by looking at the relationship between region code and policy sales channel 


In [None]:
f, ax = plt.subplots(4,4,figsize=(29,29))

count=20

for i in range(4):
    for j in range(4):
        df=(X[['Region_Code','Policy_Sales_Channel']][X['Region_Code']==count])
        if(count==28):
            ax[i,j].hist(df['Policy_Sales_Channel'],color='black')
        else:
            ax[i,j].hist(df['Policy_Sales_Channel'])
        ax[i,j].title.set_text("Region Code: " + str(count))
        count+=1

## Previously Insured (Categorical)

In [None]:
print("Variance:",X['Previously_Insured'].var())

sns.countplot(X['Previously_Insured'],hue=y)
plt.legend(labels=["not interested","interested"])

plt.show()

#### Previously_Insured - Response #####
pi=X['Previously_Insured']

## Categorical - Categorical ##
# Form a Contingency Table #
ctb=pd.crosstab(pi, y)

(chi2,p,dof,_)=stats.chi2_contingency([ctb.iloc[0].values,ctb.iloc[1].values])

previosly_insured=['Previously_Insured',chi2,p,dof,pi.var()]
(ctb)


### Distribution:
There is a larger amount of customers that were not previously insured, relative to customers that were previously insured. Being insured in this instance refers to having their vehicle insured.

### Observation:
For the customers that were not previously insured, they were more likely to be interested in the insurance. Whereas, for the customers that were previously insured, they were predominantly uninterested.

## Vehicle Age (Categorical)

In [None]:
sns.countplot(X['Vehicle_Age'],hue=y)
plt.legend(labels=["not interested","interested"])
plt.show()

#### Vehicle_Age - Response #####
va=X['Vehicle_Age']

# Form a Contingency Table #
ctb=pd.crosstab(va, y, normalize=True)

(chi2,p,dof,_)=stats.chi2_contingency([ctb.iloc[0].values,ctb.iloc[1].values])

vehicle_age=['Vehicle_Age',chi2,p,dof,va.var()]
(ctb)

### Distribution
There is an overwhelming amount of customer data collected with a vehicle that is between 1-2 years old. That is followed by customers with a vehicle that is less than a year old and a small amount of customers with a vehicle that is more than 2 years old.

### Analysis
The insurance company is assuming that newer car owners will likely be more interested in getting their vehicles covered.

In [None]:
sns.countplot(X['Vehicle_Damage'],hue=y)
plt.legend(labels=["not interested","interested"])
plt.show()

#### Vehicle_Damage - Response #####
vd=X['Vehicle_Damage']

# Form a Contingency Table #
ctb=pd.crosstab(vd, y)

(chi2,p_,dof,_)=stats.chi2_contingency([ctb.iloc[0].values,ctb.iloc[1].values])
vehicle_damage=['Vehicle_Damage',chi2,p_,dof,vd.var()]
(ctb)

## Vehicle Damage (Categorical)

### Analysis
Customers with past vehicle damage were more prone to be interested in the vehicle insurance while customers with no past vehicle damage are more likely to be uninterested.

## Annual Premium (Continuous)

In [None]:
print("Variance:", X['Annual_Premium'].var())

f, (ax_box, ax_hist) = plt.subplots(2, sharex=True, gridspec_kw={"height_ratios": (.15, .85)})
sns.boxplot(X['Annual_Premium'],ax=ax_box)
sns.distplot(X['Annual_Premium'],ax=ax_hist)
ax_box.set(xlabel='')
ax_box.set(title="Distribution of Annual Preium paid")
plt.show()


ap_interested=(X.loc[y[y==1].index.values])['Annual_Premium']
ap_notinterested=(X.loc[y[y==0].index.values])['Annual_Premium']

sns.distplot(ap_notinterested, color='salmon')
plt.title("Distribution of Annual Premium paid for customers that were interested")
plt.show()
sns.distplot(ap_interested, color='lightblue')
plt.title("Distribution of Annual Premium paid for customers that were not interested")
plt.show()

## Policy Sales Channel (Discrete)


In [None]:
print("Variance:", X['Policy_Sales_Channel'].var())

f, (ax_box, ax_hist) = plt.subplots(2, sharex=True, gridspec_kw={"height_ratios": (.15, .85)})
sns.boxplot(X['Policy_Sales_Channel'],ax=ax_box)
sns.distplot(X['Policy_Sales_Channel'],ax=ax_hist)
ax_box.set(xlabel='')
ax_box.set(title="Distribution of Policy Sales Channel")
plt.show()


psc_interested=(X.loc[y[y==1].index.values])['Policy_Sales_Channel']
psc_notinterested=(X.loc[y[y==0].index.values])['Policy_Sales_Channel']

sns.distplot(psc_notinterested, color='salmon')
plt.title("Distribution of Policy Sales Channel for customers that were interested")
plt.show()
sns.distplot(psc_interested, color='lightblue')
plt.title("Distribution of Policy Sales Channel for customers that were not interested")
plt.show()

print("Most used channel:")
print((X['Policy_Sales_Channel'].value_counts())[:3])

### Analysis
* The 3 most used policy sales channels are channel 26, 124 and 152.
* Customers that were predominantly reached through channels 26 and 124 resulted in a disinterest in the insurance.
* Customers that were predominantly reached through channels 152 resulted in an interest in the insurance

## Vintage (Discrete)

In [None]:
print("Variance:", X['Vintage'].var())

f, (ax_box, ax_hist) = plt.subplots(2, sharex=True, gridspec_kw={"height_ratios": (.15, .85)})
sns.boxplot(X['Vintage'],ax=ax_box)
sns.distplot(X['Vintage'],ax=ax_hist)
ax_box.set(xlabel='')
ax_box.set(title="Distribution of Policy Sales Channel")
plt.show()


ap_interested=(X.loc[y[y==1].index.values])['Vintage']
ap_notinterested=(X.loc[y[y==0].index.values])['Vintage']

sns.distplot(ap_notinterested, color='salmon')
plt.title("Distribution of Policy Sales Channel for customers that were interested")
plt.show()
sns.distplot(ap_interested, color='lightblue')
plt.title("Distribution of Policy Sales Channel for customers that were not interested")
plt.show()

### Distribution

There is a fairly even distribution of customers with regards to the number of days the customer is associated with the company


In [None]:
chi2summary=(gender_,driving_license,previosly_insured,vehicle_age,vehicle_damage)
chi2DF=pd.DataFrame(chi2summary,columns=['FeatureName','Chi2','p-val','DegreeOfFreedom','Variance'])
chi2DF

## Categorical Feature Summary

1. Obtain the variance of the feature
    * Driving_License will be removed as the variance is close to zero
    
2. Evaluate strength of relationship between feature and target (alpha=0.05)
    * Both Gender and Vehicle_Age has a p-val that is >0.05 but not overwhelmingly larger so I will take a look at the feature importance for a reinforced decision to remove or keep the feature



In [None]:
etc=ExtraTreesClassifier()
etc.fit(X,y)
feat_imp=pd.Series(etc.feature_importances_,
                  index=X.columns)
feat_imp.nlargest(10).plot(kind='barh')
plt.show()

## Feature Selection

* With the additional information from the ExtraTreeClassifier, we can see that Driving License is useless for the classification and hence will be removed.

* Gender has a low score for feature importance and coupled with a p-val > 0.05 we will be removing this feature as well

In [None]:
X=X.drop(columns=['Driving_License','Gender'])
test=test.drop(columns=['id','Driving_License','Gender'])

## Splitting the dataset

The train dataset will be split into xtrain,xtest,ytrain,ytest where we fit (xtrain,ytrain) into our models for training and validate using (xtest,ytest).

The distribution of Response is fairly random so we do not need to shuffle before spliiting

In [None]:
from sklearn.model_selection import train_test_split

xtrain,xtest,ytrain,ytest=train_test_split(X,y,test_size=0.2)

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier
from sklearn.model_selection import RandomizedSearchCV,GridSearchCV, cross_val_score

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report,roc_auc_score,roc_curve

## Additional Functions

1. Plotting a roc curve since it is a nice visualization tool to evaluate the model

2. A generic function to fit a classifier and extract the evaluation metrics required

In [None]:
#https://stackabuse.com/understanding-roc-curves-with-python/
def plot_roc_curve(fpr, tpr):
    plt.plot(fpr, tpr, color='orange', label='ROC')
    plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend()
    plt.show()
    
    
def build_model(clf):
    classifier_name=str(clf).split('(')[0]

    clf.fit(xtrain,ytrain)
    
    ypred = clf.predict(xtest)
    
    accuracy=accuracy_score(ytest,ypred)
    probs = (clf.predict_proba(xtest))[:,1]
    auc=roc_auc_score(ytest, probs)
    kfold=cross_val_score(clf,X,y,cv=10)
    kfold_acc = kfold.mean()

    cr=(classification_report(ypred,ytest,output_dict=True))
    cr0=cr['0']
    cr1=cr['1']
    cm=confusion_matrix(ypred,ytest)
    
    summary=([classifier_name ,cr0['precision'], cr0['recall'], cr0['f1-score'], cr0['support'],cr1['precision'], cr1['recall'], cr1['f1-score'], cr1['support'], cm[0,0], cm[0,1], cm[1,0], cm[1,1], auc, kfold_acc, accuracy])

    fpr, tpr, _ = roc_curve(ytest,probs)
    plot_roc_curve(fpr, tpr)
    
    return clf,summary

## Hyperparameter Tuning

It takes quite some time to run the grid/random searches on kaggle,therefore I've done it on Visual Studio Code on my pc to utilize a better cpu.

In [None]:
# hp={
#     'criterion'         : ['gini','entropy'],
#     'min_samples_split' : [x for x in range(2,25)],
#     'max_depth'         : [x for x in range(90,100)]
# }

# dtc_tune=RandomizedSearchCV(estimator=DecisionTreeClassifier(),
#                            param_distributions = hp, 
#                            scoring='roc_auc',
#                             cv = 5, 
#                             verbose=1,  
#                             n_jobs = -1,
#                            return_train_score=True)

# dtc_tune.fit(X,y)

# dtc_tune.best_params_

In [None]:
# hp={
#     'criterion'         : ['entropy','gini'],
#     'max_features'      : [None, 'sqrt','log2'],
#     'min_samples_split' : [x for x in range(2,11)]
# }

# rfc_tune=RandomizedSearchCV(estimator=RandomForestClassifier(),
#                       param_distributions = hp, 
#                       cv = 5,
#                       scoring='roc_auc',
#                       verbose=1,  
#                       n_jobs = -1,
#                       return_train_score=True)

# rfc_tune.fit(X,y)
# rfc_tune.best_params_

In [None]:
# hp={
#     'n_estimators' : [x for x in range(50,150,10)],
#     'learning_rate': [x for x in range(1,10)]
# }

# abc_tune=RandomizedSearchCV(estimator=XGBClassifier(),
#                            param_distributions = hp, 
#                            scoring='roc_auc',
#                            cv = 5,
#                            verbose=1,
#                            n_jobs = -1,
#                            return_train_score=True)

# abc_tune.fit(X,y)
# abc_tune.best_params_

In [None]:
# hp={
#     'n_estimators' : [x for x in range(50,150,10)],
#     'learning_rate': [x for x in range(1,10)]
# }

# abc_tune=RandomizedSearchCV(estimator=AdaBoostClassifier(),
#                            param_distributions = hp, 
#                            scoring='roc_auc',
#                            cv = 5,
#                            verbose=1,
#                            n_jobs = -1,
#                            return_train_score=True)

# abc_tune.fit(X,y)
# abc_tune.best_params_

In [None]:
dtc=DecisionTreeClassifier(criterion='gini',
                          max_depth=97,
                          min_samples_split=21)

dtc_model, dtc_summary = build_model(dtc)

In [None]:
rfc = RandomForestClassifier(n_estimators=130)

rfc_model, rfc_summary = build_model(rfc)

In [None]:
xgb = XGBClassifier()

xgb_model, xgb_summary = build_model(xgb)

In [None]:
abc = AdaBoostClassifier(n_estimators=100)

abc_model, abc_summary = build_model(abc)

In [None]:
lgbm=LGBMClassifier()

lgbm_model, lgbm_summary = build_model(lgbm)

In [None]:
model_summary=[dtc_summary,rfc_summary,xgb_summary,abc_summary,lgbm_summary]
model_summary=pd.DataFrame(model_summary,columns=['ModelName','precision_0','recall_0','f1_score_0','support_0','precision_1','recall_1','f1_score_1','support_1','TP','FP','FN','TN','AUC','cross_val_score','Accuracy']).set_index('ModelName')
model_summary

In [None]:
prediction=rfc.predict(test)
submission=pd.read_csv('../input/health-insurance-cross-sell-prediction/sample_submission.csv')
submission['Response']=prediction

In [None]:
###################################
# Distribution of predicted Target
###################################

sns.countplot(submission['Response'])
count_0, count_1 = submission['Response'].value_counts()
total=count_0+count_1
percent_0=float("{:.2f}".format(count_0/total))
percent_1=float("{:.2f}".format(count_1/total))
print("Not Interested: ",count_0,f"{percent_0}%")
print("Interested:     ",count_1,f" {percent_1}%")