# Health Insurance Cross Sell Prediction -  Classifiation
This notebook aims at building at classification engine to predict whether customers are interested in vehicle insurance or not from the information of customers health insurance  - Binary Classifiation dataset that contains around 391109 data points. Basically, the engine works as follows: after user has provided with customers health insurance information, the engine cleans the data and tries to predict whether customers is interested in vehicle insurance or not.

The Notebook is organised as follows.

1.Exploratory Data Analysis(EDA)


2.Data Preprocessing
* Label Encoding 
* One Hot Encoding 
* Response Coding
* Normalization of Data


3.Modeling
* MultinomialNB
* Logistic Regression
* Random Forest


4.Conclusion

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import nltk
import string
import math
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_curve,auc
from sklearn import metrics

In [None]:
train_dataset = pd.read_csv("/kaggle/input/health-insurance-cross-sell-prediction/train.csv")
test_dataset = pd.read_csv("/kaggle/input/health-insurance-cross-sell-prediction/test.csv")

In [None]:
print('Number of data points : ', train_dataset.shape[0])
print('Number of features : ', train_dataset.shape[1])
print('Features : ', train_dataset.columns.values)
train_dataset.head()

In [None]:
print('Number of data points : ', test_dataset.shape[0])
print('Number of features : ', test_dataset.shape[1])
print('Features : ', test_dataset.columns.values)
test_dataset.head()

In [None]:
train_dataset.isnull().sum()

In [None]:
train_dataset["Response"].value_counts(normalize= True)

# **Exploratory Data Analysis**

In [None]:
import seaborn as sns
sns.countplot(x ='Response',data = train_dataset)

In [None]:
sns.countplot(x ='Gender', hue='Response',data = train_dataset)

In [None]:
sns.FacetGrid(train_dataset,hue="Response",size=8)\
   .map(sns.distplot,"Age")\
   .add_legend()
plt.show()

People,Age between 30-60 are having vehicle insurance and more in number than age between 18-30. 

In [None]:
sns.countplot(x ='Response', hue='Previously_Insured',data = train_dataset)

From the chart you can see **175000** people was **previously insured** but their current response is **NO**.

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(15, 5), sharey=True)
sns.countplot(ax=axes[0],x ='Vehicle_Damage', hue='Previously_Insured',data = train_dataset)
sns.countplot(ax=axes[1],x ='Vehicle_Damage', hue='Response',data = train_dataset)

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(15, 5), sharey=True)
sns.countplot(ax=axes[0],x ='Vehicle_Age', hue='Previously_Insured',data = train_dataset)
sns.countplot(ax=axes[1],x ='Vehicle_Age', hue='Response',data = train_dataset)

In [None]:
X = train_dataset.groupby(["Age"]).mean().reset_index()
sns.lineplot(x=X['Age'],y=X['Annual_Premium'])
sns.lineplot(x=X['Age'],y=train_dataset['Annual_Premium'].mean())


# Preprocessing

# Label Encoding

In [None]:

labelencoder = LabelEncoder()
train_dataset["Gender"] = labelencoder.fit_transform(train_dataset["Gender"].values)
test_dataset["Gender"] = labelencoder.transform(test_dataset["Gender"].values)
train_dataset["Vehicle_Damage"] = labelencoder.fit_transform(train_dataset["Vehicle_Damage"].values)
test_dataset["Vehicle_Damage"] = labelencoder.transform(test_dataset["Vehicle_Damage"].values)
train_dataset["Vehicle_Age"] = labelencoder.fit_transform(train_dataset["Vehicle_Age"].values)
test_dataset["Vehicle_Age"] = labelencoder.transform(test_dataset["Vehicle_Age"].values)

# One Hot Encoding

In [None]:
One_Hot_Categorical_features = ["Gender","Driving_License","Previously_Insured","Vehicle_Age","Vehicle_Damage"]
train_one_hot_encoding_features = train_dataset[One_Hot_Categorical_features].values
test_one_hot_encoding_features= test_dataset[One_Hot_Categorical_features].values 

onehotencoder = OneHotEncoder(sparse=False)
train_one_hot_encoded_features = onehotencoder.fit_transform (train_one_hot_encoding_features)
test_one_hot_encoded_features = onehotencoder.fit_transform (test_one_hot_encoding_features)

In [None]:
print('Region_Code:',train_dataset["Region_Code"].nunique())
print('Policy_Sales_Channel:',train_dataset["Policy_Sales_Channel"].nunique())

# Response Encoding

There are 53 unique values in region code and 155 unique values in policy sales channel. If we apply one hot encoding on these features which creates 208 dimensions. Means two dimensions are transformed into 208 dimension. If we apply any ensemble technique like random forest or XGBoost, the base models are decision trees. If 1st split is on eithere region code or policy sales channel, 53 or 155 branches will be created which will increases computational latency. To minimize computation latency using response ecoding technique. 

In [None]:
# code for response coding with Laplace smoothing.
# alpha : used for laplace smoothing
# feature: ['Region_Code', 'Policy_Sales_Channel']

def get_response_coded_feature_dict(alpha, feature, df):
    value_count = df[feature].value_counts()
    n = df[feature].nunique()
    feature_dict = dict()
    for i, denominator in value_count.items():
        vec = []        
        for k in range(0,2):
            cls_cnt = df.loc[(df['Response']==k) & (df[feature]==i)]
            vec.append((cls_cnt.shape[0] + alpha)/ (denominator + n*alpha))
        feature_dict[i]=vec
    return feature_dict

def get_response_coded_feature(alpha, feature, train_df,test_df):
    response_coded_feature_dict = get_response_coded_feature_dict(alpha, feature, train_df)
    train_value_count = train_df[feature].value_counts()
    n = train_df[feature].nunique()
    train_response_coded_feature = []
    test_response_coded_feature = []
    for index, row in train_df.iterrows():
        if row[feature] in dict(train_value_count).keys():
            train_response_coded_feature.append(response_coded_feature_dict[row[feature]])
        else:
            train_response_coded_feature.append([1/n,1/n])
    for index, row in test_df.iterrows():
        if row[feature] in dict(train_value_count).keys():
            test_response_coded_feature.append(response_coded_feature_dict[row[feature]])
        else:
            test_response_coded_feature.append([1/n,1/n])        
    return train_response_coded_feature,test_response_coded_feature



train_region_code_feature_responseCoding,test_region_code_feature_responseCoding = np.array(get_response_coded_feature(1, "Region_Code", train_dataset, test_dataset))

train_Policy_Sales_Channel_feature_responseCoding,test_Policy_Sales_Channel_feature_responseCoding = np.array(get_response_coded_feature(1, "Policy_Sales_Channel", train_dataset, test_dataset))


In [None]:
X = np.hstack((train_one_hot_encoded_features,train_region_code_feature_responseCoding,
               train_Policy_Sales_Channel_feature_responseCoding,
               train_dataset[["Age","Annual_Premium","Vintage"]]))
y = train_dataset["Response"].values

In [None]:
minmaxscaler = MinMaxScaler()
X =minmaxscaler.fit_transform(X)

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X ,y ,test_size=0.3, random_state=42)

# Modeling

In [None]:
def batch_predict(clf, data):
    # roc_auc_score(y_true, y_score) the 2nd parameter should be probability estimates of the positive class
    # not the predicted outputs

    y_data_pred = []
    tr_loop = data.shape[0] - data.shape[0]%1000
    # consider you X_tr shape is 49041, then your tr_loop will be 49041 - 49041%1000 = 49000
    # in this for loop we will iterate unti the last 1000 multiplier
    for i in range(0, tr_loop, 1000):
        y_data_pred.extend(clf.predict_proba(data[i:i+1000])[:,1])
    # we will be predicting for the last data points
    if data.shape[0]%1000 !=0:
        y_data_pred.extend(clf.predict_proba(data[tr_loop:])[:,1])
    
    return y_data_pred

In [None]:
def find_best_threshold(threshould, fpr, tpr):
    t = threshould[np.argmax(tpr*(1-fpr))]
    # (tpr*(1-fpr)) will be maximum if your fpr is very low and tpr is very high
    print("The maximum value of tpr*(1-fpr)", max(tpr*(1-fpr)), "for threshold", np.round(t,3))
    return t
 
def predict_with_best_t(proba, threshould):
    predictions = []
    for i in proba:
        if i>=threshould:
            predictions.append(1)
        else:
            predictions.append(0)
    return predictions

# Naive Bays

In [None]:
from sklearn.naive_bayes import MultinomialNB
grid_params ={'alpha':[10**x for x in range(-4,4)]}
alpha_log = [math.log(x,10) for x in grid_params["alpha"]]

MultinomialNB_model = GridSearchCV(MultinomialNB(),grid_params,
                     scoring = 'roc_auc', cv=10,n_jobs=-1, return_train_score=True)
MultinomialNB_model.fit(X_train, y_train)

results = pd.DataFrame.from_dict(MultinomialNB_model.cv_results_)
results = results.sort_values(['param_alpha'])

plt.plot(alpha_log, results["mean_train_score"], label='Train AUC')
plt.plot(alpha_log, results["mean_test_score"].values, label='CV AUC')

plt.scatter(alpha_log, results["mean_train_score"].values, label='Train AUC points')
plt.scatter(alpha_log, results["mean_test_score"].values, label='CV AUC points')

plt.legend()
plt.xlabel("Alpha: hyperparameter")
plt.ylabel("AUC")
plt.title("AUC PLOT")
plt.grid()
plt.show()
print(MultinomialNB_model.best_estimator_)

In [None]:
MultinomialNB_model =MultinomialNB(alpha=0.0001)
MultinomialNB_model.fit(X_train,y_train)

y_train_pred = batch_predict(MultinomialNB_model,X_train)    
y_test_pred = batch_predict(MultinomialNB_model,X_test)
y_pred = MultinomialNB_model.predict(X_test)

train_fpr, train_tpr, tr_thresholds = roc_curve(y_train, y_train_pred)
test_fpr, test_tpr, te_thresholds= roc_curve(y_test, y_test_pred)

plt.plot(train_fpr, train_tpr, label="train AUC ="+str(auc(train_fpr, train_tpr)))
plt.plot(test_fpr, test_tpr, label="test AUC ="+str(auc(test_fpr, test_tpr)))
plt.legend()
plt.xlabel("C:hyperparameter")
plt.ylabel("AUC")
plt.title("AUC PLOT")
plt.grid()
plt.show()

In [None]:
cm=confusion_matrix(y_test, y_pred)
cm_df=pd.DataFrame(cm,index=[0,1],columns=[0,1])
print("Accuracy:",accuracy_score(y_test, y_pred))

sns.set(font_scale=1.4,color_codes=True,palette="deep")
sns.heatmap(cm_df,annot=True,annot_kws={"size":16},fmt="d",cmap="YlGnBu")
plt.title("Confusion Matrix")
plt.xlabel("Predicted Value")
plt.ylabel("True Value")

In [None]:
print(metrics.classification_report(y_test, y_pred, 
                                    target_names= train_dataset['Response'].astype(str).unique()))

# Logistic Regression

In [None]:
grid_params ={'C':[10**x for x in range(-4,4)]}
c_log = [math.log(x,10) for x in [10**x for x in range(-4,4)]]

LogisticRegression_model = GridSearchCV(LogisticRegression(class_weight = 'balanced'), grid_params,
                     scoring = 'roc_auc', cv=5,n_jobs=-1,return_train_score=True )
LogisticRegression_model.fit(X_train, y_train)

results = pd.DataFrame.from_dict(LogisticRegression_model.cv_results_)
results = results.sort_values(['param_C'])

plt.plot(c_log, results["mean_train_score"], label='Train AUC')
plt.plot(c_log, results["mean_test_score"].values, label='CV AUC')

plt.scatter(c_log, results["mean_train_score"].values, label='Train AUC points')
plt.scatter(c_log, results["mean_test_score"].values, label='CV AUC points')

plt.legend()
plt.xlabel("C: hyperparameter")
plt.ylabel("AUC")
plt.title("AUC PLOT")
plt.grid()
plt.show()
print(LogisticRegression_model.best_estimator_)

In [None]:
LogisticRegression_model = LogisticRegression(C=100, class_weight='balanced')
LogisticRegression_model.fit(X_train,y_train)

y_train_pred = batch_predict(LogisticRegression_model,X_train)    
y_test_pred = batch_predict(LogisticRegression_model,X_test)
y_pred = LogisticRegression_model.predict(X_test)

train_fpr, train_tpr, tr_thresholds = roc_curve(y_train, y_train_pred)
test_fpr, test_tpr, te_thresholds= roc_curve(y_test, y_test_pred)

plt.plot(train_fpr, train_tpr, label="train AUC ="+str(auc(train_fpr, train_tpr)))
plt.plot(test_fpr, test_tpr, label="test AUC ="+str(auc(test_fpr, test_tpr)))
plt.legend()
plt.xlabel("C:hyperparameter")
plt.ylabel("AUC")
plt.title("AUC PLOT")
plt.grid()
plt.show()


In [None]:
cm=confusion_matrix(y_test, y_pred)
cm_df=pd.DataFrame(cm,index=[0,1],columns=[0,1])
print("Accuracy:",accuracy_score(y_test, y_pred))

sns.set(font_scale=1.4,color_codes=True,palette="deep")
sns.heatmap(cm_df,annot=True,annot_kws={"size":16},fmt="d",cmap="YlGnBu")
plt.title("Confusion Matrix")
plt.xlabel("Predicted Value")
plt.ylabel("True Value")

In [None]:
print(metrics.classification_report(y_test, y_pred, 
                                    target_names= train_dataset['Response'].astype(str).unique()))

# Random Forest Classifier

In [None]:
max_depth = [1,5,10,50]
n_estimators = [5,10,100,500]
grid_params ={'max_depth':max_depth,'n_estimators':n_estimators}

RandomFoest_model = GridSearchCV(RandomForestClassifier(class_weight = 'balanced'), grid_params,
                  scoring = 'accuracy', cv=10,n_jobs=-1, return_train_score=True)
RandomFoest_model.fit(X_train, y_train)

results = pd.DataFrame.from_dict(RandomFoest_model.cv_results_)
print(RandomFoest_model.best_estimator_)

In [None]:
from mpl_toolkits.mplot3d import Axes3D
import matplotlib

max_depth = [1,1,1,1,5,5,5,5,10,10,10,10,50,50,50,50]
n_estimators = [5,10,100,500,5,10,100,500,5,10,100,500,5,10,100,500]
mean_train_score = list(results["mean_train_score"].values)
mean_test_score = list(results["mean_test_score"].values)

fig = matplotlib.pyplot.figure(figsize=(12,6))
ax = fig.add_subplot(111, projection='3d')

ax.scatter(max_depth, n_estimators, mean_train_score, c='r', marker='o')
ax.scatter(max_depth, n_estimators, mean_test_score, c='b', marker='o')

ax.set_xlabel('max_depth ')
ax.set_ylabel('n_estimators')
ax.set_zlabel('roc_auc')
print(RandomFoest_model.best_estimator_)

In [None]:
RandomFoest_model = RandomForestClassifier(class_weight='balanced', max_depth=10, n_estimators=500)
RandomFoest_model.fit(X_train,y_train)

y_train_pred = batch_predict(RandomFoest_model,X_train)    
y_test_pred = batch_predict(RandomFoest_model,X_test)
y_pred = RandomFoest_model.predict(X_test)

train_fpr, train_tpr, tr_thresholds = roc_curve(y_train, y_train_pred)
test_fpr, test_tpr, te_thresholds= roc_curve(y_test, y_test_pred)

plt.plot(train_fpr, train_tpr, label="train AUC ="+str(auc(train_fpr, train_tpr)))
plt.plot(test_fpr, test_tpr, label="test AUC ="+str(auc(test_fpr, test_tpr)))
plt.legend()
plt.xlabel("hyperparameter")
plt.ylabel("AUC")
plt.title("AUC PLOT")
plt.grid()
plt.show()

In [None]:
cm=confusion_matrix(y_test, y_pred)
cm_df=pd.DataFrame(cm,index=[0,1],columns=[0,1])
print("Accuracy:",accuracy_score(y_test, y_pred))

sns.set(font_scale=1.4,color_codes=True,palette="deep")
sns.heatmap(cm_df,annot=True,annot_kws={"size":16},fmt="d",cmap="YlGnBu")
plt.title("Confusion Matrix")
plt.xlabel("Predicted Value")
plt.ylabel("True Value")

In [None]:
print(metrics.classification_report(y_test, y_pred, 
                                    target_names= train_dataset['Response'].astype(str).unique()))

# Conclusion

The AUC score for Random Forest is better than Naive Bays and Logistics Regression.

# Applying Random Forest to test data

In [None]:
test_data = np.hstack((test_one_hot_encoded_features,test_region_code_feature_responseCoding,
               test_Policy_Sales_Channel_feature_responseCoding,
               test_dataset[["Age","Annual_Premium","Vintage"]]))

In [None]:
test_data = minmaxscaler.transform(test_data)

In [None]:
prediction = RandomFoest_model.predict(test_data)
test_dataset["Response"] = list(prediction)
sample_submission = test_dataset[["id","Response"]]