# Health Insurance EDA and prediction

### Importing Libraries

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.utils import resample,shuffle
from sklearn.mixture import GaussianMixture
from sklearn import mixture

### Importing Data

In [None]:
data = pd.read_csv('../input/health-insurance-cross-sell-prediction/train.csv')
data.head()

In [None]:
data.shape

## Exploratory Data Analysis

In [None]:
data.info()

No missing values. Gender, Vehicle_Age and Vehicle_Damage need to be converted to numerical data. Id colomn is not usefull in EDA nor in prediction, thus we will drop that column.

In [None]:
data.drop('id', axis=1, inplace=True)

gender = pd.get_dummies(data['Gender'], drop_first=True)
data = pd.concat([data, gender], axis=1)
data.drop('Gender', axis=1, inplace=True)

damage = pd.get_dummies(data['Vehicle_Damage'], drop_first=True)
data = pd.concat([data, damage], axis=1)
data.drop('Vehicle_Damage', axis=1, inplace=True)

In [None]:
data.rename(columns={'Yes': 'Vehicle_Damage', 'Male': 'Gender'}, inplace=True)

In [None]:
data['Vehicle_Age'].value_counts()

In [None]:
age_dict = {'< 1 Year': 0, '1-2 Year': 1, '> 2 Years': 2}
data['Vehicle_Age'] = data['Vehicle_Age'].map(age_dict)

### Data Correlation

In [None]:
corr = data.corr()
plot = sns.heatmap(corr, xticklabels=corr.columns, yticklabels=corr.columns)

In [None]:
corr.style.apply(lambda x: ["background:yellow" if abs(v) > 0.2 and abs(v) < 0.5 and v!=1 else "background:red" if abs(v) > 0.5 and v!=1 else "" for v in x], axis = 1)

In [None]:
corr['Response'].sort_values(ascending=False).drop('Response')

## Visual Data Analysis

### Response

In [None]:
plot = sns.countplot(x='Response', data=data, palette='rocket')

In [None]:
data['Response'].value_counts()

In [None]:
46710/(334399+46710)

We have very inbalanced data

### Age

In [None]:
figs, axes = plt.subplots(figsize=(18,5))
plot = sns.histplot(x='Age', data=data, hue='Response', palette='rocket')

### Vehicle Age

In [None]:
plot = sns.countplot(x='Vehicle_Age', hue='Response', data=data, palette='rocket')

### Previously Insured

In [None]:
plot = sns.countplot(x='Previously_Insured', hue='Response', data=data, palette='rocket')

### Policy Sales Channel

In [None]:
figs, axes = plt.subplots(figsize=(18,6))
plot = sns.countplot(x=data['Policy_Sales_Channel'], hue='Response', data=data, palette='rocket', order=data['Policy_Sales_Channel'].value_counts().iloc[:10].index)

## Modeling

### Random Forest without balancing the data

In [None]:
y = data['Response']
X = data.drop(['Response'], axis=1)

In [None]:
rf = RandomForestClassifier()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=22)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
accuracy_score(y_test,y_pred)

In [None]:
Conf_Mat = confusion_matrix(y_test, y_pred)
Conf_Mat

In [None]:
Class_rep = classification_report(y_test, y_pred)
print(Class_rep)

In [None]:
FPR, TPR, Threshold = roc_curve(y_test, y_pred)
plt.plot(FPR, TPR)
plt.xlabel('FPR')
plt.ylabel('TPR')

In [None]:
roc_auc_score(y_test, y_pred)

Even though the model has pretty high accuracy of 0.87, it does not have a satisfying ROC score. The data has to be balanced. We will try both downsampling and upsampling. 

### Balancing Data - Downsampling

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=22)
train_data = pd.concat([X_train, y_train], axis=1)
score_list = []

In [None]:
train_data_0 = train_data[train_data['Response']==0]
train_data_1 = train_data[train_data['Response']==1]
train_data_0_downsampled = resample(train_data_0, replace=True, n_samples=46710, random_state=22)
train_data_balanced = pd.concat([train_data_0_downsampled, train_data_1])
train_data_balanced = shuffle(train_data_balanced)
train_data_balanced.Response.value_counts()
X_train = train_data_balanced.drop(['Response'], axis=1)
y_train = train_data_balanced['Response']

### Random Forest 

In [None]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

In [None]:
Conf_Mat = confusion_matrix(y_test, y_pred)
Conf_Mat

In [None]:
Class_rep = classification_report(y_test, y_pred)
print(Class_rep)

In [None]:
FPR, TPR, Threshold = roc_curve(y_test, y_pred)
plt.plot(FPR, TPR)
plt.xlabel('FPR')
plt.ylabel('TPR')

In [None]:
score_list.append(roc_auc_score(y_test, y_pred))

### XGBoost

In [None]:
xgb = XGBClassifier()

In [None]:
xgb.fit(X_train, y_train)
y_pred = xgb.predict(X_test)

In [None]:
Conf_Mat = confusion_matrix(y_test, y_pred)
Conf_Mat

In [None]:
Class_rep = classification_report(y_test, y_pred)
print(Class_rep)

In [None]:
FPR, TPR, Threshold = roc_curve(y_test, y_pred)
plt.plot(FPR, TPR)
plt.xlabel('FPR')
plt.ylabel('TPR')

In [None]:
score_list.append(roc_auc_score(y_test, y_pred))

### Balancing Data - Upsampling

In [None]:
train_data_0 = train_data[train_data['Response']==0]
train_data_1 = train_data[train_data['Response']==1]
train_data_1_upsampled = resample(train_data_1, replace=True, n_samples=334399, random_state=22)
train_data_balanced = pd.concat([train_data_1_upsampled, train_data_0])
train_data_balanced = shuffle(train_data_balanced)
train_data_balanced.Response.value_counts()
X_train = train_data_balanced.drop(['Response'], axis=1)
y_train = train_data_balanced['Response']

### Randnom Forest

In [None]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

In [None]:
Conf_Mat = confusion_matrix(y_test, y_pred)
Conf_Mat

In [None]:
Class_rep = classification_report(y_test, y_pred)
print(Class_rep)

In [None]:
FPR, TPR, Threshold = roc_curve(y_test, y_pred)
plt.plot(FPR, TPR)
plt.xlabel('FPR')
plt.ylabel('TPR')

In [None]:
score_list.append(roc_auc_score(y_test, y_pred))

### XGBoost

In [None]:
xgb.fit(X_train, y_train)
y_pred = xgb.predict(X_test)

In [None]:
Conf_Mat = confusion_matrix(y_test, y_pred)
Conf_Mat

In [None]:
Class_rep = classification_report(y_test, y_pred)
print(Class_rep)

In [None]:
FPR, TPR, Threshold = roc_curve(y_test, y_pred)
plt.plot(FPR, TPR)
plt.xlabel('FPR')
plt.ylabel('TPR')

In [None]:
score_list.append(roc_auc_score(y_test, y_pred))

## RoC scores

In [None]:
(pd.DataFrame(score_list).T).rename(columns={0:'RFC downsampled', 1:'XGB downsampled', 2:'RFC upsampled', 3:'XGB upsampled'}).style.hide_index()

XGBoost outperfomed RandomForestClassifier in both cases. Hyperparameter tuning should be done to imporve RoC score. I will not do that in this notebook as it requires a lot of time and comuting power.

## Gaussian Mixture clusters

We will use Gaussian Mixture Model to cluster data. The model the upsampled data with XGBoost and hopefully get a higher ROC score.

In [None]:
data_cluster = data[['Age', 'Region_Code', 'Annual_Premium', 'Policy_Sales_Channel', 'Vintage']]

In [None]:
GM_components = np.arange(1,8)
GM_models = [mixture.GaussianMixture(n, covariance_type='full', random_state=22).fit(data_cluster.values) for n in GM_components]

In [None]:
plt.figure(num=None, figsize=(15, 5))
plt.plot(GM_components, [m.aic(data_cluster) for m in GM_models])
plt.xlabel('n components');

In [None]:
GM = GaussianMixture(n_components=5)
GM.fit(data_cluster)
labels = GM.predict(data_cluster)

In [None]:
frame = pd.DataFrame()
frame['cluster'] = labels

In [None]:
train_data = pd.concat([data, frame], axis=1)
y = train_data['Response']
X = train_data.drop(['Response'], axis=1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=22)

train_data_0 = train_data[train_data['Response']==0]
train_data_1 = train_data[train_data['Response']==1]
train_data_1_upsampled = resample(train_data_1, replace=True, n_samples=334399, random_state=22)
train_data_balanced = pd.concat([train_data_1_upsampled, train_data_0])
train_data_balanced = shuffle(train_data_balanced)
train_data_balanced.Response.value_counts()
X_train = train_data_balanced.drop(['Response'], axis=1)
y_train = train_data_balanced['Response']

In [None]:
xgb.fit(X_train, y_train)
y_pred = xgb.predict(X_test)
accuracy_score(y_test,y_pred)

Conf_Mat = confusion_matrix(y_test, y_pred)
print(Conf_Mat)

Class_rep = classification_report(y_test, y_pred)
print(Class_rep)

FPR, TPR, Threshold = roc_curve(y_test, y_pred)
plt.plot(FPR, TPR)
plt.xlabel('FPR')
plt.ylabel('TPR')

In [None]:
print(roc_auc_score(y_test, y_pred))

Gaussian Mixture Model improved the ROC score from 0.7965 to 0.8138. As said before this can be futher improved with hyperparameter tuning.