# Cross-sell PredictionPredict Health Insurance Owners' who will be interested in Vehicle Insurance

In [None]:
# importing libraries

import pandas as pd
import numpy as np
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt

## Importing dataset

In [None]:

data=pd.read_csv('../input/health-insurance-cross-sell-prediction/train.csv')
data.head()

In [None]:
data.info()

In [None]:
data.shape

In [None]:
data.describe()

In [None]:
data.isnull().sum()

# EDA

In [None]:
ax=sns.countplot(data['Response'])
for p in ax.patches:
    ax.annotate('{:d}'.format(p.get_height()), (p.get_x()+0.25, p.get_height()+1))


### Customer age distribution
#### Most of the customer are in 20-30 age group

In [None]:
sns.distplot(data['Age'])

#### But the positive response  received from varied age group, shows age group 35-50 have high conversion rate

In [None]:
plt.figure(figsize=(20,8))
ax= sns.countplot(data['Age'],hue=data['Response'])
plt.ylabel('Response')
plt.show()

In [None]:
sns.scatterplot(x=data['Age'],y=data['Annual_Premium'])

In [None]:
ax=sns.countplot(data['Gender'])
for p in ax.patches:
    ax.annotate('{:d}'.format(p.get_height()), (p.get_x()+0.25, p.get_height()+1))

In [None]:
df=data.groupby(['Gender'])['Driving_License'].count().to_frame().reset_index()
df

In [None]:
sns.catplot(x="Gender", y="Driving_License",
                data=df, kind="bar")

#### customer that were previously not insured most of turn positive response for vehicle insurance

In [None]:
plt.figure(figsize=(10,8))
ax= sns.countplot(data['Previously_Insured'],hue=data['Response'])
plt.ylabel('Response')
for p in ax.patches:
    ax.annotate('{:d}'.format(p.get_height()), (p.get_x()+0.15, p.get_height()+1))

In [None]:
ax=sns.countplot(data['Vehicle_Age'])
for p in ax.patches:
    ax.annotate('{:d}'.format(p.get_height()), (p.get_x()+0.25, p.get_height()+1))

In [None]:
sns.countplot(data['Vehicle_Age'],hue=data['Response'])

#### Customer with vehicle age 1-2 years responded positive for vehicle insurance

In [None]:
data['Vehicle_Damage'].value_counts()

In [None]:
sns.countplot(data['Vehicle_Damage'])

#### Customer whose vehicle was damage responded positive 

In [None]:
sns.countplot(data['Vehicle_Damage'],hue=data['Response'])

#### Customer with driving license  responded positive than those who dont own

In [None]:
plt.figure(figsize=(10,8))
ax= sns.countplot(data['Driving_License'],hue=data['Response'])
plt.ylabel('Response')
for p in ax.patches:
    ax.annotate('{:d}'.format(p.get_height()), (p.get_x()+0.15, p.get_height()+1))

In [None]:
sns.boxplot(data['Annual_Premium'])

In [None]:
sns.distplot(data['Vintage'])

In [None]:
data.columns

In [None]:
c=data['Response'].value_counts()

In [None]:
(c[1]/(c[1]+c[0]))*100

#### Data is imbalance as 12% response are positive 

In [None]:
corr=data.corr()
plt.figure(figsize=(10,8))
sns.heatmap(corr,annot=True)

## Data preprocessing

In [None]:
from sklearn.preprocessing import StandardScaler

Scale = StandardScaler()

Numfeature = [
    'Annual_Premium',
    'Vintage',
]

for x in Numfeature :
    data[[x]] = Scale.fit_transform(data[[x]])

In [None]:
data.head()

In [None]:
traindata=data

In [None]:
traindata.head()

In [None]:
traindata=traindata.rename(columns={"Previously_Insured": "Previously_Insured_Yes"})

In [None]:
train_dummies = pd.get_dummies(data[['Gender',
    'Vehicle_Damage',    
    'Vehicle_Age'
]],drop_first=True)

traindata = pd.concat([traindata, train_dummies], axis=1)
traindata.head()

In [None]:
traindata = traindata.drop([
    'id', 
    'Gender',
    'Vehicle_Age',
    'Vehicle_Damage',
       
    
], axis=1)
traindata.head()

## Test-Train Split

In [None]:
from sklearn.model_selection import train_test_split
split = traindata

y = split['Response'].values
X = split.drop(labels = ['Response'], axis = 1)


# Splitting the dataset 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1)

print('Shape of training dataset ', X_train.shape)
print('Shape of test dataset ', X_test.shape)

## Model Building- Random Forest( hyperparameter tuning)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [None]:
params = {
    'max_depth': [1, 2, 5, 10],
    'max_features': [2,3,4, 5],
    'n_estimators': [10, 30, 50, 100, 200]
}

In [None]:
classifier_rf = RandomForestClassifier(random_state=42, n_jobs=-1)

In [None]:
grid_search = GridSearchCV(estimator=classifier_rf, param_grid=params, 
                          cv=4, n_jobs=-1, verbose=1, scoring = "roc_auc")

In [None]:
%%time
grid_search.fit(X_train,y_train)

In [None]:
rf_best = grid_search.best_estimator_
rf_best

In [None]:
rf_best.feature_importances_

In [None]:
from sklearn import metrics 

In [None]:
metrics.accuracy_score(y_train, rf_best.predict(X_train))

In [None]:
c=metrics.confusion_matrix(y_train, rf_best.predict(X_train))
c

In [None]:
actuals=y_train
probs=rf_best.predict(X_train)

In [None]:
fpr, tpr, thresholds = metrics.roc_curve( actuals, probs,
                                              drop_intermediate = False )

In [None]:
metrics.roc_auc_score( y_train,rf_best.predict(X_train) )

In [None]:
    auc_score = metrics.roc_auc_score( actuals, probs )
    plt.figure(figsize=(5, 5))
    plt.plot( fpr, tpr, label='ROC curve (area = %0.2f)' % auc_score )
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate or [1 - True Negative Rate]')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic example')
    plt.legend(loc="lower right")
    plt.show()

In [None]:
metrics.accuracy_score(y_test, rf_best.predict(X_test))

In [None]:
metrics.roc_auc_score(y_test, rf_best.predict(X_test))

In [None]:
c=metrics.confusion_matrix(y_test, rf_best.predict(X_test))
c

### As we see ROC is 0.50 and accuracy score of train and test varies by large value so model is learning as there is class imbalance. To balance class we are using SMOTE  

#     

### Using SMOTE to balance class and make model learn using random forest grid search CV

In [None]:
from imblearn.over_sampling import SMOTE

In [None]:

X_resampled, y_resampled=SMOTE().fit_resample(X_train,y_train)

In [None]:
grid_search.fit(X_resampled, y_resampled)

In [None]:
rf = grid_search.best_estimator_
rf

In [None]:
metrics.accuracy_score(y_resampled, rf.predict(X_resampled))

In [None]:
c=metrics.confusion_matrix(y_resampled, rf.predict(X_resampled))
c

In [None]:
actuals=y_resampled
probs=rf.predict(X_resampled)

In [None]:
fpr, tpr, thresholds = metrics.roc_curve( actuals, probs,
                                              drop_intermediate = False )
auc_score = metrics.roc_auc_score( actuals, probs )
plt.figure(figsize=(5, 5))
plt.plot( fpr, tpr, label='ROC curve (area = %0.2f)' % auc_score )
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate or [1 - True Negative Rate]')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()

In [None]:
metrics.accuracy_score(y_test, rf.predict(X_test))

In [None]:
metrics.roc_auc_score(y_test, rf.predict(X_test))

In [None]:
# Applying Pre processing on  test.csv to apply model

In [None]:
test=pd.read_csv('../input/health-insurance-cross-sell-prediction/test.csv')
test.head()

In [None]:
for x in Numfeature :
    test[[x]] = Scale.fit_transform(test[[x]])

In [None]:
test=test.rename(columns={"Previously_Insured": "Previously_Insured_Yes"})

In [None]:
test_dummies = pd.get_dummies(test[['Gender',
    'Vehicle_Damage',    
    'Vehicle_Age'
]],drop_first=True)

test= pd.concat([test, test_dummies], axis=1)
test.head()

In [None]:
submission=test['id']
submission.head()

In [None]:
submit=pd.DataFrame({'id':submission})
submit

In [None]:
test = test.drop([
    'id', 
    'Gender',
    'Vehicle_Age',
    'Vehicle_Damage',
       
    
], axis=1)
test.head()

In [None]:
prediction=rf.predict(test)

In [None]:
prediction

In [None]:
prediction = pd.DataFrame({'Response':prediction})
prediction.head()

In [None]:
submit['Response']=prediction['Response']

In [None]:
submit.head()