## This notebook solves the problem Health Insurance Cross Sell Prediction:
https://www.kaggle.com/anmolkumar/health-insurance-cross-sell-prediction

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
%matplotlib inline
matplotlib.style.use('ggplot')

### Read the dataset

In [None]:
train=pd.read_csv("../input/health-insurance-cross-sell-prediction/train.csv")
test=pd.read_csv("../input/health-insurance-cross-sell-prediction/test.csv")

In [None]:
train.head()

In [None]:
test.head()

In [None]:
test.info()

In [None]:
test.info()

### We can see here that Test Data has all null values in Gender Column.

## Checking the details of rows, columns 

In [None]:
train.info()

In [None]:
test.info()

In [None]:
train.shape

In [None]:
test.shape

### Check for null values

In [None]:
train.isnull().sum()

#### We can see that there are no null values

In [None]:
test.isnull().sum()

### Checking the descriptive Statistics

In [None]:
train.describe()

In [None]:
test.describe()

## Exploratory Data Analysis

In [None]:
train['Response'].value_counts()

In [None]:
plt.figure(figsize=(8,6))
sns.countplot(train['Response'])
plt.title('Response Count for Training Data')
plt.xlabel('Response')
plt.ylabel('Count')

### From the above plot and count, we can see that the data is imbalanced

In [None]:
plt.figure(figsize=(8,6))
sns.countplot(train['Gender'])
plt.title('Gender Count for Training Data')
plt.xlabel('Response')
plt.ylabel('Count')

In [None]:
train['Gender'].value_counts()

There are more Males in the data than Females

In [None]:
plt.figure(figsize=(8,6))
sns.countplot(train['Response'],hue=train['Gender'])
plt.title('Gender Count for Training Data')
plt.xlabel('Gender')
plt.ylabel('Count')

In [None]:
plt.figure(figsize=(8,6))
sns.countplot(train['Response'],hue=train['Gender'])
plt.title('Gender Count for Training Data')
plt.xlabel('Response')
plt.ylabel('Count')

In [None]:
plt.figure(figsize=(8,6))
sns.distplot(train['Age'])
plt.title('Age distribution Training Data')
plt.xlabel('Age')
plt.ylabel('Count')

We can see that Age distribution is not normalized. Also the maximum frequency of age is around 25

In [None]:
plt.figure(figsize=(8,6))
sns.distplot(test['Age'])
plt.title('Age distribution Test Data')
plt.xlabel('Age')
plt.ylabel('Count')

In [None]:
plt.figure(figsize=(8,6))
sns.countplot(train['Driving_License'])
plt.title('Driving License Count for Training Data')
plt.xlabel('Driving License')
plt.ylabel('Count')

In [None]:
plt.figure(figsize=(8,6))
sns.countplot(test['Driving_License'])
plt.title('Driving License Count for Test Data')
plt.xlabel('Driving License')
plt.ylabel('Count')

So most of the data points have driving license for both training and test data

In [None]:
plt.figure(figsize=(8,6))
sns.countplot(train['Previously_Insured'])
plt.title('Previously Insured Count for Training Data')
plt.xlabel('Previously Insured')
plt.ylabel('Count')

In [None]:
plt.figure(figsize=(8,6))
sns.countplot(test['Previously_Insured'])
plt.title('Previously Insured Count for Test Data')
plt.xlabel('Previously Insured')
plt.ylabel('Count')

We can see that above 50% of the data is not previously Insured in both Training and Test Data

In [None]:
plt.figure(figsize=(8,6))
sns.countplot(train['Vehicle_Age'])
plt.title('Vehicle Age Count for Train Data')
plt.xlabel('Vehicle Age')
plt.ylabel('Count')

In [None]:
plt.figure(figsize=(8,6))
sns.countplot(train['Vehicle_Age'],hue=train['Response'])
plt.title('Vehicle Age Count for Train Data')
plt.xlabel('Vehicle Age')
plt.ylabel('Count')

In [None]:
plt.figure(figsize=(8,6))
sns.countplot(test['Vehicle_Age'])
plt.title('Vehicle Age Count for Test Data')
plt.xlabel('Vehicle Age')
plt.ylabel('Count')

### So we can see that most of the vehicles are pretty new with maximum of 1-2 years and less than a year's age

In [None]:
plt.figure(figsize=(8,6))
sns.countplot(train['Vehicle_Damage'])
plt.title('Vehicle Damage Count for Train Data')
plt.xlabel('Vehicle Damage')
plt.ylabel('Count')

In [None]:
plt.figure(figsize=(8,6))
sns.countplot(test['Vehicle_Damage'])
plt.title('Vehicle Damage Count for Test Data')
plt.xlabel('Vehicle Damage')
plt.ylabel('Count')

#### Ratio of yes:no is very close almost 50:50

In [None]:
plt.figure(figsize=(8,6))
sns.countplot(train['Vehicle_Damage'],hue=train['Response'])
plt.title('Vehicle Damage Count for Train Data')
plt.xlabel('Vehicle Damage')
plt.ylabel('Count')

### We can see that vehicles with Damage has more number of responses than vehicles without damage

In [None]:
plt.figure(figsize=(8,6))
sns.distplot(train['Annual_Premium'])
plt.title('Annual Premium for Train Data')
plt.xlabel('Annual Premium')
plt.ylabel('Count')

In [None]:
plt.figure(figsize=(8,6))
sns.distplot(test['Annual_Premium'])
plt.title('Annual Premium for Test Data')
plt.xlabel('Annual Premium')
plt.ylabel('Count')

In [None]:
train['Policy_Sales_Channel'].nunique()


### 155 unique Channels for outreaching to the customer

In [None]:
plt.figure(figsize=(8,6))
sns.distplot(train['Policy_Sales_Channel'])
plt.title('Policy_Sales_Channel Count for Train Data')
plt.xlabel('Policy Sales Channel ')
plt.ylabel('Count')

In [None]:
plt.figure(figsize=(8,6))
sns.distplot(test['Policy_Sales_Channel'])
plt.title('Policy_Sales_Channel Count for Test Data')
plt.xlabel('Policy Sales Channel ')
plt.ylabel('Count')

In [None]:
plt.figure(figsize=(8,6))
sns.distplot(train['Vintage'])
plt.title('Vintage Count for Train Data')
plt.xlabel('Vintage ')
plt.ylabel('Count')

In [None]:
plt.figure(figsize=(8,6))
sns.distplot(test['Vintage'])
plt.title('Vintage Count for Test Data')
plt.xlabel('Vintage ')
plt.ylabel('Count')

In [None]:
plt.figure(figsize=(8,6))
sns.distplot(train['Region_Code'])
plt.title('Region Code Count for Train Data')
plt.xlabel('Region Code ')
plt.ylabel('Count')

In [None]:
plt.figure(figsize=(8,6))
sns.distplot(test['Region_Code'])
plt.title('Region Code Count for Test Data')
plt.xlabel('Region Code ')
plt.ylabel('Count')

### Encoding the categorical Columns for Training Data

In [None]:
train['Gender']=train['Gender'].map({'Male':1,'Female':0})
train['Vehicle_Age']=train['Vehicle_Age'].map({'< 1 Year':0,'1-2 Year':1,'> 2 Years':2})
train['Vehicle_Damage']=train['Vehicle_Damage'].map({'Yes':1,'No':0})

### Encoding the categorical Columns for Test Data

In [None]:
test['Gender']=test['Gender'].map({'Male':1,'Female':0})
test['Vehicle_Age']=test['Vehicle_Age'].map({'< 1 Year':0,'1-2 Year':1,'> 2 Years':2})
test['Vehicle_Damage']=test['Vehicle_Damage'].map({'Yes':1,'No':0})

In [None]:
train.info()

### HeatMap

In [None]:
plt.figure(figsize=(10,10))
sns.heatmap(train.corr(),annot=True)

In [None]:
print(train.corr())

### We can see that the following columns are correlated:
Vehicle Age and Age: 0.765790

Vehicle Damage and Previously Insured :  -0.824143  

# Feature Engineering

## Normalizing the Numerical Data

In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

In [None]:
scalar=MinMaxScaler()

#### From the above data, we can see that the columns Age and Vintage have Skewed Data. It needs to be scaled with a MinMax Scalar

In [None]:
train['Age']=scalar.fit_transform(train[['Age']])
test['Age']=scalar.fit_transform(test[['Age']])

In [None]:
train['Vintage']=scalar.fit_transform(train[['Vintage']])
test['Vintage']=scalar.fit_transform(test[['Vintage']])

In [None]:
train.head()

In [None]:
test.head()

#### We Will do a Standard Scalar for Annual Premium as it is a Normalized Data

In [None]:
scalar=StandardScaler()

In [None]:
train['Annual_Premium']=scalar.fit_transform(train[['Annual_Premium']])
test['Annual_Premium']=scalar.fit_transform(test[['Annual_Premium']])

#### Remove id column

In [None]:
train=train.drop(['id'],axis=1)

## Modelling 

### As this is a classifier model, we will train the model with Random Forest, K means and CatBoost Algorithms

In [None]:
X=train.drop(['Response'],axis=1)
y=train['Response']

In [None]:
y.head()

## Handling Imbalanced Data with SMOTE 

In [None]:
print("Before OverSampling, counts of label '1': {}".format(sum(y == 1))) 
print("Before OverSampling, counts of label '0': {} \n".format(sum(y == 0))) 

In [None]:
# import SMOTE module from imblearn library 
# pip install imblearn (if you don't have imblearn in your system) 
from imblearn.over_sampling import SMOTE 

In [None]:
sm = SMOTE(random_state = 2) 
X, y = sm.fit_sample(X, y.ravel()) 
  
print('After OverSampling, the shape of X: {}'.format(X.shape)) 
print('After OverSampling, the shape of y: {} \n'.format(y.shape)) 

In [None]:
print("After OverSampling, counts of label '1': {}".format(sum(y == 1))) 
print("After OverSampling, counts of label '0': {}".format(sum(y == 0))) 

#### Split the data into Train and Test 

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=101)

### Random Forest with HyperParameter tuning

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from pprint import pprint
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix,roc_auc_score

import scikitplot as skplt

In [None]:
def model_error(model,X_test,y_test):
    predict = model.predict(X_test)
    print("RMSE of model: ",np.sqrt(mean_squared_error(y_test, predict)))
    print("\nAccuracy: ",accuracy_score(y_test,predict))
    print("\nClassification Report: ",classification_report(y_test,predict))
    print("\nConfusion Matrix: \n",confusion_matrix(y_test,predict))
    print("\nROC_AUC_Score: ",roc_auc_score(y_test,predict))

    fig, ax = plt.subplots(figsize=(10, 10))
    plot_confusion_matrix(model, X_test, y_test,ax=ax,cmap='YlOrBr',normalize='all')
    plt.title("Confusion Matrix")
    #skplt.metrics.plot_confusion_matrix(y_test, predict,figsize=(10,8),cmap='YlOrBr',text_fontsize='medium')
    plt.show()

In [None]:
rf=RandomForestClassifier()

In [None]:
base_model = RandomForestClassifier(n_estimators = 10)
base_model.fit(X_train, y_train)

In [None]:
model_error(base_model,X_test,y_test)

In [None]:
#HyperParameter Tuning

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 20, stop = 50, num = 5)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(1, 45, num = 3)]
# Minimum number of samples required to split a node
min_samples_split = [5, 10]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split}

pprint(random_grid)

In [None]:
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 10, cv = 10, verbose=2, random_state=42, n_jobs = -1, scoring='neg_mean_squared_error')

In [None]:
rf_random.fit(X_train,y_train)

In [None]:
rf_random.best_estimator_

In [None]:
model_error(rf_random.best_estimator_,X_test,y_test)

### It seems there is basemodel fitted better than this Random Forest model with hypertuning parameters

In [None]:
#knn Model
from sklearn.neighbors import KNeighborsClassifier

In [None]:
error_rate = []

# Will take some time
for i in range(1,20):
 
 knn = KNeighborsClassifier(n_neighbors=i)
 knn.fit(X_train,y_train)
 pred_i = knn.predict(X_test)
 error_rate.append(np.mean(pred_i != y_test))

In [None]:
plt.figure(figsize=(10,6))
plt.plot(range(1,20),error_rate,color='blue', linestyle='dashed', marker='o',
 markerfacecolor='red', markersize=10)
plt.title("Error Rate vs. K Value")
plt.xlabel("K")
plt.ylabel("Error Rate")

In [None]:
# NOW WITH K=14
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X_train,y_train)
model_error(knn,X_test,y_test)

### Catboost

In [None]:
from catboost import CatBoostClassifier

In [None]:
classifier=CatBoostClassifier()

In [None]:
classifier.fit(X_train,y_train)

In [None]:
model_error(classifier,X_test,y_test)

**Prediction for Submission**

In [None]:
Prediction = [predict[1] for predict in classifier.predict_proba(test.values)]


In [None]:
submission = pd.DataFrame(data = {'id': test['id'] ,'Response': Prediction})
submission.to_csv('./health-insurance-cross-sell-prediction_v1.csv', index = False)
submission.head()