In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Importing other necessary libraries.


In [None]:
from sklearn.model_selection import train_test_split, cross_val_score
from xgboost.sklearn import XGBClassifier
from sklearn import metrics   #Additional scklearn functions
from sklearn.model_selection import GridSearchCV 
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.preprocessing import MinMaxScaler
import seaborn as sns
import matplotlib.pyplot as plt

## Loading the datasets.

In [None]:
train = pd.read_csv('../input/health-insurance-cross-sell-prediction/train.csv')
test = pd.read_csv('../input/health-insurance-cross-sell-prediction/test.csv')

Now lets take a look at the data.

In [None]:
train.head()

Before we go further and perform EDA, data wrangling lets make a copy of the original data and then look at the stats of the data.

In [None]:
df = train.copy()

In [None]:
df.info()

In [None]:
df.describe()

Now lets check if there are missing values.

In [None]:
df.isnull().sum()

## EDA and Feature Engineering.


In [None]:
sns.set_style('whitegrid')
plt.figure(figsize=(12,8))
sns.countplot(x = 'Gender', hue = 'Vehicle_Damage', data=df)

Interesting, looks like vehicles owned by `Male`, tend to have more damage, compared to `Female`.

In [None]:
plt.figure(figsize=(12,8))
sns.countplot(x = 'Vehicle_Age', hue = 'Vehicle_Damage', data=df)

1-2 years old vehicles are prone to damage more than vehicles that are <1 year and >2 years old.

Now lets take a look at the `Age` column.

In [None]:
plt.figure(figsize=(12,8))
print(df['Age'].value_counts()[:5])
sns.distplot(df.Age, color='darkred')

It looks like most common age is 24, with around 25,960 customers lying in that age.

Lets also look at the age range.

In [None]:
print(f"Youngest Customer's age : {df['Age'].min()}")
print(f"Oldest Customer's age : {df['Age'].max()}")

Lets see how many customers are the oldest (85 years old) and youngest(20 years old)

In [None]:
pd.set_option('display.max_rows', None)
age_range = pd.DataFrame(df['Age'].value_counts())
age_range

There are 6232 customers, who are 20 years old, whereas there 11 customers who are 85 years old.

Lets now create a new column which is binned version of the `Age` column, which helps us understand the data more.

In [None]:
bins = [18, 30, 40, 50, 60, 70, 120]
labels = ['18-29', '30-39', '40-49', '50-59', '60-69', '70+']
df['Age_Range'] = pd.cut(df.Age, bins, labels = labels,include_lowest = True)

Now lets compare `Age_Range` and `Vehicle_Damage` and see how they correlate.

In [None]:
plt.figure(figsize=(12,8))
sns.countplot(x = 'Age_Range', hue = 'Vehicle_Damage', data=df)

Interesting, looks like the probability of `Vehicle_Damage` was more with the customers of 40-49 age group.

Now lets compare `Gender` and `Response` variable.

In [None]:
plt.figure(figsize=(12,8))
sns.countplot(x = 'Gender', hue = 'Response', data=df)

It seems like males are more interested in insurance than females.

In [None]:
plt.figure(figsize=(12,8))
sns.countplot(x = 'Gender', hue = 'Previously_Insured', data=df)

It looks like more men don't have insurance.

Lets now take a look at `Driving_License` column

In [None]:
df['Driving_License'].value_counts()

While 380297 customers have driving license, 812 don't.

Looking at the `Region_Code`...

In [None]:
plt.figure(figsize=(24,8))
sns.countplot(x = 'Region_Code', hue = 'Vehicle_Damage', data=df)

It looks like area with region code 28.0 has the most vehicle damage cases.

Now lets compare `Region_Code` with `Response`.

In [None]:
plt.figure(figsize=(24,8))
sns.countplot(x = 'Region_Code', hue = 'Response', data=df)

And obviously, that is the region where the most customers are interested in getting the insurance.

In [None]:
plt.figure(figsize=(12,8))
sns.countplot(x = 'Previously_Insured', hue = 'Response', data=df)

If you look carefully, some customers who are not insured, are still not interested in getting the insurance.

In [None]:
plt.figure(figsize=(12,8))
sns.countplot(x = 'Age_Range', hue = 'Response', data=df)

It seems that, customers in the age group 40-49 are the ones that are most interested in getting the insurance.

In [None]:
print(df['Annual_Premium'].value_counts().head(15))
plt.figure(figsize=(12,8))
sns.distplot(df['Annual_Premium'])

It seems that most common annual premium is 2630.0 with 64877 customers opting out for it.

In [None]:
plt.figure(figsize=(12,8))
sns.countplot(x = 'Vehicle_Age',data=df)

Most vehicles are 1-2 years old.

Lets now look at correlation of features.

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(df.corr(), annot=True)

It doesn't seem like any features are highly correlated.

Lets look for columns that correlated to the target variable.

In [None]:
df.corr()['Response']

`Age` is somewhat correlated to the target feature, but the correlation is not much.

Now lets look at different data types we're dealing with in this case..

In [None]:
df.info()

In [None]:
df['Vehicle_Age'].value_counts()

There are about 3 `objects`, which have to converted into numerical format. 
So lets do that.
But before that lets take a look at those 3 features.

`Gender` - Has 2 classes, `Male, Female`. Here we can use `pd.get_dummies()`.

`Vehicle_Age` - Has 3 classes, `1-2 Year, < 1 Year, > 2 Years`. Now this is ordinal data, so the best way to convert this feature into number would be to use `Label Encoding'.

`Vehicle_Damage` - Has 2 classes, `Yes, No`. Similar to `Gender`, we can use `pd.get_dummies()`.

Lets do that.

Note - We only use `Label Encoding` when the feature is ordinal. We can also use Label Encoder. But in case the feature isn't ordinal and is instead nominal, go ahead and use either `pd.get_dummies()` or `OneHotEncoder`.

In [None]:
df=pd.concat([df,pd.get_dummies(df['Vehicle_Damage'],prefix='Vehicle_Damage')],axis=1).drop(['Vehicle_Damage'],axis=1)

In [None]:
df.head()

Now that we've created dummy variable for `Vehicle_Damage` feature, lets do the same for `Gender` feature.

In [None]:
df=pd.concat([df,pd.get_dummies(df['Gender'],prefix='Gender')],axis=1).drop(['Gender'],axis=1)

In [None]:
df.head()

Can you notice what's wrong here?.

Yes!, while creating dummy variables, we created extra features which represent the same thing. This is called dummy variable trap.

So lets drop any 2 of the 4 dummy variables created. 

In [None]:
df.drop(['Vehicle_Damage_No','Gender_Female'], axis=1, inplace=True)
df.head()

Now we still have to convert feature `Vehicle_Age`, and since it's a ordinal feature,lets use `pd.categorical()`.

In [None]:
df['Vehicle_Age'] = pd.Categorical(df['Vehicle_Age'].values).codes
df['Age_Range'] = pd.Categorical(df['Age_Range'].values).codes
df.head()

In [None]:
df.info()

## Modelling

Now that all the features are numerical, lets build some ML models.

In [None]:
df.shape

In [None]:
X = df.drop(['Response','id', 'Age_Range'], axis=1)
y = df['Response']

Before we build models, lets first normalize the data.

In [None]:
scaler = MinMaxScaler()
scaled_X = scaler.fit_transform(X)
scaled_X = pd.DataFrame(scaled_X)
scaled_X.columns = X.columns
scaled_X.head()

Before start building models, lets check if the target variable is balanced.

In [None]:
print(y.value_counts())
y.value_counts().plot(kind='bar')

Target variable is highly imbalanced. This will definitely result in poor results in the class with lower value counts.
Lets balance it using `SMOTE`.

In [None]:
from collections import Counter
from imblearn.over_sampling import SMOTE
print('Original dataset shape %s' % Counter(y))
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(scaled_X, y)
print('Resampled dataset shape %s' % Counter(y_res))

Now implementing SMOTE results in significant increase in the data points.
Lets check the shape of the data.

In [None]:
X_res = pd.DataFrame(X_res)
y_res = pd.Series(y_res)
X_res.columns = scaled_X.columns
X_res.head()

In [None]:
print(y_res.value_counts())
y_res.value_counts().plot(kind='bar')

Our target variable is now balanced.

In [None]:
X_res.shape

As we can see that now the datapoints are 668798, almost double the size of our original number of datapoints.

Anyway, lets now build models on this data.

We also don't need the feature `Age_Range`, as it gives the same information as `Age`, so lets drop it.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size = 0.3, random_state = 42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
np.random.seed(42)
xgb1 = XGBClassifier(learning_rate =0.1, n_estimators=1000, max_depth=5, min_child_weight=1,
gamma=0,subsample=0.8, colsample_bytree=0.8, objective= 'binary:logistic',nthread=4,
scale_pos_weight=1,seed=27)
xgb1.fit(X_train, y_train)

In [None]:
np.random.seed(42)
etc = ExtraTreesClassifier(n_estimators=200).fit(X_train, y_train)
print(etc.score(X_test, y_test))
etc_pred = etc.predict(X_test)
print(classification_report(etc_pred, y_test))

In [None]:
np.random.seed(42)
rf = RandomForestClassifier(n_estimators=200).fit(X_train, y_train)
print(rf.score(X_test, y_test))
rf_pred = rf.predict(X_test)
print(classification_report(rf_pred, y_test))

You can play around with different parameters, or tune them as you wish.
I've used them because they seemed to work for me.

In [None]:
print(f"ROC_AUC Score of XGBoost Classifier is : {metrics.roc_auc_score(xgb1.predict(X_test), y_test)}")
print(f"ROC_AUC Score of RandomForestClassifier is : {metrics.roc_auc_score(rf.predict(X_test), y_test)}")
print(f"ROC_AUC Score of ExtraTreesClassifier is : {metrics.roc_auc_score(etc.predict(X_test), y_test)}")

Of all the models, ExtraTreesClassifier has better `roc_auc_score`. So lets consider that as our final model.

But before making predictions on the test set, we must remember that we created an additional feature called `Age_Range` using `Age` column.

We have to drop one of them, so that the model is more robust.

This may result in the drop of ROC_AUC_score, but it makes our model robust.

## Predictions
Now lets make predictions on the test data.

In [None]:
test.head()

Lets first deal with the categorical data..

In [None]:
t_copy = test.copy()

In [None]:
t_copy.head()

Creating dummy variables.

In [None]:
t_copy=pd.concat([t_copy,pd.get_dummies(t_copy['Gender'],prefix='Gender')],axis=1).drop(['Gender'],axis=1)
t_copy=pd.concat([t_copy,pd.get_dummies(t_copy['Vehicle_Damage'],prefix='Vehicle_Damage')],axis=1).drop(['Vehicle_Damage'],axis=1)

In [None]:
t_copy.head()

In [None]:
t_copy['Vehicle_Age'] = pd.Categorical(t_copy['Vehicle_Age'].values).codes
t_copy.head()

In [None]:
X_train.columns

In [None]:
t_copy.columns

Dropping `id,Vehicle_Damage_No,Gender_Female`, to avoid dummy variable trap.

In [None]:
t_copy.drop(['id','Vehicle_Damage_No','Gender_Female'], axis=1, inplace=True)

Scaling the test data..

In [None]:
scaled_test = scaler.transform(t_copy)
scaled_test = pd.DataFrame(scaled_test)
scaled_test.columns = t_copy.columns
scaled_test.head()

In [None]:
pred = etc.predict(scaled_test)
f_pred = pd.concat([pd.DataFrame(test['id']),pd.DataFrame(pred)], axis=1)
f_pred.columns = ['id','Response'] 
f_pred.head()
f_pred.to_csv('Submission.csv')