In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train=pd.read_csv('/kaggle/input/health-insurance-cross-sell-prediction/train.csv')
train.head()

In [None]:
test=pd.read_csv('/kaggle/input/health-insurance-cross-sell-prediction/test.csv')
test.head()

In [None]:
train.shape,test.shape

In [None]:
train.info()

In [None]:
test.info()

The categorical features(**'Gender','Vehicle_Age','Vehicle_Damage'**) in both train and test sets should be converted from object type to categorical type.

In [None]:
train.describe()

In [None]:
test.describe()

* No missing values in both train and test datasets.
* Presence of outliers in id and Annual_Premium features due to large difference between 75% quartile and maximum value.

In [None]:
train.isnull().sum()

In [None]:
test.isnull().sum()

Let's examine the categorical features one by one in the training dataset.

In [None]:
px.pie(train,'Gender')

Approx 54% of the people who have claimed health insurance are males.

In [None]:
px.pie(train,'Driving_License')

99.8 % of the people who have claimed health insurance have a driving license.

In [None]:
px.pie(train,'Previously_Insured')

54.2 % of the people who already had health insurance doesn't have vehicle insurance.

In [None]:
#correcting vehicle_age values
train['Vehicle_Age'].replace({'< 1 Year':'less_than_one_year','> 2 Years':'more_than_two_years'},inplace=True)
test['Vehicle_Age'].replace({'< 1 Year':'less_than_one_year','> 2 Years':'more_than_two_years'},inplace=True)

In [None]:
px.pie(train,'Vehicle_Age')

53% of the people have vehicles aged 1-2 years followed by 43% having vehicle age less than 1 year and the remaining people have vehcle age of more than 2 years.

In [None]:
px.pie(train,'Vehicle_Damage')

Approx 50% of the customers  didn't have their vehicle damaged in the past.

In [None]:
px.pie(train,'Response')

Around 87.7% of the customers having an health insurance aren't interested in buying vehicle insurance.

In [None]:
c_columns=['Gender','Vehicle_Age','Vehicle_Damage']
for i in c_columns:
    train[i]=train[i].astype('category')
    test[i]=test[i].astype('category')

In [None]:
train.info()

In [None]:
test.info()

## UNIVARIATE ANALYSIS

In [None]:
plt.figure(figsize=(10,10))
sns.distplot(train['Age'])
plt.axvline(x=train['Age'].mean(),linestyle='--',c='r',label='Mean Age')
plt.xticks(range(0,100,10))
plt.legend()

* Most of the customers having health insurance are between 20 to 30 years of age.
* The curve shows right skewed distribution.
* Mean age is around 40.

In [None]:
sns.boxplot(train['Age'])

No outliers in the age feature.

In [None]:
sns.boxplot(train['Vintage'])

* Most of the customers having health insurance are associated with the company on an average of 150 days.
* No outliers in the vintage feature.

In [None]:
sns.boxplot(train['Policy_Sales_Channel'])

* Mean channel code is between around 130.
* No outliers in the policy_sales_channel feature.

In [None]:
plt.figure(figsize=(10,10))
sns.distplot(train['Annual_Premium'])
plt.axvline(x=train['Annual_Premium'].mean(),linestyle='--',c='r',label='Avg annual premium')
plt.xticks(range(0,600000,30000),rotation=90)
plt.legend()

* Annual_Premium column has high number of outliers starting near 1,00,000.
* It shows right skewed distribution.
* Mean premium/year is around 31000.

In [None]:
sns.boxplot(train['Region_Code'])

* No outliers in Region_Code features.
* On an average,the customers come from the region_code around 30.

In [None]:
plt.figure(figsize=(20,5))
sns.boxplot(train['Annual_Premium'])

Outliers are having premium/year of more than 60000.

 ## BIVARIATE/MULTIVARIATE ANALYSIS

In [None]:
sns.stripplot(data=train,y='Gender',x='Age',jitter=True)

Both male and female have same age distributions.

In [None]:
sns.countplot('Response',hue='Gender',data=train)

Most of the customers who whether want to take vehicle insurance or not are males.

In [None]:
plt.figure(figsize=(10,5))
sns.countplot('Driving_License',hue='Gender',data=train)

Males with DL are more in proportion than females in DL.

In [None]:
sns.stripplot(data=train,x='Annual_Premium',y='Vehicle_Age',jitter=True)

Customer using their cars for 1-2 years generally pay more premium for vehicle insurance per year.

In [None]:
sns.countplot('Vehicle_Age',hue='Vehicle_Damage',data=train)

* Cars aged 1-2 Year has probably been more damaged and car aged less than one year is less damaged among the other categories.
* Customers having cars aged more than two years are surely damaged.

In [None]:
sns.countplot('Vehicle_Age',hue='Previously_Insured',data=train)

Vehicles aged less than one year are more previously insured than other categories.


In [None]:
sns.countplot('Vehicle_Age',hue='Gender',data=train)

* Female have more vehicles aged less than one year than males.
* Males have more vehicles aged around 1-2 Year or more than two years than females.

In [None]:
sns.lmplot(y='Vintage',x='Annual_Premium',col='Response',row='Gender',fit_reg=True,data=train)

Above figures show that when annual_premium is low then both gender's interest in buying the vehicle insurance will increase irrespective of the vintage.

## VARIABLE ENCODING

In [None]:
from sklearn.preprocessing import OrdinalEncoder
o=OrdinalEncoder()
for i in c_columns:
    train[i]=o.fit_transform(train[[i]])
    test[i]=o.transform(test[[i]])

In [None]:
train.head()

In [None]:
test.head()

## FEATURE SELECTION

In [None]:
#printing correlation matrix
corr=train.iloc[:,:-1].corr()
top_features=corr.index
plt.figure(figsize=(20,20))
sns.heatmap(train[top_features].corr(),annot=True)

In [None]:
# function to remove those independent features which are correlated
def correlation(dataset, threshold):
    col_corr = set()  # Set of all the names of correlated columns
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold: # we are interested in absolute coeff value
                colname = corr_matrix.columns[i]  # getting the name of column
                col_corr.add(colname)
    return col_corr

In [None]:
correlation(train.iloc[:,:-1],0.5)

In [None]:
c=train.drop(columns={'id','Policy_Sales_Channel', 'Vehicle_Age','Vehicle_Damage'})
c.head()

## FEATURE SCALING AND SPLITTING

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV,train_test_split

In [None]:
X=c.drop(columns='Response',axis=1)
y=c['Response']

In [None]:
#splitting the data
x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.4,stratify=y,random_state=0)

In [None]:
#scaling the data
sc=StandardScaler()
x_train=sc.fit_transform(x_train)
x_test=sc.transform(x_test)

## MODEL FITTING

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_auc_score

In [None]:
#function defined to fit the model
def model(model):
    model.fit(x_train,y_train)
    y_t=model.predict(x_train)
    y_pred=model.predict(x_test)
    tr=str('Training score:') + str(roc_auc_score(y_train,y_t))
    te=str('Test score:') + str(roc_auc_score(y_test,y_pred))
    return tr,te

In [None]:
model(RandomForestClassifier(random_state=0))

In [None]:
model(DecisionTreeClassifier(random_state=0))

In [None]:
model(XGBClassifier(random_state=0))

In [None]:
model(LGBMClassifier(random_state=0))

In [None]:
model(KNeighborsClassifier())

Random Forest and Decision Tree are overfitting the training set whereas XGB and LGBM have benchmark roc of around 0.5 respectively on the training set and 0.50 and 0.49 in the test set respectively.

The KNN Classifier has the best benchmark score of 0.56 in training set and 0.50 in the test set.
We will use KNN Classifier for model prediction and tune the model using tpot.

## HYPERPARAMETER TUNING

In [None]:
r_params={
    'n_neighbors': np.arange(1,15),
    'leaf_size':np.arange(20,100,10)
}

In [None]:
from tpot import TPOTClassifier


tpot_classifier = TPOTClassifier(generations= 3, population_size= 12, offspring_size= 6,
                                 verbosity= 2, early_stop= 12,
                                 config_dict={'sklearn.neighbors.KNeighborsClassifier': r_params}, 
                                 cv = 4, scoring = 'roc_auc')
tpot_classifier.fit(x_train,y_train)

In [None]:
roc_auc = tpot_classifier.score(x_test, y_test)
print(roc_auc)

We can see that previously test set ROC was 0.50 which was increased to 0.58 using effective hyperparameters.