## Importing Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from catboost import CatBoostClassifier, Pool
from catboost.utils import get_confusion_matrix
sns.set(style="darkgrid")

## Loading the data

In [None]:
train = pd.read_csv("../input/health-insurance-cross-sell-prediction/train.csv")
test = pd.read_csv("../input/health-insurance-cross-sell-prediction/test.csv")

In [None]:
train.head()

In [None]:
train.info()

* There are no missing values on the dataset.

In [None]:
train.describe().T

In [None]:
train = train.drop(['id'], axis=1)

* Drop the column ID.

## EDA (Exploratory Data Analysis) 

### Correlation among Features

In [None]:
numerical = ['Age', 'Region_Code','Annual_Premium','Vintage']
sns.pairplot(train[numerical])

### Age Distribution of Customers

In [None]:
sns.distplot(train.Age)

* Age between 20-50 are most likely to go for insurance.

### Response Distribution

In [None]:
sns.countplot(train.Response)

In [None]:
sns.countplot(train.Gender)

### Response by Gender

In [None]:
df_response = train.groupby(['Gender','Response'])['Age'].count().reset_index()
df_response = df_response.rename(columns={'Age':'Count'})
df_response

In [None]:
sns.catplot(x='Gender',y='Count',
            col='Response',
            data=df_response,
           kind='bar')

### Driving License by Gender

In [None]:
df_DL = train.groupby(['Gender'])['Driving_License'].count().reset_index()
df_DL = df_DL.rename(columns={'Driving_License':'Driving License'})
df_DL

In [None]:
sns.catplot(x='Gender',y='Driving License',
            data=df_DL,
           kind='bar')

### Customers with Driving License 

In [None]:
sns.countplot(train.Previously_Insured)

In [None]:
df_PDL = train.groupby(['Gender','Previously_Insured'])['Age'].count().reset_index()
df_PDL = df_PDL.rename(columns={'Age':'count','Previously_Insured':'Previously Insured'})
df_PDL

In [None]:
sns.catplot(x='Gender',y='count',
            col='Previously Insured',
            data=df_PDL,
           kind='bar')

In [None]:
df_vehicleage = train.groupby(['Vehicle_Age','Response'])['Age'].count().reset_index()
df_vehicleage = df_vehicleage.rename(columns={'Age':'count','Vehicle_Age':'Vehicle Age'})
df_vehicleage

In [None]:
sns.catplot(x='Vehicle Age',y='count',
            col='Response',
            data=df_vehicleage,
           kind='bar')

### Customers with Damaged Vehicle

In [None]:
df_damaged = train.groupby(['Vehicle_Damage','Response'])['Age'].count().reset_index()
df_damaged = df_damaged.rename(columns={'Age':'count','Vehicle_Damage':'Vehicle Damage'})
df_damaged

In [None]:
sns.catplot(x='Vehicle Damage',y='count',
            col='Response',
            data=df_damaged,
           kind='bar')

### Vintage - Number of Days customer is associated with the company

In [None]:
sns.distplot(train.Vintage)

### Data PreProcessing

In [None]:
train_copy = train.copy()

lb_make = LabelEncoder()
train_copy["Gender"] = lb_make.fit_transform(train_copy['Gender'])
train_copy['Vehicle_Age'] = lb_make.fit_transform(train_copy['Vehicle_Age'])
train_copy['Vehicle_Damage'] = lb_make.fit_transform(train_copy['Vehicle_Damage'])
train_copy.head()

### Building Model

In [None]:
features = train_copy.iloc[:,:-1]
labels = train_copy.iloc[:,-1:]

In [None]:
x_train, x_test, y_train, y_test = train_test_split(features, labels, test_size=0.3)

In [None]:
eval_dataset = Pool(x_test,
                    y_test)

model = CatBoostClassifier(learning_rate=0.0001,
                           eval_metric='AUC')

In [None]:
model.fit(x_train,
          y_train,
          eval_set=eval_dataset)

In [None]:
print(model.get_best_score())

In [None]:
cm = get_confusion_matrix(model, eval_dataset)

fig = plt.figure(figsize=(12,7))
predict_accuracy_on_test_set = (cm[0,0] + cm[1,1])/(cm[0,0] + cm[1,1]+cm[1,0] + cm[0,1])
ax = sns.heatmap(cm, linewidths=0.5, linecolor='white',square=True)
plt.show()

print("catboost Accuracy : ", predict_accuracy_on_test_set*100)

### Making Predictions on Test Data

In [None]:
test.head()

In [None]:
lb_make = LabelEncoder()
test["Gender"] = lb_make.fit_transform(test['Gender'])
test['Vehicle_Age'] = lb_make.fit_transform(test['Vehicle_Age'])
test['Vehicle_Damage'] = lb_make.fit_transform(test['Vehicle_Damage'])
test.head()

In [None]:
eval_test = Pool(test)
eval_test

In [None]:
pred = model.predict(eval_test)
pred.shape

In [None]:
submit = pd.DataFrame(index=test.index)
submit["id"] = test.id
submit["Response"] = pred
submit.set_index('id').reset_index(inplace=True)
submit.head()

In [None]:
submit.to_csv("Submission.csv")