In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from matplotlib import pyplot as plt
import seaborn as sns

In [None]:
train_data = pd.read_csv("/kaggle/input/health-insurance-cross-sell-prediction/train.csv")
test_data = pd.read_csv("/kaggle/input/health-insurance-cross-sell-prediction/test.csv")

### Looking at the data

In [None]:
train_data.head()

In [None]:
test_data.head()

In [None]:
print(f"Training set has {train_data.shape[0]} examples.")
print(f"Test set has {test_data.shape[0]} examples.")

| Feature | Description | Category |
|---------|----------|-------------|
| id | Unique ID for the customer | Unique Identifier |
| Gender | Gender of the customer | Categorical |
| Age | Age of the customer | Numeric-Discrete |
| Driving_License | 0 : Customer does not have DL, 1 : Customer already has DL |  Categorical |
| Region_Code | Unique code for the region of the customer |  Categorical |
| Previously_Insured | 1 : Customer already has Vehicle Insurance, 0 : Customer doesn't have Vehicle Insurance |  Categorical |
| Vehicle_Age | Age of the Vehicle | Categorical |
| Vehicle_Damage | 1 : Customer got his/her vehicle damaged in the past. 0 : Customer didn't get his/her vehicle damaged in the past. |  Categorical |
| Annual_Premium | The amount customer needs to pay as premium in the year | Numeric |
| Policy Sales Channel | Anonymized Code for the channel of outreaching to the customer ie. Different Agents, Over Mail, Over Phone, In Person, etc. | Categorical |
| Vintage | Number of Days, Customer has been associated with the company |  Numeric-Discrete |
| Response | 1 : Customer is interested, 0 : Customer is not interested | Target Variable |

Identifying Null Values 

In [None]:
plt.style.use('seaborn')
plt.figure(figsize=(10,5))
sns.heatmap(train_data.isnull(), yticklabels = False, cmap='plasma')
plt.title('Null Values in Training Set');

In [None]:
train_data.isnull().sum(axis=0)

In [None]:
test_data.isnull().sum(axis=0)

There are no Null Values in the dataset

In [None]:
plt.figure(figsize=(15,5))
plt.subplot(1,2,1)
sns.countplot(train_data.Response)
plt.title('Number of customers Insured');

### Categorical features 

Looking at the distribution of categorical features and their relation to the target variable

In [None]:
plt.figure(figsize=(15,15))
plt.subplot(3,3,1)
sns.countplot(train_data.Gender)
plt.subplot(3,3,2)
sns.countplot(train_data.Driving_License)
plt.subplot(3,3,3)
sns.countplot(train_data.Region_Code)
plt.subplot(3,3,4)
sns.countplot(train_data.Previously_Insured)
plt.subplot(3,3,5)
sns.countplot(train_data.Vehicle_Age)
plt.subplot(3,3,6)
sns.countplot(train_data.Vehicle_Damage)
plt.subplot(3,3,7)
sns.countplot(train_data.Policy_Sales_Channel)

In [None]:
plt.figure(figsize=(15,15))
plt.subplot(3,3,1)
sns.countplot(x="Gender", hue="Response", data=train_data)
plt.subplot(3,3,2)
sns.countplot(x="Driving_License", hue="Response", data=train_data)
plt.subplot(3,3,3)
sns.countplot(x="Region_Code", hue="Response", data=train_data)
plt.subplot(3,3,4)
sns.countplot(x="Previously_Insured", hue="Response", data=train_data)
plt.subplot(3,3,5)
sns.countplot(x="Vehicle_Age", hue="Response", data=train_data)
plt.subplot(3,3,6)
sns.countplot(x="Vehicle_Damage", hue="Response", data=train_data)
plt.subplot(3,3,7)
sns.countplot(x="Policy_Sales_Channel", hue="Response", data=train_data)

**Take aways** 
1. **Gender**: Male customes are more in number than female customers. Although male candidates have responded positively to the car insurance proposal more frequently than the female customers. It is proportaional to the total number. Therefore, gender doesn't tell us a lot about the response.
2. **Driving License**: All the customers have a driving licencse. Therefore, driving license is not a good predictor of the customers response. We might not even use Driving License as a feature in our prediciton
3. **Region Code**: Although Region code is a categorical feature. The number of categories are a lot, so it will be analyzed as a discrete numeric feature later.
4. **Previously Insured**: Majority of the customers are not previously insured(Vehicle's Insurance). Almost all of the customers who have responded positively were not insured previously. This feature can be a great predictor of the customers response. 
5. **Vehicle Age**: Majority of the customers have a vehicle age of <= 2 years. Customres with Vehicle of age 1-2 years are more likely to get an insurence. This feature will be useful in our predicitons. 
6. **Vehicle Damage**: The distribution of customers who have damaged their vehicles in the past is almost identical. Customers who have damaged their vehicles in the past are more likely to buy a vehicle insurence. 
7. **Policy_Sales_Channel**: Just like Region Code number of categories are a lot in Policy Sales Channel, so it will be analyzed as a discrete numeric feature later.

### Numeric Features 



#### Discrete Numeric 

In [None]:
plt.figure(figsize=(24,5))
plt.subplot(1,4,1)
train_data.Age.plot(kind='hist')
plt.title("Age Distribution")
plt.subplot(1,4,2)
train_data.Vintage.plot(kind='hist')
plt.title("Vintage Distribution")
plt.subplot(1,4,3)
train_data.Region_Code.plot(kind='hist')
plt.title("Region Code Distribution")
plt.subplot(1,4,4)
train_data.Policy_Sales_Channel.plot(kind='hist')
plt.title("Policy Sales Channel Distribution")

Using buckets in histogram for better visualization

In [None]:
plt.figure(figsize=(24,5))
plt.subplot(1,4,1)
train_data.Age.hist(bins=80)
plt.title("Age Distribution")
plt.subplot(1,4,2)
train_data.Vintage.hist(bins=30)
plt.title("Vintage Distribution")
plt.subplot(1,4,3)
train_data.Region_Code.hist(bins=50)
plt.title("Region Code Distribution")
plt.subplot(1,4,4)
train_data.Policy_Sales_Channel.hist(bins=80)
plt.title("Policy Sales Channel Distribution")

In [None]:
plt.figure(figsize=(24,5))
plt.subplot(1,4,1)
train_data.groupby('Response').Age.hist(bins=80)
plt.title("Age Distribution")
plt.subplot(1,4,2)
train_data.groupby('Response').Vintage.hist(bins=30)
plt.title("Vintage Distribution")
plt.subplot(1,4,3)
train_data.groupby('Response').Region_Code.hist(bins=50)
plt.title("Region Code Distribution")
plt.subplot(1,4,4)
train_data.groupby('Response').Policy_Sales_Channel.hist(bins=80)
plt.title("Policy Sales Channel Distribution")

In [None]:
train_data.Region_Code.value_counts()

In [None]:
train_data.Policy_Sales_Channel.value_counts()

| Feature | Distribution | Desctiption | Take Away |
|---------|--------------|-------------|-----------|
| Age | Vaguely similar to Log-Normal | Majority of the customers are either young or close to retirement  | Althouth young customers are more in number. Pre retirement age customres are more likely to purchase a car insurence  |
| Vintage | Very close to Uniform | The customers are distriburted uniformly | The number of days of association with the company has almost no effect on the likelyhood of car insurence. |
| Region Code | Not a standard distribution | Regions 8, 28, 41, 46 have most of the customers | the likelyhood of purchasing insurence is directly correlated to the number of customers in the regions. |
| Policy Sales Channel | Not a standard distribution | Spikes in channels of sales channels 26,124, 152-160 are most effective | channels 26, 124 are relatively more successful in selling car insurence |

In [None]:
plt.figure(figsize=(24,15))
plt.subplot(2,2,1)
sns.boxplot(y = 'Response', x = 'Age', data = train_data, fliersize = 0, orient = 'h')
plt.subplot(2,2,2)
sns.boxplot(y = 'Response', x = 'Vintage', data = train_data, fliersize = 0, orient = 'h')
plt.subplot(2,2,3)
sns.boxplot(y = 'Response', x = 'Region_Code', data = train_data, fliersize = 0, orient = 'h')
plt.subplot(2,2,4)
sns.boxplot(y = 'Response', x = 'Policy_Sales_Channel', data = train_data, fliersize = 0, orient = 'h')

The Box Plot verifies out key take aways. The 25 percentile, median and 75 percentile marks help us make our hypothesis more concrete. 

Identify outliers

In [None]:
plt.figure(figsize=(24,15))
plt.subplot(3,4,1)
sns.stripplot(x='Response', y='Age', data=train_data, alpha=0.01, jitter=True);
plt.title("Age Distribution")
plt.subplot(3,4,2)
sns.stripplot(x='Response', y='Vintage', data=train_data, alpha=0.01, jitter=True);
plt.title("Vintage Distribution")
plt.subplot(3,4,3)
sns.stripplot(x='Response', y='Region_Code', data=train_data, alpha=0.01, jitter=True);
plt.title("Region Code Distribution")
plt.subplot(3,4,4)
sns.stripplot(x='Response', y='Policy_Sales_Channel', data=train_data, alpha=0.01, jitter=True);
plt.title("Policy Sales Channel Distribution")

No visible outliers 

#### Continous numeric 

In [None]:
train_data.Annual_Premium.plot(kind='hist')

In [None]:
train_data.Annual_Premium.hist(bins=100)

In [None]:
train_data.groupby('Response').Annual_Premium.hist(bins=100)

**Take Away**
The likelyhood of purchasing insurence is directly correlated to the number of customers in the paticluar annula premium group.
Most likely to have outliers

In [None]:
sns.boxplot(y = 'Response', x = 'Annual_Premium', data = train_data, fliersize = 0, orient = 'h')
sns.stripplot(y = 'Response', x = 'Annual_Premium', data = train_data,linewidth = 0.6, orient = 'h')

Annual Premium has a lot of outliers. These outliers will affect the values our model will learn and will lead to skewed predicitons. Let's keep this in mind, we might drop the outliers (customers with annual premium more than 200000) if needed.

#### Correlation between the features

In [None]:
from sklearn.preprocessing import LabelEncoder

corr_check = train_data.copy()

col_ls = ['Gender', 'Vehicle_Age', 'Vehicle_Damage']

for col in col_ls:
    corr_check[col] = LabelEncoder().fit_transform(corr_check[col])

In [None]:
sns.heatmap(corr_check.corr(), annot=True)
plt.title('Corelation Matrix');

Features in order of correlation with response(Target variable)

| Feature | correlation type |
|---------|------------------|
| Vehicle Damage | Positive |
| Previously Insured | Negitive |
| Vehicle Age | Positive |
| Policy Sales Channel | Negitive |
| Age | Positive |
| Annual Premium | Positive |

### Feature Engineering 

Encoding Categorical features 

In [None]:
train_data['Gender'][train_data['Gender'] == 'Male'] = 0
train_data['Gender'][train_data['Gender'] == 'Female'] = 1

train_data['Vehicle_Age'][train_data['Vehicle_Age'] == '< 1 Year'] = 0
train_data['Vehicle_Age'][train_data['Vehicle_Age'] == '1-2 Year'] = 1
train_data['Vehicle_Age'][train_data['Vehicle_Age'] == '> 2 Years'] = 2

train_data['Vehicle_Damage'][train_data['Vehicle_Damage'] == 'No'] = 0
train_data['Vehicle_Damage'][train_data['Vehicle_Damage'] == 'Yes'] = 1

In [None]:
train_data.head()

In [None]:
test_data['Gender'][test_data['Gender'] == 'Male'] = 0
test_data['Gender'][test_data['Gender'] == 'Female'] = 1

test_data['Vehicle_Age'][test_data['Vehicle_Age'] == '< 1 Year'] = 0
test_data['Vehicle_Age'][test_data['Vehicle_Age'] == '1-2 Year'] = 1
test_data['Vehicle_Age'][test_data['Vehicle_Age'] == '> 2 Years'] = 2

test_data['Vehicle_Damage'][test_data['Vehicle_Damage'] == 'No'] = 0
test_data['Vehicle_Damage'][test_data['Vehicle_Damage'] == 'Yes'] = 1

Set outliers to mean in Annual Premium

In [None]:
train_data.Annual_Premium[train_data.Annual_Premium > 200000] = train_data.Annual_Premium.mean()
test_data.Annual_Premium[test_data.Annual_Premium > 200000] = train_data.Annual_Premium.mean()

Since the train datasize is 3 as much as test data we are considering the train data mean for both test and train data. This is not a good practice and a mean of the test and train data combined should have been considered.

Drop features that are not useful to us

In [None]:
train_data.columns

In [None]:
train_data = train_data.drop(['Driving_License'], axis = 1)
test_data = test_data.drop(['Driving_License'], axis = 1)
# train_data = train_data.drop(['id'], axis = 1)
# test_data = test_data.drop(['id'], axis = 1)
train_data.head()

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(train_data.drop(['id','Response'], axis=1), train_data['Response'], test_size = 0.3, random_state=0)

In [None]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression()
logreg.fit(X_train, y_train)
test_pred = logreg.predict(X_test)
test_pred_proba = logreg.predict_proba(X_test)

In [None]:
from sklearn.metrics import roc_auc_score, accuracy_score
print(accuracy_score(y_test, test_pred))
print(roc_auc_score(y_test, test_pred_proba[:,1]))

In [None]:
predictions = logreg.predict(test_data.drop(['id'], axis=1))
print(test_data.shape, predictions.shape)
output = pd.DataFrame({'i': test_data.PassengerId, 'Survived': predictions})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")