In [None]:
import pandas as pd
import numpy as np

# visualization
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
%matplotlib inline

import imblearn
from imblearn import over_sampling

# pre-processing
import sklearn
from sklearn import preprocessing
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split 

# install: conda install -c conda-forge category_encoders
import category_encoders
from category_encoders import TargetEncoder

import datetime
import statsmodels.api as sm

In [None]:
print('Pandas:', pd.__version__)
print('Numpy:', np.__version__)
print('Matplotlib:', matplotlib.__version__)
print('Seaborn:', sns.__version__)
print('Imblearn:', imblearn.__version__)
print('Scikit-Learn:', sklearn.__version__)
print('Category Encoders:', category_encoders.__version__)
print('Statsmodels:', sm.__version__)

In [None]:
sns.set_style("whitegrid")
sns.color_palette("bright")
plt.style.use("fivethirtyeight")

# Pre-Processing

## Data Exploration

### Load Data

In [None]:
data = pd.read_csv('../input/health-insurance-cross-sell-prediction/train.csv')
data

### Describe Data

In [None]:
data.info()

#### Numerical Data

In [None]:
print(data.select_dtypes(include = np.number).shape)
data.select_dtypes(include = np.number)

#### Non-Numerical Data (Categorical)

In [None]:
print(data.select_dtypes(include = object).shape)
data.select_dtypes(include = object)

### Features & Target

#### Features

In [None]:
features_num = list(data.select_dtypes(include = np.number).drop(columns = ['id', 'Response'], axis = 1).columns)
print('Lenghth: ', len(features_num))
features_num

**The dataset has 7 numerical features.**

- Region_Code & Policy Sales Channel is basically not a numeric data type, they are **categorical data that represent by numbers**. We need to do feature engineering to transform these features.
- Driving_License & Previously_Insured are **categorical data that have been label encoded**.
- Age, Annual_Premium, Vintage are numerical-discrete data types, we need to **check the distribution** of these features.

In [None]:
features_cat = list(data.select_dtypes(include = object).columns)
print('Lenghth: ', len(features_cat))
features_cat

**The dataset has 3 categorical features.**

- Vehicle_Age has natural, ordered quality, we can use an ordinal encoding.
- Gender has 2 unique values, each of them is independent, we can use one-hot encoding.
- Vehicle_Damage values are boolean, we can use label encoding (1 for yes, 0 for no).

In [None]:
features_list = features_num + features_cat
print('Lenghth: ', len(features_list))
features_list

**The data has a total of 10 features.**

#### Target

Target Variable: **'Response'** Column

In [None]:
print('Unique Values:', data['Response'].nunique(), '\nType:', data['Response'].dtypes)
data.groupby(['Response']).count()[['id']]

In [None]:
print('Conversion rate:', round(len(data[data['Response'] == 1]) * 100 / len(data), 1), '%')

**Target variable of the dataset consists of 2 categorical unique values, which means we will do binary classification.**

## Data Cleansing

### Missing Values

In [None]:
print('Row counts:', data.shape[0], '\nColumn counts:', data.shape[1], '\n')
data.head()

In [None]:
data.info()

In [None]:
data.isnull().sum()

**Dataset is clean from missing values.**

### Invalid Values

In [None]:
data.nunique()

In [None]:
print('Unique Values:', data['Driving_License'].nunique(), '\nType:', data['Driving_License'].dtypes)
data.groupby(['Driving_License']).count()[['id']]

In [None]:
print('Unique Values:', data['Previously_Insured'].nunique(), '\nType:', data['Previously_Insured'].dtypes)
data.groupby(['Previously_Insured']).count()[['id']]

In [None]:
print('Unique Values:', data['Vehicle_Age'].nunique(), '\nType:', data['Vehicle_Age'].dtypes)
data.groupby(['Vehicle_Age']).count()[['id']]

### Duplicate Rows

In [None]:
print('Any duplicated rows?')
print('No') if data.duplicated().values.any() == False else print('Yes')

## Feature Engineering

**Check Features Distribution**

**Numerical Data**

In [None]:
plt.figure(figsize = (20.7, 7))
for i in range(0, len(features_num)):
    plt.subplot(1, 7, i + 1)
    sns.boxplot(
        y = data[features_num[i]],
        color = 'blue'
    )

plt.tight_layout()

Annual_Premium have a lot of outlier in the upper range, this can be solved using IQR Method or log (if possible).

**Non-Numerical Data (Categorical)**

In [None]:
plt.figure(figsize = (20.7, 5))

for i in range(0, len(features_cat)):
    plt.subplot(1, 3, i + 1)
    sns.countplot(
        x = data[features_cat[i]],
        color = 'blue'
    )

plt.tight_layout()

In [None]:
data_preprocess = data.copy()
data_preprocess

### We will use dataframe: data_preprocess for next stage.

### Gender

In [None]:
data_preprocess['Gender'].value_counts()

In [None]:
dummies_Gender = pd.get_dummies(data_preprocess['Gender'], prefix = 'Gender')
data_preprocess = pd.concat([data_preprocess, dummies_Gender], axis = 1)

In [None]:
data_preprocess[['Gender', 'Gender_Male', 'Gender_Female']].head()

### Age

In [None]:
data_preprocess['Age'].describe()

### Driving_License

In [None]:
data_preprocess['Driving_License'].value_counts()

### Region_Code

- https://contrib.scikit-learn.org/category_encoders/targetencoder.html#
- We will use Target encoding for Region_Code and Policy_Sales Channel

In [None]:
te = TargetEncoder()
data_preprocess['Region_Code_Encoding'] = te.fit_transform(data_preprocess['Region_Code'].astype(str), data_preprocess['Response'])

In [None]:
data_preprocess[['Region_Code', 'Response', 'Region_Code_Encoding']].head()

#### Later for predicting new test data

In [None]:
sort_region = data_preprocess.sort_values(['Region_Code', 'Region_Code_Encoding'], ascending = True)
sort_region

In [None]:
region_unique = sort_region.groupby('Region_Code').first().reset_index()[['Region_Code', 'Region_Code_Encoding']]
region_unique

In [None]:
region_unique.to_csv('./region_target_encoding.csv', index = False)

### Previously_Insured

In [None]:
data_preprocess['Previously_Insured'].value_counts()

### Vehicle_Age

In [None]:
data_preprocess['Vehicle_Age'].value_counts()

In [None]:
oe_va = OrdinalEncoder(categories = [['< 1 Year', '1-2 Year', '> 2 Years']], dtype = np.int64)
data_preprocess['Vehicle_Age_Encoding'] = oe_va.fit_transform(data_preprocess.loc[:, ['Vehicle_Age']])

In [None]:
data_preprocess['Vehicle_Age_Encoding'].value_counts()

In [None]:
data_preprocess['Vehicle_Age_Encoding'].skew()

In [None]:
plt.figure(figsize = (20.7, 7))

sns.boxplot(
    x = data_preprocess['Vehicle_Age_Encoding']
)

plt.tight_layout()

### Vehicle_Damage

In [None]:
data_preprocess['Vehicle_Damage'].value_counts()

In [None]:
le = LabelEncoder()
data_preprocess['Vehicle_Damage_Encoding'] = le.fit_transform(data_preprocess['Vehicle_Damage'])

In [None]:
data_preprocess['Vehicle_Damage_Encoding'].value_counts()

### Annual_Premium

In [None]:
data_preprocess['Annual_Premium'].describe()

### Policy_Sales_Channel

In [None]:
te = TargetEncoder()
data_preprocess['Policy_Sales_Channel_Encoding'] = te.fit_transform(data_preprocess['Policy_Sales_Channel'].astype(str), data_preprocess['Response'])

In [None]:
data_preprocess[['Policy_Sales_Channel', 'Response', 'Policy_Sales_Channel_Encoding']].head()

#### Later for predicting new test data

In [None]:
sort_channel = data_preprocess.sort_values(['Policy_Sales_Channel', 'Policy_Sales_Channel_Encoding'], ascending = True)
sort_channel

In [None]:
sales_channel_unique = sort_channel.groupby('Policy_Sales_Channel').first().reset_index()[['Policy_Sales_Channel', 'Policy_Sales_Channel_Encoding']]
sales_channel_unique

In [None]:
sales_channel_unique.to_csv('./sales_channel_target_encoding.csv', index = False)

### Vintage

In [None]:
data_preprocess['Annual_Premium'].describe()

## Outliers

In [None]:
list_2 = ['Age', 'Annual_Premium', 'Vintage']

plt.figure(figsize = (20.7, 5))

for i in range(0, len(list_2)):
    plt.subplot(1, 3, i + 1)
    sns.boxplot(
        x = data_preprocess[list_2[i]],
        hue = data_preprocess['Response']
    )

plt.tight_layout()

In [None]:
data_preprocess[['Age', 'Annual_Premium', 'Vintage']].skew()

- Since the Region_Code and Policy_Sales_Channel have been encoded based on their respective positive class (Probability of Response = 1), we also need to check their distribution.

In [None]:
list_2 = ['Region_Code_Encoding', 'Policy_Sales_Channel_Encoding']

plt.figure(figsize = (20.7, 5))

for i in range(0, len(list_2)):
    plt.subplot(1, 2, i + 1)
    sns.boxplot(
        x = data_preprocess[list_2[i]],
        hue = data_preprocess['Response']
    )

plt.tight_layout()

In [None]:
data_preprocess[['Region_Code_Encoding', 'Policy_Sales_Channel_Encoding']].skew()

https://medium.com/@atanudan/kurtosis-skew-function-in-pandas-aa63d72e20de

- If the skewness is between -0.5 and 0.5, the data are fairly symmetrical
- If the skewness is between -1 and — 0.5 or between 0.5 and 1, the data are moderately skewed
- If the skewness is less than -1 or greater than 1, the data are highly skewed

- Based on the distribution plot and skewness, we need to **treat the outliers for Age and Annual_Premium features**.

### Age

#### Log Transformation

In [None]:
data_preprocess['Age_log'] = np.log(data_preprocess['Age'])
data_preprocess[['Age', 'Age_log']].describe()

In [None]:
data_preprocess[['Age', 'Age_log']].skew()

Skewness decrease from 0.68 to 0.20, means data are fairly symmetrical.

In [None]:
age_viz = ['Age', 'Age_log']

plt.figure(figsize = (20.7, 10))

for i in range(0, len(age_viz)):
    plt.subplot(2, 1, i + 1)
    sns.histplot(
        x = data_preprocess[age_viz[i]],
        kde = True
    )

plt.tight_layout()

In [None]:
plt.figure(figsize = (20.7, 10))

plt.subplot(2, 1, 1)
sns.boxplot(
    x = data_preprocess['Age_log']
)

plt.subplot(2, 1, 2)
sns.boxplot(
    x = data_preprocess['Age']
)

plt.tight_layout()

In [None]:
Q1 = data_preprocess['Age_log'].quantile(0.25)
Q3 = data_preprocess['Age_log'].quantile(0.75)
IQR = Q3 - Q1
low_limit = Q1 - (1.5 * IQR)
high_limit = Q3 + (1.5 * IQR)

print(low_limit)
print(high_limit)

In [None]:
filtered_entries = ((data_preprocess['Age_log'] < low_limit) | (data_preprocess['Age_log'] > high_limit))
data_preprocess[filtered_entries].shape

### Annual Premium

In [None]:
data_preprocess[['Annual_Premium']].skew()

In [None]:
before_remove = data_preprocess.shape[0]
before_remove 

In [None]:
Q1 = data_preprocess['Annual_Premium'].quantile(0.25)
Q3 = data_preprocess['Annual_Premium'].quantile(0.75)
IQR = Q3 - Q1
low_limit = Q1 - (1.5 * IQR)
high_limit = Q3 + (1.5 * IQR)

print(low_limit)
print(high_limit)

In [None]:
filtered_entries = ((data_preprocess['Annual_Premium'] >= low_limit) & (data_preprocess['Annual_Premium'] <= high_limit))
data_preprocess = data_preprocess[filtered_entries].reset_index(drop = True)

In [None]:
after_remove = data_preprocess.shape[0]
after_remove

In [None]:
print('Outlier removed:', round(100 - (after_remove * 100 / before_remove), 2), '%')

In [None]:
data_preprocess[['Annual_Premium']].skew()

In [None]:
data_preprocess['Annual_Premium_log'] = np.log(data_preprocess['Annual_Premium'])
data_preprocess[['Annual_Premium', 'Annual_Premium_log']].describe()

In [None]:
data_preprocess[['Annual_Premium', 'Annual_Premium_log']].skew()

In [None]:
age_viz = ['Annual_Premium', 'Annual_Premium_log']

plt.figure(figsize = (20.7, 10))

for i in range(0, len(age_viz)):
    plt.subplot(2, 1, i + 1)
    sns.histplot(
        x = data_preprocess[age_viz[i]],
        kde = True
    )

plt.tight_layout()

In [None]:
plt.figure(figsize = (20.7, 10))

plt.subplot(2, 1, 1)
sns.boxplot(
    x = data_preprocess['Annual_Premium_log']
)

plt.subplot(2, 1, 2)
sns.boxplot(
    x = data_preprocess['Annual_Premium']
)

plt.tight_layout()

We can see that 'Annual_Premium' distribution is **better without log transformation**, so we can ignore the log transformation ('Annual_Premium_log') and proceed with previous 'Annual_Premium'.

## Imbalanced Class

In [None]:
plt.figure(figsize = (20.7, 8))

sns.countplot(
    x = data_preprocess['Response'],
    palette = 'PuBu'
)

plt.title('Target Variable Distribution', fontsize = 20, fontweight = 'bold')

plt.tight_layout()

In [None]:
data_preprocess['Response'].value_counts()

### Oversampling

In [None]:
data_preprocess_oversampling = data_preprocess[[
    'Gender_Male', 'Gender_Female',
    'Age_log',
    'Driving_License',
    'Region_Code_Encoding',
    'Previously_Insured',
    'Vehicle_Age_Encoding',
    'Vehicle_Damage_Encoding',
    'Annual_Premium',
    'Policy_Sales_Channel_Encoding',
    'Vintage',
    'Response'
]]
data_preprocess_oversampling

In [None]:
X_imbalanced = data_preprocess_oversampling.drop(['Response'], axis = 1)
y_imbalanced = data_preprocess_oversampling['Response']

oversampling = over_sampling.SMOTE(random_state = 42)

X_over_smote, y_over_smote = oversampling.fit_resample(X_imbalanced, y_imbalanced)

In [None]:
data_oversampling = pd.concat([X_over_smote, y_over_smote], axis = 1)
data_oversampling

In [None]:
data_preprocess_oversampling['Response'].value_counts()

In [None]:
data_preprocess_oversampling.shape

In [None]:
data_oversampling['Response'].value_counts()

In [None]:
data_oversampling.shape

In [None]:
fig, ax = plt.subplots(1, 2, figsize = (20.7, 8))

sns.countplot(
    x = 'Response',
    data = data_preprocess_oversampling,
    palette = 'PuBu',
    ax = ax[0]
)

sns.countplot(
    x = 'Response',
    data = data_oversampling,
    palette = 'YlOrBr',
    ax = ax[1]
)

ax[0].set_title('Target Variable before Oversampling', fontsize = 20, fontweight = 'bold')
ax[1].set_title('Target Variable after Oversampling', fontsize = 20, fontweight = 'bold')

plt.tight_layout()

In [None]:
data_preprocess.to_csv('./data_preprocess.csv', index = False)

In [None]:
data_oversampling.to_csv('./data_oversampling.csv', index = False)

## DATAFRAME & CSV
- data_preprocess -> cleaned from missing value, outliers, and duplication.
- data_oversampling -> result of oversampling process using SMOTE.

In [None]:
print(data_preprocess.shape)
data_preprocess.head()

In [None]:
print(data_oversampling.shape)
data_oversampling.head()

## Dataset for Modeling

- We will use csv: data_oversampling.csv
- We will do train-test split and standardization scaling
- We will split the dataset before scaling to avoid data leakage
- Both of the stages will be done in stage 3

# EDA & Insights

In [None]:
data_eda_viz = data.copy()

In [None]:
print(data_eda_viz.shape)
data_eda_viz.head()

- Encode categorical Data for correlation matrix

## Categorical Feature Encoding
-  For Better Visualization

### Gender

In [None]:
data_eda_viz['Gender'].value_counts()

In [None]:
dummies_Gender = pd.get_dummies(data_eda_viz['Gender'], prefix = 'Gender')
data_eda_viz = pd.concat([data_eda_viz, dummies_Gender], axis = 1)

In [None]:
data_eda_viz[['Gender', 'Gender_Male', 'Gender_Female']].head()

### Driving_License

In [None]:
data_eda_viz['Driving_License'].value_counts()

In [None]:
def driving_license(x):
    if x['Driving_License'] == 1:
        license = 'Yes'
    else:
        license = 'No'
    return license

In [None]:
data_eda_viz['Driving_License'] = data_eda_viz.apply(lambda x: driving_license(x), axis =1)

In [None]:
data_eda_viz['Driving_License'].value_counts()

In [None]:
dummies_Driving_License = pd.get_dummies(data_eda_viz['Driving_License'], prefix = 'Driving_License')
data_eda_viz = pd.concat([data_eda_viz, dummies_Driving_License], axis = 1)

In [None]:
data_eda_viz[['Driving_License', 'Driving_License_Yes', 'Driving_License_No']].head()

### Region_Code

In [None]:
data['Region_Code'].value_counts().head()

In [None]:
def segment_region_code(x):
    if x['Region_Code'] == 28.0:
        segment = 'West Bengal'
    elif x['Region_Code'] == 8.0:
        segment = 'Haryana'
    elif x['Region_Code'] == 46.0:
        segment = 'Goa'
    elif x['Region_Code'] == 41.0:
        segment = 'Andra Pradesh'
    elif x['Region_Code'] == 15.0:
        segment = 'Maharashtra'
    else:
        segment = 'others'
    return segment

In [None]:
data_eda_viz['Region_Code_Group'] = data_eda_viz.apply(lambda x: segment_region_code(x), axis = 1)

In [None]:
data_eda_viz['Region_Code_Group'].value_counts()

In [None]:
dummies_Region_Code = pd.get_dummies(data_eda_viz['Region_Code_Group'], prefix = 'Region_Code')
data_eda_viz = pd.concat([data_eda_viz, dummies_Region_Code], axis = 1)

data_eda_viz[['Region_Code_Group', 'Region_Code_West Bengal', 'Region_Code_Haryana', 'Region_Code_Goa', 'Region_Code_Andra Pradesh', 'Region_Code_Maharashtra', 'Region_Code_others']].head()

### Previously_Insured

In [None]:
data_eda_viz['Previously_Insured'].value_counts()

In [None]:
def previously_insured(x):
    if x['Previously_Insured'] == 1:
        prev_insured = 'Yes'
    else:
        prev_insured = 'No'
    return prev_insured

In [None]:
data_eda_viz['Previously_Insured'] = data_eda_viz.apply(lambda x: previously_insured(x), axis =1)

In [None]:
data_eda_viz['Previously_Insured'].value_counts()

In [None]:
dummies_Previously_Insured = pd.get_dummies(data_eda_viz['Previously_Insured'], prefix = 'Previously_Insured')
data_eda_viz = pd.concat([data_eda_viz, dummies_Previously_Insured], axis = 1)

data_eda_viz[['Previously_Insured', 'Previously_Insured_Yes', 'Previously_Insured_No']].head()

### Vehicle_Age

In [None]:
data_eda_viz['Vehicle_Age'].value_counts()

In [None]:
oe_va = OrdinalEncoder(categories = [['< 1 Year', '1-2 Year', '> 2 Years']])
data_eda_viz['Vehicle_Age_Num'] = oe_va.fit_transform(data_eda_viz.loc[:, ['Vehicle_Age']])

In [None]:
data_eda_viz['Vehicle_Age_Num'].value_counts()

### Vehicle_Damage

In [None]:
data_eda_viz['Vehicle_Damage'].value_counts()

In [None]:
dummies_Vehicle_Damage = pd.get_dummies(data_eda_viz['Vehicle_Damage'], prefix = 'Vehicle_Damage')
data_eda_viz = pd.concat([data_eda_viz, dummies_Vehicle_Damage], axis = 1)

In [None]:
data_eda_viz[['Vehicle_Damage', 'Vehicle_Damage_Yes', 'Vehicle_Damage_No']].head()

### Policy_Sales_Channel

In [None]:
data_eda_viz['Policy_Sales_Channel'].value_counts().head()

In [None]:
def segment_policy_sales_channel(x):
    if x['Policy_Sales_Channel'] == 152.0:
        segment = 'Internet'
    elif x['Policy_Sales_Channel'] == 26.0:
        segment = 'Direct Response'
    elif x['Policy_Sales_Channel'] == 124.0:
        segment = 'Independent Agencies'
    elif x['Policy_Sales_Channel'] == 160.0:
        segment = 'Affinity Group'
    elif x['Policy_Sales_Channel'] == 156.0:
        segment = 'Exclusive/Captive Agents'
    else:
        segment = 'others'
    return segment

In [None]:
data_eda_viz['Policy_Sales_Channel_Group'] = data_eda_viz.apply(lambda x: segment_policy_sales_channel(x), axis = 1)

In [None]:
data_eda_viz['Policy_Sales_Channel_Group'].value_counts()

In [None]:
dummies_Policy_Sales_Channel = pd.get_dummies(data_eda_viz['Policy_Sales_Channel_Group'], prefix = 'Policy_Sales_Channel')
data_eda_viz = pd.concat([data_eda_viz, dummies_Policy_Sales_Channel], axis = 1)

data_eda_viz[['Policy_Sales_Channel_Group', 'Policy_Sales_Channel_Internet', 'Policy_Sales_Channel_Direct Response', 'Policy_Sales_Channel_Independent Agencies', 'Policy_Sales_Channel_Affinity Group', 'Policy_Sales_Channel_Exclusive/Captive Agents', 'Policy_Sales_Channel_others']].head()

## Numerical Feature Binning
- For Better Visualization

### Age

In [None]:
data_eda_viz['Age'].describe()

In [None]:
data_eda_viz['Age'].value_counts()

In [None]:
def segment_age(x):
    if x['Age'] > 17 and x['Age'] <= 30:
        segment = '17y - 30y'
    elif x['Age'] > 30 and x['Age'] <= 45:
        segment = '31y - 45y'
    elif x['Age'] > 45 and x['Age'] <= 55:
        segment = '46y - 55y'
    elif x['Age'] > 55:
        segment = '> 55y'
    return segment

In [None]:
data_eda_viz['Age_Group'] = data_eda_viz.apply(lambda x: segment_age(x), axis = 1)
data_eda_viz.head()

In [None]:
data_eda_viz['Age_Group'].value_counts()

### Annual_Premium

In [None]:
data_eda_viz['Annual_Premium'].describe()

In [None]:
data_eda_viz['Annual_Premium'].value_counts()

In [None]:
def segment_annual_premium(x):
    if x['Annual_Premium'] < 5000:
        segment = '< 5K'
    elif x['Annual_Premium'] > 5000 and x['Annual_Premium'] <= 30000:
        segment = '5K - 30K'
    elif x['Annual_Premium'] > 30000 and x['Annual_Premium'] <= 40000:
        segment = '30K - 40K'
    elif x['Annual_Premium'] > 40000 and x['Annual_Premium'] <= 50000:
        segment = '40K - 50K'
    elif x['Annual_Premium'] > 50000:
        segment = '> 50K'
    return segment

In [None]:
data_eda_viz['Annual_Premium_Group'] = data_eda_viz.apply(lambda x: segment_annual_premium(x), axis = 1)
data.head()

In [None]:
data_eda_viz['Annual_Premium_Group'].value_counts()

### Vintage

In [None]:
data_eda_viz['Vintage'].describe()

In [None]:
data_eda_viz['Vintage'].value_counts()

In [None]:
def vintage_month(x):
    to_month = int(round(x['Vintage'] / 30, 0))
    return to_month

In [None]:
data_eda_viz['Vintage_Month'] = data_eda_viz.apply(lambda x: vintage_month(x), axis = 1)
data_eda_viz.head()

In [None]:
data_eda_viz['Vintage_Month'].value_counts()

In [None]:
def vintage_month_group(x):
    to_month = x['Vintage_Month']
    
    if to_month < 4:
        month_group = '< 4 Month'
    elif to_month >= 4 and to_month <= 7:
        month_group = '4-7 Month'
    elif to_month > 7:
        month_group = '> 7 Month'
    return month_group

In [None]:
data_eda_viz['Vintage_Month_Group'] = data_eda_viz.apply(lambda x: vintage_month_group(x), axis = 1)
data_eda_viz.head()

In [None]:
data_eda_viz['Vintage_Month_Group'].value_counts()

## EDA

### Matrix Correlation

In [None]:
feature = [
    'Gender_Male', 'Gender_Female',
    'Age',
    'Vehicle_Age_Num',
    'Annual_Premium',
    'Vintage',
    'Driving_License_Yes',
    'Previously_Insured_Yes',
    'Vehicle_Damage_Yes',
    'Region_Code_West Bengal', 'Region_Code_Haryana', 'Region_Code_Goa', 'Region_Code_Andra Pradesh', 'Region_Code_Maharashtra', 'Region_Code_others',
    'Policy_Sales_Channel_Internet', 'Policy_Sales_Channel_Direct Response', 'Policy_Sales_Channel_Independent Agencies', 'Policy_Sales_Channel_Affinity Group', 'Policy_Sales_Channel_Exclusive/Captive Agents', 'Policy_Sales_Channel_others'
]

target = ['Response']
corr_ = feature + target

check_corr = data_eda_viz[corr_].corr()

In [None]:
plt.figure(figsize = (20.7, 16))
sns.heatmap(
    check_corr,
    annot = True,
    fmt = '.1f',
    cmap = 'RdBu'
)

plt.tight_layout()

### Numerical

In [None]:
features_num = ['Age', 'Annual_Premium', 'Vintage']

In [None]:
plt.figure(figsize = (10, 21))
for i in range(0, len(features_num)):
    plt.subplot(3, 1, i + 1)
    ax = sns.histplot(
        x = data_eda_viz[features_num[i]],
        kde = True
    )
    
plt.tight_layout()

In [None]:
plt.figure(figsize = (12, 21))
for i in range(0, len(features_num)):
    plt.subplot(3, 1, i + 1)
    ax = sns.histplot(
        x = data_eda_viz[features_num[i]],
        hue = data_eda_viz['Response'],
        kde = True,
        hue_order = [0, 1],
        multiple = 'stack',
        palette = ['darkgray', '#293286']
    )

    L = ax.legend(['Interested', 'Not Interested'], fontsize = 14, frameon = True)
    L.set_title('Response', prop = {'size' : 14})
    L.get_frame().set_alpha(1)
    L.get_frame().set_facecolor((1, 1, 1, 1))
    
plt.tight_layout()

### Categorical

In [None]:
features_cat = ['Gender', 'Driving_License', 'Previously_Insured', 'Vehicle_Age', 'Vehicle_Damage', 'Region_Code_Group', 'Policy_Sales_Channel_Group']

In [None]:
plt.figure(figsize = (12, 49))
for i in range(0, len(features_cat)):
    plt.subplot(7, 1, i + 1)
    ax = sns.countplot(
        x = data_eda_viz[features_cat[i]],
        order = data_eda_viz[features_cat[i]].value_counts().index,
        palette = ['#3f88c5', '#ffba08', '#032b43', '#d00000', '#136f63', '#383d3b']
        # ['#30bced', '#fc5130', '#737382', '#fffaff', '#e0777d', '#050401']
        # ['#235789', '#f1d302', '#ed1c24', '#fdfffc', '#020100', '#3d0c11']
    )
    
plt.tight_layout()

In [None]:
plt.figure(figsize = (12, 49))
for i in range(0, len(features_cat)):
    plt.subplot(7, 1, i + 1)
    ax = sns.countplot(
        x = data_eda_viz[features_cat[i]],
        hue = data_eda_viz['Response'],
        hue_order = [1, 0],
        palette = ['#293286', 'darkgray'],
        order = pd.crosstab(data_eda_viz[features_cat[i]], data_eda_viz['Response']).sort_values(1, ascending = False).index,
    )
    
    L = ax.legend(['Interested', 'Not Interested'], fontsize = 14, frameon = True)
    L.set_title('Response', prop = {'size' : 14})
    L.get_frame().set_alpha(1)
    L.get_frame().set_facecolor((1, 1, 1, 1))
    
plt.tight_layout()

## Insights & Visualization

In [None]:
data.info()

### Gender

In [None]:
gender_distribution = data_eda_viz.groupby(['Gender', 'Response']).count()[['id']]

gender = data_eda_viz.groupby(['Gender']).count()[['id']]

gender_distribution['percentage'] = gender_distribution.div(gender, level = 'Gender') * 100
gender_distribution = gender_distribution.reset_index()
gender_distribution.columns = ['Gender', 'Response', '#of customers', 'percentage']
gender_distribution

In [None]:
# palette color to highlight Response = 'Interested'/1
color_focus_gender = ['#293286' if (x == 1) else 'darkgray' for x in gender_distribution['Response']]

In [None]:
plt.figure(figsize = (10, 8))

# plot proportion
ax = sns.barplot(
    x = 'Gender',
    y = 'percentage',
    hue = 'Response',
    order = gender_distribution.iloc[gender_distribution[gender_distribution['Response'] == 1]['percentage'].sort_values(ascending = False).index]['Gender'],
    palette = color_focus_gender,
    data = gender_distribution
)

ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.spines['left'].set_visible(False)

L = plt.legend(fontsize = 13, frameon = True)
L.get_texts()[0].set_text('Not interested')
L.get_texts()[1].set_text('Interested')
L.set_title('Response', prop = {'size' : 13})
L.get_frame().set_alpha(1)
L.get_frame().set_facecolor((1, 1, 1, 1))

y_ = [x for x in range(0, 120, 20)]
plt.yticks(y_)

plt.xlabel('Gender', fontsize = 14)
plt.ylabel('Proportion', fontsize = 14)

ylabels = [format(y) + '%' for y in ax.get_yticks()]
ax.set_yticklabels(labels = ylabels)

plt.xticks(fontsize = 14) 
plt.yticks(fontsize = 14) 

for p in ax.patches:
    ax.annotate(
        format(p.get_height(), '.1f') + '%',
        (p.get_x() + p.get_width() / 2., p.get_height()), 
        ha = 'center',
        fontweight = 'semibold',
        size = 20,
        xytext = (0, 10), 
        textcoords = 'offset points',
        color = 'grey'
    )

plt.title('Customer Response Based on Gender', fontsize = 24, fontweight = 'semibold', ha = 'center', pad = 20)

plt.tight_layout()

### Age

In [None]:
plt.figure(figsize = (20.7, 8))

# plot proportion
ax = sns.histplot(
    x = data_eda_viz['Age'],
    hue = data_eda_viz['Response'],
    stat = 'probability',
    multiple = 'fill',
    data = data_eda_viz,
    palette = ['darkgray', '#293286']
)

ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.spines['left'].set_visible(False)

L = ax.legend(['Interested', 'Not Interested'], fontsize = 14, frameon = True)
L.set_title('Response', prop = {'size' : 14})
L.get_frame().set_alpha(1)
L.get_frame().set_facecolor((1, 1, 1, 1))

plt.xlabel('Age', fontsize = 14)
plt.ylabel('Proportion', fontsize = 14)

y_ = [y for y in np.arange(0, 1.2, 0.2)]
plt.yticks(y_)

x_ = [x for x in range(20, 90, 5)]
plt.xticks(x_)

ylabels = [format(int(round(y * 100, 0))) + '%' for y in ax.get_yticks()]
ax.set_yticklabels(labels = ylabels)

plt.xticks(fontsize = 14) 
plt.yticks(fontsize = 14) 

plt.title('Customer Response Based on Age', fontsize = 24, fontweight = 'semibold', ha = 'center', pad = 20)

plt.tight_layout()

In [None]:
age_group_distribution = data_eda_viz.groupby(['Age_Group', 'Response']).count()[['id']]

age_group = data_eda_viz.groupby(['Age_Group']).count()[['id']]

age_group_distribution['percentage'] = age_group_distribution.div(age_group, level = 'Age_Group') * 100
age_group_distribution = age_group_distribution.reset_index()
age_group_distribution.columns = ['Age_Group', 'Response', '#of customers', 'percentage']
age_group_distribution

In [None]:
# palette color to highlight Response = 'Interested'/1
color_focus_age_group = ['#293286' if (x == 1) else 'darkgray' for x in age_group_distribution['Response']]

In [None]:
plt.figure(figsize = (10, 8))

# plot proportion
ax = sns.barplot(
    x = 'Age_Group',
    y = 'percentage',
    hue = 'Response',
    order = age_group_distribution.iloc[age_group_distribution[age_group_distribution['Response'] == 1]['percentage'].sort_values(ascending = False).index]['Age_Group'],
    palette = color_focus_age_group,
    data = age_group_distribution
)

ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.spines['left'].set_visible(False)

L = plt.legend(fontsize = 13, frameon = True, bbox_to_anchor = (0, 1.03), loc = 'upper left')
L.get_texts()[0].set_text('Not interested')
L.get_texts()[1].set_text('Interested')
L.set_title('Response', prop = {'size' : 13})
L.get_frame().set_alpha(1)
L.get_frame().set_facecolor((1, 1, 1, 1))

y_ = [x for x in range(0, 120, 20)]
plt.yticks(y_)

plt.xlabel('Age Group', fontsize = 14)
plt.ylabel('Proportion', fontsize = 14)

ylabels = [format(y) + '%' for y in ax.get_yticks()]
ax.set_yticklabels(labels = ylabels)

plt.xticks(fontsize = 14) 
plt.yticks(fontsize = 14) 

for p in ax.patches:
    ax.annotate(
        format(p.get_height(), '.1f') + '%',
        (p.get_x() + p.get_width() / 2., p.get_height()), 
        ha = 'center',
        fontweight = 'semibold',
        size = 20,
        xytext = (5, 8), 
        textcoords = 'offset points',
        color = 'grey'
    )

plt.title('Customer Response Based on Age Group', fontsize = 24, fontweight = 'semibold', ha = 'center', pad = 20)

plt.tight_layout()

### Driving_License

In [None]:
driving_license_distribution = data_eda_viz.groupby(['Driving_License', 'Response']).count()[['id']]

driving_license = data_eda_viz.groupby(['Driving_License']).count()[['id']]

driving_license_distribution['percentage'] = driving_license_distribution.div(driving_license, level = 'Driving_License') * 100
driving_license_distribution = driving_license_distribution.reset_index()
driving_license_distribution.columns = ['Driving_License', 'Response', '#of customers', 'percentage']
driving_license_distribution

In [None]:
# palette color to highlight Response = 'Interested'/1
color_focus_driving_license = ['#293286' if (x == 1) else 'darkgray' for x in driving_license_distribution['Response']]

In [None]:
plt.figure(figsize = (10, 8))

# plot proportion
ax = sns.barplot(
    x = 'Driving_License',
    y = 'percentage',
    hue = 'Response',
    order = driving_license_distribution.iloc[driving_license_distribution[driving_license_distribution['Response'] == 1]['percentage'].sort_values(ascending = False).index]['Driving_License'],
    palette = color_focus_driving_license,
    data = driving_license_distribution
)

ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.spines['left'].set_visible(False)

L = plt.legend(fontsize = 13, frameon = True)
L.get_texts()[0].set_text('Not interested')
L.get_texts()[1].set_text('Interested')
L.set_title('Response', prop = {'size' : 13})
L.get_frame().set_alpha(1)
L.get_frame().set_facecolor((1, 1, 1, 1))

y_ = [x for x in range(0, 120, 20)]
plt.yticks(y_)

plt.xlabel('Driving License', fontsize = 14)
plt.ylabel('Proportion', fontsize = 14)

ylabels = [format(y) + '%' for y in ax.get_yticks()]
ax.set_yticklabels(labels = ylabels)

plt.xticks(fontsize = 14) 
plt.yticks(fontsize = 14) 

for p in ax.patches:
    ax.annotate(
        format(p.get_height(), '.1f') + '%',
        (p.get_x() + p.get_width() / 2., p.get_height()), 
        ha = 'center',
        fontweight = 'semibold',
        size = 20,
        xytext = (0, 10), 
        textcoords = 'offset points',
        color = 'grey'
    )

plt.title('Customer Response Based on Driving License', fontsize = 24, fontweight = 'semibold', ha = 'center', pad = 20)

plt.tight_layout()

### Region_Code

In [None]:
region_code_group_distribution = data_eda_viz.groupby(['Region_Code_Group', 'Response']).count()[['id']]

region_code_group = data_eda_viz.groupby(['Region_Code_Group']).count()[['id']]

region_code_group_distribution['percentage'] = region_code_group_distribution.div(region_code_group, level = 'Region_Code_Group') * 100
region_code_group_distribution = region_code_group_distribution.reset_index()
region_code_group_distribution.columns = ['Region_Code_Group', 'Response', '#of customers', 'percentage']
region_code_group_distribution

In [None]:
region_code_group_distribution = region_code_group_distribution[region_code_group_distribution['Region_Code_Group'] != 'others']
region_code_group_distribution

In [None]:
# palette color to highlight Response = 'Interested'/1
color_focus_region_code_group = ['#293286' if (x == 1) else 'darkgray' for x in region_code_group_distribution['Response']]

In [None]:
plt.figure(figsize = (10, 8))

# plot proportion
ax = sns.barplot(
    x = 'Region_Code_Group',
    y = 'percentage',
    hue = 'Response',
    order = region_code_group_distribution.iloc[region_code_group_distribution[region_code_group_distribution['Response'] == 1]['percentage'].sort_values(ascending = False).index]['Region_Code_Group'],
    palette = color_focus_region_code_group,
    data = region_code_group_distribution
)

ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.spines['left'].set_visible(False)

L = plt.legend(fontsize = 13, frameon = True, bbox_to_anchor = (1, 0.78), loc = 'upper right')
L.get_texts()[0].set_text('Not interested')
L.get_texts()[1].set_text('Interested')
L.set_title('Response', prop = {'size' : 13})
L.get_frame().set_alpha(1)
L.get_frame().set_facecolor((1, 1, 1, 1))

y_ = [x for x in range(0, 120, 20)]
plt.yticks(y_)

plt.xlabel('Region Code Group', fontsize = 14)
plt.ylabel('Proportion', fontsize = 14)

ylabels = [format(y) + '%' for y in ax.get_yticks()]
ax.set_yticklabels(labels = ylabels)

plt.xticks(fontsize = 14) 
plt.yticks(fontsize = 14) 

for p in ax.patches:
    ax.annotate(
        format(p.get_height(), '.1f') + '%',
        (p.get_x() + p.get_width() / 2., p.get_height()), 
        ha = 'center',
        fontweight = 'semibold',
        size = 20,
        xytext = (5, 10), 
        textcoords = 'offset points',
        color = 'grey'
    )

plt.title('Customer Response Based on Top 5 Region by Frequency', fontsize = 24, fontweight = 'semibold', ha = 'center', pad = 20)

plt.tight_layout()

- https://en.wikipedia.org/wiki/Administrative_divisions_of_India
 - 28 -> West Bengal
 - 8 -> Haryana
 - 46 -> ? (6: Goa)
 - 41 -> ? (1: Andhra Pradesh)
 - 15 -> Maharashtra

In [None]:
west_bengal = data_eda_viz[data_eda_viz['Region_Code_Group'] == 'West Bengal']
west_bengal

In [None]:
res_west_bengal = west_bengal[west_bengal['Response'] == 1]
res_west_bengal

In [None]:
res_west_bengal['Annual_Premium'].sum()

In [None]:
data_eda_viz.groupby(['Region_Code_Group']).sum()[['Annual_Premium']].sort_values('Annual_Premium', ascending = False)

In [None]:
data_eda_viz.groupby(['Region_Code_Group', 'Response']).sum()[['Annual_Premium']].sort_values('Annual_Premium', ascending = False).sort_values('Response', ascending = False)

Among other regions, the customer from West Bengal with response yes has the highest total annual premium.

In [None]:
data_eda_viz.groupby(['Region_Code', 'Response']).sum()[['Annual_Premium']].sort_values('Response', ascending = False).sort_values('Annual_Premium', ascending = False)

### Previously_Insured

In [None]:
previously_insured_distribution = data_eda_viz.groupby(['Previously_Insured', 'Response']).count()[['id']]

previously_insured = data_eda_viz.groupby(['Previously_Insured']).count()[['id']]

previously_insured_distribution['percentage'] = previously_insured_distribution.div(previously_insured, level = 'Previously_Insured') * 100
previously_insured_distribution = previously_insured_distribution.reset_index()
previously_insured_distribution.columns = ['Previously_Insured', 'Response', '#of customers', 'percentage']
previously_insured_distribution

In [None]:
# palette color to highlight Response = 'Interested'/1
color_focus_previously_insured = ['#293286' if (x == 1) else 'darkgray' for x in previously_insured_distribution['Response']]

In [None]:
plt.figure(figsize = (10, 8))

# plot proportion
ax = sns.barplot(
    x = 'Previously_Insured',
    y = 'percentage',
    hue = 'Response',
    order = previously_insured_distribution.iloc[previously_insured_distribution[previously_insured_distribution['Response'] == 1]['percentage'].sort_values(ascending = False).index]['Previously_Insured'],
    palette = color_focus_previously_insured,
    data = previously_insured_distribution
)

ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.spines['left'].set_visible(False)

L = plt.legend(fontsize = 13, frameon = True)
L.get_texts()[0].set_text('Not interested')
L.get_texts()[1].set_text('Interested')
L.set_title('Response', prop = {'size' : 13})
L.get_frame().set_alpha(1)
L.get_frame().set_facecolor((1, 1, 1, 1))

y_ = [x for x in range(0, 120, 20)]
plt.yticks(y_)

plt.xlabel('Previously Insured', fontsize = 14)
plt.ylabel('Proportion', fontsize = 14)

ylabels = [format(y) + '%' for y in ax.get_yticks()]
ax.set_yticklabels(labels = ylabels)

plt.xticks(fontsize = 14) 
plt.yticks(fontsize = 14) 

for p in ax.patches:
    ax.annotate(
        format(p.get_height(), '.1f') + '%',
        (p.get_x() + p.get_width() / 2., p.get_height()), 
        ha = 'center',
        fontweight = 'semibold',
        size = 20,
        xytext = (0, 10), 
        textcoords = 'offset points',
        color = 'grey'
    )

plt.title('Customer Response Based on Previously Insured', fontsize = 24, fontweight = 'semibold', ha = 'center', pad = 20)

plt.tight_layout()

### Vehicle_Age

In [None]:
vehicle_age_distribution = data_eda_viz.groupby(['Vehicle_Age', 'Response']).count()[['id']]

vehicle_age = data_eda_viz.groupby(['Vehicle_Age']).count()[['id']]

vehicle_age_distribution['percentage'] = vehicle_age_distribution.div(vehicle_age, level = 'Vehicle_Age') * 100
vehicle_age_distribution = vehicle_age_distribution.reset_index()
vehicle_age_distribution.columns = ['Vehicle_Age', 'Response', '#of customers', 'percentage']
vehicle_age_distribution

In [None]:
# palette color to highlight Response = 'Interested'/1
color_focus_vehicle_age = ['#293286' if (x == 1) else 'darkgray' for x in vehicle_age_distribution['Response']]

In [None]:
plt.figure(figsize = (10, 8))

# plot proportion
ax = sns.barplot(
    x = 'Vehicle_Age',
    y = 'percentage',
    hue = 'Response',
    order = vehicle_age_distribution.iloc[vehicle_age_distribution[vehicle_age_distribution['Response'] == 1]['percentage'].sort_values(ascending = False).index]['Vehicle_Age'],
    palette = color_focus_vehicle_age,
    data = vehicle_age_distribution
)

ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.spines['left'].set_visible(False)

L = plt.legend(fontsize = 13, frameon = True)
L.get_texts()[0].set_text('Not interested')
L.get_texts()[1].set_text('Interested')
L.set_title('Response', prop = {'size' : 13})
L.get_frame().set_alpha(1)
L.get_frame().set_facecolor((1, 1, 1, 1))

y_ = [x for x in range(0, 120, 20)]
plt.yticks(y_)

plt.xlabel('Vehicle Age', fontsize = 14)
plt.ylabel('Proportion', fontsize = 14)

ylabels = [format(y) + '%' for y in ax.get_yticks()]
ax.set_yticklabels(labels = ylabels)

plt.xticks(fontsize = 14) 
plt.yticks(fontsize = 14) 

for p in ax.patches:
    ax.annotate(
        format(p.get_height(), '.1f') + '%',
        (p.get_x() + p.get_width() / 2., p.get_height()), 
        ha = 'center',
        fontweight = 'semibold',
        size = 20,
        xytext = (0, 10), 
        textcoords = 'offset points',
        color = 'grey'
    )

plt.title('Customer Response Based on Vehicle Age', fontsize = 24, fontweight = 'semibold', ha = 'center', pad = 20)

plt.tight_layout()

### Vehicle_Damage

In [None]:
vehicle_damage_distribution = data_eda_viz.groupby(['Vehicle_Damage', 'Response']).count()[['id']]

vehicle_damage = data_eda_viz.groupby(['Vehicle_Damage']).count()[['id']]

vehicle_damage_distribution['percentage'] = vehicle_damage_distribution.div(vehicle_damage, level = 'Vehicle_Damage') * 100
vehicle_damage_distribution = vehicle_damage_distribution.reset_index()
vehicle_damage_distribution.columns = ['Vehicle_Damage', 'Response', '#of customers', 'percentage']
vehicle_damage_distribution

In [None]:
# palette color to highlight Response = 'Interested'/1
color_focus_vehicle_damage = ['#293286' if (x == 1) else 'darkgray' for x in vehicle_damage_distribution['Response']]

In [None]:
plt.figure(figsize = (10, 8))

# plot proportion
ax = sns.barplot(
    x = 'Vehicle_Damage',
    y = 'percentage',
    hue = 'Response',
    order = vehicle_damage_distribution.iloc[vehicle_damage_distribution[vehicle_damage_distribution['Response'] == 1]['percentage'].sort_values(ascending = False).index]['Vehicle_Damage'],
    palette = color_focus_vehicle_damage,
    data = vehicle_damage_distribution
)

ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.spines['left'].set_visible(False)

L = plt.legend(fontsize = 13, frameon = True)
L.get_texts()[0].set_text('Not interested')
L.get_texts()[1].set_text('Interested')
L.set_title('Response', prop = {'size' : 13})
L.get_frame().set_alpha(1)
L.get_frame().set_facecolor((1, 1, 1, 1))

y_ = [x for x in range(0, 120, 20)]
plt.yticks(y_)

plt.xlabel('Vehicle Damage', fontsize = 14)
plt.ylabel('Proportion', fontsize = 14)

ylabels = [format(y) + '%' for y in ax.get_yticks()]
ax.set_yticklabels(labels = ylabels)

plt.xticks(fontsize = 14) 
plt.yticks(fontsize = 14) 

for p in ax.patches:
    ax.annotate(
        format(p.get_height(), '.1f') + '%',
        (p.get_x() + p.get_width() / 2., p.get_height()), 
        ha = 'center',
        fontweight = 'semibold',
        size = 20,
        xytext = (0, 10), 
        textcoords = 'offset points',
        color = 'grey'
    )

plt.title('Customer Response Based on Vehicle Damage', fontsize = 24, fontweight = 'semibold', ha = 'center', pad = 20)

plt.tight_layout()

### Annual_Premium

In [None]:
plt.figure(figsize = (20.7, 8))

# plot proportion
ax = sns.histplot(
    x = data_eda_viz['Annual_Premium'],
    hue = data_eda_viz['Response'],
    stat = 'probability',
    multiple = 'fill',
    data = data_eda_viz,
    palette = ['darkgray', '#293286']
)

ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.spines['left'].set_visible(False)

L = ax.legend(['Interested', 'Not Interested'], fontsize = 14, frameon = True)
L.set_title('Response', prop = {'size' : 14})
L.get_frame().set_alpha(1)
L.get_frame().set_facecolor((1, 1, 1, 1))

plt.xlabel('Annual Premium', fontsize = 14)
plt.ylabel('Proportion', fontsize = 14)

y_ = [y for y in np.arange(0, 1.2, 0.2)]
plt.yticks(y_)

x_ = [x for x in range(2500, 550000, 50000)]
plt.xticks(x_)

ylabels = [format(int(round(y * 100, 0))) + '%' for y in ax.get_yticks()]
ax.set_yticklabels(labels = ylabels)

plt.xticks(fontsize = 14) 
plt.yticks(fontsize = 14) 

plt.title('Customer Response Based on Annual Premium', fontsize = 24, fontweight = 'semibold', ha = 'center', pad = 20)

plt.tight_layout()

In [None]:
annual_premium_group_distribution = data_eda_viz.groupby(['Annual_Premium_Group', 'Response']).count()[['id']]

annual_premium_group = data_eda_viz.groupby(['Annual_Premium_Group']).count()[['id']]

annual_premium_group_distribution['percentage'] = annual_premium_group_distribution.div(annual_premium_group, level = 'Annual_Premium_Group') * 100
annual_premium_group_distribution = annual_premium_group_distribution.reset_index()
annual_premium_group_distribution.columns = ['Annual_Premium_Group', 'Response', '#of customers', 'percentage']
annual_premium_group_distribution

In [None]:
# palette color to highlight Response = 'Interested'/1
color_focus_annual_premium_group = ['#293286' if (x == 1) else 'darkgray' for x in annual_premium_group_distribution['Response']]

In [None]:
plt.figure(figsize = (10, 8))

# plot proportion
ax = sns.barplot(
    x = 'Annual_Premium_Group',
    y = 'percentage',
    hue = 'Response',
    order = annual_premium_group_distribution.iloc[annual_premium_group_distribution[annual_premium_group_distribution['Response'] == 1]['percentage'].sort_values(ascending = False).index]['Annual_Premium_Group'],
    palette = color_focus_annual_premium_group,
    data = annual_premium_group_distribution
)

ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.spines['left'].set_visible(False)

L = plt.legend(fontsize = 13, frameon = True, bbox_to_anchor = (1, 0.78), loc = 'upper right')
L.get_texts()[0].set_text('Not interested')
L.get_texts()[1].set_text('Interested')
L.set_title('Response', prop = {'size' : 13})
L.get_frame().set_alpha(1)
L.get_frame().set_facecolor((1, 1, 1, 1))

y_ = [x for x in range(0, 120, 20)]
plt.yticks(y_)

plt.xlabel('Annual Premium Group', fontsize = 14)
plt.ylabel('Proportion', fontsize = 14)

ylabels = [format(y) + '%' for y in ax.get_yticks()]
ax.set_yticklabels(labels = ylabels)

plt.xticks(fontsize = 14) 
plt.yticks(fontsize = 14) 

for p in ax.patches:
    ax.annotate(
        format(p.get_height(), '.1f') + '%',
        (p.get_x() + p.get_width() / 2., p.get_height()), 
        ha = 'center',
        fontweight = 'semibold',
        size = 20,
        xytext = (5, 8), 
        textcoords = 'offset points',
        color = 'grey'
    )

plt.title('Customer Response Based on Annual Premium Group', fontsize = 24, fontweight = 'semibold', ha = 'center', pad = 20)

plt.tight_layout()

### Policy_Sales_Channel

In [None]:
policy_sales_channel_group_distribution = data_eda_viz.groupby(['Policy_Sales_Channel_Group', 'Response']).count()[['id']]

policy_sales_channel_group = data_eda_viz.groupby(['Policy_Sales_Channel_Group']).count()[['id']]

policy_sales_channel_group_distribution['percentage'] = policy_sales_channel_group_distribution.div(policy_sales_channel_group, level = 'Policy_Sales_Channel_Group') * 100
policy_sales_channel_group_distribution = policy_sales_channel_group_distribution.reset_index()
policy_sales_channel_group_distribution.columns = ['Policy_Sales_Channel_Group', 'Response', '#of customers', 'percentage']
policy_sales_channel_group_distribution

In [None]:
policy_sales_channel_group_distribution = policy_sales_channel_group_distribution[policy_sales_channel_group_distribution['Policy_Sales_Channel_Group'] != 'others']
policy_sales_channel_group_distribution

In [None]:
direct_writers = policy_sales_channel_group_distribution[policy_sales_channel_group_distribution['Policy_Sales_Channel_Group'].isin(['Affinity Group', 'Direct Response', 'Exclusive/Captive Agents', 'Internet'])].reset_index(drop = True)
direct_writers

In [None]:
agency_writers = policy_sales_channel_group_distribution[policy_sales_channel_group_distribution['Policy_Sales_Channel_Group'].isin(['Independent Agencies'])].reset_index(drop = True)
agency_writers

In [None]:
# palette color to highlight Response = 'Interested'/1
color_focus_policy_sales_channel_group = ['#293286' if (x == 1) else 'darkgray' for x in policy_sales_channel_group_distribution['Response']]

In [None]:
direct_writers['Policy_Sales_Channel_Group'].unique()

In [None]:
fig, ax = plt.subplots(
    nrows = 1, ncols = 2, figsize = (12, 8), sharey = True,
    gridspec_kw = {
        'width_ratios': [3.5, 1]
})

# plot direct writers
ax[0] = sns.barplot(
    x = 'Policy_Sales_Channel_Group',
    y = 'percentage',
    hue = 'Response',
    order = direct_writers.iloc[direct_writers[direct_writers['Response'] == 1]['percentage'].sort_values(ascending = False).index]['Policy_Sales_Channel_Group'],
    palette = color_focus_policy_sales_channel_group,
    data = direct_writers,
    ax = ax[0]
)

ax[0].spines['right'].set_visible(False)
ax[0].spines['top'].set_visible(False)
ax[0].spines['left'].set_visible(False)

L = ax[0].legend(fontsize = 13, frameon = True)
L.get_texts()[0].set_text('Not interested')
L.get_texts()[1].set_text('Interested')
L.set_title('Response', prop = {'size' : 13})
L.get_frame().set_alpha(1)
L.get_frame().set_facecolor((1, 1, 1, 1))

y_ = [x for x in range(0, 120, 20)]
ax[0].set_yticks(y_)

ylabels = [format(y) + '%' for y in ax[0].get_yticks()]
ax[0].set_yticklabels(labels = ylabels, fontsize = 14)

ax[0].set_xlabel('Direct Writers', fontsize = 16, fontweight = 'semibold')
ax[0].set_ylabel('Proportion', fontsize = 16)

for p in ax[0].patches:
    ax[0].annotate(
        format(p.get_height(), '.1f') + '%',
        (p.get_x() + p.get_width() / 2., p.get_height()), 
        ha = 'center',
        fontweight = 'semibold',
        size = 20,
        xytext = (5, 10), 
        textcoords = 'offset points',
        color = 'grey'
    )

# plot agency writers
ax[1] = sns.barplot(
    x = 'Policy_Sales_Channel_Group',
    y = 'percentage',
    hue = 'Response',
    order = agency_writers.iloc[agency_writers[agency_writers['Response'] == 1]['percentage'].sort_values(ascending = False).index]['Policy_Sales_Channel_Group'],
    palette = color_focus_policy_sales_channel_group,
    data = agency_writers,
    ax = ax[1]
)

ax[1].spines['right'].set_visible(False)
ax[1].spines['top'].set_visible(False)
ax[1].spines['left'].set_visible(False)

ax[1].legend([],[], frameon = False)

y_ = [x for x in range(0, 120, 20)]
ax[1].set_yticks(y_)

ylabels = [format(y) + '%' for y in ax[1].get_yticks()]
ax[1].set_yticklabels(labels = ylabels)

ax[1].set_xlabel('Agency Writers', fontsize = 16, fontweight = 'semibold')
ax[1].set_ylabel(' ', fontsize = 16)

for p in ax[1].patches:
    ax[1].annotate(
        format(p.get_height(), '.1f') + '%',
        (p.get_x() + p.get_width() / 2., p.get_height()), 
        ha = 'center',
        fontweight = 'semibold',
        size = 20,
        xytext = (5, 10), 
        textcoords = 'offset points',
        color = 'grey'
    )

plt.suptitle('Customer Response Based on Top 5 Sales Channel by Frequency', fontsize = 24, fontweight = 'semibold', ha = 'center', y = 1.02)

- https://www.iii.org/fact-statistic/facts-statistics-distribution-channels
- Best organizes insurance into two main distribution channels: agency writers and direct writers.
 - Its agency writers category includes insurers that distribute through independent agencies, brokers, general agents and managing general agents.
 - Its direct writers category includes insurers that distribute through the Internet, exclusive/captive agents, direct response and affinity groups.
- In 2018 direct writers accounted for 51.4 percent of P/C insurance net premiums written and agency writers accounted for 47.7 percent, according to A.M. Best.*

- https://en.wikipedia.org/wiki/Independent_insurance_agent
 - Independent insurance agents, also known as insurance sales agents or "producers", typically sell a variety of insurance and financial products, including property insurance and casualty insurance, life insurance, health insurance, disability insurance, and long-term care insurance.
- https://www.thebalance.com/what-is-a-captive-insurance-agent-527298
 - Captive insurance agents work directly for a single insurance carrier. A lot of well known, highly rated insurance carriers employ captive agents. 
- https://www.nerdwallet.com/blog/insurance/auto/drivers-can-find-cheap-car-insurance-through-a-group/
 - Affinity groups give insurers access to a pool of potential customers, who often are part of a demographic that the insurer judges to be “less risky” and therefore eligible for better rates.

### Vintage

In [None]:
plt.figure(figsize = (20.7, 8))

# plot proportion
ax = sns.histplot(
    x = data_eda_viz['Vintage'],
    hue = data_eda_viz['Response'],
    stat = 'probability',
    multiple = 'fill',
    data = data_eda_viz,
    palette = ['darkgray', '#293286']
)

ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.spines['left'].set_visible(False)

L = ax.legend(['Interested', 'Not Interested'], fontsize = 14, frameon = True)
L.set_title('Response', prop = {'size' : 14})
L.get_frame().set_alpha(1)
L.get_frame().set_facecolor((1, 1, 1, 1))

plt.xlabel('Vintage in day count', fontsize = 14)
plt.ylabel('Proportion', fontsize = 14)

y_ = [y for y in np.arange(0, 1.2, 0.2)]
plt.yticks(y_)

x_ = [x for x in range(10, 310, 10)]
plt.xticks(x_)

ylabels = [format(int(round(y * 100, 0))) + '%' for y in ax.get_yticks()]
ax.set_yticklabels(labels = ylabels)

plt.xticks(fontsize = 14) 
plt.yticks(fontsize = 14) 

plt.title('Customer Response Based on Length of Relationship', fontsize = 24, fontweight = 'semibold', ha = 'center', pad = 20)

plt.tight_layout()

In [None]:
vintage_month_distribution = data_eda_viz.groupby(['Vintage_Month', 'Response']).count()[['id']]

vintage_month = data_eda_viz.groupby(['Vintage_Month']).count()[['id']]

vintage_month_distribution['percentage'] = vintage_month_distribution.div(vintage_month, level = 'Vintage_Month') * 100
vintage_month_distribution = vintage_month_distribution.reset_index()
vintage_month_distribution.columns = ['Vintage_Month', 'Response', '#of customers', 'percentage']
vintage_month_distribution

In [None]:
# palette color to highlight Response = 'Interested'/1
color_focus_vintage_month = ['#293286' if (x == 1) else 'darkgray' for x in vintage_month_distribution['Response']]

In [None]:
plt.figure(figsize = (20.7, 8))

# plot proportion
ax = sns.barplot(
    x = 'Vintage_Month',
    y = 'percentage',
    hue = 'Response',
    order = vintage_month_distribution.iloc[vintage_month_distribution[vintage_month_distribution['Response'] == 1]['percentage'].sort_values(ascending = False).index]['Vintage_Month'],
    palette = color_focus_vintage_month,
    data = vintage_month_distribution
)

ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.spines['left'].set_visible(False)

L = plt.legend(fontsize = 13, frameon = True, bbox_to_anchor = (1, 0.7), loc = 'upper right')
L.get_texts()[0].set_text('Not interested')
L.get_texts()[1].set_text('Interested')
L.set_title('Response', prop = {'size' : 13})
L.get_frame().set_alpha(1)
L.get_frame().set_facecolor((1, 1, 1, 1))

y_ = [x for x in range(0, 120, 20)]
plt.yticks(y_)

plt.xlabel('Vintage in month count', fontsize = 14)
plt.ylabel('Proportion', fontsize = 14)

xlabels = [format(x) + 'th' for x in ax.get_xticks()]
ax.set_xticklabels(labels = xlabels)

ylabels = [format(y) + '%' for y in ax.get_yticks()]
ax.set_yticklabels(labels = ylabels)

plt.xticks(fontsize = 14) 
plt.yticks(fontsize = 14) 

for p in ax.patches:
    ax.annotate(
        format(p.get_height(), '.1f') + '%',
        (p.get_x() + p.get_width() / 2., p.get_height()), 
        ha = 'center',
        fontweight = 'semibold',
        size = 20,
        xytext = (5, 8), 
        textcoords = 'offset points',
        color = 'grey'
    )

plt.title('Customer Response Based on Length of Relationship', fontsize = 24, fontweight = 'semibold', ha = 'center', pad = 20)

plt.tight_layout()

In [None]:
vintage_month_group_distribution = data_eda_viz.groupby(['Vintage_Month_Group', 'Response']).count()[['id']]

vintage_month_group = data_eda_viz.groupby(['Vintage_Month_Group']).count()[['id']]

vintage_month_group_distribution['percentage'] = vintage_month_group_distribution.div(vintage_month_group, level = 'Vintage_Month_Group') * 100
vintage_month_group_distribution = vintage_month_group_distribution.reset_index()
vintage_month_group_distribution.columns = ['Vintage_Month_Group', 'Response', '#of customers', 'percentage']
vintage_month_group_distribution

In [None]:
# palette color to highlight Response = 'Interested'/1
color_focus_vintage_month_group = ['#293286' if (x == 1) else 'darkgray' for x in vintage_month_group_distribution['Response']]

In [None]:
plt.figure(figsize = (10, 8))

# plot proportion
ax = sns.barplot(
    x = 'Vintage_Month_Group',
    y = 'percentage',
    hue = 'Response',
    order = vintage_month_group_distribution.iloc[vintage_month_group_distribution[vintage_month_group_distribution['Response'] == 1]['percentage'].sort_values(ascending = False).index]['Vintage_Month_Group'],
    palette = color_focus_vintage_month_group,
    data = vintage_month_group_distribution
)

ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.spines['left'].set_visible(False)

L = plt.legend(fontsize = 13, frameon = True, bbox_to_anchor = (1, 0.7), loc = 'upper right')
L.get_texts()[0].set_text('Not interested')
L.get_texts()[1].set_text('Interested')
L.set_title('Response', prop = {'size' : 13})
L.get_frame().set_alpha(1)
L.get_frame().set_facecolor((1, 1, 1, 1))

y_ = [x for x in range(0, 120, 20)]
plt.yticks(y_)

plt.xlabel('Vintage in group of month', fontsize = 14)
plt.ylabel('Proportion', fontsize = 14)

ylabels = [format(y) + '%' for y in ax.get_yticks()]
ax.set_yticklabels(labels = ylabels)

plt.xticks(fontsize = 14) 
plt.yticks(fontsize = 14) 

for p in ax.patches:
    ax.annotate(
        format(p.get_height(), '.1f') + '%',
        (p.get_x() + p.get_width() / 2., p.get_height()), 
        ha = 'center',
        fontweight = 'semibold',
        size = 20,
        xytext = (5, 8), 
        textcoords = 'offset points',
        color = 'grey'
    )

plt.title('Customer Response Based on Length of Relationship', fontsize = 24, fontweight = 'semibold', ha = 'center', pad = 20)

plt.tight_layout()