In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#Import required libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from warnings import filterwarnings
filterwarnings('ignore')

In [None]:
#Read dataset from the kaggle

df= pd.read_csv('/kaggle/input/health-insurance-cross-sell-prediction/train.csv')
df.head()

In [None]:
#Shape of the dataset
df.shape

In [None]:
#Data description
df.describe()

In [None]:
#Features and Data-types
df.info()

In [None]:
#Data description of numerical column
df.describe()

In [None]:
#Data description of categorical column
df.select_dtypes(exclude='number').describe()

In [None]:
#Checking for Null values 

df.isnull().sum()
#There are no null values

In [None]:
#Checking for Duplicate records 

df.duplicated().value_counts()

# Exploratory Data Analysis
### Unvariate and MultiVariate Analysis

### Response feature

In [None]:
plt.figure(figsize=(16,7))

plt.subplot(1,2,1)
plt.title('Count of Responses')
sns.countplot(df['Response'])

plt.subplot(1,2,2)
plt.pie(df['Response'].value_counts(), explode=[0.05,0] , colors=[ 'lightskyblue', 'orange'] ,autopct='%.1f%%', labels=['No', 'Yes'], labeldistance=1.1)
plt.title('Percentage of Response class')

plt.show()

### Gender feature

In [None]:
plt.figure(figsize=(16,7))

plt.subplot(1,3,1)
plt.title('Count of Male and Female')
sns.countplot(df['Gender'])

plt.subplot(1,3,2)
plt.pie(df['Gender'].value_counts(), explode=[0.025,0] ,autopct='%.1f%%', labels=['Male', 'Female'], labeldistance=1.1)
plt.title('Percentage of Male and Female')

plt.subplot(1,3 ,3)
sns.countplot(df['Gender'], hue=df['Response'], palette='Paired')
plt.title('Gender with respect to Response')

plt.show()

In [None]:
#Splitting Male & Female customers and analysing based on overall customers

male = df[df['Gender']=='Male']
female = df[df['Gender']=='Female']


count_response = []

#Count of customers who have not purchased/responded to vehicle insurance
count_response.append(df[df['Response'] == 0]['Response'].count())

#Count of customers who have purchased/responded to vehicle insurance and those who are Male customers
count_response.append(male[male['Response']==1]['Response'].count())

#Count of customers who have purchased/responded to vehicle insurance and those who are Female customers
count_response.append(female[female['Response']==1]['Response'].count())

count_response

In [None]:
plt.figure(figsize=(16,7))

plt.subplot(1,3,1)
plt.pie(male['Response'].value_counts(), explode=[0.025,0] ,autopct='%.1f%%', labels=['No', 'Yes'], labeldistance=1.1)
plt.title('Percentage of Males interested')

plt.subplot(1,3,2)
plt.pie(female['Response'].value_counts(), explode=[0.025,0] ,autopct='%.1f%%', labels=['No', 'Yes'], labeldistance=1.1)
plt.title('Percentage of Females interested')

plt.subplot(1,3,3)
plt.pie(count_response, explode=[0.025,0.05, 0] ,colors=[ 'blue', 'green', 'orange'], autopct='%.1f%%', labels=['No', 'Male_Yes', 'Female_Yes'], labeldistance=1.1)
plt.title('Overall Percentage of Males and Females interested')

plt.show()

### Age feature

In [None]:
#Splitting the data based on customers who have responded
res_yes = df[df['Response']==1]
res_no = df[df['Response']==0]

plt.figure(figsize=(15,4))

plt.subplot(1,2,1)
sns.distplot(df['Age'])
plt.title('Distribution of Age variable')

plt.subplot(1,2,2)
sns.distplot(res_yes['Age'], label='Interested')
sns.distplot(res_no['Age'], label='Not-Interested')
plt.title('Distribution of Age variable w.r.t Response')
plt.legend()

plt.show()

In [None]:
plt.figure(figsize=(15,4))

plt.subplot(1,2,1)
sns.histplot(df['Age'],bins=10, binwidth=5 )
plt.title('Count of Age group')

plt.subplot(1,2,2)
sns.histplot(data=df, x='Age', hue='Response' ,bins=10, binwidth=5 )
plt.title('Count of Age group w.r.t Response')

plt.show()

In [None]:
plt.figure(figsize=(10,8))

plt.subplot(2,1,1)
sns.boxplot(df['Age'], color='m')
plt.title('Box-Plot and Violin-Plot ')

plt.subplot(2,1,2)
sns.violinplot(df['Age'], color='m')

plt.show()

### Driving License feature

In [None]:
plt.figure(figsize=(14,7))

plt.subplot(1,2,1)
plt.pie(df['Driving_License'].value_counts(), explode=[0.025,0] ,autopct='%.1f%%', labels=['Yes', 'No'], labeldistance=1.1)
plt.title('Percentage of customers with Driving License')

plt.subplot(1,2 ,2)
sns.countplot(df['Driving_License'], hue=df['Response'], palette='Paired')
plt.title('Customers having Driving License w.r.t Response')

plt.show()

### Region_Code feature

In [None]:
plt.figure(figsize=(15,4))

plt.subplot(1,2,1)
sns.distplot(df['Region_Code'])
plt.title('Distribution of Region Code variable')

plt.subplot(1,2,2)
sns.boxplot(df['Region_Code'], color='c')
plt.title('Box-Plot of Region Code')

plt.show()

In [None]:
plt.figure(figsize=(15,4))

plt.subplot(1,2,1)
sns.histplot(df['Region_Code'], binwidth=5 )
plt.title('Count of Region Code group')

plt.subplot(1,2,2)
sns.histplot(data=df, x='Region_Code', hue='Response' , binwidth=5 )
plt.title('Count of Region Code group w.r.t Response')

plt.show()

### Previously Insured Feature

In [None]:
plt.figure(figsize=(14,7))

plt.subplot(1,2,1)
plt.pie(df['Previously_Insured'].value_counts(), explode=[0.025,0] ,autopct='%.1f%%', labels=['Yes', 'No'], labeldistance=1.1)
plt.title('Percentage of customers who were Previously Insured')

plt.subplot(1,2 ,2)
sns.countplot(df['Previously_Insured'], hue=df['Response'], palette='Paired')
plt.title('Customers who were Previously Insured w.r.t Response')

plt.show()

In [None]:
plt.figure(figsize=(14,7))

plt.subplots_adjust(wspace=0.3)

plt.subplot(1,2,1)
plt.pie(df['Vehicle_Age'].value_counts(), explode=[0.025,0.05, 0] ,colors=[ 'blue', 'green', 'orange'], autopct='%.1f%%', 
        labels=list(df['Vehicle_Age'].value_counts().index), labeldistance=1.1)
plt.title('Overall Percentage of Vehicle age')

plt.subplot(1,2 ,2)
sns.countplot(df['Vehicle_Age'], hue=df['Response'], palette='Paired')
plt.title('Vehicle age w.r.t Response')


plt.show()

### Vehicle Damage feature

In [None]:
plt.figure(figsize=(16,7))

plt.subplots_adjust(wspace=0.35)

plt.subplot(1,2,1)
plt.title('Count of Vehicles which are damaged')
sns.countplot(df['Vehicle_Damage'])

plt.subplot(1,2 ,2)
sns.countplot(df['Vehicle_Damage'], hue=df['Response'], palette='Paired')
plt.title('Total count of vehicles damaged w.r.t Response')

plt.show()

### Annual Premium feature

In [None]:
plt.figure(figsize=(15,4))

sns.distplot(df['Annual_Premium'])
plt.title('Distribution of Age variable')

plt.show()

plt.figure(figsize=(14,10))

plt.subplot(2,1,1)
sns.boxplot(df['Annual_Premium'], color='m')
plt.title('Box-Plot and Violin-Plot ')

plt.subplot(2,1,2)
sns.violinplot(df['Annual_Premium'], color='m')

plt.show()

### Policy Sales Channel feature

In [None]:
plt.figure(figsize=(15,4))

plt.subplot(1,2,1)
sns.distplot(df['Policy_Sales_Channel'])
plt.title('Distribution of Policy Sales Channel variable')

plt.subplot(1,2,2)
sns.boxplot(df['Policy_Sales_Channel'], color='c')
plt.title('Box-Plot of Policy Sales Channel')

plt.show()

In [None]:
plt.figure(figsize=(15,4))

plt.subplot(1,2,1)
sns.histplot(df['Policy_Sales_Channel'], binwidth=5 )
plt.title('Count of Policy Sales Channel')

plt.subplot(1,2,2)
sns.histplot(data=df, x='Policy_Sales_Channel', hue='Response' , binwidth=5 )
plt.title('Count of Policy Sales Channel w.r.t Response')

plt.show()

### Vintage feature

In [None]:
plt.figure(figsize=(14,10))

plt.subplot(2,1,1)
sns.distplot(df['Vintage'])
plt.title('Distribution of Age variable')

plt.subplot(2,1,2)
sns.boxplot(df['Vintage'], color='m')
plt.title('Box-Plot')

plt.show()

### Age vs Gender with respect to Response

In [None]:
plt.figure(figsize=(12,6))
plt.subplots_adjust(wspace=0.5)

plt.subplot(1,2,1)
sns.histplot(data=res_yes, x='Age', hue='Gender', binwidth=5)
plt.title('Count of Male and Female who have responded')

plt.subplot(1,2,2)
sns.histplot(data=res_no, x='Age', hue='Gender', binwidth=5)
plt.title('Count of Male and Female who have not responded')
plt.show()

### Region Code vs Gender with respect to Response

In [None]:
plt.figure(figsize=(12,6))
plt.subplots_adjust(wspace=0.5)

plt.subplot(1,2,1)
sns.histplot(data=res_yes, x='Region_Code', hue='Gender', binwidth=5)
plt.title('Count of Male and Female who have responded')

plt.subplot(1,2,2)
sns.histplot(data=res_no, x='Region_Code', hue='Gender', binwidth=5)
plt.title('Count of Male and Female who have not responded')
plt.show()

### Count of Previously Insured vs Gender with respect to Response

In [None]:
pd.crosstab(index=df['Gender'], columns=df['Previously_Insured']).plot(kind='bar')
plt.title('Male and Female who have previously insured')
plt.show()

### Count of Vehicle Age vs Gender with respect to Response

In [None]:
pd.crosstab(index=df['Gender'], columns=df['Vehicle_Age']).sort_values(by='Female', axis=1, ascending=False).plot(kind='bar')
plt.legend(loc=2)
plt.title('Male and Female customers with Vehicle Age')
plt.show()

### Corelation plot

cor = df.iloc[:, 1:].corr()

plt.figure(figsize=(10,8))
sns.heatmap(cor,linewidths=2,square=True, annot=True, cmap='YlGnBu')
plt.title('Corelation Heatmap')
plt.show()

# Data Pre-processing

### Removing in-significant variables which are not useful 

In [None]:
# Id is in-significant, we will remove that column

df.drop('id', axis=1, inplace=True)
df.head(2)

### Duplicate records

In [None]:
# Check for duplicates
df.duplicated(keep='first').value_counts()

#269 records are duplicated, We are keeping the first record and deleting the rest of duplicate records
#We found out there are duplicate values only after we removed insignificant variable ==> 'id'

#deleting the duplicate records
df = df[~df.duplicated()]
df.head()

#Shape of the dataset after duplicate deletion
df.shape

### Null checking and imputation

In [None]:
#Null value
df.isnull().sum()

#There are no null values

#There cannot be any 'blank' or '?' in the numerical column
#We are checking the unique values in categorical columns to check whether any 'blank' or '?' are there

col = df.select_dtypes(exclude='number')

for i in col:
    print(f'\nUnique values in {i} feature :')
    print(df[i].unique())

#There are no null values

### Outlier Analysis (IQR Method)

In [None]:
#As seen in eda, only Annual Premium had outliers and age had negligible outliers

# checking outliers in Annual Premium
q1 = df['Annual_Premium'].quantile(0.25)
q3 = df['Annual_Premium'].quantile(0.75)

iqr = q3-q1

lower_limit = q1 - 1.5*iqr
upper_limit = q3 + 1.5*iqr

outlier_ap = df[(df['Annual_Premium']<lower_limit) | (df['Annual_Premium']>upper_limit)].shape
print('The shape of outliers in Annual Premium:', outlier_ap)

#There are 10331 outliers 
percent = round(outlier_ap[0]/len(df) * 100,2)
print(f'Percentage of outliers : {percent}%')

#There are 2.71% outliers of overall records, so we will remove that

In [None]:
#Checking whether any outliers in 'Age' feature
sns.boxplot(df['Age'])
plt.title('Box-plot of AGE')
plt.show()

#There is no significant outliers in Age, so we are not considering AGE for outlier treatment

## Further Scope
>#### Data processing
>#### Transformation
>#### Modelling
>#### Hyper-parameter tuning
>#### Recomendation

# I will upload one more notebook with all the ML models

   # ----------------------------Thank You-------------------------------