In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import warnings  
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
import seaborn as sns

* ***READ BOTH TRAIN AND TEST DATA***

In [None]:
train_data = pd.read_csv('../input/health-insurance-cross-sell-prediction/train.csv')
test_data = pd.read_csv('../input/health-insurance-cross-sell-prediction/test.csv')
train_data


In [None]:
test_data

> *lET'S CHECK OUT IF ANY INDEPENDENT VARIABLE CONTAINS NULL VALUES*

In [None]:
#No null values dataset
train_data.isnull().sum()

In [None]:
print(train_data.shape)
print(test_data.shape)

In [None]:
#Checking the data type of values of Vehicle_Age 
print(type(train_data['Vehicle_Age'][0]))

# **UNIVARIATE ANALYSIS OF CATEGORICAL VARIABLES** 

In [None]:
#Checking number of vehicles in different age range 
sns.countplot(train_data['Vehicle_Age'])

**THE PLOT SHOWS THAT MOST OF THE VEHICLE TAKEN IN THIS STUDY IS 1-2 YEARS OLD**

In [None]:
df=train_data.groupby(['Vehicle_Age','Response'])['id'].count().to_frame().rename(columns={'id':'count'}).reset_index()
df

In [None]:
sns.countplot(train_data['Gender'])

In [None]:
sns.countplot(train_data['Driving_License'])

**ALMOST ALL PEOPLE HAVE DRIVING LICENSE.. THAT'S GOOD**

In [None]:
sns.countplot(train_data['Previously_Insured'])

**THERE ARE MORE NUMBER OF PEOPLE WHO HAVE NO PREVIOUS INSURANCE POLICY**

In [None]:
sns.countplot(train_data['Vehicle_Damage'])

In [None]:
sns.countplot(train_data['Response'])

**HERE WE GO... THIS DATASET IS HIGHLY IMBALANCED. WE WILL NEED TO TREAT IT BEFORE APPLYING A MACHINE LEARNING ALGORITHM
BECAUSE TJIS IMBALANCE WILL LEAD TO POOR PERFORMANCE OF MODELS**

# UNIVARIATE ANALYSIS OF CONTINUOUS VARIABLES

In [None]:
sns.distplot(x=train_data['Age'])

In [None]:
sns.distplot(x=train_data['Region_Code'])

In [None]:
sns.kdeplot(
   data=train_data, x="Annual_Premium", hue="Response",
   fill=True, common_norm=False, palette="crest",
   alpha=.5, linewidth=0,
)

In [None]:
sns.kdeplot(
   data=train_data, x="Policy_Sales_Channel", hue="Response",
   fill=True, common_norm=False, palette="crest",
   alpha=.5, linewidth=0,
)

In [None]:
sns.kdeplot(
   data=train_data, x="Vintage", hue="Response",
   fill=True, common_norm=False, palette="crest",
   alpha=.5, linewidth=0,
)

# CHECK FOR SKEWNESS OF THE CONTINUOUS VARIABLE


IF A COLUMN IN THE DATASET IS SKEWED THEN IT AFFECTS THE MODEL'S PERFORMANCE AND WE MIGHT END UP WITH WRONG PREDICTIONS. SKEWNESS IS THE MEASURE OF ASSYMETRY OF PROBABILITY DISTRIBUTION OF RANDOM VARIABLE ABOUT IT'S MEAN. WE TAKE INTO ACCOUNT THE FOLLOWING POINTS:
1. If skewness is 0, the data are perfectly symmetrical
2. If skewness is less than -1 or greater than 1, the distribution is highly skewed.
3. If skewness is between -1 and -0.5 or between 0.5 and 1, the distribution is moderately skewed.
4. If skewness is between -0.5 and 0.5, the distribution is approximately symmetric.** 

In [None]:
from scipy.stats import skew
print(skew(train_data['Age']))
print(skew(train_data['Annual_Premium']))
print(skew(train_data['Vintage']))
print(skew(train_data['Policy_Sales_Channel']))
print(skew(train_data['Region_Code']))

> SO HERE WE SEE THAT "AGE" IS MODERATELY SKEWED AND "ANNUAL PREMIUM" IS HIGHLY SKEWED. SO WE NEED TO TREAT THEM AS WELL. REST ALL COLUMNS ARE NOT SKEWED**

**TREATMENT OF SKEWED COLUMNS**

**SKEWNESS CAN BE REMOVED USING VARIOUS METHODS. I HAVE APPLIED SQUARE ROOT TRANSFORM METHOD TO REMOVE IT**

In [None]:
train_data["Age"] = np.sqrt(train_data["Age"])
print(skew(train_data['Age']))

In [None]:
train_data["Annual_Premium"] = np.sqrt(train_data["Annual_Premium"])
print(skew(train_data['Annual_Premium']))

In [None]:
test_data["Age"] = np.sqrt(test_data["Age"])
test_data["Annual_Premium"] = np.sqrt(test_data["Annual_Premium"])

# RELATIONSHIP BETWEEN VARIABLES

In [None]:
g = sns.catplot(x="Vehicle_Age", y="count",col="Response",
                data=df, kind="bar",
                height=4, aspect=.7);


From graph we see that people having vehicles 1-2 years old have taken insurance and overall very less number of people have shown interest in taking insurance, most of them have not taken insurance.

**Now it is important to change the categorical variable names into numeric before proceeding to further analysis**

In [None]:
#mapping of different cagegorical variables to numerical values
train_data['Vehicle_Damage'] = train_data['Vehicle_Damage'].map({'Yes': 0, 'No':1})
train_data['Gender'] = train_data['Gender'].map({'Male': 0, 'Female': 1})
test_data['Vehicle_Damage'] = test_data['Vehicle_Damage'].map({'Yes': 0, 'No':1})
test_data['Gender'] = test_data['Gender'].map({'Male': 0, 'Female': 1})\

In [None]:
train_data.Gender

In [None]:
sns.catplot(x='Gender', y='Age', hue = 'Response', kind = 'bar', data = train_data)

More Females have taken insurance as compared to males

In [None]:
sns.catplot(x='Driving_License', y='Previously_Insured', hue='Gender', kind = 'bar', data = train_data)

More number of females have Driving license

In [None]:
sns.catplot(x='Driving_License', y='Previously_Insured', hue='Response', kind = 'bar', data = train_data)

People who have Previously insured have shown interest in taking new insurance

In [None]:
sns.catplot(x='Vehicle_Age', y='Annual_Premium', hue='Response', kind = 'bar', data = train_data)

In [None]:
sns.catplot(x='Vehicle_Age', y='Annual_Premium', hue='Vehicle_Damage', kind = 'bar', data = train_data)

In [None]:
sns.catplot(x='Vehicle_Damage', y='Annual_Premium', hue='Response', kind = 'bar', data = train_data)

In [None]:
sns.catplot(x='Vehicle_Age', y='Vehicle_Damage', hue='Response', kind = 'bar', data = train_data)

In [None]:
sns.catplot(x='Vehicle_Damage', y='Annual_Premium', hue='Previously_Insured', kind = 'bar', data = train_data)

In [None]:
sns.catplot(x='Response', y="Vintage", kind="box", data=train_data)

In [None]:
sns.catplot(x='Vehicle_Damage', y='Vintage', hue='Response', kind = 'box', data = train_data)

In [None]:
sns.catplot(x='Vehicle_Damage', y='Vintage', hue='Response', kind = 'box', data = train_data)

In [None]:
sns.lineplot(data= train_data, x='Vintage',y='Annual_Premium', color='goldenrod')


**SO WHAT DOES THE DATA TELL US?**

**From the above analysis some important conclusions can be drawn:
1. People having Vehicles with age > 2 years have to pay more amount of annual premium and that has lead to higher number of people from that category not taking insurance. We need to modify the amount little bit so that people from that category do not skip taking insuranc.
2. people having Vehicle Damage tend to buy insurance as compared to the ones who do not have any damage.
3. Annual Premium does not depend on how many days people are associated with company. So we can modify the premium policy so that insurance company can attract more customers.**