In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
SMALL_SIZE = 10
MEDIUM_SIZE = 12
BIGGER_SIZE = 14

plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=SMALL_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

In [None]:
dataset = pd.read_csv("/kaggle/input/vehicle-insurance-data/VehicleInsuranceData.csv")

# **Summary review**

In [None]:
dataset.head()

### *Policyholder statistics*

Let's begin with general data about policyholders. It's might be interesting

In [None]:
policy=dataset[['Education','EmploymentStatus','Income','Location.Code','Marital.Status','Number.of.Policies','Gender','clv']]

How policyholder's *EmploymentStatus* and *Marital.Status*  associated with *Number.of.Policies* ?

In [None]:
plt.figure(figsize=(10,6))
#plt.title("Income(Education)")
sns.barplot(data=policy,x='Marital.Status',y='Number.of.Policies',hue='Gender',palette='YlGnBu')
plt.xlabel('Marital status')
plt.ylabel('Number of policies');

In [None]:
plt.figure(figsize=(10,6))
#plt.title("Income(Education)")
sns.barplot(data=policy,x='EmploymentStatus',y='Number.of.Policies',hue='Gender',palette='YlGnBu')
plt.xlabel('Employment status')
plt.ylabel('Number of policies');

Let's view how *Income*  variable depends on *Employment Status* in insurance data.

In [None]:
plt.figure(figsize=(10,6))
plt.title("Income(Education)")
sns.barplot(data=policy,x='EmploymentStatus',y='Income',hue='Gender',palette='YlGnBu')
plt.xlabel('Employment status')
plt.ylabel('Income');

As you could see, *Income* is not depands from policyholder's *Education* almost at all for each *gender*. Whereas for retired females *Number of policie* is about in 1.5 times less than for retired males

How *Income* and *Location* features related with *Education* ? Let's figure it out for both genders

In [None]:
plt.figure(figsize=(10,6))
#plt.title("Income(Education)")
sns.barplot(x=policy['Education'],y=policy['Income'],hue=policy['Gender'],palette='YlGnBu')
plt.xlabel('Education')
plt.ylabel('Income ($ / year)');

## And how *Incomes* feature distributed for all *Employment Status* of policyholders 

In [None]:
sns.jointplot(data=policy.loc[policy['EmploymentStatus']=='Employed'],x='clv', y='Income',kind='kde');

In [None]:
sns.jointplot(data=policy.loc[policy['EmploymentStatus']=='Retired'],x='clv', y='Income',kind='kde');

In [None]:
sns.jointplot(data=policy.loc[policy['EmploymentStatus']=='Medical Leave'],x='clv', y='Income',kind='kde');

In [None]:
sns.jointplot(data=policy.loc[policy['EmploymentStatus']=='Disabled'],x='clv', y='Income',kind='kde');

In [None]:
sns.jointplot(data=policy,x='clv', y='Income',kind='kde');

So we can assume that there are few outliers in *customer lifetime value* feature.  
 Values with *clv* over 15000 look suspicious owing to long thin "tail".
 They might be outliers due to low frequency for any *Employment Status* whatsoever. 
  Therefore, we are going to check it 

We will try a few different approaches due to be more objective in outliers definition. Because if we marked or dropped these suspectious values we would miss some crutial insights.

### 1. [Tukey's fences](https://en.wikipedia.org/wiki/Outlier#Tukey's_fences) approach

In [None]:
plt.figure(figsize=(18,6))
plt.rc('xtick', labelsize=15)  
plt.rc('axes', labelsize=15) 
sns.boxplot(data=policy,x='clv');

In [None]:
IQR=(policy['clv'].quantile(.75)-policy['clv'].quantile(.25))
policy.loc[policy['clv']> policy['clv'].quantile(.75)+IQR*1.5].shape[0]

There are at least 418 outlier *cvl* points  according to Tukey's fences estimation

### 2. [Median Absolute Deviation](https://en.wikipedia.org/wiki/Median_absolute_deviation) approach

In [None]:
med=policy['clv'].median()

In [None]:
residuals = abs(policy['clv']-med)

In [None]:
MAD=abs(policy['clv']-med).median()

In [None]:
MAD

Wow! That's was suprising. So MAD doesn't work at all for this data owing to long thin tail and skewness of distribution

### *Vehicle statistics*

In [None]:
vehicle= dataset[['Vehicle.Size','Vehicle.Class']]

In [None]:
plt.figure(figsize=(10,6))
plt.title("Bar chart of vehicle sizes")
unique_sizes=vehicle['Vehicle.Size'].unique()
sns.barplot(x=unique_sizes,y=vehicle['Vehicle.Size'].value_counts()[unique_sizes])
plt.xlabel('Vehicle size')
plt.ylabel('Count (in units)');

In [None]:
sns.set_style('dark')
plt.figure(figsize=(10,6))
plt.title("Bar chart of vehicle classes")
unique_sizes=vehicle['Vehicle.Class'].unique()
sns.barplot(x=unique_sizes,y=vehicle['Vehicle.Class'].value_counts()[unique_sizes])
plt.xlabel('Vehicle Class')
plt.ylabel('Count (in units)');

As you could see, most of all cars are ***Four-Door*** and ***Medsized***.

*Luxury SUV*, *Luxury Car* and  *Sports Car* are not so popular as others sizes. May be it's due to their cost ?

Now let's looking at policy information

### *Policy statistics*

In [None]:
policy=dataset[[]]