# Purpose and Observation

## Purpose:
    - Continuous Variables distributions





# Import Libraries

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import csv


# Read Clean and Missing Values Treated Data

#### Moving ahead with merged dataset.
**dataset used Merged_clean_and_dropped: Merged cleaned, continuous variables imputed with mean, rest recordes with NaN dropped**

In [None]:
df_merged = pd.read_csv('Dataset_model.csv')

df_merged["RCD"] = pd.to_datetime(df_merged["RCD"])



# Descriptive stats for reference

In [None]:
with pd.option_context('display.max_columns', None):
    pd.options.display.float_format = "{:.2f}".format
    display(df_merged.head())
    display(df_merged.describe())
    display(df_merged.info())

# EDA

### Variables and count

In [None]:
print('All variables:', df_merged.columns)
print('Total number of variables: ', len(df_merged.columns))


### Variables explored to check impact on target variable
- Variables to explore (32)
    - Amounts (4) : Owner_Salary, afyp, premium, sum_assured
    - Gender (2): Own_gender, LA_gender
    - Education (2): Own_Edu, Own_Education
    - Occupation (4):  Occ_Profile, Occupation_Group, own_occupation, Occupation
    - Internal categorization (2): risk_status, contract_type 
    - Product (5): Product_Description, Par_NonPar, Product_brief_Category, Product_Club_Manual, CUST_prod_cat
    - Location (5): city, DSTNAME, STATNAME, Focus_region, City_classification
    - Time (4): Age, PPT, Policy_term, billing_frequency
    - Flags (3): channel_flag, Med_Flag, ECS_flag
    - Marital Status (1): Martial_status 
- Variables not suitable to be explored on:
    - Identifiers (2): policy_number, policy_owner_number
    - Date (1): RCD
    - Target variable itself and frequency (2): Freq, target

### Binning Continuous Variables

## Percentage of 1s and 0s in target variable

In [None]:
target_vals = df_merged['target'].value_counts()
target_vals = pd.DataFrame(target_vals)

In [None]:
fig1, ax1 = plt.subplots()
ax1.pie(target_vals['target'], labels = target_vals.index, explode = (0,0.1), autopct = '%1.1f%%', startangle = 90)
ax1.axis('equal')
plt.savefig('Targetpie.png', bbox_inches = 'tight')
plt.show()

# Distributions

In [None]:
#set graph aesthetics
sns.set_style('whitegrid')

### 1. age

In [None]:
plt.figure(figsize= (20,10))
sns.countplot(x= 'age', data= df_merged)
plt.savefig('age_distribution1.png', bbox_inches = 'tight')
plt.show()

In [None]:
#histogram
sns.distplot(df_merged['age'], color = 'red',
             kde_kws = {"color": "k", "lw": 3},
            hist_kws = {"linewidth": 3, "alpha": 1, "color": "c"})
plt.savefig('age_distribution2.png', bbox_inches = 'tight')
plt.show()

### 2. Policy_term

In [None]:
plt.figure(figsize= (20,10))
sns.countplot(x= 'Policy_term', data= df_merged)
plt.savefig('Policy_termdistribution1.png', bbox_inches = 'tight')
plt.show()

In [None]:
sns.distplot(df_merged['Policy_term'],
             kde_kws = {"color": "k", "lw": 3},
            hist_kws = {"linewidth": 3, "alpha": 0.5, "color": "c"})
plt.savefig('Policy_term_histogram.png', bbox_inches = 'tight')
plt.show()

### 3. PPT

In [None]:
plt.figure(figsize= (20,10))
sns.countplot(x= 'PPT', data= df_merged)
plt.savefig('PPT_distribution1.png', bbox_inches = 'tight')
plt.show()

In [None]:
sns.distplot(df_merged['PPT'],
             kde_kws = {"color": "k", "lw": 3},
            hist_kws = {"linewidth": 3, "alpha": 0.5, "color": "c"})
plt.savefig('PPT_histogram.png', bbox_inches = 'tight')
plt.show()

### 4. billing_frequency

In [None]:
plt.figure(figsize= (20,10))
sns.countplot(x= 'billing_frequency', data= df_merged)
plt.savefig('billing_frequency_distribution.png', bbox_inches = 'tight')
plt.show()

In [None]:
sns.distplot(df_merged['billing_frequency'],
             kde_kws = {"color": "k", "lw": 3},
            hist_kws = {"linewidth": 3, "alpha": 0.5, "color": "c"})

plt.savefig('billing_frequency_histogram.png', bbox_inches = 'tight')
plt.show()

## 5. Frequency

In [None]:
plt.figure(figsize= (20,10))
sns.countplot(x= 'Freq', data= df_merged)
plt.savefig('Freq_distribution.png', bbox_inches = 'tight')
plt.show()

## 5. Amount distributions

Owner_Salary, premium, afyp, sum_assured

In [None]:
#prepare to plot
l1 = list(set(df_merged[df_merged['Owner_salary'] > df_merged['Owner_salary'].quantile(0.99)].index)
          .union(set(df_merged[df_merged['Owner_salary'] < df_merged['Owner_salary'].quantile(0.01)].index)))
print('No of records to be excluded considering Owner_Salary:', len(l1), '\n',
     'percentage:', 100*len(l1)/len(df_merged))

l2 = list(set(df_merged[df_merged['afyp'] > df_merged['afyp'].quantile(0.99)].index)
          .union(set(df_merged[df_merged['afyp'] < df_merged['afyp'].quantile(0.01)].index)))
print('No of records to be excluded considering afyp:', len(l2), '\n',
     'percentage:', 100*len(l2)/len(df_merged))

l3 = list(set(df_merged[df_merged['sum_assured'] > df_merged['sum_assured'].quantile(0.99)].index)
          .union(set(df_merged[df_merged['sum_assured'] < df_merged['sum_assured'].quantile(0.01)].index)))
print('No of records to be excluded considering sum_assured:', len(l3), '\n',
     'percentage:', 100*len(l3)/len(df_merged))

l4 = list(set(df_merged[df_merged['premium'] > df_merged['premium'].quantile(0.99)].index)
                .union(set(df_merged[df_merged['premium'] < df_merged['premium'].quantile(0.01)].index)))
print('No of records to be excluded considering premium:', len(l4), '\n',
     'percentage:', 100*len(l4)/len(df_merged))

lexclude = list(set(l1).union(set(l2), set(l3), set(l4)))
print('No of records to be excluded considering all (union):', len(lexclude), '\n',
     'percentage:', 100*len(lexclude)/len(df_merged))

#create dataframe excluding these records.
#this is created only to plot histograms of amounts and PairPlot.
df_graph = df_merged.drop(lexclude, axis = 'index')

In [None]:
#Owner_salary
sns.distplot(df_graph['Owner_salary'],
             kde_kws = {"color": "k", "lw": 2},
            hist_kws = {"linewidth": 3, "alpha": 0.5, "color": "c"})

plt.savefig('Owner_salary_histogram.png', bbox_inches = 'tight')
plt.show()

In [None]:
#sum_assured
sns.distplot(df_graph['sum_assured'], 
             kde_kws = {"color": "k", "lw": 3},
            hist_kws = {"linewidth": 3, "alpha": 0.5, "color": "c"})
plt.savefig('sum_assured_histogram.png', bbox_inches = 'tight')
plt.show()

In [None]:
#premium
sns.distplot(df_graph['premium'],
             kde_kws = {"color": "k", "lw": 3},
            hist_kws = {"linewidth": 3, "alpha": 0.5, "color": "c"})
plt.savefig('premium_histogram.png', bbox_inches = 'tight')
plt.show()

In [None]:
#afyp
sns.distplot(df_graph['afyp'],
             kde_kws = {"color": "k", "lw": 3},
            hist_kws = {"linewidth": 3, "alpha": 0.5, "color": "c"})
plt.savefig('afyp_histogram.png', bbox_inches = 'tight')
plt.show()

## Pairplot

In [None]:
sns.pairplot(data = df_graph, 
            vars = ['premium', 'afyp','sum_assured', 'Owner_salary', 'age',
                    'Policy_term', 'PPT', 'billing_frequency', 'Freq'],
            hue = 'target', kind = "scatter",
            markers = ['+', 'o'],
            plot_kws= {'alpha':0.5})