# 1. About dataset

Porto Seguro is a Brazilian auto insurance company.     
The purpose of this contest is to predict the probability that a car owner will file an insurance claim next year.      
The data has 59 million train data and 89 million test data.

In [None]:
# import library
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.impute import SimpleImputer # 결측치 대체를 위한 라이브러리
from sklearn.utils import shuffle


In [None]:
#import data
data_path = "../input/porto-seguro-safe-driver-prediction/"
df_train = pd.read_csv(data_path+"train.csv")
df_test = pd.read_csv(data_path+"test.csv")

## dataset information
Features are grouped                      
**ind** - integer value mapping                      
**reg** - region                     
**car** - car                   
**calc** - real value                  
* **bin** is binary feature, **cat** is categorical feature.
* **Target value** is 1 if there has been a previous insurance claim, otherwise 0
* All Null Value is -1
* Binary variable
* Cartegorial variable of integers

In [None]:
df_train.head()

In [None]:
print(df_train.shape)

In [None]:
df_train.columns.tolist()

In [None]:
df_test.head()

In [None]:
print(df_test.shape)

In [None]:
df_test.columns.tolist()

Only train data has target value

# 2. Data preprocessing

In [None]:
df_train.isnull().sum()

There is no NULL value in train data.                
All NULL value is -1, so it can't be found by isnull(). 

In [None]:
import missingno as msno # 결측치 시각화 라이브러리
msno.matrix(df=df_train.iloc[:,:40], figsize=(14,10))

(Not all data is visualized to save memory)

# 3. Metadata

Create Metadata dataframe to facilitate data access.
* data's role: input, ID, target
* data's level: nominal,interval,ordinal,binary
* Whether data is deleted: True or False (Only id data is deleted.)
* data's type: int, float, str

In [None]:
# empty list for append
data = []

for f in df_train.columns:
    
    # assign data's role
    if f == 'target':
        role = 'target'
    elif f == 'id':
        role = 'id'
    else:
        role = 'input'
        
    # assign data's level
    if 'bin' in f or f == 'target':
        level = 'binary'
    elif 'cat' in f or f == 'id':
        level = 'nominal' 
    elif df_train[f].dtype == float:
        level = 'interval'
    elif df_train[f].dtype == int:
        level = 'ordinal'
    
    # Decide whether to discard this data or not
    keep = True
    if f == 'id':
        keep = False
        
    # assign data's type
    dtype = df_train[f].dtype
    
    f_dict = {
        'varname': f,
        'role': role,
        'level': level,
        'keep': keep,
        'dtype': dtype
    }
    data.append(f_dict)
    
    meta = pd.DataFrame(data, columns = ["varname", "role", "level", "keep", "dtype"])
    meta.set_index("varname", inplace = True)

In [None]:
meta

## How to use metadata
We can use this metadata for data indexing.

In [None]:
# ex 1.
meta[(meta["level"] == "nominal") & (meta["keep"])].index

In [None]:
# ex 2.
meta.groupby(["role", "level"])["role"].size()

# 4. EDA 

In [None]:
# index array for Interval value's index
Interval = meta[(meta["level"] == "interval") & (meta["keep"])].index

In [None]:
df_train[Interval].describe()

In [None]:
# index array for Ordinal value's index
Ordinal = meta[(meta["level"] == "ordinal") & (meta["keep"])].index

In [None]:
df_train[Ordinal].describe()

In [None]:
# index array for Ordinal value's index
Binary = meta[(meta["level"] == "binary") & (meta["keep"])].index

In [None]:
df_train[Binary].describe()

# Imbalanced Class 
* Not claiming insurance(0) is much more likely than claiming insurance(1).
* Imbalanced data is usually resolved using Undersampling or Oversampling.
* Undersamplig: Since 0s are much more than 1s, we balance the data by reducing the 0s.
* Oversampling: Since there are far more 0's than 1's, we balance the data by increasing the 1's.

If data is too big, it costs too much time, memory. So let's use Undersampling.

In [None]:
df_train['target'].value_counts().plot.pie(autopct = '%1.1f%%')
plt.title("Target PiePlot", size=20)

In [None]:
# for undersampling rate
desired_apriori = 0.10

# target data class index
idx_0 = df_train[df_train["target"] == 0].index
idx_1 = df_train[df_train["target"] == 1].index

# index class length
nb_0 = len(df_train.loc[idx_0])
nb_1 = len(df_train.loc[idx_1])

# undersampling
undersampling_rate = ((1-desired_apriori)*nb_1)/(nb_0*desired_apriori)
undersampled_nb_0 = int(undersampling_rate*nb_0)
print("target=0에 대한 언더샘플링 비율: {}".format(undersampling_rate))
print("언더샘플링 전 target=0 레코드의 개수: {}".format(nb_0))
print("언더샘플링 후 target=0 레코드의 개수: {}".format(undersampled_nb_0))

undersampled_idx = shuffle(idx_0, random_state = 37, n_samples=undersampled_nb_0)

idx_list = list(undersampled_idx) + list(idx_1)

df_train = df_train.loc[idx_list].reset_index(drop=True)


## NULL value
It is important that we have to seize Null's meaning.  
* true missing values
* Can change the target due to missing values
* If it is expressed as 0, is it a null value or a true 0 value?
* It is actually a value of -1


In [None]:
vars_with_missing = []

for f in df_train.columns: 
    missings = df_train[df_train[f] == -1][f].count() 
    if missings > 0: 
        vars_with_missing.append(f) 
        missings_perc = missings/df_train.shape[0] 
        print('Variable {}\t has {:>10} records\t ({:.2%})\t with missing values'.format(f, missings, missings_perc))

print() 
print('In total, there are {} variables with missing values'.format(len(vars_with_missing)))



* ps_car_03_cat, ps_car_05_cat has many null values.
* ps_reg_03 - replace with average
* ps_car_11 - There is only one null value, replace with mode

In [None]:
vars_to_drop = ['ps_car_03_cat', 'ps_car_05_cat'] 
df_train.drop(vars_to_drop, inplace=True, axis=1)


# metadata update
meta.loc[(vars_to_drop),'keep'] = False

mean_imp = SimpleImputer(missing_values=-1, strategy='mean') 
mode_imp = SimpleImputer(missing_values=-1, strategy='most_frequent')

df_train['ps_reg_03'] = mean_imp.fit_transform(df_train[['ps_reg_03']]) 
df_train['ps_car_12'] = mean_imp.fit_transform(df_train[['ps_car_12']]) 
df_train['ps_car_14'] = mean_imp.fit_transform(df_train[['ps_car_14']]) 
df_train['ps_car_11'] = mode_imp.fit_transform(df_train[['ps_car_11']])




# 5. visualization

## cartegorial variables

In [None]:
Nominal = meta[(meta["level"] == 'nominal') & (meta["keep"])].index

for f in Nominal:
    plt.figure()
    fig, ax = plt.subplots(figsize=(20,10))
    ax.grid(axis="y", linestyle='--')
    cat_perc = df_train[[f, 'target']].groupby([f], as_index=False).mean()
    cat_perc.sort_values(by='target', ascending=False, inplace=True)
    
    sns.barplot(ax=ax, x=f, y='target', palette = "Pastel1", edgecolor='black', linewidth=0.8, data = cat_perc, order= cat_perc[f])
    plt.ylabel('% target', fontsize=18)
    plt.xlabel(f,fontsize=18)
    plt.tick_params(axis='both', which = 'major', labelsize=18)
    plt.show()

## Interval variables

In [None]:
def corr_heatmap(Interval):
    correlations = df_train[Interval].corr()
    
    cmap = sns.diverging_palette(220,10,as_cmap=True)
    
    fig, ax = plt.subplots(figsize=(10,10))
    sns.heatmap(correlations, cmap=cmap, vmax = 1.0, center=0, fmt='.2f', square=True, linewidths=5, annot=True, cbar_kws={"shrink":.75})
    plt.show();
    
Interval = meta[(meta["role"] == "target") | (meta["level"] == 'interval') & (meta["keep"])].index
corr_heatmap(Interval)

## some variables have strong correlation
* ps_reg_02 & ps_reg_03 (0.7)
* ps_car_12 & ps_car13 (0.67)
* ps_car_12 & ps_car14 (0.58)
* ps_car_13 & ps_car15 (0.67)


## 1) ps_reg_02 & ps_reg_03

In [None]:
sns.lmplot(x='ps_reg_02', y='ps_reg_03', data=df_train, hue = 'target', palette = 'Set1', scatter_kws={'alpha':0.3})
plt.show()

## 2) ps_car_12 & ps_car_13

In [None]:
sns.lmplot(x='ps_car_12', y='ps_car_13', data=df_train, hue = 'target', palette = 'Set1', scatter_kws={'alpha':0.3})
plt.show()

## 3) ps_car_12 & ps_car_14

In [None]:
sns.lmplot(x='ps_car_12', y='ps_car_14', data=df_train, hue = 'target', palette = 'Set1', scatter_kws={'alpha':0.3})
plt.show()

## 4) ps_car_15 & ps_car_13

In [None]:
sns.lmplot(x='ps_car_15', y='ps_car_13', data=df_train, hue = 'target', palette = 'Set1', scatter_kws={'alpha':0.3})
plt.show()

## Ordinal variables

In [None]:
Ordinal = meta[(meta["role"] == "target") | (meta["level"] == 'ordinal') & (meta["keep"])].index
corr_heatmap(Ordinal)