### Loading packages

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
#from sklearn.preprocessing import Imputer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectFromModel
from sklearn.utils import shuffle
from sklearn.ensemble import RandomForestClassifier

pd.set_option('display.max_columns', 100)

### Loading data

In [4]:
train = pd.read_csv('../PortoSeguro/train_reduce.csv')
#test = pd.read_csv('../PortoSeguro/test.csv')

### Data at first sight

* Features that belong to similar groupings are tagged as such in the feature names (e.g., ind, reg, car, calc).  
* Feature names include the postfix bin to indicate binary features and cat to indicate categorical features.  
* Features without these designations are either continuous or ordinal.  
* Values of -1 indicate that the feature was missing from the observation.  
* The target columns signifies whether or not a claim was filed for that policy holder.   

* Binary features == 여부([0,1], [true, false])  
* Categorical(범주형) features   
  * 명목형 자료(Nominal data) == 성별(남/여), 혈액형(A/B/O/AB)  # 순서의 의미가 없다.
  * 순서형 자료(Ordinal data) == 효과(없음/조금있음/매우있음)   # 순서의 의미가 있다.
  
* Numerical(수치형) features  
  * 이산형 자료(Discrete data)   == 일정기간 동안의 발생횟수, 출산횟수  # 이산적인 값을 가짐 
  * 연속형 자료(Continuous data) == 신장, 체중, 협압                    # 연속적인 값을 가짐

### We indeed see the following ###   
* binary variables
* categorical variables of which the category values are integers
* other variables with integer or float values
* variables with -1 representing missing values
* the target variable and an ID variable

In [29]:
train.shape

(1950, 59)

In [30]:
train.drop_duplicates()
train.shape

(1950, 59)

No duplicate rows, so that's fine

In [None]:
train.info()

### Metadata ###  
To facilitate the data management, we'll store meta-information about the variables in a DataFrame.   
This will be helpful when we want to select specific variables for analysis, visualization, modeling, ...


Concretely we will store:

* role: input, ID, target
* level: nominal, interval, ordinal, binary
* keep: True or False
* dtype: int, float, str

In [45]:
data = []
for f in train.columns:
    
    # Defining the role 
    if f == 'target':
        role = 'target'
    elif f == 'id':
        role = 'id'
    else:
        role = 'input'
    
    # Defining the level
    if 'bin' in f or f == 'target':
        level = 'binary'
    elif 'cat' in f or f == 'id':
        level = 'nominal'
    elif train[f].dtype == float:
        level = 'interval'
    elif train[f].dtype == int:
        level = 'ordinal'
    
    # Initialize keep to True for all variables except for id 
    keep = True 
    if f == 'id':
        keep = False
    
    # Defining the data type
    dtype = train[f].dtype
    
    # Creating a Dict that contains all the metadata for the variable 
    f_dict = {
        'varname': f,
        'role': role,
        'level': level,
        'keep': keep,
        'dtype': dtype
    }
    data.append(f_dict)

meta = pd.DataFrame(data, columns=['varname', 'role', 'level', 'keep', 'dtype'])
meta.set_index('varname', inplace=True)

In [None]:
meta

In [None]:
meta[(meta.level == 'nominal') & (meta.keep)].index

In [49]:
pd.DataFrame({'count' : meta.groupby(['role', 'level'])['role'].size()}).reset_index()

Unnamed: 0,role,level,count
0,id,nominal,1
1,input,binary,17
2,input,interval,10
3,input,nominal,14
4,input,ordinal,16
5,target,binary,1


### Descriptive Statistics

#### Interval variables

In [54]:
v = meta[(meta.level == 'interval') & (meta.keep)].index

In [55]:
train[v].describe()

Unnamed: 0,ps_reg_01,ps_reg_02,ps_reg_03,ps_car_12,ps_car_13,ps_car_14,ps_car_15,ps_calc_01,ps_calc_02,ps_calc_03
count,1950.0,1950.0,1950.0,1950.0,1950.0,1950.0,1950.0,1950.0,1950.0,1950.0
mean,0.611744,0.428359,0.539683,0.378948,0.811761,0.28015,3.064025,0.458564,0.454154,0.456359
std,0.285012,0.386391,0.791087,0.061781,0.236304,0.350502,0.739527,0.292116,0.285462,0.288444
min,0.0,0.0,-1.0,0.315278,0.339968,-1.0,0.0,0.0,0.0,0.0
25%,0.4,0.2,0.521416,0.316228,0.670928,0.337639,2.828427,0.2,0.2,0.2
50%,0.7,0.3,0.709753,0.374166,0.75922,0.368782,3.316625,0.5,0.5,0.5
75%,0.9,0.6,0.982662,0.4,0.897062,0.394335,3.605551,0.7,0.7,0.7
max,0.9,1.8,2.332649,1.264911,2.707275,0.565685,3.741657,0.9,0.9,0.9


##### reg variables  

* only ps_reg_03 has missing values
* the range (min to max) differs between the variables. We could appy scaling   
  (e.g. StandardScaler), but it depends on the classifier we will want to use.

##### car variables  
* ps_car_12 and ps_car_15 have missing values  
* again, the range differs and we could apply scaling.  

##### calc variables  
* no missing values  
* this seems to be some kind of ratio as the maximum is 0.9
* all three_calc variables have very similar distributions  

**Overal**, we can see that the range of the interval variables is rather small.   
Perhaps some transformation (e.g. log) is already applied in order to anonymizae the data?

#### Ordinal variables

In [59]:
v = meta[(meta.level == 'ordinal') & (meta.keep)].index
train[v].describe()

Unnamed: 0,ps_ind_01,ps_ind_03,ps_ind_14,ps_ind_15,ps_car_11,ps_calc_04,ps_calc_05,ps_calc_06,ps_calc_07,ps_calc_08,ps_calc_09,ps_calc_10,ps_calc_11,ps_calc_12,ps_calc_13,ps_calc_14
count,1950.0,1950.0,1950.0,1950.0,1950.0,1950.0,1950.0,1950.0,1950.0,1950.0,1950.0,1950.0,1950.0,1950.0,1950.0,1950.0
mean,1.822564,4.354359,0.010769,7.25641,2.368205,2.368718,1.895385,7.760513,3.002051,9.233846,2.346154,8.373333,5.393846,1.486154,2.852308,7.68
std,1.918371,2.705688,0.112743,3.553577,0.816589,1.132543,1.143536,1.31166,1.429728,1.445675,1.240944,2.871653,2.344773,1.202257,1.648337,2.805689
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,0.0,2.0,0.0,5.0,2.0,2.0,1.0,7.0,2.0,8.0,1.0,6.0,4.0,1.0,2.0,6.0
50%,1.0,4.0,0.0,7.0,3.0,2.0,2.0,8.0,3.0,9.0,2.0,8.0,5.0,1.0,3.0,7.0
75%,3.0,6.0,0.0,10.0,3.0,3.0,3.0,9.0,4.0,10.0,3.0,10.0,7.0,2.0,4.0,10.0
max,7.0,11.0,2.0,13.0,3.0,5.0,6.0,10.0,8.0,12.0,6.0,18.0,14.0,7.0,10.0,17.0


#### Binary variables

In [60]:
v = meta[(meta.level == 'binary') & (meta.keep)].index
train[v].describe()

Unnamed: 0,target,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,ps_ind_09_bin,ps_ind_10_bin,ps_ind_11_bin,ps_ind_12_bin,ps_ind_13_bin,ps_ind_16_bin,ps_ind_17_bin,ps_ind_18_bin,ps_calc_15_bin,ps_calc_16_bin,ps_calc_17_bin,ps_calc_18_bin,ps_calc_19_bin,ps_calc_20_bin
count,1950.0,1950.0,1950.0,1950.0,1950.0,1950.0,1950.0,1950.0,1950.0,1950.0,1950.0,1950.0,1950.0,1950.0,1950.0,1950.0,1950.0,1950.0
mean,0.037949,0.41641,0.251282,0.146667,0.185641,0.0,0.002051,0.008718,0.0,0.666154,0.114872,0.15641,0.124615,0.614872,0.566667,0.292308,0.357436,0.153846
std,0.191121,0.49309,0.433862,0.353864,0.388916,0.0,0.045256,0.092986,0.0,0.471706,0.318949,0.363337,0.330367,0.48675,0.495663,0.45494,0.479368,0.360894
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
75%,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0
max,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


* A priori in the train data is 3.645%, which is **strongly imbalanced.**
* From the means we can conclude that for most variables the value is zero in most cases.

### Handling imbalanced classes

In [61]:
train.head()

Unnamed: 0,id,target,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,ps_ind_09_bin,ps_ind_10_bin,ps_ind_11_bin,ps_ind_12_bin,ps_ind_13_bin,ps_ind_14,ps_ind_15,ps_ind_16_bin,ps_ind_17_bin,ps_ind_18_bin,ps_reg_01,ps_reg_02,ps_reg_03,ps_car_01_cat,ps_car_02_cat,ps_car_03_cat,ps_car_04_cat,ps_car_05_cat,ps_car_06_cat,ps_car_07_cat,ps_car_08_cat,ps_car_09_cat,ps_car_10_cat,ps_car_11_cat,ps_car_11,ps_car_12,ps_car_13,ps_car_14,ps_car_15,ps_calc_01,ps_calc_02,ps_calc_03,ps_calc_04,ps_calc_05,ps_calc_06,ps_calc_07,ps_calc_08,ps_calc_09,ps_calc_10,ps_calc_11,ps_calc_12,ps_calc_13,ps_calc_14,ps_calc_15_bin,ps_calc_16_bin,ps_calc_17_bin,ps_calc_18_bin,ps_calc_19_bin,ps_calc_20_bin
0,7,0,2,2,5,1,0,0,1,0,0,0,0,0,0,0,11,0,1,0,0.7,0.2,0.71807,10,1,-1,0,1,4,1,0,0,1,12,2,0.4,0.883679,0.37081,3.605551,0.6,0.5,0.2,3,1,10,1,10,1,5,9,1,5,8,0,1,1,0,0,1
1,9,0,1,1,7,0,0,0,0,1,0,0,0,0,0,0,3,0,0,1,0.8,0.4,0.766078,11,1,-1,0,-1,11,1,1,2,1,19,3,0.316228,0.618817,0.388716,2.44949,0.3,0.1,0.3,2,1,9,5,8,1,7,3,1,1,9,0,1,1,0,1,0
2,13,0,5,4,9,1,0,0,0,1,0,0,0,0,0,0,12,1,0,0,0.0,0.0,-1.0,7,1,-1,0,-1,14,1,1,2,1,60,1,0.316228,0.641586,0.347275,3.316625,0.5,0.7,0.1,2,2,9,1,8,2,7,4,2,7,7,0,1,1,0,1,0
3,16,0,0,1,2,0,0,1,0,0,0,0,0,0,0,0,8,1,0,0,0.9,0.2,0.580948,7,1,0,0,1,11,1,1,3,1,104,1,0.374166,0.542949,0.294958,2.0,0.6,0.9,0.1,2,4,7,1,8,4,2,2,2,4,9,0,0,0,0,0,0
4,17,0,0,2,0,1,0,1,0,0,0,0,0,0,0,0,9,1,0,0,0.7,0.6,0.840759,11,1,-1,0,-1,14,1,1,2,1,82,3,0.31607,0.565832,0.365103,2.0,0.4,0.6,0.0,2,2,6,3,10,2,12,3,1,1,3,0,0,0,1,1,0
