In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Introduction
이 노트북은 아래의 메인 섹션에 따라 구성되었습니다.  
1. Visual inspection of your data  
2. Defining the metadata  
3. Descriptive statistics  
4. Handling imbalanced classes  
5. Data quality checks  
6. Exploratory data visualization  
7. Feature engineering  
8. Feature selection  
9. Feature scaling

# Loadding packages

In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectFromModel
from sklearn.utils import shuffle
from sklearn.ensemble import RandomForestClassifier

pd.set_option('display.max_columns', 100)

Loading data

In [11]:
train = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/kaggle study/Porto Seguro’s Safe Driver Prediction/data/train.csv')
test = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/kaggle study/Porto Seguro’s Safe Driver Prediction/data/test.csv')

data as first sight

다음은 대회에 대한 데이터 설명의 발췌입니다.  
  
* 비슷한 그루핑으로 속한 피처들은 다음과 다음 피처 이름과 같이 태그가 지정됩니다. (e.g: ind, reg, car, calc).  
  
* **bin** 접미사가 포함된 피처 이름은 binary feature임을 뜻하며 **cat**은 categorical feature를 뜻합니다.  
  
* 이러한 명칭이 없는 피처들은 continuous나  ordinal입니다.  
  
* -1 값은 관측에서 누락된 피처임을 의미합니다.  
  
* **target** 컬럼은 정책 소유자에 대한 클레임이 접수 되었는지 여부를 나타냅니다.

In [12]:
train.head()

Unnamed: 0,id,target,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,ps_ind_09_bin,ps_ind_10_bin,ps_ind_11_bin,ps_ind_12_bin,ps_ind_13_bin,ps_ind_14,ps_ind_15,ps_ind_16_bin,ps_ind_17_bin,ps_ind_18_bin,ps_reg_01,ps_reg_02,ps_reg_03,ps_car_01_cat,ps_car_02_cat,ps_car_03_cat,ps_car_04_cat,ps_car_05_cat,ps_car_06_cat,ps_car_07_cat,ps_car_08_cat,ps_car_09_cat,ps_car_10_cat,ps_car_11_cat,ps_car_11,ps_car_12,ps_car_13,ps_car_14,ps_car_15,ps_calc_01,ps_calc_02,ps_calc_03,ps_calc_04,ps_calc_05,ps_calc_06,ps_calc_07,ps_calc_08,ps_calc_09,ps_calc_10,ps_calc_11,ps_calc_12,ps_calc_13,ps_calc_14,ps_calc_15_bin,ps_calc_16_bin,ps_calc_17_bin,ps_calc_18_bin,ps_calc_19_bin,ps_calc_20_bin
0,7,0,2,2,5,1,0,0,1,0,0,0,0,0,0,0,11,0,1,0,0.7,0.2,0.71807,10,1,-1,0,1,4,1,0,0,1,12,2,0.4,0.883679,0.37081,3.605551,0.6,0.5,0.2,3,1,10,1,10,1,5,9,1,5,8,0,1,1,0,0,1
1,9,0,1,1,7,0,0,0,0,1,0,0,0,0,0,0,3,0,0,1,0.8,0.4,0.766078,11,1,-1,0,-1,11,1,1,2,1,19,3,0.316228,0.618817,0.388716,2.44949,0.3,0.1,0.3,2,1,9,5,8,1,7,3,1,1,9,0,1,1,0,1,0
2,13,0,5,4,9,1,0,0,0,1,0,0,0,0,0,0,12,1,0,0,0.0,0.0,-1.0,7,1,-1,0,-1,14,1,1,2,1,60,1,0.316228,0.641586,0.347275,3.316625,0.5,0.7,0.1,2,2,9,1,8,2,7,4,2,7,7,0,1,1,0,1,0
3,16,0,0,1,2,0,0,1,0,0,0,0,0,0,0,0,8,1,0,0,0.9,0.2,0.580948,7,1,0,0,1,11,1,1,3,1,104,1,0.374166,0.542949,0.294958,2.0,0.6,0.9,0.1,2,4,7,1,8,4,2,2,2,4,9,0,0,0,0,0,0
4,17,0,0,2,0,1,0,1,0,0,0,0,0,0,0,0,9,1,0,0,0.7,0.6,0.840759,11,1,-1,0,-1,14,1,1,2,1,82,3,0.31607,0.565832,0.365103,2.0,0.4,0.6,0.0,2,2,6,3,10,2,12,3,1,1,3,0,0,0,1,1,0


In [13]:
train.tail()

Unnamed: 0,id,target,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,ps_ind_09_bin,ps_ind_10_bin,ps_ind_11_bin,ps_ind_12_bin,ps_ind_13_bin,ps_ind_14,ps_ind_15,ps_ind_16_bin,ps_ind_17_bin,ps_ind_18_bin,ps_reg_01,ps_reg_02,ps_reg_03,ps_car_01_cat,ps_car_02_cat,ps_car_03_cat,ps_car_04_cat,ps_car_05_cat,ps_car_06_cat,ps_car_07_cat,ps_car_08_cat,ps_car_09_cat,ps_car_10_cat,ps_car_11_cat,ps_car_11,ps_car_12,ps_car_13,ps_car_14,ps_car_15,ps_calc_01,ps_calc_02,ps_calc_03,ps_calc_04,ps_calc_05,ps_calc_06,ps_calc_07,ps_calc_08,ps_calc_09,ps_calc_10,ps_calc_11,ps_calc_12,ps_calc_13,ps_calc_14,ps_calc_15_bin,ps_calc_16_bin,ps_calc_17_bin,ps_calc_18_bin,ps_calc_19_bin,ps_calc_20_bin
595207,1488013,0,3,1,10,0,0,0,0,0,1,0,0,0,0,0,13,1,0,0,0.5,0.3,0.69282,10,1,-1,0,1,1,1,1,0,1,31,3,0.374166,0.684631,0.385487,2.645751,0.4,0.5,0.3,3,0,9,0,9,1,12,4,1,9,6,0,1,1,0,1,1
595208,1488016,0,5,1,3,0,0,0,0,0,1,0,0,0,0,0,6,1,0,0,0.9,0.7,1.382027,9,1,-1,0,-1,15,0,0,2,1,63,2,0.387298,0.972145,-1.0,3.605551,0.2,0.2,0.0,2,4,8,6,8,2,12,4,1,3,8,1,0,1,0,1,1
595209,1488017,0,1,1,10,0,0,1,0,0,0,0,0,0,0,0,12,1,0,0,0.9,0.2,0.659071,7,1,-1,0,-1,1,1,1,2,1,31,3,0.397492,0.596373,0.398748,1.732051,0.4,0.0,0.3,3,2,7,4,8,0,10,3,2,2,6,0,0,1,0,0,0
595210,1488021,0,5,2,3,1,0,0,0,1,0,0,0,0,0,0,12,1,0,0,0.9,0.4,0.698212,11,1,-1,0,-1,11,1,1,2,1,101,3,0.374166,0.764434,0.384968,3.162278,0.0,0.7,0.0,4,0,9,4,9,2,11,4,1,4,2,0,1,1,1,0,0
595211,1488027,0,0,1,8,0,0,1,0,0,0,0,0,0,0,0,7,1,0,0,0.1,0.2,-1.0,7,0,-1,0,-1,0,1,0,2,1,34,2,0.4,0.932649,0.378021,3.741657,0.4,0.0,0.5,2,3,10,4,10,2,5,4,4,3,8,0,1,0,0,0,0


우리는 다음을 확인할 수 있습니다.  
* binary variables  
* 범주 값이 정수인 categorical 변수
* 다른 변수들은 정수나 소수 값임
* -1을 가진 변수는 결측치를 나타냄
* target 변수 및 ID 변수

train data의 행과 컬럼 개수를 알아보겠습니다.

In [14]:
train.shape

(595212, 59)

59개의 변수와 595,212개의 행이 있습니다. test data와 같은 개수의 변수가 있는지 확인하겠습니다.  
training data에 중복된 행이 있는지 살펴보겠습니다.

In [15]:
train.drop_duplicates()
train.shape

(595212, 59)

중복된 행이 없습니다. 좋습니다.

In [16]:
test.shape

(892816, 58)

test set에 하나의 변수가 없습니다. 하지만 이는 target 변수입니다. 따라서 양호합니다.  
얼마나 많은 변수 타입이 있는지 살펴보겠습니다.

나중에 14개의 categorical 변수에 대해 더미 변수를 만들 수 있습니다. bin 변수는 이미 binary이니 더미화를 할 필요가 없습니다.

In [17]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 595212 entries, 0 to 595211
Data columns (total 59 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   id              595212 non-null  int64  
 1   target          595212 non-null  int64  
 2   ps_ind_01       595212 non-null  int64  
 3   ps_ind_02_cat   595212 non-null  int64  
 4   ps_ind_03       595212 non-null  int64  
 5   ps_ind_04_cat   595212 non-null  int64  
 6   ps_ind_05_cat   595212 non-null  int64  
 7   ps_ind_06_bin   595212 non-null  int64  
 8   ps_ind_07_bin   595212 non-null  int64  
 9   ps_ind_08_bin   595212 non-null  int64  
 10  ps_ind_09_bin   595212 non-null  int64  
 11  ps_ind_10_bin   595212 non-null  int64  
 12  ps_ind_11_bin   595212 non-null  int64  
 13  ps_ind_12_bin   595212 non-null  int64  
 14  ps_ind_13_bin   595212 non-null  int64  
 15  ps_ind_14       595212 non-null  int64  
 16  ps_ind_15       595212 non-null  int64  
 17  ps_ind_16_

info() 메서드를 통해 데이터 타입이 정수나 소수인지 확인할 수 있습니다. 데이터셋에는 결측치가 나타나지 않습니다. 결측치는 -1로 대체됐기 때문에 이는 괜찮습니다. 나중에 이를 살펴볼 것입니다.

### Metadata
데이터 관리를 수행하기 위해서 데이터프레임에 메타 정보로 변수를 저장할 것입니다. 이는 분석이나 시각화, 모델링을 하는데 있어서 우리가 특정한 변수를 선택하는데 도움이 될 것입니다.  
  
구체적으로 저장할 것들:  
  
* role: input, ID, target
* level: nominal, interval, ordinal, binary
* keep: True or False
* dtype: int, float, str

In [30]:
data = []

for f in  train.columns:
  # Defining the role
  if f == 'target':
    role = 'target'
  elif f == 'id':
    role = 'id'
  else:
    role = 'input'

  # Defining the level
  if 'bin' in f or f == 'target':
    level = 'binary'
  elif 'cat' in f or f == 'id':
    level = 'nominal'
  elif train[f].dtype == float:
    level = 'interval'
  elif train[f].dtype == int:
    level = 'ordinal'
    
  # Initialize keep to True for all variables except for id
  keep = True
  if f == 'id':
    keep = False

  # Defining the dat type
  dtype = train[f].dtype

  # Creating a Dict that contains all the metadata for the variable
  f_dict = {
      'varname': f,
      'role': role,
      'level': level,
      'keep': keep,
      'dtype': dtype
  }
  data.append(f_dict)

In [31]:
meta = pd.DataFrame(data, columns=['varname', 'role', 'level', 'keep', 'dtype'])
meta.set_index('varname', inplace=True)

In [32]:
meta

Unnamed: 0_level_0,role,level,keep,dtype
varname,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
id,id,nominal,False,int64
target,target,binary,True,int64
ps_ind_01,input,ordinal,True,int64
ps_ind_02_cat,input,nominal,True,int64
ps_ind_03,input,ordinal,True,int64
ps_ind_04_cat,input,nominal,True,int64
ps_ind_05_cat,input,nominal,True,int64
ps_ind_06_bin,input,binary,True,int64
ps_ind_07_bin,input,binary,True,int64
ps_ind_08_bin,input,binary,True,int64


연습삼아 삭제되지 않은 명목 변수를 추출하겠습니다.

In [33]:
meta[(meta.level == 'nominal') & (meta.keep)].index

Index(['ps_ind_02_cat', 'ps_ind_04_cat', 'ps_ind_05_cat', 'ps_car_01_cat',
       'ps_car_02_cat', 'ps_car_03_cat', 'ps_car_04_cat', 'ps_car_05_cat',
       'ps_car_06_cat', 'ps_car_07_cat', 'ps_car_08_cat', 'ps_car_09_cat',
       'ps_car_10_cat', 'ps_car_11_cat'],
      dtype='object', name='varname')

아래는 역할에 대한 변수의 수와 레벨이 표시됩니다.

In [44]:
 meta.groupby(['role', 'level']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,keep,dtype
role,level,Unnamed: 2_level_1,Unnamed: 3_level_1
id,nominal,1,1
input,binary,17,17
input,interval,10,10
input,nominal,14,14
input,ordinal,16,16
target,binary,1,1


In [43]:
 meta.groupby(['role', 'level']).size()

role    level   
id      nominal      1
input   binary      17
        interval    10
        nominal     14
        ordinal     16
target  binary       1
dtype: int64

In [38]:
pd.DataFrame({'count': meta.groupby(['role', 'level'])['role'].size()}).reset_index()

Unnamed: 0,role,level,count
0,id,nominal,1
1,input,binary,17
2,input,interval,10
3,input,nominal,14
4,input,ordinal,16
5,target,binary,1


### 서술통계
우리는 또한 데이터프레임에 설명적 방법을 적용할 수 있습니다. 하지만 이는 범주형 변수나 id에 대해서 mea, std를 계산하는 것은 말이 되지 않습니다. 우리는 범주형 변수에 대해서 나중에 시각적으로 탐색할 것입니다.  
  
meta 파일 덕분에 우리는 쉽게 변수를 선택할 수 있습니다. 우리는 서술 통계를 계산하기 원하는 변수를 선택할 수 있습니다. 확실히 하기 위해서 우리는 data type에 대해 이를 수행할 것입니다.

### Interval variables

In [49]:
v = meta[(meta.level == 'interval') & (meta.keep)].index
train[v].describe()

Unnamed: 0,ps_reg_01,ps_reg_02,ps_reg_03,ps_car_12,ps_car_13,ps_car_14,ps_car_15,ps_calc_01,ps_calc_02,ps_calc_03
count,595212.0,595212.0,595212.0,595212.0,595212.0,595212.0,595212.0,595212.0,595212.0,595212.0
mean,0.610991,0.439184,0.551102,0.379945,0.813265,0.276256,3.065899,0.449756,0.449589,0.449849
std,0.287643,0.404264,0.793506,0.058327,0.224588,0.357154,0.731366,0.287198,0.286893,0.287153
min,0.0,0.0,-1.0,-1.0,0.250619,-1.0,0.0,0.0,0.0,0.0
25%,0.4,0.2,0.525,0.316228,0.670867,0.333167,2.828427,0.2,0.2,0.2
50%,0.7,0.3,0.720677,0.374166,0.765811,0.368782,3.316625,0.5,0.4,0.5
75%,0.9,0.6,1.0,0.4,0.90619,0.396485,3.605551,0.7,0.7,0.7
max,0.9,1.8,4.037945,1.264911,3.720626,0.636396,3.741657,0.9,0.9,0.9
