# 08 안전 운전자 예측
## 8.4 성능 개선 I: LightGBM 모델

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from lightgbm import early_stopping, log_evaluation

In [2]:
data_path = '../../data/08_safe_driver/'

train_df = pd.read_csv(data_path+'train.csv', index_col='id')
test_df = pd.read_csv(data_path+'test.csv', index_col='id')
submission_df = pd.read_csv(data_path+'sample_submission.csv', index_col='id')

### 8.4.1 피처 엔지니어링

In [3]:
# 데이터 합치기
all_data = pd.concat([train_df, test_df], ignore_index=True)
all_data = all_data.drop(columns='target')
all_features = all_data.columns

In [4]:
# 명목형 피처 원핫 인코딩
from sklearn.preprocessing import OneHotEncoder
cat_features = [feature for feature in all_features if 'cat' in feature]
onehot_encoder = OneHotEncoder()
encoded_cat_matrix = onehot_encoder.fit_transform(all_data[cat_features])

In [5]:
# 파생 피처 추가
## 결측값 개수를 파생 피처로 생성
all_data['num_missing'] = (all_data==-1).sum(axis=1)

In [6]:
# 필요 없는 피처 제거
# drop_features = ['ps_ind_14', 'ps_ind_10_bin', 'ps_ind_11_bin', 'ps_ind_12_bin', 'ps_ind_13_bin', 'ps_car_14']
remaining_features = [feature for feature in all_features if ('cat' not in feature and 'calc' not in feature)] # and feature not in drop_features)]
remaining_features.append('num_missing')

In [7]:
## ind 분류의 피처 처리
ind_features = [feature for feature in all_features if 'ind' in feature]

is_first_feature = True
for ind_feature in ind_features:
    if is_first_feature:
        all_data['mix_ind'] = all_data[ind_feature].astype(str) + '_'
        is_first_feature=False
    else:
        all_data['mix_ind'] += all_data[ind_feature].astype(str) + '_'

In [8]:
all_data.head(10)['mix_ind']

0    2_2_5_1_0_0_1_0_0_0_0_0_0_0_11_0_1_0_
1     1_1_7_0_0_0_0_1_0_0_0_0_0_0_3_0_0_1_
2    5_4_9_1_0_0_0_1_0_0_0_0_0_0_12_1_0_0_
3     0_1_2_0_0_1_0_0_0_0_0_0_0_0_8_1_0_0_
4     0_2_0_1_0_1_0_0_0_0_0_0_0_0_9_1_0_0_
5     5_1_4_0_0_0_0_0_1_0_0_0_0_0_6_1_0_0_
6     2_1_3_1_0_0_1_0_0_0_0_0_0_0_8_1_0_0_
7    5_1_4_0_0_1_0_0_0_0_0_0_0_0_13_1_0_0_
8     5_1_3_1_0_0_0_1_0_0_0_0_0_0_6_1_0_0_
9     1_1_2_0_0_0_1_0_0_0_0_0_0_0_4_0_0_1_
Name: mix_ind, dtype: object

In [14]:
all_data['mix_ind'].value_counts()

mix_ind
0_2_1_0_0_1_0_0_0_0_0_0_0_0_7_1_0_0_     2992
0_1_2_0_0_1_0_0_0_0_0_0_0_0_8_1_0_0_     2784
0_1_2_0_0_1_0_0_0_0_0_0_0_0_7_1_0_0_     2568
0_1_1_0_0_1_0_0_0_0_0_0_0_0_7_1_0_0_     2174
0_2_0_0_0_1_0_0_0_0_0_0_0_0_7_1_0_0_     2131
                                         ... 
0_2_0_0_6_0_0_0_1_0_0_0_0_0_4_0_1_0_        1
6_4_7_0_0_0_0_1_0_0_0_0_0_0_7_1_0_0_        1
3_4_8_1_2_0_0_0_1_0_0_0_0_0_12_1_0_0_       1
5_4_6_1_2_0_1_0_0_0_0_0_0_0_4_1_0_0_        1
5_1_8_0_0_0_1_0_0_0_1_0_0_1_3_0_0_1_        1
Name: count, Length: 143769, dtype: int64

In [9]:
## 명목형 피처의 고유값별 개수를 새로운 피처로 추가
all_data['ps_ind_02_cat'].value_counts()

ps_ind_02_cat
 1    1079327
 2     309747
 3      70172
 4      28259
-1        523
Name: count, dtype: int64

In [10]:
all_data['ps_ind_02_cat'].value_counts().to_dict()

{1: 1079327, 2: 309747, 3: 70172, 4: 28259, -1: 523}

In [11]:
cat_count_features = []
for feature in cat_features+['mix_ind']:
    val_counts_dict = all_data[feature].value_counts().to_dict()
    all_data[f'{feature}_count'] = all_data[feature].apply(lambda x: val_counts_dict[x])
    cat_count_features.append(f'{feature}_count')

## 지금까지 만든 피처
* encoded_cat_matrix
    * 원핫 인코딩된 명목형 피처
* remaining_features
    * 명목형 피처와 calc 분류의 피처를 제외한 피처들 + num_missing
* cat_count_features
    * mix_ind를 포함한 명목형 피처의 고유값별 개수 파생 피처


In [12]:
all_data.head()

Unnamed: 0,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,ps_ind_09_bin,ps_ind_10_bin,...,ps_car_03_cat_count,ps_car_04_cat_count,ps_car_05_cat_count,ps_car_06_cat_count,ps_car_07_cat_count,ps_car_08_cat_count,ps_car_09_cat_count,ps_car_10_cat_count,ps_car_11_cat_count,mix_ind_count
0,2,2,5,1,0,0,1,0,0,0,...,1028142,1241334,431560,77845,1383070,249663,486510,1475460,18326,6
1,1,1,7,0,0,0,0,1,0,0,...,1028142,1241334,666910,329890,1383070,1238365,883326,1475460,12535,36
2,5,4,9,1,0,0,0,1,0,0,...,1028142,1241334,666910,147714,1383070,1238365,883326,1475460,19943,24
3,0,1,2,0,0,1,0,0,0,0,...,183044,1241334,431560,329890,1383070,1238365,36798,1475460,212989,2784
4,0,2,0,1,0,1,0,0,0,0,...,1028142,1241334,666910,147714,1383070,1238365,883326,1475460,26161,258


In [15]:
# 필요없는 피처 제거
drop_features = ['ps_ind_14', 'ps_ind_10_bin', 'ps_ind_11_bin', 'ps_ind_12_bin', 'ps_ind_13_bin', 'ps_car_14']

all_data_remaining = all_data[remaining_features+cat_count_features].drop(columns=drop_features)
all_data_remaining.head()

Unnamed: 0,ps_ind_01,ps_ind_03,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,ps_ind_09_bin,ps_ind_15,ps_ind_16_bin,ps_ind_17_bin,ps_ind_18_bin,...,ps_car_03_cat_count,ps_car_04_cat_count,ps_car_05_cat_count,ps_car_06_cat_count,ps_car_07_cat_count,ps_car_08_cat_count,ps_car_09_cat_count,ps_car_10_cat_count,ps_car_11_cat_count,mix_ind_count
0,2,5,0,1,0,0,11,0,1,0,...,1028142,1241334,431560,77845,1383070,249663,486510,1475460,18326,6
1,1,7,0,0,1,0,3,0,0,1,...,1028142,1241334,666910,329890,1383070,1238365,883326,1475460,12535,36
2,5,9,0,0,1,0,12,1,0,0,...,1028142,1241334,666910,147714,1383070,1238365,883326,1475460,19943,24
3,0,2,1,0,0,0,8,1,0,0,...,183044,1241334,431560,329890,1383070,1238365,36798,1475460,212989,2784
4,0,0,1,0,0,0,9,1,0,0,...,1028142,1241334,666910,147714,1383070,1238365,883326,1475460,26161,258


In [16]:
from scipy import sparse
all_data_sprs = sparse.hstack([
    sparse.csr_matrix(all_data_remaining),
    encoded_cat_matrix
], format='csr')

In [17]:
# 데이터 나누기
num_train = len(train_df)

X = all_data_sprs[:num_train]
X_test = all_data_sprs[num_train:]
y = train_df['target'].values

In [23]:
def eval_gini(y_true, y_pred):
    assert y_true.shape == y_pred.shape
    n_samples = y_true.shape[0]
    L_mid = np.linspace(1/n_samples, 1, n_samples) # 대각선 값

    # 예측값에 대한 지니계수
    pred_order = y_true[y_pred.argsort()]
    # 로렌츠 곡선
    L_pred = np.cumsum(pred_order) / np.sum(pred_order) # array / scalar = array
    G_pred = np.sum(L_mid - L_pred) # 예측값에 대한 지니계수

    # 예측이 완벽할 때 지니계수
    true_order = y_true[y_true.argsort()]
    L_true = np.cumsum(true_order) / np.sum(true_order) # 로렌츠 곡선
    G_true = np.sum(L_mid - L_true) # 예측이 완벽할 때의 지니계수

    # 정규화된 지니계수
    return G_pred / G_true

### 8.4.2 하이퍼파라미터 최적화

In [18]:
# 데이터셋 준비
import lightgbm as lgb
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=0)

bayes_dtrain = lgb.Dataset(X_train, y_train)
bayes_dvalid = lgb.Dataset(X_valid, y_valid)

### 하이퍼파리미터 범위 설정
* 하이퍼파라미터 범위를 점점 좁히는 방법
* 다른 상위권 캐글러가 설정한 하이퍼파라미터룰 참고하는 방법

In [19]:
param_bounds = {
    'num_leaves': (30,40),
    'lambda_l1': (0.7, 0.9),
    'lambda_l2': (0.9, 1),
    'feature_fraction': (0.6, 0.7),
    'bagging_fraction': (0.6, 0.9),
    'min_child_samples': (6, 10),
    'min_child_weight': (10, 40)
}

fixed_params = {
    'objective': 'binary',
    'learning_rate': 0.005,
    'bagging_freq': 1,
    'force_row_wise': True,
    'random_state': 1991
}

In [24]:
# LightGBM용 gini() 함수
def gini(preds, dtrain):
    labels = dtrain.get_label() # 데이터셋의 타깃값을 반환
    #    평가지표 이름       평가 점수     평가 점수가 높을 수록 좋은지 여부
    return 'gini',   eval_gini(labels, preds),      True

In [20]:
# 베이지안 최적화용 평가지표 계산 함수 작성
def eval_function(num_leaves, lambda_l1, lambda_l2, feature_fraction, bagging_fraction, min_child_samples, min_child_weight):
    params = {
        'num_leaves': int(round(num_leaves)),
        'lambda_l1': lambda_l1,
        'lambda_l2': lambda_l2,
        'feature_fraction': feature_fraction,
        'bagging_fraction': bagging_fraction,
        'min_child_samples': int(round(min_child_samples)),
        'min_child_weight': min_child_weight,
        'feature_pre_filter': False
    }
    params.update(fixed_params)

    print(f'하이퍼파라미터: {params}')

    lgb_model = lgb.train(
        params=params,
        train_set=bayes_dtrain,
        num_boost_round=2500,
        valid_sets=bayes_dvalid,
        feval=gini,
        callbacks=[early_stopping(300)] #, log_evaluation(100)],
        # early_stopping_rounds=300,
        # verbose_eval=False
    )
    preds = lgb_model.predict(X_valid)
    gini_score = eval_gini(y_valid, preds)
    print(f'지니계수 : {gini_score}\n')

    return gini_score

In [21]:
# 최적화 수행
from bayes_opt import BayesianOptimization

optimizer = BayesianOptimization(
    f=eval_function,
    pbounds=param_bounds,
    random_state=0
)

In [25]:
optimizer.maximize(init_points=3, n_iter=6)

|   iter    |  target   | baggin... | featur... | lambda_l1 | lambda_l2 | min_ch... | min_ch... | num_le... |
-------------------------------------------------------------------------------------------------------------
하이퍼파라미터: {'num_leaves': 39, 'lambda_l1': 0.7766883037651555, 'lambda_l2': 0.9791725038082665, 'feature_fraction': 0.6963662760501029, 'bagging_fraction': 0.867531900234624, 'min_child_samples': 8, 'min_child_weight': 27.04133683281797, 'feature_pre_filter': False, 'objective': 'binary', 'learning_rate': 0.005, 'bagging_freq': 1, 'force_row_wise': True, 'random_state': 1991}




[LightGBM] [Info] Number of positive: 17383, number of negative: 458786
[LightGBM] [Info] Total Bins 1555
[LightGBM] [Info] Number of data points in the train set: 476169, number of used features: 217
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.036506 -> initscore=-3.273091
[LightGBM] [Info] Start training from score -3.273091
지니계수 : 0.2837380537005777

| [0m1        [0m | [0m0.2837   [0m | [0m0.8675   [0m | [0m0.6964   [0m | [0m0.7767   [0m | [0m0.9792   [0m | [0m8.116    [0m | [0m27.04    [0m | [0m39.26    [0m |
하이퍼파라미터: {'num_leaves': 40, 'lambda_l1': 0.7040436794880651, 'lambda_l2': 0.9832619845547939, 'feature_fraction': 0.608712929970154, 'bagging_fraction': 0.6213108174593661, 'min_child_samples': 9, 'min_child_weight': 36.10036444740457, 'feature_pre_filter': False, 'objective': 'binary', 'learning_rate': 0.005, 'bagging_freq': 1, 'force_row_wise': True, 'random_state': 1991}
[LightGBM] [Info] Number of positive: 17383, number of negative: 458786
[LightG



[LightGBM] [Info] Total Bins 1555
[LightGBM] [Info] Number of data points in the train set: 476169, number of used features: 217
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.036506 -> initscore=-3.273091
[LightGBM] [Info] Start training from score -3.273091
지니계수 : 0.2828993761731121

| [0m6        [0m | [0m0.2829   [0m | [0m0.8978   [0m | [0m0.6594   [0m | [0m0.8445   [0m | [0m0.9234   [0m | [0m8.619    [0m | [0m10.55    [0m | [0m30.09    [0m |
하이퍼파라미터: {'num_leaves': 37, 'lambda_l1': 0.7738449330497988, 'lambda_l2': 0.9032695189818599, 'feature_fraction': 0.6606341064409726, 'bagging_fraction': 0.7666713964943057, 'min_child_samples': 9, 'min_child_weight': 29.306172421380474, 'feature_pre_filter': False, 'objective': 'binary', 'learning_rate': 0.005, 'bagging_freq': 1, 'force_row_wise': True, 'random_state': 1991}
[LightGBM] [Info] Number of positive: 17383, number of negative: 458786
[LightGBM] [Info] Total Bins 1555
[LightGBM] [Info] Number of data points in 



지니계수 : 0.28513273331754563

| [0m7        [0m | [0m0.2851   [0m | [0m0.7667   [0m | [0m0.6606   [0m | [0m0.7738   [0m | [0m0.9033   [0m | [0m8.769    [0m | [0m29.31    [0m | [0m36.6     [0m |
하이퍼파라미터: {'num_leaves': 40, 'lambda_l1': 0.7113567244294035, 'lambda_l2': 0.9992148463611682, 'feature_fraction': 0.6823972673568225, 'bagging_fraction': 0.6452323984860321, 'min_child_samples': 9, 'min_child_weight': 36.23198396337493, 'feature_pre_filter': False, 'objective': 'binary', 'learning_rate': 0.005, 'bagging_freq': 1, 'force_row_wise': True, 'random_state': 1991}
[LightGBM] [Info] Number of positive: 17383, number of negative: 458786
[LightGBM] [Info] Total Bins 1555
[LightGBM] [Info] Number of data points in the train set: 476169, number of used features: 217
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.036506 -> initscore=-3.273091
[LightGBM] [Info] Start training from score -3.273091




지니계수 : 0.28549714593923864

| [0m8        [0m | [0m0.2855   [0m | [0m0.6452   [0m | [0m0.6824   [0m | [0m0.7114   [0m | [0m0.9992   [0m | [0m9.083    [0m | [0m36.23    [0m | [0m39.59    [0m |
하이퍼파라미터: {'num_leaves': 40, 'lambda_l1': 0.7, 'lambda_l2': 0.9, 'feature_fraction': 0.6, 'bagging_fraction': 0.6374163087819378, 'min_child_samples': 9, 'min_child_weight': 35.07917770932417, 'feature_pre_filter': False, 'objective': 'binary', 'learning_rate': 0.005, 'bagging_freq': 1, 'force_row_wise': True, 'random_state': 1991}
[LightGBM] [Info] Number of positive: 17383, number of negative: 458786




[LightGBM] [Info] Total Bins 1555
[LightGBM] [Info] Number of data points in the train set: 476169, number of used features: 217
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.036506 -> initscore=-3.273091
[LightGBM] [Info] Start training from score -3.273091
지니계수 : 0.2858822213198542

| [95m9        [0m | [95m0.2859   [0m | [95m0.6374   [0m | [95m0.6      [0m | [95m0.7      [0m | [95m0.9      [0m | [95m9.496    [0m | [95m35.08    [0m | [95m40.0     [0m |
하이퍼파라미터: {'num_leaves': 40, 'lambda_l1': 0.7, 'lambda_l2': 0.9, 'feature_fraction': 0.6, 'bagging_fraction': 0.6, 'min_child_samples': 8, 'min_child_weight': 35.037970582965684, 'feature_pre_filter': False, 'objective': 'binary', 'learning_rate': 0.005, 'bagging_freq': 1, 'force_row_wise': True, 'random_state': 1991}
[LightGBM] [Info] Number of positive: 17383, number of negative: 458786
[LightGBM] [Info] Total Bins 1555
[LightGBM] [Info] Number of data points in the train set: 476169, number of used features: 217



지니계수 : 0.2848427308895299

| [0m10       [0m | [0m0.2848   [0m | [0m0.6      [0m | [0m0.6      [0m | [0m0.7      [0m | [0m0.9      [0m | [0m7.843    [0m | [0m35.04    [0m | [0m40.0     [0m |
하이퍼파라미터: {'num_leaves': 38, 'lambda_l1': 0.8397572814771797, 'lambda_l2': 0.9008140279489394, 'feature_fraction': 0.62583667466163, 'bagging_fraction': 0.8751015355558421, 'min_child_samples': 10, 'min_child_weight': 34.5260863487709, 'feature_pre_filter': False, 'objective': 'binary', 'learning_rate': 0.005, 'bagging_freq': 1, 'force_row_wise': True, 'random_state': 1991}
[LightGBM] [Info] Number of positive: 17383, number of negative: 458786




[LightGBM] [Info] Total Bins 1555
[LightGBM] [Info] Number of data points in the train set: 476169, number of used features: 217
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.036506 -> initscore=-3.273091
[LightGBM] [Info] Start training from score -3.273091
지니계수 : 0.28435840552500996

| [0m11       [0m | [0m0.2844   [0m | [0m0.8751   [0m | [0m0.6258   [0m | [0m0.8398   [0m | [0m0.9008   [0m | [0m9.824    [0m | [0m34.53    [0m | [0m38.42    [0m |


In [26]:
# 결과 확인
max_params = optimizer.max['params']
max_params

{'bagging_fraction': 0.6374163087819378,
 'feature_fraction': 0.6,
 'lambda_l1': 0.7,
 'lambda_l2': 0.9,
 'min_child_samples': 9.4959504898169,
 'min_child_weight': 35.07917770932417,
 'num_leaves': 40.0}

In [27]:
# 정수형 하이퍼파라미터 변환
max_params['num_leaves'] = int(round(max_params['num_leaves']))
max_params['min_child_samples'] = int(round(max_params['min_child_samples']))

In [28]:
# 값이 고정된 파라미터 추가
max_params.update(fixed_params)

In [29]:
max_params

{'bagging_fraction': 0.6374163087819378,
 'feature_fraction': 0.6,
 'lambda_l1': 0.7,
 'lambda_l2': 0.9,
 'min_child_samples': 9,
 'min_child_weight': 35.07917770932417,
 'num_leaves': 40,
 'objective': 'binary',
 'learning_rate': 0.005,
 'bagging_freq': 1,
 'force_row_wise': True,
 'random_state': 1991}

### 8.4.3 모델 훈련 및 성능 검증

In [30]:
# OOF 방식으로 LightGBM 훈련
from sklearn.model_selection import StratifiedKFold

folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=1991)

In [31]:
oof_val_preds = np.zeros(X.shape[0])
oof_test_preds = np.zeros(X_test.shape[0])
for idx, (train_idx, valid_idx) in enumerate(folds.split(X, y)):
    print('#'*40, f'폴드 {idx+1} / 폴드 {folds.n_splits}', '#'*40)
    X_train, y_train = X[train_idx], y[train_idx]
    X_valid, y_valid = X[valid_idx], y[valid_idx]

    dtrain = lgb.Dataset(X_train, y_train)
    dvalid = lgb.Dataset(X_valid, y_valid)

    lgb_model = lgb.train(
        params=max_params,             # 훈련용 하이퍼파라미터
        train_set=dtrain,          # 훈련 데이터셋
        num_boost_round=2500,      # 부스팅 반복 횟수
        valid_sets=dvalid,         # 상능 평가용 검증 데이터셋
        feval=gini,                # 검증용 평가지표
        callbacks=[early_stopping(300), log_evaluation(100)],# 조기종료 조건, 100번째마다 점수 출력
        # early_stopping_rounds=300,
        # verbose_eval=100
    )
    oof_test_preds += lgb_model.predict(X_test)/folds.n_splits
    oof_val_preds[valid_idx] += lgb_model.predict(X_valid)

    gini_score = eval_gini(y_valid, oof_val_preds[valid_idx])
    print(f'폴드 {idx+1} 지니계수 : {gini_score}\n')

######################################## 폴드 1 / 폴드 5 ########################################




[LightGBM] [Info] Number of positive: 17355, number of negative: 458814
[LightGBM] [Info] Total Bins 1554
[LightGBM] [Info] Number of data points in the train set: 476169, number of used features: 216
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.036447 -> initscore=-3.274764
[LightGBM] [Info] Start training from score -3.274764
Training until validation scores don't improve for 300 rounds
[100]	valid_0's binary_logloss: 0.154243	valid_0's gini: 0.271218
[200]	valid_0's binary_logloss: 0.153179	valid_0's gini: 0.276022
[300]	valid_0's binary_logloss: 0.152584	valid_0's gini: 0.279766
[400]	valid_0's binary_logloss: 0.152219	valid_0's gini: 0.283476
[500]	valid_0's binary_logloss: 0.151986	valid_0's gini: 0.286633
[600]	valid_0's binary_logloss: 0.151829	valid_0's gini: 0.289077
[700]	valid_0's binary_logloss: 0.151717	valid_0's gini: 0.290887
[800]	valid_0's binary_logloss: 0.15163	valid_0's gini: 0.292622
[900]	valid_0's binary_logloss: 0.151566	valid_0's gini: 0.293937
[1000]	val

In [32]:
print(f'OOF 검증 데이터 지니계수: {eval_gini(y, oof_val_preds)}')

OOF 검증 데이터 지니계수: 0.2889965811339316


In [33]:
submission_df['target'] = oof_test_preds
submission_df.to_csv(data_path+'submission_enhanced1.csv')