# 1. 데이터 수집

In [1]:
import pandas as pd

# 데이터 경로
data_path = '/kaggle/input/porto-seguro-safe-driver-prediction/'

train = pd.read_csv(data_path + 'train.csv', index_col='id')
test = pd.read_csv(data_path + 'test.csv', index_col='id')
submission = pd.read_csv(data_path + 'sample_submission.csv', index_col='id')

# 3. 데이터 전처리

데이터 합치기

In [2]:
all_data = pd.concat([train, test], ignore_index=True)
all_data = all_data.drop('target', axis=1) # 타깃값 제거

In [3]:
all_features = all_data.columns # 전체 피처
all_features

Index(['ps_ind_01', 'ps_ind_02_cat', 'ps_ind_03', 'ps_ind_04_cat',
       'ps_ind_05_cat', 'ps_ind_06_bin', 'ps_ind_07_bin', 'ps_ind_08_bin',
       'ps_ind_09_bin', 'ps_ind_10_bin', 'ps_ind_11_bin', 'ps_ind_12_bin',
       'ps_ind_13_bin', 'ps_ind_14', 'ps_ind_15', 'ps_ind_16_bin',
       'ps_ind_17_bin', 'ps_ind_18_bin', 'ps_reg_01', 'ps_reg_02', 'ps_reg_03',
       'ps_car_01_cat', 'ps_car_02_cat', 'ps_car_03_cat', 'ps_car_04_cat',
       'ps_car_05_cat', 'ps_car_06_cat', 'ps_car_07_cat', 'ps_car_08_cat',
       'ps_car_09_cat', 'ps_car_10_cat', 'ps_car_11_cat', 'ps_car_11',
       'ps_car_12', 'ps_car_13', 'ps_car_14', 'ps_car_15', 'ps_calc_01',
       'ps_calc_02', 'ps_calc_03', 'ps_calc_04', 'ps_calc_05', 'ps_calc_06',
       'ps_calc_07', 'ps_calc_08', 'ps_calc_09', 'ps_calc_10', 'ps_calc_11',
       'ps_calc_12', 'ps_calc_13', 'ps_calc_14', 'ps_calc_15_bin',
       'ps_calc_16_bin', 'ps_calc_17_bin', 'ps_calc_18_bin', 'ps_calc_19_bin',
       'ps_calc_20_bin'],
      dtype='obj

명목형 피처 원-핫 인코딩

In [4]:
from sklearn.preprocessing import OneHotEncoder

# 명목형 피처 추출
cat_features = [feature for feature in all_features if 'cat' in feature] 

onehot_encoder = OneHotEncoder() # 원-핫 인코더 객체 생성
# 인코딩
encoded_cat_matrix = onehot_encoder.fit_transform(all_data[cat_features]) 

encoded_cat_matrix

<1488028x184 sparse matrix of type '<class 'numpy.float64'>'
	with 20832392 stored elements in Compressed Sparse Row format>

필요 없는 피처 제거

In [5]:
# 추가로 제거할 피처
drop_features = ['ps_ind_14', 'ps_ind_10_bin', 'ps_ind_11_bin', 
                 'ps_ind_12_bin', 'ps_ind_13_bin', 'ps_car_14']

# '1) 명목형 피처, 2) calc 분류의 피처, 3) 추가 제거할 피처'를 제외한 피처
remaining_features = [feature for feature in all_features 
                      if ('cat' not in feature and 
                          'calc' not in feature and 
                          feature not in drop_features)]

In [6]:
from scipy import sparse

all_data_sprs = sparse.hstack([sparse.csr_matrix(all_data[remaining_features]),
                               encoded_cat_matrix],
                              format='csr')

데이터 나누기

In [7]:
num_train = len(train) # 훈련 데이터 개수

# 훈련 데이터와 테스트 데이터 나누기
X = all_data_sprs[:num_train]
X_test = all_data_sprs[num_train:]

y = train['target'].values

# 4. 모델링

정규화 지니계수 계산 함수

In [8]:
import numpy as np

def eval_gini(y_true, y_pred):
    # 실제값과 예측값의 크기가 같은지 확인 (값이 다르면 오류 발생)
    assert y_true.shape == y_pred.shape

    n_samples = y_true.shape[0]                      # 데이터 개수
    L_mid = np.linspace(1 / n_samples, 1, n_samples) # 대각선 값

    # 1) 예측값에 대한 지니계수
    pred_order = y_true[y_pred.argsort()] # y_pred 크기순으로 y_true 값 정렬
    L_pred = np.cumsum(pred_order) / np.sum(pred_order) # 로렌츠 곡선
    G_pred = np.sum(L_mid - L_pred)       # 예측 값에 대한 지니계수

    # 2) 예측이 완벽할 때 지니계수
    true_order = y_true[y_true.argsort()] # y_true 크기순으로 y_true 값 정렬
    L_true = np.cumsum(true_order) / np.sum(true_order) # 로렌츠 곡선
    G_true = np.sum(L_mid - L_true)       # 예측이 완벽할 때 지니계수

    # 정규화된 지니계수
    return G_pred / G_true

In [9]:
# XGBoost용 지니함수 계산함수는 반환값이 2개(평가지표명, 평가점수)
# XGBoost용 gini() 함수
def gini(preds, dtrain):
    labels = dtrain.get_label()
    return 'gini', eval_gini(labels, preds)

xgb.DMatrix()로 베이지안 최적화용 데이터셋을 만들어 준다.

In [10]:
import xgboost as xgb
from sklearn.model_selection import train_test_split

# 8:2 비율로 훈련 데이터, 검증 데이터 분리 (베이지안 최적화 수행용)
X_train, X_valid, y_train, y_valid = train_test_split(X, y, 
                                                      test_size=0.2, 
                                                      random_state=0)
# 베이지안 최적화용 데이터셋
bayes_dtrain = xgb.DMatrix(X_train, y_train)
bayes_dvalid = xgb.DMatrix(X_valid, y_valid)

하이퍼파라미터 범위 설정

In [11]:
# 정석
# # 베이지안 최적화를 위한 하이퍼파라미터 범위
# param_bounds = {'max_depth': (4, 8),
#                 'subsample': (0.6, 0.9),
#                 'colsample_bytree': (0.7, 1.0),
#                 'min_child_weight': (5, 7),
#                 'gamma': (8, 11),
#                 'reg_alpha': (7, 9),
#                 'reg_lambda': (1.1, 1.5),
#                 'scale_pos_weight': (1.4, 1.6)}

# # 값이 고정된 하이퍼파라미터
# fixed_params = {'objective': 'binary:logistic',
#                 'learning_rate': 0.02,
#                 'random_state': 1991}

In [12]:
# 수정
# 베이지안 최적화를 위한 하이퍼파라미터 범위
param_bounds = {'max_depth': (5, 7),
                'subsample': (0.6, 0.8),
                'colsample_bytree': (0.8, 0.9),
                'min_child_weight': (6, 7),
                'gamma': (10, 11),
                'reg_alpha': (8, 9),
                'reg_lambda': (1.3, 1.4),
                'scale_pos_weight': (1.4, 1.5)}

# 값이 고정된 하이퍼파라미터
fixed_params = {'objective': 'binary:logistic',
                'learning_rate': 0.02,                                  .,;;0k0kkk0 kkkkk000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 , ,hkkh 00 
                'random_state': 1991}

베이지안 최적화용 평가지표 계산 함수 작성

In [13]:
import xgboost as xgb



def eval_function(max_depth, subsample, colsample_bytree, min_child_weight,
                 reg_alpha, gamma, reg_lambda, scale_pos_weight):
    '''최적화하려는 평가지표(지니계수) 계산 함수'''
    # 베이지안 최적화를 수행할 하이퍼파라미터
    params = {'max_depth': int(round(max_depth)),
              'subsample': subsample,
              'colsample_bytree': colsample_bytree,
              'min_child_weight': min_child_weight,
              'gamma': gamma,
              'reg_alpha':reg_alpha,
              'reg_lambda': reg_lambda,
              'scale_pos_weight': scale_pos_weight}
    # 값이 고정된 하이퍼파라미터도 추가
    params.update(fixed_params)

    print('하이퍼파라미터 :', params)    

    # XGBoost 모델 훈련
    
    xgb_model = xgb.train(params=params, 
                          dtrain=bayes_dtrain,
                          num_boost_round=2000,
                          evals=[(bayes_dvalid, 'bayes_dvalid')],
                          maximize=True,
                          feval=gini,
                          early_stopping_rounds=200,
                          verbose_eval=False)

    best_iter = xgb_model.best_iteration # 최적 반복 횟수
    # 검증 데이터로 예측 수행
    preds = xgb_model.predict(bayes_dvalid, 
                              iteration_range=(0, best_iter))
    # 지니계수 계산
    gini_score = eval_gini(y_valid, preds)
    print(f'지니계수 : {gini_score}\n')

    return gini_score

최적화 수행 및 결과 확인

In [14]:
from bayes_opt import BayesianOptimization

# 베이지안 최적화 객체 생성
optimizer = BayesianOptimization(f=eval_function, 
                                 pbounds=param_bounds, 
                                 random_state=0)

# 베이지안 최적화 수행
optimizer.maximize(init_points=3, n_iter=6)

# 평가함수 점수가 최대일 때 하이퍼파라미터
max_params = optimizer.max['params']
print(max_params)

|   iter    |  target   | colsam... |   gamma   | max_depth | min_ch... | reg_alpha | reg_la... | scale_... | subsample |
-------------------------------------------------------------------------------------------------------------------------
하이퍼파라미터 : {'max_depth': 6, 'subsample': 0.778354600156416, 'colsample_bytree': 0.8548813503927325, 'min_child_weight': 6.544883182996897, 'gamma': 10.71518936637242, 'reg_alpha': 8.423654799338905, 'reg_lambda': 1.3645894113066657, 'scale_pos_weight': 1.4437587211262692, 'objective': 'binary:logistic', 'learning_rate': 0.02, 'random_state': 1991}


  "`feval` is deprecated, use `custom_metric` instead.  They have "


지니계수 : 0.2846225977005399

| [0m1        [0m | [0m0.2846   [0m | [0m0.8549   [0m | [0m10.72    [0m | [0m6.206    [0m | [0m6.545    [0m | [0m8.424    [0m | [0m1.365    [0m | [0m1.444    [0m | [0m0.7784   [0m |
하이퍼파라미터 : {'max_depth': 7, 'subsample': 0.6174258599403081, 'colsample_bytree': 0.896366276050103, 'min_child_weight': 6.528894919752904, 'gamma': 10.383441518825778, 'reg_alpha': 8.568044561093933, 'reg_lambda': 1.392559663829266, 'scale_pos_weight': 1.4071036058197885, 'objective': 'binary:logistic', 'learning_rate': 0.02, 'random_state': 1991}
지니계수 : 0.28389589534334453

| [0m2        [0m | [0m0.2839   [0m | [0m0.8964   [0m | [0m10.38    [0m | [0m6.583    [0m | [0m6.529    [0m | [0m8.568    [0m | [0m1.393    [0m | [0m1.407    [0m | [0m0.6174   [0m |
하이퍼파라미터 : {'max_depth': 7, 'subsample': 0.7561058352572911, 'colsample_bytree': 0.8020218397440326, 'min_child_weight': 6.870012148246819, 'gamma': 10.832619845547939, 'reg_alpha': 8.978618342

  "`feval` is deprecated, use `custom_metric` instead.  They have "


지니계수 : 0.2825190196652289

| [0m4        [0m | [0m0.2825   [0m | [0m0.8614   [0m | [0m10.82    [0m | [0m6.419    [0m | [0m6.747    [0m | [0m8.776    [0m | [0m1.37     [0m | [0m1.412    [0m | [0m0.6668   [0m |
하이퍼파라미터 : {'max_depth': 6, 'subsample': 0.7690155783567096, 'colsample_bytree': 0.8765861935025867, 'min_child_weight': 6.624782214679624, 'gamma': 10.651878489167856, 'reg_alpha': 8.705756109918921, 'reg_lambda': 1.3811150020734444, 'scale_pos_weight': 1.4708403613290768, 'objective': 'binary:logistic', 'learning_rate': 0.02, 'random_state': 1991}


  "`feval` is deprecated, use `custom_metric` instead.  They have "


지니계수 : 0.28493105508583977

| [95m5        [0m | [95m0.2849   [0m | [95m0.8766   [0m | [95m10.65    [0m | [95m6.405    [0m | [95m6.625    [0m | [95m8.706    [0m | [95m1.381    [0m | [95m1.471    [0m | [95m0.769    [0m |
하이퍼파라미터 : {'max_depth': 6, 'subsample': 0.6308412679379533, 'colsample_bytree': 0.83643064288649, 'min_child_weight': 6.539999638446021, 'gamma': 10.51797225886217, 'reg_alpha': 8.930198181129049, 'reg_lambda': 1.3762685005779836, 'scale_pos_weight': 1.4334272347669137, 'objective': 'binary:logistic', 'learning_rate': 0.02, 'random_state': 1991}


  "`feval` is deprecated, use `custom_metric` instead.  They have "


지니계수 : 0.2839463068140855

| [0m6        [0m | [0m0.2839   [0m | [0m0.8364   [0m | [0m10.52    [0m | [0m6.266    [0m | [0m6.54     [0m | [0m8.93     [0m | [0m1.376    [0m | [0m1.433    [0m | [0m0.6308   [0m |
하이퍼파라미터 : {'max_depth': 6, 'subsample': 0.6620761291198162, 'colsample_bytree': 0.8605933910026189, 'min_child_weight': 6.996167101820936, 'gamma': 10.337808243365048, 'reg_alpha': 8.320429194969563, 'reg_lambda': 1.3640701478928907, 'scale_pos_weight': 1.4247231289657456, 'objective': 'binary:logistic', 'learning_rate': 0.02, 'random_state': 1991}


  "`feval` is deprecated, use `custom_metric` instead.  They have "


지니계수 : 0.28342292034365724

| [0m7        [0m | [0m0.2834   [0m | [0m0.8606   [0m | [0m10.34    [0m | [0m6.464    [0m | [0m6.996    [0m | [0m8.32     [0m | [0m1.364    [0m | [0m1.425    [0m | [0m0.6621   [0m |
하이퍼파라미터 : {'max_depth': 5, 'subsample': 0.6174376278029394, 'colsample_bytree': 0.858771507354015, 'min_child_weight': 6.067661676778794, 'gamma': 10.131651500546607, 'reg_alpha': 8.975221523543922, 'reg_lambda': 1.3808912274792062, 'scale_pos_weight': 1.4608812586905784, 'objective': 'binary:logistic', 'learning_rate': 0.02, 'random_state': 1991}


  "`feval` is deprecated, use `custom_metric` instead.  They have "


지니계수 : 0.28380397639308513

| [0m8        [0m | [0m0.2838   [0m | [0m0.8588   [0m | [0m10.13    [0m | [0m5.28     [0m | [0m6.068    [0m | [0m8.975    [0m | [0m1.381    [0m | [0m1.461    [0m | [0m0.6174   [0m |
하이퍼파라미터 : {'max_depth': 6, 'subsample': 0.7719811602033093, 'colsample_bytree': 0.886127374840979, 'min_child_weight': 6.5791525277017815, 'gamma': 10.435110925111482, 'reg_alpha': 8.600464217045577, 'reg_lambda': 1.3392759897023265, 'scale_pos_weight': 1.4350133466747808, 'objective': 'binary:logistic', 'learning_rate': 0.02, 'random_state': 1991}


  "`feval` is deprecated, use `custom_metric` instead.  They have "


지니계수 : 0.28373820331350924

| [0m9        [0m | [0m0.2837   [0m | [0m0.8861   [0m | [0m10.44    [0m | [0m5.797    [0m | [0m6.579    [0m | [0m8.6      [0m | [0m1.339    [0m | [0m1.435    [0m | [0m0.772    [0m |
{'colsample_bytree': 0.8765861935025867, 'gamma': 10.651878489167856, 'max_depth': 6.4046373479391105, 'min_child_weight': 6.624782214679624, 'reg_alpha': 8.705756109918921, 'reg_lambda': 1.3811150020734444, 'scale_pos_weight': 1.4708403613290768, 'subsample': 0.7690155783567096}


In [15]:
# max_depth는 트리 깊이를 의미하므로 정수형이어야하니 정수형으로 바꾸고, 고정 하이퍼파라미터도 추가한다.
# 정수형 하이퍼파라미터 변환
max_params['max_depth'] = int(round(max_params['max_depth']))

# 값이 고정된 하이퍼파라미터 추가
max_params.update(fixed_params)
max_params

{'colsample_bytree': 0.8765861935025867,
 'gamma': 10.651878489167856,
 'max_depth': 6,
 'min_child_weight': 6.624782214679624,
 'reg_alpha': 8.705756109918921,
 'reg_lambda': 1.3811150020734444,
 'scale_pos_weight': 1.4708403613290768,
 'subsample': 0.7690155783567096,
 'objective': 'binary:logistic',
 'learning_rate': 0.02,
 'random_state': 1991}

모델 훈련 및 성능 검증

In [16]:
# OOF 방식을 이용해 XGBoost 모델을 훈련한다.

from sklearn.model_selection import StratifiedKFold

# 층화 K 폴드 교차 검증기 생성
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=1991)

# OOF 방식으로 훈련된 모델로 검증 데이터 타깃값을 예측한 확률을 담을 1차원 배열
oof_val_preds = np.zeros(X.shape[0]) 
# OOF 방식으로 훈련된 모델로 테스트 데이터 타깃값을 예측한 확률을 담을 1차원 배열
oof_test_preds = np.zeros(X_test.shape[0]) 

# OOF 방식으로 모델 훈련, 검증, 예측
for idx, (train_idx, valid_idx) in enumerate(folds.split(X, y)):
    # 각 폴드를 구분하는 문구 출력
    print('#'*40, f'폴드 {idx+1} / 폴드 {folds.n_splits}', '#'*40)

    # 훈련용 데이터, 검증용 데이터 설정
    X_train, y_train = X[train_idx], y[train_idx]
    X_valid, y_valid = X[valid_idx], y[valid_idx]

    # XGBoost 전용 데이터셋 생성 
    dtrain = xgb.DMatrix(X_train, y_train)
    dvalid = xgb.DMatrix(X_valid, y_valid)
    dtest = xgb.DMatrix(X_test)
    # XGBoost 모델 훈련
    xgb_model = xgb.train(params=max_params, 
                          dtrain=dtrain,
                          num_boost_round=2000,
                          evals=[(dvalid, 'valid')],
                          maximize=True,
                          feval=gini,
                          early_stopping_rounds=200,
                          verbose_eval=100)

    # 모델 성능이 가장 좋을 때의 부스팅 반복 횟수 저장
    best_iter = xgb_model.best_iteration
    # 테스트 데이터를 활용해 OOF 예측
    oof_test_preds += xgb_model.predict(dtest,
                                        iteration_range=(0, best_iter))/folds.n_splits

    # 모델 성능 평가를 위한 검증 데이터 타깃값 예측 
    oof_val_preds[valid_idx] += xgb_model.predict(dvalid, 
                                                  iteration_range=(0, best_iter))

    # 검증 데이터 예측 확률에 대한 정규화 지니계수
    gini_score = eval_gini(y_valid, oof_val_preds[valid_idx])
    print(f'폴드 {idx+1} 지니계수 : {gini_score}\n')

######################################## 폴드 1 / 폴드 5 ########################################


  "`feval` is deprecated, use `custom_metric` instead.  They have "


[0]	valid-logloss:0.67671	valid-gini:0.16215
[100]	valid-logloss:0.19194	valid-gini:0.24637
[200]	valid-logloss:0.15849	valid-gini:0.27603
[300]	valid-logloss:0.15515	valid-gini:0.28673
[400]	valid-logloss:0.15462	valid-gini:0.29120
[500]	valid-logloss:0.15445	valid-gini:0.29397
[600]	valid-logloss:0.15440	valid-gini:0.29558
[700]	valid-logloss:0.15434	valid-gini:0.29680
[800]	valid-logloss:0.15435	valid-gini:0.29706
[900]	valid-logloss:0.15431	valid-gini:0.29766
[1000]	valid-logloss:0.15430	valid-gini:0.29821
[1100]	valid-logloss:0.15428	valid-gini:0.29841
[1200]	valid-logloss:0.15427	valid-gini:0.29875
[1300]	valid-logloss:0.15425	valid-gini:0.29905
[1400]	valid-logloss:0.15425	valid-gini:0.29931
[1500]	valid-logloss:0.15424	valid-gini:0.29944
[1600]	valid-logloss:0.15424	valid-gini:0.29970
[1700]	valid-logloss:0.15424	valid-gini:0.29972
[1800]	valid-logloss:0.15423	valid-gini:0.29968
[1900]	valid-logloss:0.15425	valid-gini:0.29959
[1907]	valid-logloss:0.15425	valid-gini:0.29959
폴드 1

In [17]:
print('OOF 검증 데이터 지니계수 :', eval_gini(y, oof_val_preds))

OOF 검증 데이터 지니계수 : 0.28878246656069356


In [22]:
submission['target'] = oof_test_preds
submission.to_csv('submission_01.csv')