# Binary classification : Tabular data

# 2nd level. Porto Seguro’s Safe Driver Prediction

- https://www.kaggle.com/aharless/xgboost-cv-lb-284

## Setting

gc
- https://blog.winterjung.dev/2018/02/18/python-gc
- python은 기본적으로 garbage collection과 reference counting을 통해 할당된 메모리를 관리
- gc module은 오로지 순환 참조를 탐지하고 해결하기 위해 존재
- gc module로 cyclic garbage collection을 직접 제어하고, 이를 통해 reference cycles를 해결
- gc 파이썬 공식문서에서도 순환 참조를 만들지 않는다고 확신할 수 있으면 gc.disable()을 통해 garbage collector를 비활성화시켜도 된다고 함

In [1]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from numba import jit
import time
import gc

## Load Data

In [2]:
train = pd.read_csv('../datasets_for_practice/kaggle/safe_driver/train.csv')
test = pd.read_csv('../datasets_for_practice/kaggle/safe_driver/test.csv')

train.head()

Unnamed: 0,id,target,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,...,ps_calc_11,ps_calc_12,ps_calc_13,ps_calc_14,ps_calc_15_bin,ps_calc_16_bin,ps_calc_17_bin,ps_calc_18_bin,ps_calc_19_bin,ps_calc_20_bin
0,7,0,2,2,5,1,0,0,1,0,...,9,1,5,8,0,1,1,0,0,1
1,9,0,1,1,7,0,0,0,0,1,...,3,1,1,9,0,1,1,0,1,0
2,13,0,5,4,9,1,0,0,0,1,...,4,2,7,7,0,1,1,0,1,0
3,16,0,0,1,2,0,0,1,0,0,...,2,2,4,9,0,0,0,0,0,0
4,17,0,0,2,0,1,0,1,0,0,...,3,1,1,3,0,0,0,1,1,0


### Extract columns' names

In [3]:
train_features = [
    "ps_car_13",  #            : 1571.65 / shadow  609.23
    "ps_reg_03",  #            : 1408.42 / shadow  511.15
    "ps_ind_05_cat",  #        : 1387.87 / shadow   84.72
    "ps_ind_03",  #            : 1219.47 / shadow  230.55
    "ps_ind_15",  #            :  922.18 / shadow  242.00
    "ps_reg_02",  #            :  920.65 / shadow  267.50
    "ps_car_14",  #            :  798.48 / shadow  549.58
    "ps_car_12",  #            :  731.93 / shadow  293.62
    "ps_car_01_cat",  #        :  698.07 / shadow  178.72
    "ps_car_07_cat",  #        :  694.53 / shadow   36.35
    "ps_ind_17_bin",  #        :  620.77 / shadow   23.15
    "ps_car_03_cat",  #        :  611.73 / shadow   50.67
    "ps_reg_01",  #            :  598.60 / shadow  178.57
    "ps_car_15",  #            :  593.35 / shadow  226.43
    "ps_ind_01",  #            :  547.32 / shadow  154.58
    "ps_ind_16_bin",  #        :  475.37 / shadow   34.17
    "ps_ind_07_bin",  #        :  435.28 / shadow   28.92
    "ps_car_06_cat",  #        :  398.02 / shadow  212.43
    "ps_car_04_cat",  #        :  376.87 / shadow   76.98
    "ps_ind_06_bin",  #        :  370.97 / shadow   36.13
    "ps_car_09_cat",  #        :  214.12 / shadow   81.38
    "ps_car_02_cat",  #        :  203.03 / shadow   26.67
    "ps_ind_02_cat",  #        :  189.47 / shadow   65.68
    "ps_car_11",  #            :  173.28 / shadow   76.45
    "ps_car_05_cat",  #        :  172.75 / shadow   62.92
    "ps_calc_09",  #           :  169.13 / shadow  129.72
    "ps_calc_05",  #           :  148.83 / shadow  120.68
    "ps_ind_08_bin",  #        :  140.73 / shadow   27.63
    "ps_car_08_cat",  #        :  120.87 / shadow   28.82
    "ps_ind_09_bin",  #        :  113.92 / shadow   27.05
    "ps_ind_04_cat",  #        :  107.27 / shadow   37.43
    "ps_ind_18_bin",  #        :   77.42 / shadow   25.97
    "ps_ind_12_bin",  #        :   39.67 / shadow   15.52
    "ps_ind_14",  #            :   37.37 / shadow   16.65
]

# add combinations
combs = [
    ('ps_reg_01', 'ps_car_02_cat'),  
    ('ps_reg_01', 'ps_car_04_cat'),
]

In [4]:
train.head()

Unnamed: 0,id,target,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,...,ps_calc_11,ps_calc_12,ps_calc_13,ps_calc_14,ps_calc_15_bin,ps_calc_16_bin,ps_calc_17_bin,ps_calc_18_bin,ps_calc_19_bin,ps_calc_20_bin
0,7,0,2,2,5,1,0,0,1,0,...,9,1,5,8,0,1,1,0,0,1
1,9,0,1,1,7,0,0,0,0,1,...,3,1,1,9,0,1,1,0,1,0
2,13,0,5,4,9,1,0,0,0,1,...,4,2,7,7,0,1,1,0,1,0
3,16,0,0,1,2,0,0,1,0,0,...,2,2,4,9,0,0,0,0,0,0
4,17,0,0,2,0,1,0,1,0,0,...,3,1,1,3,0,0,0,1,1,0


In [5]:
id_train = train['id'].values
id_test = test['id'].values
y = train['target']

### Make a new column

<b>print('\r')</b>
- 커서 위치 맨 앞으로 이동
- 출력 부분에서 'ps_reg_01_plus_ps_car_04_cat    2 in   0.1' 이 부분을 계속 update 함

In [6]:
start = time.time()
for n_c, (f1, f2) in enumerate(combs):
    name1 = f1 + '_plus_' + f2
    print('current feature %60s %4d in %5.1f'
         % (name1, n_c + 1, (time.time() - start) / 60), end='')
    print('\r' * 75, end='')
    
    train[name1] = train[f1].apply(lambda x: str(x)) + '_' + train[f2].apply(lambda x: str(x))
    test[name1] = test[f1].apply(lambda x: str(x)) + '_' + test[f2].apply(lambda x: str(x))
    
    # Label Encoder
    lbl = LabelEncoder()
    lbl.fit(list(train[name1].values) + list(test[name1].values))
    train[name1] = lbl.transform(list(train[name1].values))
    test[name1] = lbl.transform(list(test[name1].values))
    
    train_features.append(name1)  

current feature                                 ps_reg_01_plus_ps_car_04_cat    2 in   0.1

### Select columns

In [7]:
X = train[train_features]
test = test[train_features]

### Category columns

In [8]:
f_cats = [f for f in X.columns if "_cat" in f]

### Predicted value

In [9]:
y_valid_pred = 0 * y
y_test_pred = 0

# XGBoost

## 1. Create a Model

I recommend initially setting <b>MAX_ROUNDS</b> fairly high and using <b>OPTIMIZE_ROUNDS</b> to get an idea of the appropriate number of rounds (which, in my judgment, should be close to the maximum value of best_ntree_limit among all folds, maybe even a bit higher if your model is adequately regularized...or alternatively, you could set <b>verbose=True</b> and look at the details to try to find a number of rounds that works well for all folds). <b>Then I would turn off OPTIMIZE_ROUNDS and set MAX_ROUNDS to the appropraite number of total rounds.</b>

In [10]:
k = 5
kf = KFold(n_splits=k, shuffle=True, random_state=42)
np.random.seed(0)

- objective : training을 할 때, 해당 값을 최소화 하는 방향으로 training
- scale_pos_weight : ratio of number of negative class to the positive class

In [11]:
max_rounds = 400
optimize_rounds = False
learning_rate = 0.07
early_stopping_rounds = 50

model = XGBClassifier(n_estimators=max_rounds,
                      max_depth=4,
                      objective='binary:logistic',
                      learning_rate=learning_rate,
                      subsample=.8,
                      min_child_weight=6,
                      colsample_bytree=.8,
                      scale_pos_weight=1.6,
                      gamma=10,
                      reg_alpha=8,
                      reg_lambda=1.3,
                      )

## 2. Gini Function

<b>Numba : JIT (just-in-time) Compiler</b>
- https://gurujung.github.io/dev/numba_user_jit/
- 대부분의 무거운 작업 수행
- 가능한 많이 효율적인 CPU 및 GPU로 변환

<b>np.asarray</b>
- ndarray 데이터 형태인 경우, 입력이 복사되지 않은 상태로 반환
- array에서 copy를 False로 설정한 것과 같음

In [12]:
from numba import jit

@jit
def eval_gini(y_true, y_prob):
    y_true = np.asarray(y_true)
    y_true = y_true[np.argsort(y_prob)]
    (ntrue, gini, delta, n) = (0, 0, 0, len(y_true))
    
    for i in range(n-1, -1, -1):
        y_i = y_true[i]
        ntrue += y_i
        gini += y_i * delta
        delta += 1 - y_i
    
    gini = 1 - 2 * gini/(ntrue * (n - ntrue))
    return gini

def gini_xgb(preds, dtrain):
    labels = dtrain.get_label()
    gini_score = -eval_gini(labels, preds)
    return [('gini', gini_score)]

## 3. Add Noise Function

In [13]:
def add_noise(series, noise_level):
    return series * (1 + noise_level * np.random.randn(len(series)))

## 4. Target Encode Function

- min_samples_leaf (int) : minimum samples to take category average into account
- smoothing (int) : smoothing effect to balance categorical average vs prior

In [14]:
def target_encode(trn_series=None, val_series=None, tst_series=None, target=None,
                  min_samples_leaf=1, smoothing=1, noise_level=0):
    
    assert len(trn_series) == len(target)
    assert trn_series.name == tst_series.name
    
    temp = pd.concat([trn_series, target], axis=1)
    
    # target mean
    averages = temp.groupby(by=trn_series.name)[target.name].agg(['mean','count'])
    
    # Compute smoothing
    smoothing = 1 / (1 + np.exp(-(averages['count'] - min_samples_leaf) / smoothing))
    
    # Apply average function to al target data
    prior = target.mean()
    
    # The bigger the count the less full_avg is taken into account
    averages[target.name] = prior * (1 - smoothing) + averages['mean'] + smoothing
    averages.drop(['mean','count'], axis=1, inplace=True)
    
    # Apply averages to trn and tst series
    ft_trn_series = pd.merge(
        trn_series.to_frame(trn_series.name),
        averages.reset_index().rename(columns={'index':target.name, target.name:'average'}),
        on=trn_series.name,
        how='left'
    )['average'].rename(trn_series.name + '_mean').fillna(prior)
    
    # pd.merge does not keep the index so restore it
    ft_trn_series.index = trn_series.index
    
    ft_val_series = pd.merge(
        val_series.to_frame(val_series.name),
        averages.reset_index().rename(columns={'index':target.name, target.name:'average'}),
        on=val_series.name,
        how='left'
    )['average'].rename(trn_series.name + '_mean').fillna(prior)
    
    ft_val_series.index = val_series.index
    
    ft_tst_series = pd.merge(
        tst_series.to_frame(tst_series.name),
        averages.reset_index().rename(columns={'index':target.name, target.name:'average'}),
        on=tst_series.name,
        how='left'
    )['average'].rename(trn_series.name + '_mean').fillna(prior)
    
    ft_tst_series.index = tst_series.index
    
    return add_noise(ft_trn_series, noise_level), add_noise(ft_val_series, noise_level), add_noise(ft_tst_series, noise_level)    

<b>'target_encode' 함수 작동 원리 reference</b>

In [15]:
s = X['ps_ind_05_cat']
print('s.name : ', s.name)

# target mean
a = pd.concat([s, y], axis=1)
avg = a.groupby(by=s.name)[y.name].agg(['mean', 'count'])
avg

s.name :  ps_ind_05_cat


Unnamed: 0_level_0,mean,count
ps_ind_05_cat,Unnamed: 1_level_1,Unnamed: 2_level_1
-1,0.083319,5809
0,0.033865,528009
1,0.047825,8322
2,0.075526,4184
3,0.042998,8233
4,0.052061,18344
5,0.048514,1649
6,0.059336,20662


In [16]:
smoothing = 1
min_samples_leaf = 400 
smoothing = 1 / (1 + np.exp(-(avg['count'] - min_samples_leaf) / smoothing))
smoothing

ps_ind_05_cat
-1    1.0
 0    1.0
 1    1.0
 2    1.0
 3    1.0
 4    1.0
 5    1.0
 6    1.0
Name: count, dtype: float64

## 5. Apply

In [18]:
for i, (train_idx, test_idx) in enumerate(kf.split(train)):
    X_train, X_val = X.iloc[train_idx, :].copy(), X.iloc[test_idx, :].copy()
    y_train, y_val = y.iloc[train_idx].copy(), y.iloc[test_idx]
    X_test = test.copy()
    print("\nFold ", i)
    
    # Encode data
    for f in f_cats:
        X_train[f + '_avg'], X_val[f + '_avg'], X_test[f + '_avg'] = target_encode(trn_series = X_train[f],
                                                                                   val_series = X_val[f],
                                                                                   tst_series = X_test[f],
                                                                                   target = y_train,
                                                                                   min_samples_leaf = 200,
                                                                                   smoothing = 10,
                                                                                   noise_level = 0
                                                                                   )
    
    if optimize_rounds:
        eval_set = [(X_val, y_val)]
        fit_model = model.fit(X_train, y_train,
                              eval_set=eval_set, eval_metric=gini_xgb,
                              early_stopping_rounds=early_stopping_rounds, verbose=False)
        print("Best N trees : ", model.best_ntree_limit)
        print("Best gini    : ", model.best_score)
    else:
        fit_model = model.fit(X_train, y_train)
        
    # Generate validation prediction for this fold
    pred = fit_model.predict_proba(X_val)[:,1]
    print("Gini = ", eval_gini(y_val, pred))
    y_valid_pred.iloc[test_idx] = pred
    
    # Accumulate test set prediction
    y_test_pred += fit_model.predict_proba(X_test)[:,1]
    
    del X_test, X_train, X_val, y_train


Fold  0
Gini =  0.2893594639975673

Fold  1
Gini =  0.2757238051730051

Fold  2
Gini =  0.301326002967506

Fold  3
Gini =  0.28117757655654063

Fold  4
Gini =  0.2787517861326404


Average test set prediction

In [20]:
y_test_pred /= k
print("\nGini for full training set: ", eval_gini(y, y_valid_pred))


Gini for full training set:  0.285250787653606


<b>작동 원리 reference</b>

In [21]:
for i, (train_idx, test_idx) in enumerate(kf.split(train)):
    print(i, train_idx, test_idx)

0 [     1      3      4 ... 595208 595209 595210] [     0      2      6 ... 595201 595203 595211]
1 [     0      1      2 ... 595208 595209 595211] [    10     14     30 ... 595190 595206 595210]
2 [     0      1      2 ... 595209 595210 595211] [    11     16     17 ... 595200 595205 595208]
3 [     0      2      3 ... 595209 595210 595211] [     1      4      8 ... 595197 595199 595202]
4 [     0      1      2 ... 595208 595210 595211] [     3      5     13 ... 595204 595207 595209]


## 6. Save validation predictions for stacking/ensembling

In [22]:
val = pd.DataFrame()
val['id'] = id_train
val['target'] = y_valid_pred.values
val.to_csv('xgb_valid.csv', float_format='%.6f', index=False)

## 7. Create submission file

In [23]:
sub = pd.DataFrame()
sub['id'] = id_test
sub['target'] = y_test_pred
sub.to_csv('xgb_submit.csv', float_format='%.6f', index=False)