In [7]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
import lightgbm as lgbm

In [9]:
# Gini index 계산하는 함수
def Gini(y_true, y_pred):
    # 정답과 예측값의 개수가 동일한지 확인한다
    assert y_true.shape == y_pred.shape
    n_samples = y_true.shape[0]

    # 예측값(y_pred)를 오름차순으로 정렬한다
    arr = np.array([y_true, y_pred]).transpose()
    true_order = arr[arr[:, 0].argsort()][::-1, 0]
    pred_order = arr[arr[:, 1].argsort()][::-1, 0]

    # Lorenz curves를 계산한다
    L_true = np.cumsum(true_order) * 1. / np.sum(true_order)
    L_pred = np.cumsum(pred_order) * 1. / np.sum(pred_order)
    L_ones = np.linspace(1 / n_samples, 1, n_samples)

    # Gini 계수를 계산한다
    G_true = np.sum(L_ones - L_true)
    G_pred = np.sum(L_ones - L_pred)

    # Gini 계수를 정규화한다
    return G_pred * 1. / G_true

# LightGBM 모델 학습 과정에서 평가 함수로 사용한다
def evalerror(preds, dtrain):
    labels = dtrain.get_label()
    return 'gini', Gini(labels, preds), True

In [2]:
# 훈련/테스트 데이터를 읽어온다
train = pd.read_csv("../../data/train.csv")
train_label = train['target']
train_id = train['id']
del train['target'], train['id']

test = pd.read_csv("../../data/test.csv")
test_id = test['id']
del test['id']

In [3]:
# 파생 변수 01 : 결측값을 의미하는 “-1”의 개수를 센다(각 행 별 결측치 개수)
train['missing'] = (train==-1).sum(axis=1).astype(float)
test['missing'] = (test==-1).sum(axis=1).astype(float)

In [4]:
# 파생 변수 02 : 이진 변수의 합
bin_features = [c for c in train.columns if 'bin' in c]
train['bin_sum'] = train[bin_features].sum(axis=1)
test['bin_sum'] = test[bin_features].sum(axis=1)

In [5]:
# 파생 변수 03 : 단일변수 타겟 비율 분석으로 선정한 변수를 기반으로 Target Encoding을 수행한다. Target Encoding은 교차 검증 과정에서 진행한다.
features = ['ps_ind_06_bin', 'ps_ind_07_bin', 'ps_ind_08_bin', 'ps_ind_09_bin', 'ps_ind_12_bin', 'ps_ind_16_bin', 'ps_ind_17_bin', 'ps_ind_18_bin', 'ps_ind_04_cat', 'ps_ind_05_cat', 'ps_car_01_cat', 'ps_car_02_cat', 'ps_car_03_cat', 'ps_car_04_cat', 'ps_car_06_cat', 'ps_car_07_cat', 'ps_car_08_cat', 'ps_car_09_cat', 'ps_car_11_cat', 'ps_ind_01', 'ps_ind_03', 'ps_ind_15', 'ps_car_11']

In [6]:
# LightGBM 모델의 설정값이다.
# parmaeter 설명(참고) : https://lightgbm.readthedocs.io/en/latest/Parameters.html
num_boost_round = 10000
params = { # Core Parameters
          "objective": "binary",  # binary : logistic regression
          "boosting_type": "gbdt", # gradient boosting decision tree(default)
          "learning_rate": 0.1, # shirinkage rate
          "num_leaves": 15, # 노드 최대 개수
           # Learning Control Parameters
          "max_bin": 256,
          "feature_fraction": 0.6,
          "verbosity": 0,
          "drop_rate": 0.1,
          "is_unbalance": False,
          "max_drop": 50,
          "min_child_samples": 10,
          "min_child_weight": 150,
          "min_split_gain": 0,
          "subsample": 0.9,
          "seed": 2018
}

In [10]:
# Stratified 5-Fold 내부 교차 검증을 준비한다
NFOLDS = 5
kfold = StratifiedKFold(n_splits=NFOLDS, shuffle=True, random_state=218) # random_state : seed 지정
kf = kfold.split(train, train_label)

cv_train = np.zeros(len(train_label))
cv_pred = np.zeros(len(test_id))    
best_trees = []
fold_scores = []

for i, (train_fold, validate) in enumerate(kf): # train_fold, validate : 각 fold에 포함되는 데이터들의 index
    # 훈련/검증 데이터를 분리한다 (training set을 fold로 구분하여 validation set 생성)
    X_train, X_validate, label_train, label_validate = train.iloc[train_fold, :], train.iloc[validate, :], train_label[train_fold], train_label[validate]
    
    # target encoding 피쳐 엔지니어링을 수행한다
    for feature in features:
        # 훈련 데이터에서 feature 고유값별 타겟 변수의 평균을 구한다
        # 즉, Kth fold를 제외한 데이터들의 feature 열과 y값을 transpose한 후 feature로 groupby해서 mean 구해라!!
        map_dic = pd.DataFrame([X_train[feature], label_train]).T.groupby(feature).agg('mean')
        map_dic = map_dic.to_dict()['target']  # dataframe을 dict로 바꾸고 거기서 'target'변수만 가져와라!!
        # 훈련/검증/테스트 데이터에 평균값을 매핑한다  - .get(key값, deault키값):key값이 있으면 가져오고 없으면 default키 값을 가져온다
        X_train[feature + '_target_enc'] = X_train[feature].apply(lambda x: map_dic.get(x, 0))
        X_validate[feature + '_target_enc'] = X_validate[feature].apply(lambda x: map_dic.get(x, 0))
        test[feature + '_target_enc'] = test[feature].apply(lambda x: map_dic.get(x, 0))

    dtrain = lgbm.Dataset(X_train, label_train)
    dvalid = lgbm.Dataset(X_validate, label_validate, reference=dtrain)
    
    # 훈련 데이터를 학습하고, evalerror() 함수를 통해 검증 데이터에 대한 정규화 Gini 계수 점수를 기준으로 최적의 트리 개수를 찾는다.
    bst = lgbm.train(params, dtrain, num_boost_round, valid_sets=dvalid, feval=evalerror, verbose_eval=100, early_stopping_rounds=100)
    best_trees.append(bst.best_iteration)
    
    # 테스트 데이터에 대한 예측값을 cv_pred에 더한다.
    cv_pred += bst.predict(test, num_iteration=bst.best_iteration) # 최적 tree 개수를 사용한 모델로 predict
    cv_train[validate] += bst.predict(X_validate) # validate(index)에 해당하는 cv_train 각 index에 predict값을 갹각 더함(원래는 0임)

    # 검증 데이터에 대한 평가 점수를 출력한다.
    score = Gini(label_validate, cv_train[validate])
    print(score)
    fold_scores.append(score)

cv_pred /= NFOLDS

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Training until validation scores don't improve for 100 rounds.
[100]	valid_0's gini: 0.289057
[200]	valid_0's gini: 0.289507
Early stopping, best iteration is:
[128]	valid_0's gini: 0.290886
0.290885768895
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's gini: 0.263882
[200]	valid_0's gini: 0.265789
Early stopping, best iteration is:
[163]	valid_0's gini: 0.266903
0.266902573663
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's gini: 0.276838
[200]	valid_0's gini: 0.278797
Early stopping, best iteration is:
[192]	valid_0's gini: 0.27924
0.279239808758
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's gini: 0.28051
[200]	valid_0's gini: 0.280624
Early stopping, best iteration is:
[147]	valid_0's gini: 0.281718
0.281718470101
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's gini: 0.286562
[200]	valid_0's gini: 0.284692
Early stopping, best iteration is:
[117]	valid_0's gi

In [11]:
# 시드값별로 교차 검증 점수를 출력한다.
print("cv score:")
print(Gini(train_label, cv_train))
print(fold_scores)
print(best_trees, np.mean(best_trees))

cv score:
0.280973903402
[0.29088576889478723, 0.26690257366267167, 0.27923980875765608, 0.28171847010071399, 0.28676732014459044]
[128, 163, 192, 147, 117] 149.4


In [12]:
# 테스트 데이터에 대한 결과물을 저장한다.
pd.DataFrame({'id': test_id, 'target': cv_pred}).to_csv('../model/lgbm_baseline.csv', index=False)