출처 - https://www.kaggle.com/xiaozhouwang/2nd-place-lightgbm-solution

In [None]:
%config Completer.use_jedi = False

In [None]:
# part of 2nd place solution: lightgbm model with private score 0.29124 and public lb score 0.28555
import warnings
warnings.filterwarnings("ignore")

import lightgbm as lgbm
from scipy import sparse as ssp
from sklearn.model_selection import StratifiedKFold
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [None]:
def Gini(y_true, y_pred):
    # check and get number of samples
    assert y_true.shape == y_pred.shape
    n_samples = y_true.shape[0]

    # sort rows on prediction column
    # (from largest to smallest)
    arr = np.array([y_true, y_pred]).transpose()
    true_order = arr[arr[:, 0].argsort()][::-1, 0]
    pred_order = arr[arr[:, 1].argsort()][::-1, 0]

    # get Lorenz curves
    L_true = np.cumsum(true_order) * 1. / np.sum(true_order)
    L_pred = np.cumsum(pred_order) * 1. / np.sum(pred_order)
    L_ones = np.linspace(1 / n_samples, 1, n_samples)

    # get Gini coefficients (area between curves)
    G_true = np.sum(L_ones - L_true)
    G_pred = np.sum(L_ones - L_pred)

    # normalize to true Gini coefficient
    return G_pred * 1. / G_true

In [None]:
NROWS = 5000
cv_only = True
save_cv = True
full_train = False

In [None]:
# Customized Function생성시 인자는 Prediction, dTrain
def evalerror(preds, dtrain): 
    labels = dtrain.get_label()
    return 'gini', Gini(labels, preds), True

In [None]:
path = "../input/porto-seguro-safe-driver-prediction/"

In [None]:
%%time
train = pd.read_csv(path+'train.csv', nrows=NROWS)
train_label = train['target']
train_id = train['id']
test = pd.read_csv(path+'test.csv', nrows=NROWS)
test_id = test['id']

In [None]:
NFOLDS = 5

In [None]:
kfold = StratifiedKFold(n_splits=NFOLDS, shuffle=True, random_state=218)

In [None]:
y = train['target'].values
drop_feature = [
    'id',
    'target'
]

In [None]:
X = train.drop(drop_feature, axis=1)

In [None]:
X.columns.tolist()

In [None]:
feature_names = X.columns.tolist()

In [None]:
cat_features = [c for c in feature_names if ('cat' in c and 'count' not in c)]
num_features = [c for c in feature_names if ('cat' not in c and 'calc' not in c)] # calc가 의미가 없다는 의견들이 있어 calc가 포함된 컬럼 제외

# Feature engineering

CV, LB를 물어보는 이유
CV랑 LB가 선형적으로 따라가는 게 좋은데..
1. 이 Competition이 Shake up이 생길건지 간을 본다. CV, LB차이가 크고 Unstable하면 Shake up일 발생할 여지가 있다
2. 고수들의 CV, LB값을 보고 본인 PC에서 그 정도 점수를 올린다.


In [None]:
# 각 피처가 가지고 있는 Null,Missing Value 관련 피처 생성
train == -1

In [None]:
(train==-1).sum(axis=1).value_counts()

In [None]:
train['missing'] = (train==-1).sum(axis=1).astype(float)
test['missing'] = (test==-1).sum(axis=1).astype(float)
num_features.append('missing')

In [None]:
num_features

In [None]:
# Category 데이터 Label Encoding
for c in cat_features:
    le = LabelEncoder()
    le.fit(train[c])
    train[c] = le.transform(train[c])
    test[c] = le.transform(test[c])

In [None]:
train

In [None]:
enc = OneHotEncoder()
enc.fit(train[cat_features])

In [None]:
X_cat = enc.transform(train[cat_features])
X_t_cat = enc.transform(test[cat_features])

In [None]:
ind_features = [c for c in feature_names if 'ind' in c]

In [None]:
count = 0
for c in ind_features:
    if count == 0:
        train['new_ind'] = train[c].astype(str) + '_'
        test['new_ind'] = test[c].astype(str) + '_'
        count+=1
    else:
        train['new_ind'] += train[c].astype(str)+'_'
        test['new_ind'] += test[c].astype(str)+'_'

In [None]:
train['new_ind'].value_counts().shape[0]

In [None]:
# ind feature들을 하나로 묶어서 새로운카테고리를 만들어 내는 작업.
# discussion에 ind 피쳐가 엮여있지 않았나.. 그래서 다 합쳤다. 
# Frequency Encoding....
cat_count_features = []
for c in cat_features+['new_ind']:
    break;

In [None]:
c

In [None]:
train[c],test[c]

In [None]:
pd.concat([train[c],test[c]]).value_counts().to_dict() # 7000여개나 되는 High Cardinality를 이넘으로 대체

In [None]:
d = pd.concat([train[c],test[c]]).value_counts().to_dict() # 7000여개나 되는 High Cardinality를 이넘으로 대체

In [None]:
train[c].apply(lambda x:d.get(x,0))

In [None]:
train[c]

In [None]:
for c in cat_features+['new_ind']:
    d = pd.concat([train[c],test[c]]).value_counts().to_dict()
    train['%s_count'%c] = train[c].apply(lambda x:d.get(x,0))
    test['%s_count'%c] = test[c].apply(lambda x:d.get(x,0))
    cat_count_features.append('%s_count'%c)

In [None]:
cat_count_features

In [None]:
X_cat[0][0][0]

In [None]:
# Categorical -> Frequence Encoding + One Hot Encoding으로 표현.
train_list = [train[num_features+cat_count_features].values,X_cat]
test_list = [test[num_features+cat_count_features].values,X_t_cat]

In [None]:
train_list # one hot encoding하면 0이 너무 많아서 Compressed Sparse Row 적용됨.

In [None]:
len(train_list)

In [None]:
ssp.hstack(train_list)

In [None]:
ssp.hstack(train_list).tocsr() # CSR: Compressed Sparse Row 

In [None]:
X = ssp.hstack(train_list).tocsr() # Sparse Matrix로 푼다.

In [None]:
X[0][0]

In [None]:
X_test = ssp.hstack(test_list).tocsr()

# Model development

In [None]:
learning_rate = 0.1
num_leaves = 15 
min_data_in_leaf = 2000 # 학습할 때 마지막 남기는 데이터
feature_fraction = 0.6 # feature 뽑을 갯수
num_boost_round = 10000

In [None]:
# 파라미터에 대한 의미를 알고 있어야 한다.
# 파라미터 튜닝은 나중에.. 쥐어짜낼때 한다...
params = {"objective": "binary",
          "boosting_type": "gbdt", # Gradient Boosting
          "learning_rate": learning_rate,
          "num_leaves": num_leaves,
          "max_bin": 256,
          "feature_fraction": feature_fraction,
          "verbosity": -1, # 출력 보여줄지 여부
          "drop_rate": 0.1, # Drop Out 같은거... GBDT는 Drop Rate적용안됨.
          "is_unbalance": False,
          "max_drop": 50, # GBDT에는 사용 안됨.
          "min_child_samples": 10,
          "min_child_weight": 150,
          "min_split_gain": 0,
          "subsample": 0.9
          }

In [None]:
x_score = []
final_cv_train = np.zeros(len(train_label)) # Stacking...
final_cv_pred = np.zeros(len(test_id))

In [None]:
final_cv_train

In [None]:
for s in np.arange(16): # s: random number
    break

In [None]:
s

In [None]:
cv_train = np.zeros(len(train_label))
cv_pred = np.zeros(len(test_id))

In [None]:
params['seed'] = s # 중요
# 앙상블은 다양성이 있어서 앙상블이다.
# 다양성을 표현하는 방법: 
# 피쳐를 다양하게... 
# Random #를 바꿔도 다양성 확보 가능..

In [None]:
kf = kfold.split(X, train_label)

In [None]:
best_trees = []
fold_scores = []

In [None]:
for i, (train_fold, validate) in enumerate(kf):
    break

In [None]:
train_fold

In [None]:
validate

In [None]:
X_train, X_validate = X[train_fold, :], X[validate, :]
label_train, label_validate = train_label[train_fold], train_label[validate]

In [None]:
X_train

In [None]:
label_train

In [None]:
# XGBoot와 동일하게 DMatrix를 만들고 진행해야한다.
dtrain = lgbm.Dataset(X_train, label_train)
dvalid = lgbm.Dataset(X_validate, label_validate, reference=dtrain)

In [None]:
bst = lgbm.train(params, 
           dtrain,
           num_boost_round, 
           valid_sets=dvalid, 
           feval=evalerror, 
           verbose_eval=100,
           early_stopping_rounds=100)

In [None]:
bst.best_iteration

In [None]:
best_trees.append(bst.best_iteration)

In [None]:
best_trees

In [None]:
cv_pred += bst.predict(X_test, num_iteration=bst.best_iteration)
# fold별로 학습하고 쭉 더한다.

In [None]:
cv_pred

In [None]:
cv_train[validate] += bst.predict(X_validate)
# fold를 다 훑으면 validation fold를 다 합치면 전체 Train Set이 다 나온다??
# OF를 만들때는 Validate에 대해 예측하면 된다??

In [None]:
score = Gini(label_validate, cv_train[validate])

In [None]:
score

In [None]:
fold_scores.append(score)

In [None]:
fold_scores

In [None]:
best_trees = []
fold_scores = []

for i, (train_fold, validate) in enumerate(kf):
    print('#'*30, '{} of {}'.format(i+1, 5))
    X_train, X_validate, label_train, label_validate = \
        X[train_fold, :], X[validate, :], train_label[train_fold], train_label[validate]
    dtrain = lgbm.Dataset(X_train, label_train)
    dvalid = lgbm.Dataset(X_validate, label_validate, reference=dtrain)
    bst = lgbm.train(params, dtrain, num_boost_round, valid_sets=dvalid, feval=evalerror, verbose_eval=100,
                    early_stopping_rounds=100)
    best_trees.append(bst.best_iteration)
    cv_pred += bst.predict(X_test, num_iteration=bst.best_iteration)
    cv_train[validate] += bst.predict(X_validate)

    score = Gini(label_validate, cv_train[validate])
    print(score)
    fold_scores.append(score)

cv_pred /= NFOLDS

In [None]:
fold_scores

In [None]:
final_cv_train += cv_train
final_cv_pred += cv_pred

In [None]:
print("cv score:") # Random # 하나 선택해서 CV Score 구하고
print(Gini(train_label, cv_train))

In [None]:
print("current score:", Gini(train_label, final_cv_train / (s + 1.)), s+1) # final_cv_train size 만큼 나눔. / s+1.
print(fold_scores)
print(best_trees, np.mean(best_trees))

# Random 16 * KFOLD 5 --> 총80번 돌림~

In [None]:
#전체 코드
x_score = []
final_cv_train = np.zeros(len(train_label))
final_cv_pred = np.zeros(len(test_id))

for s in np.arange(16):
    print('#'*30, 'random number outer iteration: {}'.format(s))
    cv_train = np.zeros(len(train_label))
    cv_pred = np.zeros(len(test_id))

    params['seed'] = s

    if cv_only:
        kf = kfold.split(X, train_label)

        best_trees = []
        fold_scores = []

        for i, (train_fold, validate) in enumerate(kf):
            print('#'*10, 'inner cross validation system: {}'.format(i))
            X_train, X_validate, label_train, label_validate = \
                X[train_fold, :], X[validate, :], train_label[train_fold], train_label[validate]
            dtrain = lgbm.Dataset(X_train, label_train)
            dvalid = lgbm.Dataset(X_validate, label_validate, reference=dtrain)
            bst = lgbm.train(params, dtrain, num_boost_round, valid_sets=dvalid, feval=evalerror, verbose_eval=100,
                            early_stopping_rounds=100)
            best_trees.append(bst.best_iteration)
            cv_pred += bst.predict(X_test, num_iteration=bst.best_iteration)
            cv_train[validate] += bst.predict(X_validate)

            score = Gini(label_validate, cv_train[validate])
            print(score)
            fold_scores.append(score)

        cv_pred /= NFOLDS
        final_cv_train += cv_train
        final_cv_pred += cv_pred

        print("cv score:")
        print(Gini(train_label, cv_train))
        print("current score:", Gini(train_label, final_cv_train / (s + 1.)), s+1)
        print(fold_scores)
        print(best_trees, np.mean(best_trees))

        x_score.append(Gini(train_label, cv_train))

print(x_score)

In [None]:
pd.DataFrame({'id': test_id, 'target': final_cv_pred / 16.}).to_csv('./lgbm3_pred_avg.csv', index=False)
pd.DataFrame({'id': train_id, 'target': final_cv_train / 16.}).to_csv('./lgbm3_cv_avg.csv', index=False)