## Libraries

In [1]:
# Data Handling
import pandas as pd
import numpy as np
#import matplotlib.pyplot as plt
import warnings;warnings.filterwarnings('ignore')
pd.set_option('max_columns', 30, 'max_rows', 20)


# Data Split
from sklearn.model_selection import StratifiedKFold
seed = 42
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=seed)


# Data Preprocess
from sklearn.decomposition import PCA

# Modeling
#  - Bagging,models
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier

#  - Boosting models
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import cross_val_score


# Hyperparameter Optimization
from bayes_opt import BayesianOptimization

# Evaluation
from sklearn.metrics import log_loss

## Load Data

In [2]:
train = pd.read_csv('./data/temp_train.csv')
test = pd.read_csv('./data/temp_test.csv')
target = pd.read_csv('./data/y_train.csv').LABEL

In [3]:
print(train.shape, test.shape, target.shape)

(150000, 20) (113104, 20) (150000,)


## Modeling


In [4]:
rf_clf = RandomForestClassifier(random_state=seed)
extra_clf = ExtraTreesClassifier(random_state=seed)
gbm_clf = GradientBoostingClassifier(random_state=seed) # gbm이 오래 걸림
xgb_clf = XGBClassifier(random_state=seed, n_jobs=-1)
lgb_clf = LGBMClassifier(random_state=seed, n_jobs=-1)

models = [rf_clf, extra_clf, gbm_clf, xgb_clf, lgb_clf]

#### Cross_validation

In [5]:
# Check models' bacis score

for model in models:
    loglosses = cross_val_score(model, train.values, target, scoring='neg_log_loss', cv=skf, n_jobs=-1)
    logloss_mean = -np.mean(loglosses)
    print(f'{model.__class__.__name__} 평균 성능: {logloss_mean:.4f}')

RandomForestClassifier 평균 성능: 2.6038
ExtraTreesClassifier 평균 성능: 4.0941
GradientBoostingClassifier 평균 성능: 1.3566
XGBClassifier 평균 성능: 1.3653
LGBMClassifier 평균 성능: 1.3562


#### Tuning

In [9]:
# LGB_clf bayesian example

pbounds = {'learning_rate' : (0.01,0.1),
           'n_estimators' : (100,300),
           'max_depth' : (10,30),
           'subsample' : (0.8,1),
           'colsample_bytree' : (0.75,1.0),
           'min_child_samples' : (20, 30), 
           'num_leaves': (20, 35)
           }

def lgb_opt(learning_rate, n_estimators, max_depth, subsample, colsample_bytree, min_child_samples, num_leaves):
    
    params = {
        'learning_rate' : learning_rate,
        'n_estimators' : int(round(n_estimators)),
        'max_depth': int(round(max_depth)),
        'subsample' : subsample,
        'colsample_bytree' : colsample_bytree,
        'min_child_samples' : int(round(min_child_samples)),
        'num_leaves' : int(round(num_leaves)),
        'objective' : 'binary',
        'random_state' : seed,
        'n_jobs' : -1
    }

    lgb_reg = LGBMClassifier(**params)
    
    scores = cross_val_score(lgb_reg, train.values, target,scoring = 'neg_log_loss', cv=skf, n_jobs=-1)
    mean_score = np.mean(scores)
    
    return mean_score


BO_lgb = BayesianOptimization(f = lgb_opt, pbounds = pbounds, random_state=seed)

In [10]:
BO_lgb.maximize(init_points=5, n_iter=5)

|   iter    |  target   | colsam... | learni... | max_depth | min_ch... | n_esti... | num_le... | subsample |
-------------------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m-1.356   [0m | [0m 0.8436  [0m | [0m 0.09556 [0m | [0m 24.64   [0m | [0m 25.99   [0m | [0m 131.2   [0m | [0m 22.34   [0m | [0m 0.8116  [0m |
| [0m 2       [0m | [0m-1.358   [0m | [0m 0.9665  [0m | [0m 0.0641  [0m | [0m 24.16   [0m | [0m 20.21   [0m | [0m 294.0   [0m | [0m 32.49   [0m | [0m 0.8425  [0m |
| [95m 3       [0m | [95m-1.355   [0m | [95m 0.7955  [0m | [95m 0.02651 [0m | [95m 16.08   [0m | [95m 25.25   [0m | [95m 186.4   [0m | [95m 24.37   [0m | [95m 0.9224  [0m |
| [95m 4       [0m | [95m-1.355   [0m | [95m 0.7849  [0m | [95m 0.03629 [0m | [95m 17.33   [0m | [95m 24.56   [0m | [95m 257.0   [0m | [95m 23.0    [0m | [95m 0.9028  [0m |
| [0m 5       [0m | [0m-1.362  

In [11]:
max_params = BO_lgb.max['params']

max_params['n_estimators'] = int(round(max_params['n_estimators']))
max_params['max_depth'] = int(round(max_params['max_depth']))
max_params['min_child_samples'] = int(round(max_params['min_child_samples']))
max_params['num_leaves'] = int(round(max_params['num_leaves']))
max_params['objective'] = 'binary'
max_params['n_jobs'] = -1
max_params['random_state'] = seed

max_params

{'colsample_bytree': 0.7848734651630105,
 'learning_rate': 0.03629301836816964,
 'max_depth': 17,
 'min_child_samples': 25,
 'n_estimators': 257,
 'num_leaves': 23,
 'subsample': 0.9028468876827224,
 'objective': 'binary',
 'n_jobs': -1,
 'random_state': 42}

In [12]:
# 해당 코드는 cross_val_score를 풀어 쓴 것임
# cross_val_score와의 차이점:
# - cross_val_score는 한 줄로 간단명료하게 cross_validation을 할 수 있는 패키지로 모델이 fit이 되지 않음
                        #   해당 패키지는 모델의 기본적 평균적인 성능을 빠르게 체크하여 볼 수 있음
# - cross_val_score를 아래와 같이 풀어써서 모델을 fit 시킬 수 있음

lgb_clf = LGBMClassifier(**max_params)

scores = []
for iter_count, (train_idx, valid_idx) in enumerate(skf.split(train, target)):
    
    X_train, X_valid = train.values[train_idx], train.values[valid_idx]
    y_train, y_valid = target.values[train_idx], target.values[valid_idx]
    
    lgb_clf.fit(X_train, y_train)
    
    pred = lgb_clf.predict_proba(X_valid)
    score = log_loss(y_valid, pred)
    scores.append(score)
print(f'LGBM 모델의 튜닝 평균 성능:  {np.mean(scores):.4f}')

LGBM 모델의 튜닝 평균 성능:  1.3550


## Evaluation

In [13]:
sub_pred = lgb_clf.predict_proba(test)

columns = ['F20','F30','F40','M20','M30','M40']
index_df = pd.DataFrame(test.index)
sub = pd.DataFrame(sub_pred, columns=columns)

submission = pd.concat([index_df, sub], axis=1)
submission

Unnamed: 0,0,F20,F30,F40,M20,M30,M40
0,0,0.109960,0.404298,0.347190,0.020232,0.055943,0.062377
1,1,0.143275,0.327016,0.334074,0.027427,0.090947,0.077261
2,2,0.073110,0.344245,0.402682,0.013123,0.068432,0.098409
3,3,0.021482,0.447789,0.462944,0.001293,0.017197,0.049295
4,4,0.463715,0.317726,0.181747,0.009244,0.011122,0.016446
...,...,...,...,...,...,...,...
113099,113099,0.213674,0.461255,0.276609,0.009629,0.017996,0.020837
113100,113100,0.126198,0.462531,0.348812,0.007444,0.027846,0.027170
113101,113101,0.145075,0.340562,0.336845,0.030168,0.069697,0.077654
113102,113102,0.095116,0.465259,0.324009,0.015219,0.051622,0.048774


In [None]:
# submission.to_csv('./data/submission.csv', index=False)

## **────────────────────────End of Pipeline──────────────────────**