# 1. Import

In [1]:
# Basic Library
import os
import pandas as pd
import numpy as np
import pickle
from itertools import permutations, combinations
from tqdm.auto import tqdm
import warnings
warnings.filterwarnings('ignore')

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Preprocessing
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler, QuantileTransformer

# HP Tuning
import optuna
from optuna import Trial
from optuna.samplers import TPESampler
from optuna.visualization import plot_contour, plot_optimization_history
from optuna.visualization import plot_parallel_coordinate, plot_slice, plot_param_importances

# Modeling
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_absolute_error
from lightgbm import LGBMRegressor
import xgboost as xgb

In [2]:
def create_dir(dir):
    if not os.path.exists(dir):
        os.makedirs(dir)
        print("Created Directory :", dir)
    else:
        print("Directory already existed :", dir)
create_dir("../pickle")
create_dir("../model")
create_dir("../submission")

Directory already existed : ../pickle
Directory already existed : ../model
Directory already existed : ../submission


In [3]:
train = pd.read_csv("../data/train_f2.csv")
test = pd.read_csv("../data/test_f2.csv")
submission = pd.read_csv("../data/sample_submission.csv")

In [4]:
rows_train = train.shape[0] # 주어진 train data의 row 수
rows_test = test.shape[0] # 주어진 test data의 row 수
num_trial = 100 # 파라미터 튜닝을 몇 번 진행하는지의 수
splits_hp = 5 # 파라미터 튜닝을 진행할 때의 kfold 수
splits_tr = 15 # 모델 트레이닝을 진행할 때의 kfold 수
basic_seed = 42 # default seed
num_seed_hp = 3 # 파라미터 튜닝 seed 개수
num_seed_tr = 10 # 트레이닝 seed 개수
sel_seed = 3 # 선택할 seed 개수

In [5]:
pred_dict = {}
pred_test_dict = {}

# 2. LightGBM

In [6]:
del_cols = train.columns[train.columns.str.startswith("Gender#")]
train = train.drop(del_cols, axis=1)
test = test.drop(del_cols, axis=1)

In [7]:
train_x = train.drop(['Target'], axis=1) # 데이터 나누기
train_y = train['Target']
test_x = test.copy()

In [8]:
train_lab = train.copy()
test_lab = test.copy()

for col in train_lab.columns:
    if train_lab[col].dtypes=='object':
        train_lab[col] = train_lab[col].astype('category')
        test_lab[col] = test_lab[col].astype('category')

train_x = train_lab.drop(['Target'], axis=1) # 데이터 나누기
train_y = train_lab['Target']
test_x = test_lab.copy()

print('Category Encoding Completed')

Category Encoding Completed


In [9]:
def lgb_objective(trial: Trial) -> float:
    score_hp = []
    for seed_hp in [0, 42]:
        params_lgb = {
            "random_state": seed_hp,
            "verbosity": -1,
            "n_estimators": 10000,
            "learning_rate": trial.suggest_loguniform("learning_rate", 5e-3, 1e-1), # default=0.1, range=[0,1]
            "max_depth": trial.suggest_int("max_depth", 4, 12), # default=-1
            "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-2, 1e+2), # default=0
            "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-2, 1e+2), # default=0
            "num_leaves": trial.suggest_int("num_leaves", 31, 5000), # default=31, range=(1,130172]
            "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.0, 1.0), # feature_fraction, default=1
            "subsample": trial.suggest_uniform("subsample", 0.0, 1.0), # bagging_fraction, default=1, range=[0,1]
            "subsample_freq": trial.suggest_int("subsample_freq", 1, 20), # bagging_freq, default=0
            "min_child_samples": trial.suggest_int("min_child_samples", 1, 40), # min_data_in_leaf, default=20 
            "max_bin": trial.suggest_int("max_bin", 100, 500),
        }

        kfold = StratifiedKFold(n_splits=splits_hp, random_state=seed_hp, shuffle=True) # Cross-validation cv=5
        cv = np.zeros(rows_train)

        for n, (train_idx, val_idx) in enumerate(kfold.split(train_x, train_y)):

            x_train, x_val = train_x.iloc[train_idx], train_x.iloc[val_idx]
            y_train, y_val = train_y.iloc[train_idx].values, train_y.iloc[val_idx].values

            lgbmodel = LGBMRegressor(**params_lgb)
                                                                                            # 진행상황 보고싶을때 -1을 100으로
            lgbmodel.fit(x_train, y_train, eval_set=[(x_val, y_val)], early_stopping_rounds=30, verbose=-1) 
            cv[val_idx] = lgbmodel.predict(x_val)
            
        score_hp.append(mean_absolute_error(train_y, cv))
    
    np.mean(score_hp)
    return np.mean(score_hp)

In [None]:
sampler = TPESampler(seed=basic_seed)
lgb_study = optuna.create_study(study_name="lgb_parameter_opt", direction="minimize", sampler=sampler)
lgb_study.optimize(lgb_objective, n_trials=num_trial)

lgb_best_hyperparams = lgb_study.best_trial.params
lgb_base_hyperparams = {'n_estimators':10000}
lgb_best_hyperparams.update(lgb_base_hyperparams)

with open('../pickle/lgb_best_hyperparams.pickle', 'wb') as fw:
    pickle.dump(lgb_best_hyperparams, fw)
print("The best hyperparameters are:\n", lgb_best_hyperparams)

[32m[I 2022-03-31 01:58:23,642][0m A new study created in memory with name: lgb_parameter_opt[0m
[32m[I 2022-03-31 01:59:24,119][0m Trial 0 finished with value: 1.659318533855895 and parameters: {'learning_rate': 0.015355286838886862, 'max_depth': 12, 'reg_alpha': 8.471801418819979, 'reg_lambda': 2.481040974867813, 'num_leaves': 806, 'colsample_bytree': 0.15599452033620265, 'subsample': 0.05808361216819946, 'subsample_freq': 18, 'min_child_samples': 25, 'max_bin': 383}. Best is trial 0 with value: 1.659318533855895.[0m
[32m[I 2022-03-31 02:02:33,484][0m Trial 1 finished with value: 1.5223233479876928 and parameters: {'learning_rate': 0.005318033256270142, 'max_depth': 12, 'reg_alpha': 21.368329072358772, 'reg_lambda': 0.07068974950624607, 'num_leaves': 934, 'colsample_bytree': 0.18340450985343382, 'subsample': 0.3042422429595377, 'subsample_freq': 11, 'min_child_samples': 18, 'max_bin': 216}. Best is trial 1 with value: 1.5223233479876928.[0m
[32m[I 2022-03-31 02:04:43,573][

[32m[I 2022-03-31 04:13:29,664][0m Trial 19 finished with value: 1.5527466288231353 and parameters: {'learning_rate': 0.06320136705383216, 'max_depth': 6, 'reg_alpha': 0.02862461857079291, 'reg_lambda': 3.114674975980468, 'num_leaves': 3362, 'colsample_bytree': 0.707550416346461, 'subsample': 0.4344090618553158, 'subsample_freq': 20, 'min_child_samples': 19, 'max_bin': 244}. Best is trial 3 with value: 1.5032426487838153.[0m
[32m[I 2022-03-31 04:45:22,548][0m Trial 20 finished with value: 1.5318844778399612 and parameters: {'learning_rate': 0.011614261866644411, 'max_depth': 8, 'reg_alpha': 3.5951661085031215, 'reg_lambda': 45.14399961853961, 'num_leaves': 4527, 'colsample_bytree': 0.5693811304438392, 'subsample': 0.5818163242527762, 'subsample_freq': 3, 'min_child_samples': 8, 'max_bin': 323}. Best is trial 3 with value: 1.5032426487838153.[0m
[32m[I 2022-03-31 04:58:53,016][0m Trial 21 finished with value: 1.5061377799261957 and parameters: {'learning_rate': 0.018431374036174

[32m[I 2022-03-31 06:32:23,635][0m Trial 38 finished with value: 2.3040116247633033 and parameters: {'learning_rate': 0.03659429583336455, 'max_depth': 9, 'reg_alpha': 0.039231652606136466, 'reg_lambda': 0.5931225163121119, 'num_leaves': 4812, 'colsample_bytree': 0.9462689263533774, 'subsample': 0.014800630642531398, 'subsample_freq': 1, 'min_child_samples': 11, 'max_bin': 205}. Best is trial 33 with value: 1.4966871607610237.[0m
[32m[I 2022-03-31 06:35:12,708][0m Trial 39 finished with value: 1.5563536406599598 and parameters: {'learning_rate': 0.05698993820292772, 'max_depth': 5, 'reg_alpha': 0.01053871780699714, 'reg_lambda': 11.45701433355122, 'num_leaves': 2060, 'colsample_bytree': 0.7443629518731241, 'subsample': 0.3035909219770531, 'subsample_freq': 10, 'min_child_samples': 22, 'max_bin': 479}. Best is trial 33 with value: 1.4966871607610237.[0m
[32m[I 2022-03-31 06:38:58,163][0m Trial 40 finished with value: 1.62744457013827 and parameters: {'learning_rate': 0.007906242

In [None]:
optuna.visualization.matplotlib.plot_param_importances(lgb_study);

In [None]:
optuna.visualization.matplotlib.plot_slice(lgb_study);

In [None]:
# with open('../pickle/lgb_best_hyperparams.pickle', 'rb') as fw:
#     lgb_best_hyperparams = pickle.load(fw)

In [9]:
lgb_best_hyperparams = {'n_estimators':10000, 'learning_rate': 0.0385744726440635, 'max_depth': 5, 'reg_alpha': 0.019211565937469453, 'reg_lambda': 48.31817229397436, 'num_leaves': 4712, 'colsample_bytree': 0.9508380949542059, 'subsample': 0.2812178703760207, 'subsample_freq': 1, 'min_child_samples': 21, 'max_bin': 393}

In [10]:
lucky_seeds = np.random.randint(0, 1000, num_seed_tr)

for i, seed in enumerate(lucky_seeds):

    kfold = StratifiedKFold(n_splits=splits_tr, random_state=seed, shuffle=True) # CV 늘려가면서 하기
    cv = np.zeros(rows_train)
    pred_test = np.zeros(rows_test)

    for n, (train_idx, val_idx) in enumerate(kfold.split(train_x, train_y)):
        
        x_train, x_val = train_x.iloc[train_idx], train_x.iloc[val_idx]
        y_train, y_val = train_y.iloc[train_idx].values.ravel(), train_y.iloc[val_idx].values.ravel()

        lgbmodel = LGBMRegressor(**lgb_best_hyperparams)
        lgbmodel.fit(x_train, y_train, eval_set=[(x_val, y_val)], early_stopping_rounds=30, verbose=-1)
        
        cv[val_idx] = lgbmodel.predict(x_val)
        pred_test += lgbmodel.predict(test_x) / splits_tr
        
    pred_dict['lgb'+str(seed)] = cv
    pred_test_dict['lgb'+str(seed)] = pred_test
    print(f'seed {seed}', 'mean_absolute_error :', mean_absolute_error(train_y, cv))

seed 420 mean_absolute_error : 1.5020670823278075
seed 67 mean_absolute_error : 1.4965953746706389
seed 815 mean_absolute_error : 1.4905855596668207
seed 639 mean_absolute_error : 1.4987759415663127
seed 732 mean_absolute_error : 1.4902158004540975


KeyboardInterrupt: 

# 3. Export

In [None]:
def sort_dict(model, pred_dict, pred_test_dict):
    pred_dict_local = {}
    for key, value in pred_dict.items():
        if model in key:
            pred_dict_local[key]=value

    pred_test_dict_local = {}
    for key, value in pred_test_dict.items():
        if model in key:
            pred_test_dict_local[key]=value

    pred_dict_new_local = dict(sorted(
        pred_dict_local.items(), 
        key=lambda x:mean_absolute_error((train_y), list(x[1])), reverse=False)[:sel_seed])
    pred_test_dict_new_local = {}
    for key, value in pred_dict_new_local.items():
        pred_test_dict_new_local[key]=pred_test_dict_local[key]
        
    return pred_dict_new_local, pred_test_dict_new_local

In [None]:
def save_dict(model, pred_dict, pred_test_dict):
    with open('../pickle/pred_dict_'+model+'.pickle', 'wb') as fw:
        pickle.dump(pred_dict, fw)
    with open('../pickle/pred_test_dict_'+model+'.pickle', 'wb') as fw:
        pickle.dump(pred_test_dict, fw)

In [None]:
pred_dict_lgb, pred_test_dict_lgb = sort_dict('lgb', pred_dict, pred_test_dict)
save_dict('lgb', pred_dict_lgb, pred_test_dict_lgb)