Lasso, Ridge, Randomforest, ElasticNet, GradientBoostingRegressor, LGBM(dart), XGB(otherBoosting)

# 1. Import

In [470]:
# Basic Library
import os
import pandas as pd
import numpy as np
import pickle
from itertools import permutations, combinations
from tqdm.auto import tqdm
import warnings
warnings.filterwarnings('ignore')

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Preprocessing
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler, QuantileTransformer

# HP Tuning
import optuna
from optuna import Trial
from optuna.samplers import TPESampler
from optuna.visualization import plot_contour, plot_optimization_history
from optuna.visualization import plot_parallel_coordinate, plot_slice, plot_param_importances

# Modeling
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_absolute_error
from lightgbm import LGBMRegressor
import xgboost as xgb

In [586]:
train = pd.read_csv("../data/train.csv")
test = pd.read_csv("../data/test.csv")
submission = pd.read_csv("../data/sample_submission.csv")

In [587]:
train = train.drop('id', axis=1)
test = test.drop('id', axis=1)

In [588]:
def create_dir(dir):
    if not os.path.exists(dir):
        os.makedirs(dir)
        print("Created Directory :", dir)
    else:
        print("Directory already existed :", dir)
create_dir("../pickle")
create_dir("../model")
create_dir("../submission")

Directory already existed : ../pickle
Directory already existed : ../model
Directory already existed : ../submission


In [589]:
rows_train = train.shape[0] # 주어진 train data의 row 수
rows_test = test.shape[0] # 주어진 test data의 row 수
num_trial = 100 # 파라미터 튜닝을 몇 번 진행하는지의 수
splits_hp = 5 # 파라미터 튜닝을 진행할 때의 kfold 수
splits_tr = 15 # 모델 트레이닝을 진행할 때의 kfold 수
basic_seed = 42 # default seed
num_seed_hp = 3 # 파라미터 튜닝 seed 개수
num_seed_tr = 10 # 트레이닝 seed 개수
sel_seed = 3 # 선택할 seed 개수

# 2. Preprocessing

In [590]:
cat_cols = []
num_cols = []
for col in train.columns:
    if train[col].dtypes=='object':
        cat_cols.append(col)
    elif train[col].dtypes=='float64':
        num_cols.append(col)

In [591]:
scaler1 = StandardScaler()
scaler2 = MinMaxScaler()
scaler3 = QuantileTransformer()
def feature_num_scaler(train_df, test_df):
    for num_col in num_cols:
        
        scaler1.fit(train[[num_col]])
        train_df[num_col+'#scaler1'] = scaler1.transform(train[[num_col]])
        test_df[num_col+'#scaler1'] = scaler1.transform(test_df[[num_col]])
        
        scaler2.fit(train[[num_col]])
        train_df[num_col+'#scaler2'] = scaler2.transform(train[[num_col]])
        test_df[num_col+'#scaler2'] = scaler2.transform(test_df[[num_col]])
        
        scaler3.fit(train[[num_col]])
        train_df[num_col+'#scaler3'] = scaler3.transform(train[[num_col]])
        test_df[num_col+'#scaler3'] = scaler3.transform(test_df[[num_col]])
        
        train_df[num_col+'#log'] = np.log(train_df[num_col])
        test_df[num_col+'#log'] = np.log(test_df[num_col])
        
        train_df[num_col+'#log2'] = np.log2(train_df[num_col])
        test_df[num_col+'#log2'] = np.log2(test_df[num_col])
        
        train_df[num_col+'#log10'] = np.log10(train_df[num_col])
        test_df[num_col+'#log10'] = np.log10(test_df[num_col])

In [592]:
feature_num_scaler(train, test)

In [593]:
cat_cols = []
num_cols = []
for col in train.columns:
    if train[col].dtypes=='object':
        cat_cols.append(col)
    elif train[col].dtypes=='float64':
        num_cols.append(col)

In [594]:
for num_col_first in num_cols:
    for num_col_second in num_cols:
        if num_col_first != num_col_second:
            train[num_col_first+'/'+num_col_second] = train[num_col_first] / train[num_col_second]
            train[num_col_first+'*'+num_col_second] = train[num_col_first] * train[num_col_second]
            test[num_col_first+'/'+num_col_second] = test[num_col_first] / test[num_col_second]
            test[num_col_first+'*'+num_col_second] = test[num_col_first] * test[num_col_second]

In [599]:
cat_cols = []
num_cols = []
for col in train.columns:
    if train[col].dtypes=='object':
        cat_cols.append(col)
    elif train[col].dtypes=='float64':
        num_cols.append(col)

In [600]:
def feature_cat_generation(df):

    for cat_col in cat_cols:
        for num_col in num_cols:        
            new_name = cat_col + "#mean#" + num_col
            grouped = df.groupby(cat_col)[num_col].mean()
            df[new_name] = df[cat_col].map(grouped)

            new_name = cat_col + "#std#" + num_col
            grouped = df.groupby(cat_col)[num_col].std(ddof = 1)
            df[new_name] = df[cat_col].map(grouped)

            new_name = cat_col + "#var#" + num_col
            grouped = df.groupby(cat_col)[num_col].var(ddof = 1)
            df[new_name] = df[cat_col].map(grouped)

            new_name = cat_col + "#max#" + num_col
            grouped = df.groupby(cat_col)[num_col].max()
            df[new_name] = df[cat_col].map(grouped)

            new_name = cat_col + "#min#" + num_col
            grouped = df.groupby(cat_col)[num_col].min()
            df[new_name] = df[cat_col].map(grouped)

            new_name = cat_col + "#ptp#" + num_col
            grouped = df.groupby(cat_col)[num_col].agg(np.ptp)
            df[new_name] = df[cat_col].map(grouped)

            new_name = cat_col + "#median" + num_col
            grouped = df.groupby(cat_col)[num_col].median()
            df[new_name] = df[cat_col].map(grouped)

            new_name = cat_col + "#skew" + num_col
            grouped = df.groupby(cat_col)[num_col].skew()
            df[new_name] = df[cat_col].map(grouped)

            new_name = cat_col + "#percentile_10" + num_col
            grouped = df.groupby(cat_col)[num_col].agg(lambda x: np.percentile(x, 10))
            df[new_name] = df[cat_col].map(grouped)

            new_name = cat_col + "#percentile_60" + num_col
            grouped = df.groupby(cat_col)[num_col].agg(lambda x: np.percentile(x, 60))
            df[new_name] = df[cat_col].map(grouped)

            new_name = cat_col + "#percentile_90" + num_col
            grouped = df.groupby(cat_col)[num_col].agg(lambda x: np.percentile(x, 90))
            df[new_name] = df[cat_col].map(grouped)
    
    return df

In [601]:
feature_cat_generation(train)
feature_cat_generation(test)

Unnamed: 0,Gender,Lenght,Diameter,Height,Whole Weight,Shucked Weight,Viscra Weight,Shell Weight,Lenght#scaler1,Lenght#scaler2,...,Gender#std#Shell Weight#log10*Shell Weight#log2,Gender#var#Shell Weight#log10*Shell Weight#log2,Gender#max#Shell Weight#log10*Shell Weight#log2,Gender#min#Shell Weight#log10*Shell Weight#log2,Gender#ptp#Shell Weight#log10*Shell Weight#log2,Gender#medianShell Weight#log10*Shell Weight#log2,Gender#skewShell Weight#log10*Shell Weight#log2,Gender#percentile_10Shell Weight#log10*Shell Weight#log2,Gender#percentile_60Shell Weight#log10*Shell Weight#log2,Gender#percentile_90Shell Weight#log10*Shell Weight#log2
0,F,0.595,0.470,0.155,1.1210,0.4515,0.1780,0.1550,0.600176,0.723881,...,0.926354,0.858132,8.526048,0.016549,8.509499,0.933756,2.361114,0.381003,1.105024,2.326363
1,M,0.580,0.450,0.150,0.9270,0.2760,0.1815,0.3600,0.475366,0.701493,...,1.636947,2.679595,17.588742,0.007403,17.581339,1.038377,4.284991,0.406227,1.179412,2.930899
2,I,0.260,0.205,0.070,0.0970,0.0415,0.0190,0.0305,-2.187241,0.223881,...,3.143925,9.884261,26.490585,0.252546,26.238039,2.966591,2.305698,1.239471,3.632890,7.704082
3,M,0.590,0.460,0.130,1.1020,0.4550,0.2055,0.3300,0.558572,0.716418,...,1.636947,2.679595,17.588742,0.007403,17.581339,1.038377,4.284991,0.406227,1.179412,2.930899
4,F,0.595,0.465,0.140,1.1130,0.5175,0.2440,0.3050,0.600176,0.723881,...,0.926354,0.858132,8.526048,0.016549,8.509499,0.933756,2.361114,0.381003,1.105024,2.326363
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2919,I,0.170,0.105,0.035,0.0340,0.0120,0.0085,0.0050,-2.936099,0.089552,...,3.143925,9.884261,26.490585,0.252546,26.238039,2.966591,2.305698,1.239471,3.632890,7.704082
2920,I,0.435,0.345,0.115,0.4180,0.2220,0.0735,0.1060,-0.731128,0.485075,...,3.143925,9.884261,26.490585,0.252546,26.238039,2.966591,2.305698,1.239471,3.632890,7.704082
2921,I,0.570,0.450,0.135,0.7940,0.3815,0.1415,0.2450,0.392159,0.686567,...,3.143925,9.884261,26.490585,0.252546,26.238039,2.966591,2.305698,1.239471,3.632890,7.704082
2922,I,0.460,0.350,0.120,0.4885,0.1930,0.1050,0.1550,-0.523112,0.522388,...,3.143925,9.884261,26.490585,0.252546,26.238039,2.966591,2.305698,1.239471,3.632890,7.704082


## 이상치 제거

In [288]:
col_name = 'Whole Weight'

In [289]:
high = train[col_name].mean() + 3*train[col_name].std()
low = train[col_name].mean() - 3*train[col_name].std()

In [290]:
print("Highest allowed", high)
print("Lowest allowed", low)

Highest allowed 2.2465514892030605
Lowest allowed -0.6197810769152661


In [291]:
train[(train[col_name] > high) | (train[col_name] < low)]

Unnamed: 0,Gender,Lenght,Diameter,Height,Whole Weight,Shucked Weight,Viscra Weight,Shell Weight,Target
17,M,0.765,0.6,0.22,2.302,1.007,0.509,0.6205,12
551,M,0.74,0.58,0.205,2.381,0.8155,0.4695,0.488,12


In [292]:
train = train[~(train[col_name] > high) & ~(train[col_name] < low)]

In [108]:
train[col_name] = np.where(train[col_name]>high, high, np.where(train[col_name]<low, low, train[col_name]))

# 2. Modeling

In [604]:
pred_dict = {}
pred_test_dict = {}

## (1) LightGBM

In [605]:
train_lab = train.copy()
test_lab = test.copy()

for col in train_lab.columns:
    if train_lab[col].dtypes=='object':
        train_lab[col] = train_lab[col].astype('category')
        test_lab[col] = test_lab[col].astype('category')

train_x = train_lab.drop(['Target'], axis=1) # 데이터 나누기
train_y = train_lab['Target']
test_x = test_lab.copy()

print('Category Encoding Completed')

Category Encoding Completed


In [606]:
rows_train = train.shape[0] # 주어진 train data의 row 수
rows_test = test.shape[0] # 주어진 test data의 row 수
num_trial = 100 # 파라미터 튜닝을 몇 번 진행하는지의 수
splits_hp = 5 # 파라미터 튜닝을 진행할 때의 kfold 수
splits_tr = 15 # 모델 트레이닝을 진행할 때의 kfold 수
basic_seed = 42 # default seed
num_seed_hp = 3 # 파라미터 튜닝 seed 개수
num_seed_tr = 10 # 트레이닝 seed 개수
sel_seed = 3 # 선택할 seed 개수

In [607]:
def lgb_objective(trial: Trial) -> float:
    score_hp = []
    for seed_hp in np.random.randint(0, 1000, num_seed_hp):
        params_lgb = {
            "random_state": seed_hp,
            "verbosity": -1,
#             "metric": "mae",
            "n_estimators": 10000,
            'learning_rate': trial.suggest_loguniform("learning_rate", 2e-3, 1e-1), # default=0.1, range=[0,1]
            "max_depth": trial.suggest_int("max_depth", 3, 10), # default=-1
            "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-2, 1e+2), # default=0
            "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-2, 1e+2), # default=0
            "num_leaves": trial.suggest_int("num_leaves", 31, 3000), # default=31, range=(1,130172]
            "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.0, 1.0), # feature_fraction, default=1
            "subsample": trial.suggest_uniform("subsample", 0.0, 1.0), # bagging_fraction, default=1, range=[0,1]
            "subsample_freq": trial.suggest_int("subsample_freq", 1, 20), # bagging_freq, default=0
            "min_child_samples": trial.suggest_int("min_child_samples", 1, 40), # min_data_in_leaf, default=20 
            "max_bin": trial.suggest_int("max_bin", 100, 500),
        }

        kfold = StratifiedKFold(n_splits=splits_hp, random_state=basic_seed, shuffle=True) # Cross-validation cv=5
        cv = np.zeros(rows_train)

        for n, (train_idx, val_idx) in enumerate(kfold.split(train_x, train_y)):

            x_train, x_val = train_x.iloc[train_idx], train_x.iloc[val_idx]
            y_train, y_val = train_y.iloc[train_idx].values, train_y.iloc[val_idx].values

            lgbmodel = LGBMRegressor(**params_lgb)
                                                                                            # 진행상황 보고싶을때 -1을 100으로
            lgbmodel.fit(x_train, y_train, eval_set=[(x_val, y_val)], early_stopping_rounds=30, verbose=-1) 
            cv[val_idx] = lgbmodel.predict(x_val)
            
        score_hp.append(mean_absolute_error(train_y, cv))
    
    np.mean(score_hp)
    return np.mean(score_hp)

In [None]:
sampler = TPESampler(seed=basic_seed)
lgb_study = optuna.create_study(study_name="lgb_parameter_opt", direction="minimize", sampler=sampler)
lgb_study.optimize(lgb_objective, n_trials=num_trial)

lgb_best_hyperparams = lgb_study.best_trial.params
lgb_base_hyperparams = {'n_estimators':10000,
#                         'lambda_l1':lgb_best_hyperparams['reg_alpha'],
#                         'lambda_l2':lgb_best_hyperparams['reg_lambda']
                       }
lgb_best_hyperparams.update(lgb_base_hyperparams)

with open('../pickle/lgb_best_hyperparams.pickle', 'wb') as fw:
    pickle.dump(lgb_best_hyperparams, fw)
print("The best hyperparameters are:\n", lgb_best_hyperparams)

[32m[I 2022-03-23 22:08:31,284][0m A new study created in memory with name: lgb_parameter_opt[0m
[32m[I 2022-03-23 22:15:16,160][0m Trial 0 finished with value: 1.6772579607307059 and parameters: {'learning_rate': 0.008656900442587762, 'max_depth': 10, 'reg_alpha': 8.471801418819979, 'reg_lambda': 2.481040974867813, 'num_leaves': 494, 'colsample_bytree': 0.15599452033620265, 'subsample': 0.05808361216819946, 'subsample_freq': 18, 'min_child_samples': 25, 'max_bin': 383}. Best is trial 0 with value: 1.6772579607307059.[0m


In [365]:
sampler = TPESampler(seed=basic_seed)
lgb_study = optuna.create_study(study_name="lgb_parameter_opt", direction="minimize", sampler=sampler)
lgb_study.optimize(lgb_objective, n_trials=num_trial)

lgb_best_hyperparams = lgb_study.best_trial.params
lgb_base_hyperparams = {'n_estimators':10000,
#                         'lambda_l1':lgb_best_hyperparams['reg_alpha'],
#                         'lambda_l2':lgb_best_hyperparams['reg_lambda']
                       }
lgb_best_hyperparams.update(lgb_base_hyperparams)

with open('../pickle/lgb_best_hyperparams.pickle', 'wb') as fw:
    pickle.dump(lgb_best_hyperparams, fw)
print("The best hyperparameters are:\n", lgb_best_hyperparams)

[32m[I 2022-03-23 19:16:18,684][0m A new study created in memory with name: lgb_parameter_opt[0m
[32m[I 2022-03-23 19:16:20,291][0m Trial 0 finished with value: 2.0181724994339656 and parameters: {'learning_rate': 0.008656900442587762, 'max_depth': 10, 'reg_alpha': 8.471801418819979, 'reg_lambda': 2.481040974867813, 'num_leaves': 494, 'colsample_bytree': 0.15599452033620265, 'subsample': 0.05808361216819946, 'subsample_freq': 18, 'min_child_samples': 25, 'max_bin': 383}. Best is trial 0 with value: 2.0181724994339656.[0m
[32m[I 2022-03-23 19:16:34,399][0m Trial 1 finished with value: 1.7547988488345874 and parameters: {'learning_rate': 0.00216771625386895, 'max_depth': 10, 'reg_alpha': 21.368329072358772, 'reg_lambda': 0.07068974950624607, 'num_leaves': 571, 'colsample_bytree': 0.18340450985343382, 'subsample': 0.3042422429595377, 'subsample_freq': 11, 'min_child_samples': 18, 'max_bin': 216}. Best is trial 1 with value: 1.7547988488345874.[0m
[32m[I 2022-03-23 19:16:35,886]

[32m[I 2022-03-23 19:18:50,838][0m Trial 19 finished with value: 1.5717390301487673 and parameters: {'learning_rate': 0.016053647133999992, 'max_depth': 7, 'reg_alpha': 0.038819097888976355, 'reg_lambda': 0.8533570324637013, 'num_leaves': 2625, 'colsample_bytree': 0.6398425719810308, 'subsample': 0.8516831114413721, 'subsample_freq': 14, 'min_child_samples': 40, 'max_bin': 252}. Best is trial 12 with value: 1.5485472461502203.[0m
[32m[I 2022-03-23 19:19:17,087][0m Trial 20 finished with value: 1.612590797343131 and parameters: {'learning_rate': 0.0023650168354119715, 'max_depth': 5, 'reg_alpha': 1.0240172282710127, 'reg_lambda': 21.174905148304283, 'num_leaves': 1915, 'colsample_bytree': 0.26359447387982743, 'subsample': 0.9608771169328999, 'subsample_freq': 1, 'min_child_samples': 19, 'max_bin': 155}. Best is trial 12 with value: 1.5485472461502203.[0m
[32m[I 2022-03-23 19:19:32,578][0m Trial 21 finished with value: 1.5512285225730231 and parameters: {'learning_rate': 0.004549

[32m[I 2022-03-23 19:23:16,467][0m Trial 38 finished with value: 1.5544115178699711 and parameters: {'learning_rate': 0.008527013454904364, 'max_depth': 8, 'reg_alpha': 0.7008366763578057, 'reg_lambda': 0.02919145966551565, 'num_leaves': 986, 'colsample_bytree': 0.7520493169937362, 'subsample': 0.38187802396007803, 'subsample_freq': 2, 'min_child_samples': 13, 'max_bin': 154}. Best is trial 12 with value: 1.5485472461502203.[0m
[32m[I 2022-03-23 19:23:19,498][0m Trial 39 finished with value: 1.6342384020341811 and parameters: {'learning_rate': 0.018922182732019205, 'max_depth': 5, 'reg_alpha': 10.816334046376584, 'reg_lambda': 28.62882208883035, 'num_leaves': 1995, 'colsample_bytree': 0.626760704542541, 'subsample': 0.25760904507729043, 'subsample_freq': 13, 'min_child_samples': 20, 'max_bin': 122}. Best is trial 12 with value: 1.5485472461502203.[0m
[32m[I 2022-03-23 19:23:35,456][0m Trial 40 finished with value: 1.5978968888750043 and parameters: {'learning_rate': 0.003068155

[32m[I 2022-03-23 19:29:14,375][0m Trial 57 finished with value: 1.5393628104600312 and parameters: {'learning_rate': 0.003786703439718293, 'max_depth': 7, 'reg_alpha': 0.022295278241118394, 'reg_lambda': 7.675338450843856, 'num_leaves': 2876, 'colsample_bytree': 0.8935207198402152, 'subsample': 0.3421148496044024, 'subsample_freq': 1, 'min_child_samples': 3, 'max_bin': 377}. Best is trial 56 with value: 1.5386974622751846.[0m
[32m[I 2022-03-23 19:29:35,307][0m Trial 58 finished with value: 1.5381246610156722 and parameters: {'learning_rate': 0.003470329190252938, 'max_depth': 7, 'reg_alpha': 0.021244865487961458, 'reg_lambda': 7.649969901587285, 'num_leaves': 2828, 'colsample_bytree': 0.9870490564739482, 'subsample': 0.2673277435432703, 'subsample_freq': 1, 'min_child_samples': 3, 'max_bin': 382}. Best is trial 58 with value: 1.5381246610156722.[0m
[32m[I 2022-03-23 19:29:54,320][0m Trial 59 finished with value: 1.5385173414617446 and parameters: {'learning_rate': 0.0036144826

[32m[I 2022-03-23 19:36:17,837][0m Trial 76 finished with value: 1.7238961323597177 and parameters: {'learning_rate': 0.0036012002452255054, 'max_depth': 7, 'reg_alpha': 0.014845548801619235, 'reg_lambda': 3.8452899296771155, 'num_leaves': 2843, 'colsample_bytree': 0.13016471491999915, 'subsample': 0.27333488385297366, 'subsample_freq': 3, 'min_child_samples': 4, 'max_bin': 418}. Best is trial 61 with value: 1.5363140060520655.[0m
[32m[I 2022-03-23 19:36:32,393][0m Trial 77 finished with value: 1.748390425061115 and parameters: {'learning_rate': 0.0030855351993067646, 'max_depth': 6, 'reg_alpha': 0.03416951199446686, 'reg_lambda': 23.695452373108086, 'num_leaves': 2990, 'colsample_bytree': 0.9191246430030446, 'subsample': 0.11227664231745255, 'subsample_freq': 1, 'min_child_samples': 31, 'max_bin': 449}. Best is trial 61 with value: 1.5363140060520655.[0m
[32m[I 2022-03-23 19:36:49,325][0m Trial 78 finished with value: 1.5414838756813711 and parameters: {'learning_rate': 0.0046

[32m[I 2022-03-23 19:41:02,150][0m Trial 95 finished with value: 1.539814888089064 and parameters: {'learning_rate': 0.003978442447116503, 'max_depth': 7, 'reg_alpha': 0.018751338636676083, 'reg_lambda': 2.7142585702161828, 'num_leaves': 2931, 'colsample_bytree': 0.96376972301785, 'subsample': 0.3884218345495154, 'subsample_freq': 2, 'min_child_samples': 1, 'max_bin': 467}. Best is trial 79 with value: 1.5360140314738084.[0m
[32m[I 2022-03-23 19:41:21,169][0m Trial 96 finished with value: 1.5488232503674688 and parameters: {'learning_rate': 0.003995922205461012, 'max_depth': 7, 'reg_alpha': 0.017917716186518592, 'reg_lambda': 1.438846604277167, 'num_leaves': 2942, 'colsample_bytree': 0.9699896824703876, 'subsample': 0.4111040847293402, 'subsample_freq': 2, 'min_child_samples': 1, 'max_bin': 469}. Best is trial 79 with value: 1.5360140314738084.[0m
[32m[I 2022-03-23 19:41:23,994][0m Trial 97 finished with value: 1.6049403930733213 and parameters: {'learning_rate': 0.027848440318

The best hyperparameters are:
 {'learning_rate': 0.004517118436756375, 'max_depth': 7, 'reg_alpha': 0.05501213811040662, 'reg_lambda': 4.584272521448968, 'num_leaves': 2539, 'colsample_bytree': 0.8566170764932776, 'subsample': 0.3697332639859875, 'subsample_freq': 3, 'min_child_samples': 2, 'max_bin': 392, 'n_estimators': 10000}


In [116]:
# optuna.visualization.matplotlib.plot_param_importances(lgb_study);

In [117]:
# optuna.visualization.matplotlib.plot_slice(lgb_study);

In [118]:
with open('../pickle/lgb_best_hyperparams.pickle', 'rb') as fw:
    lgb_best_hyperparams = pickle.load(fw)

In [539]:
lucky_seeds = np.random.randint(0, 1000, num_seed_tr)

for i, seed in enumerate(lucky_seeds):

    kfold = StratifiedKFold(n_splits=splits_tr, random_state=seed, shuffle=True) # CV 늘려가면서 하기
    cv = np.zeros(rows_train)
    pred_test = np.zeros(rows_test)

    for n, (train_idx, val_idx) in enumerate(kfold.split(train_x, train_y)):
        
        x_train, x_val = train_x.iloc[train_idx], train_x.iloc[val_idx]
        y_train, y_val = train_y.iloc[train_idx].values.ravel(), train_y.iloc[val_idx].values.ravel()

        lgbmodel = LGBMRegressor(**lgb_best_hyperparams)
        lgbmodel.fit(x_train, y_train, eval_set=[(x_val, y_val)], early_stopping_rounds=30, verbose=-1)
        
        cv[val_idx] = lgbmodel.predict(x_val)
        pred_test += lgbmodel.predict(test_x) / splits_tr
        
    pred_dict['lgb'+str(seed)] = cv
    pred_test_dict['lgb'+str(seed)] = pred_test
    print(f'seed {seed}', 'mean_absolute_error :', mean_absolute_error(train_y, cv))

seed 507 mean_absolute_error : 1.5081423908779208
seed 557 mean_absolute_error : 1.5055449688236962
seed 545 mean_absolute_error : 1.501742195958065
seed 33 mean_absolute_error : 1.5059633389527785
seed 575 mean_absolute_error : 1.502893918808439
seed 519 mean_absolute_error : 1.4865623208532666
seed 270 mean_absolute_error : 1.503224000687485
seed 293 mean_absolute_error : 1.5077689372687273
seed 625 mean_absolute_error : 1.5048008027569406
seed 593 mean_absolute_error : 1.5117782582354276


## (2) XGBoost

In [None]:
train_lab = train.copy()
test_lab = test.copy()

enc = LabelEncoder()
for col in train_lab.columns:
    if train_lab[col].dtypes=='object':
        train_lab[col] = enc.fit_transform(train_lab[col])
        test_lab[col] = enc.fit_transform(test_lab[col])

train_x = train_lab.drop(['Target'], axis=1) # 데이터 나누기
train_y = train_lab['Target']
test_x = test_lab.copy()

print('Label Encoding Completed')

In [None]:
def xgb_objective(trial: Trial) -> float:
    score_hp = []
    for seed_hp in np.random.randint(0, 1000, num_seed_hp):
        params_xgb = {
        "random_state": seed_hp,
        "verbose": None,
        "learning_rate": trial.suggest_loguniform("learning_rate", 2e-3, 1e-1), # eta, default=0.3, range=[0,1]
        "gamma": trial.suggest_loguniform("gamma", 1e-2, 1e+2), # min_split_loss, default=0, range=[0,∞]
        "max_depth": trial.suggest_int("max_depth", 4, 10), # default=5, range=[0,∞]
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10), #default=1
        "max_delta_step" : trial.suggest_int("max_delta_step", 0, 10), #default=0
        "subsample": trial.suggest_uniform("subsample", 0.0, 1.0), # default=1, range=(0,1]
        "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.0, 1.0), # default=1, range=(0,1]
        "colsample_bylevel": trial.suggest_uniform("colsample_bylevel", 0.0, 1.0), # default=1, range=(0,1]
        "colsample_bynode": trial.suggest_uniform("colsample_bynode", 0.0, 1.0), # default=1, range=(0,1]
        "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-2, 1e+2), # default=0, range=[0,∞]
        "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-2, 1e+2), # default=1, range=[0,∞]
        "max_bin": trial.suggest_int("max_bin", 100, 400),
        }

        kfold = StratifiedKFold(n_splits=splits_hp, random_state=basic_seed, shuffle=True) # Cross-validation cv=5
        cv = np.zeros(rows_train)

        for n, (train_idx, val_idx) in enumerate(kfold.split(train_x, train_y)):

            x_train, x_val = train_x.iloc[train_idx], train_x.iloc[val_idx]
            y_train, y_val = train_y.iloc[train_idx].values, train_y.iloc[val_idx].values
            
            dtrain = xgb.DMatrix(x_train, label=y_train)
            dvalid = xgb.DMatrix(x_val, label=y_val)
            watchlist = [(dtrain, 'train'), (dvalid, 'valid')]
                                                                                                # 진행상황 보고싶을때 None을 100으로
            xgbmodel = xgb.train(params_xgb, dtrain, 100000, watchlist, early_stopping_rounds=30, verbose_eval=None)
            cv[val_idx] = xgbmodel.predict(dvalid)
            
        score_hp.append(mean_absolute_error(train_y, cv))
    
    np.mean(score_hp)
#     pred_hp_dict['lgb'+str(seed)] = cv
#     print(f'seed {seed}', 'mean_absolute_error :', mean_absolute_error(train_y, cv))
    
#     return mean_absolute_error(train_y, cv)
    return np.mean(score_hp)

In [None]:
# sampler = TPESampler(seed=basic_seed)
# xgb_study = optuna.create_study(study_name="xgb_parameter_opt", direction="minimize", sampler=sampler)
# xgb_study.optimize(xgb_objective, n_trials=num_trial)

# xgb_best_hyperparams = xgb_study.best_trial.params
# xgb_base_hyperparams = {"random_state": basic_seed}
# xgb_best_hyperparams.update(xgb_base_hyperparams)

# with open('../pickle/xgb_best_hyperparams.pickle', 'wb') as fw:
#     pickle.dump(xgb_best_hyperparams, fw)
# print("The best hyperparameters are:\n", xgb_best_hyperparams)

In [None]:
# optuna.visualization.matplotlib.plot_param_importances(xgb_study);

In [None]:
# optuna.visualization.matplotlib.plot_slice(xgb_study);

In [None]:
with open('../pickle/xgb_best_hyperparams.pickle', 'rb') as fw:
    xgb_best_hyperparams = pickle.load(fw)

In [None]:
lucky_seeds = np.random.randint(0, 1000, num_seed_tr)
xgtest = xgb.DMatrix(test_x)
for i, seed in enumerate(lucky_seeds):

    kfold = StratifiedKFold(n_splits=splits_tr, random_state=seed, shuffle = True) # CV 늘려가면서 하기
    cv=np.zeros(rows_train)
    pred_test = np.zeros(rows_test)

    for n, (train_idx, val_idx) in enumerate(kfold.split(train_x, train_y)):
        
        x_train, x_val = train_x.iloc[train_idx], train_x.iloc[val_idx]
        y_train, y_val = train_y.iloc[train_idx].values.ravel(), train_y.iloc[val_idx].values.ravel()
        
        dtrain = xgb.DMatrix(x_train, label=y_train)
        dvalid = xgb.DMatrix(x_val, label=y_val)
        watchlist = [(dtrain, 'train'), (dvalid, 'valid')]
        
                                                                                            # 진행상황 보고싶을때 None을 100으로
        xgbmodel = xgb.train(xgb_best_hyperparams, dtrain, 100000, watchlist, early_stopping_rounds=30, verbose_eval=None)

        cv[val_idx] = xgbmodel.predict(dvalid)
        pred_test += xgbmodel.predict(xgtest) / splits_tr # CV 바꾸면 이 숫자도 똑같이 바꿔야함
        
    pred_dict['xgb'+str(seed)] = cv
    pred_test_dict['xgb'+str(seed)] = pred_test
    print(f'seed {seed}', 'mean_absolute_error :', mean_absolute_error(train_y, cv))

# 3. Stacking

## (1) Collect Data

In [None]:
def sort_dict(model, pred_dict, pred_test_dict):
    pred_dict_local = {}
    for key, value in pred_dict.items():
        if model in key:
            pred_dict_local[key]=value

    pred_test_dict_local = {}
    for key, value in pred_test_dict.items():
        if model in key:
            pred_test_dict_local[key]=value

    pred_dict_new_local = dict(sorted(
        pred_dict_local.items(), 
        key=lambda x:mean_absolute_error((train_y), list(x[1])), reverse=False)[:sel_seed])
    pred_test_dict_new_local = {}
    for key, value in pred_dict_new_local.items():
        pred_test_dict_new_local[key]=pred_test_dict_local[key]
        
    return pred_dict_new_local, pred_test_dict_new_local

In [None]:
pred_dict_lgb, pred_test_dict_lgb = sort_dict('lgb', pred_dict, pred_test_dict)
pred_dict_xgb, pred_test_dict_xgb = sort_dict('xgb', pred_dict, pred_test_dict)

In [None]:
def save_dict(model, pred_dict, pred_test_dict):
    with open('../pickle/pred_dict_'+model+'.pickle', 'wb') as fw:
        pickle.dump(pred_dict, fw)
    with open('../pickle/pred_test_dict_'+model+'.pickle', 'wb') as fw:
        pickle.dump(pred_test_dict, fw)

In [None]:
save_dict('lgb', pred_dict_lgb, pred_test_dict_lgb)
save_dict('xgb', pred_dict_xgb, pred_test_dict_xgb)

In [None]:
def load_dict(model):
    with open('../pickle/pred_dict_'+model+'.pickle', 'rb') as fw:
        pred_dict_new_local = pickle.load(fw)
    with open('../pickle/pred_test_dict_'+model+'.pickle', 'rb') as fw:
        pred_test_dict_new_local = pickle.load(fw)
    return pred_dict_new_local, pred_test_dict_new_local

In [None]:
pred_dict_lgb, pred_test_dict_lgb = load_dict('lgb')
pred_dict_xgb, pred_test_dict_xgb = load_dict('xgb')

In [None]:
pred_dict_total = {**pred_dict_lgb, **pred_dict_xgb}
pred_test_dict_total = {**pred_test_dict_lgb, **pred_test_dict_xgb}

## (2) HP Tuning

In [None]:
def stack_objective(trial: Trial) -> float:
    score_hp = []
    for seed_hp in np.random.randint(0, 1000, 5):
        params_xgb = {
            "random_state": basic_seed,
            "verbose": None,
            "learning_rate": trial.suggest_loguniform("learning_rate", 1e-3, 1e-1), # eta, default=0.3, range=[0,1]
            "gamma": trial.suggest_loguniform("gamma", 1e-2, 1e+2), # min_split_loss, default=0, range=[0,∞]
            "max_depth": trial.suggest_int("max_depth", 4, 10), # default=5, range=[0,∞]
            "min_child_weight": trial.suggest_int("min_child_weight", 1, 10), #default=1
            "max_delta_step" : trial.suggest_int("max_delta_step", 0, 10), #default=0
            "subsample": trial.suggest_uniform("subsample", 0.0, 1.0), # default=1, range=(0,1]
            "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.0, 1.0), # default=1, range=(0,1]
            "colsample_bylevel": trial.suggest_uniform("colsample_bylevel", 0.0, 1.0), # default=1, range=(0,1]
            "colsample_bynode": trial.suggest_uniform("colsample_bynode", 0.0, 1.0), # default=1, range=(0,1]
            "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-2, 1e+2), # default=0, range=[0,∞]
            "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-2, 1e+2), # default=1, range=[0,∞]
            "max_bin": trial.suggest_int("max_bin", 100, 400),
        }

        kfold = StratifiedKFold(n_splits=splits_hp, random_state=basic_seed, shuffle=True)
        cv = np.zeros(rows_train)

        for n, (train_idx, val_idx) in enumerate(kfold.split(train_x, train_y)):

            x_train, x_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
            y_train, y_val = train_y.iloc[train_idx], train_y.iloc[val_idx]

            dtrain = xgb.DMatrix(x_train, label=y_train)
            dvalid = xgb.DMatrix(x_val, label=y_val)
            watchlist = [(dtrain, 'train'), (dvalid, 'valid')]
                                                                                                # 진행상황 보고싶을때 None을 100으로
            stack_xgbmodel = xgb.train(params_xgb, dtrain, 100000, watchlist, early_stopping_rounds=30, verbose_eval=None)
            cv[val_idx] = stack_xgbmodel.predict(dvalid)

        score_hp.append(mean_absolute_error(train_y, cv))
    
    np.mean(score_hp)
#     pred_hp_dict['lgb'+str(seed)] = cv
#     print(f'seed {seed}', 'mean_absolute_error :', mean_absolute_error(train_y, cv))
    
#     return mean_absolute_error(train_y, cv)
    return np.mean(score_hp)

In [None]:
X_train = pd.DataFrame(np.vstack([x for _, x in pred_dict_total.items()]).T)
X_test = pd.DataFrame(np.vstack([x for _, x in pred_test_dict_total.items()]).T)

sampler = TPESampler(seed=basic_seed)
stack_study = optuna.create_study(study_name="stack_parameter_opt", direction="minimize", sampler=sampler)
stack_study.optimize(stack_objective, n_trials=num_trial)

stack_best_hyperparams = stack_study.best_trial.params
stack_base_hyperparams = {"random_state": basic_seed}
stack_best_hyperparams.update(stack_base_hyperparams)
print("The best hyperparameters are:\n", stack_best_hyperparams)

In [None]:
# X_train = pd.DataFrame(np.vstack([x for _, x in pred_dict_total.items()]).T)
# X_test = pd.DataFrame(np.vstack([x for _, x in pred_test_dict_total.items()]).T)

# sampler = TPESampler(seed=basic_seed)
# stack_study = optuna.create_study(study_name="stack_parameter_opt", direction="minimize", sampler=sampler)
# stack_study.optimize(stack_objective, n_trials=num_trial)

# stack_best_hyperparams = stack_study.best_trial.params
# stack_base_hyperparams = {"random_state": basic_seed}
# stack_best_hyperparams.update(stack_base_hyperparams)

# with open('../pickle/stack_best_hyperparams.pickle', 'wb') as fw:
#     pickle.dump(stack_best_hyperparams, fw)
# print("The best hyperparameters are:\n", stack_best_hyperparams)

In [None]:
# optuna.visualization.matplotlib.plot_param_importances(stack_study);

In [None]:
# optuna.visualization.matplotlib.plot_slice(stack_study);

In [None]:
with open('../pickle/stack_best_hyperparams.pickle', 'rb') as fw:
    stack_best_hyperparams = pickle.load(fw)

In [None]:
pred = np.zeros(rows_train)
pred_test = np.zeros(rows_test)
kfold = StratifiedKFold(n_splits=splits_tr, random_state=basic_seed, shuffle = True)

for n, (train_idx, val_idx) in enumerate(kfold.split(X_train, train_y)):
    x_train, x_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_train, y_val = train_y.iloc[train_idx], train_y.iloc[val_idx]

    dtrain = xgb.DMatrix(x_train, label=y_train)
    dvalid = xgb.DMatrix(x_val, label=y_val)
    watchlist = [(dtrain, 'train'), (dvalid, 'valid')]
                                                                                        # 진행상황 보고싶을때 None을 100으로
    stack_xgbmodel = xgb.train(stack_best_hyperparams, dtrain, 100000, watchlist, early_stopping_rounds=30, verbose_eval=None)
    
    pred[val_idx] = stack_xgbmodel.predict(dvalid)
    pred_test += stack_xgbmodel.predict(xgb.DMatrix(X_test)) / splits_tr

In [None]:
print(f'CV mean_absolute_error: {mean_absolute_error(train_y, pred):.6f}')

# 4. Blending

In [None]:
stack_train = pred.copy()
stack_test = pred_test.copy()

In [None]:
candidate = np.arange(0, 15)
permute = permutations(candidate, 3)
score = {}
for i in list(permute):
    pred_permute = (sum(pred_dict_lgb.values())/sel_seed * i[0] +
                  sum(pred_dict_xgb.values())/sel_seed * i[1] +
#               sum(pred_dict_cnn.values())/sel_seed * i[2] +
#               sum(pred_dict_rcnn.values())/sel_seed * i[3] +
                               stack_train * i[2])
    score[i] = mean_absolute_error(train_y, pred_permute/sum(i))

score = dict(sorted(score.items(), key=lambda x: x[1], reverse=False)[:5])
score

In [None]:
pred = (sum(pred_dict_lgb.values())/sel_seed * list(score.keys())[0][0] +
        sum(pred_dict_xgb.values())/sel_seed * list(score.keys())[0][1] +
#         sum(pred_dict_cnn.values())/sel_seed * list(score.keys())[0][2] +
#         sum(pred_dict_rcnn.values())/sel_seed * list(score.keys())[0][3] +
        stack_train * list(score.keys())[0][2]
       ) / sum(list(score.keys())[0])
mean_absolute_error(train_y, pred)

In [None]:
pred_test = (sum(pred_test_dict_lgb.values())/3 * list(score.keys())[0][0] +
             sum(pred_test_dict_xgb.values())/3 * list(score.keys())[0][1] +
#              sum(pred_test_dict_cnn.values())/3 * list(score.keys())[0][2] +
#              sum(pred_test_dict_rcnn.values())/3 * list(score.keys())[0][3] +
             stack_test * list(score.keys())[0][2]
            ) / sum(list(score.keys())[0])

In [None]:
candidate = np.arange(0, 11)
permute = permutations(candidate,5)
score = {}
for i in list(permute):
    pred_permute = (sum(pred_dict_lgb.values())/sel_seed * i[0] +
                    sum(pred_dict_xgb.values())/sel_seed * i[1] +
                    sum(pred_dict_cnn.values())/sel_seed * i[2] +
                    sum(pred_dict_rcnn.values())/sel_seed * i[3] +
                    stack_train * i[4]
                   ) / 
    score[i] = mean_absolute_error(train_y, pred_permute/sum(i))

score = dict(sorted(score.items(), key=lambda x: x[1], reverse=False)[:5])
score

In [None]:
pred = (sum(pred_dict_lgb.values())/sel_seed * list(score.keys())[0][0] +
        sum(pred_dict_xgb.values())/sel_seed * list(score.keys())[0][1] +
        sum(pred_dict_cnn.values())/sel_seed * list(score.keys())[0][2] +
        sum(pred_dict_rcnn.values())/sel_seed * list(score.keys())[0][3] +
        stack_train * list(score.keys())[0][4]
       ) / sum(list(score.keys())[0])
mean_absolute_error(train_y, pred)

In [None]:
pred_test = (sum(pred_test_dict_lgb.values())/3 * list(score.keys())[0][0] +
             sum(pred_test_dict_xgb.values())/3 * list(score.keys())[0][1] +
             sum(pred_test_dict_cnn.values())/3 * list(score.keys())[0][2] +
             sum(pred_test_dict_rcnn.values())/3 * list(score.keys())[0][3] +
             stack_test * list(score.keys())[0][4]
            ) / sum(list(score.keys())[0])

# 5. Weight

In [None]:
weight_dict = {}
for target in tqdm(np.arange(4, 17)):
    score_dict = {}
    for weight in np.linspace(0.7, 1.3, 6001):
        score_dict[weight] = mean_absolute_error(train_y, np.where(((pred>target)&(pred<target+1)), pred*weight, pred))
    weight_dict[target] = min(score_dict,key=score_dict.get)

In [None]:
mean_absolute_error(train_y, pred)

In [None]:
for key, value in weight_dict.items():
    pred = np.where(((pred>key)&(pred<key+1)), pred*value, pred)
    pred_test = np.where(((pred_test>key)&(pred_test<key+1)), pred_test*value, pred_test)
min_target = list(weight_dict.keys())[0]
max_target = list(weight_dict.keys())[-1]
min_weight = weight_dict[min_target]
max_weight = weight_dict[max_target]
pred = np.where(pred<min_target, pred*min_weight, pred)
pred = np.where(pred>max_target+1, pred*max_weight, pred)
pred_test = np.where(pred_test<min_target, pred_test*min_weight, pred_test)
pred_test = np.where(pred_test>max_target+1, pred_test*max_weight, pred_test)

In [None]:
submission.Target = pred_test

In [None]:
submission_name = '20220324'
submission_number = '1'
submission.to_csv(f'../submission/{submission_name}-{submission_number}.csv', index = False)