# 전복 나이 예측 경진대회

https://github.com/pobredward/dacon-competition/tree/main/abalone-age-prediction <br>
모든 코드는 깃허브에 업로드하였고, 각각의 코드를 파일 한 개로 모을 시간이 마땅치 않아 nbviewer 링크로 업로드합니다.

#### 1. 파생 변수 약 6000여개와 8000여개를 각각 생성하여 두 가지의 데이터프레임을 만들었습니다.
https://nbviewer.org/github/pobredward/dacon-competition/blob/main/abalone-age-prediction/code/1_FeatureEngineering.ipynb

#### 2. 얻어낸 데이터프레임 두 개를 따로따로 모델에 학습시켰습니다.
(LGB1) https://nbviewer.org/github/pobredward/dacon-competition/blob/main/abalone-age-prediction/code/2_Modeling_lightgbm-f1.ipynb<br>
(LGB2) https://nbviewer.org/github/pobredward/dacon-competition/blob/main/abalone-age-prediction/code/2_Modeling_lightgbm-f2.ipynb<br>
(XGB1) https://nbviewer.org/github/pobredward/dacon-competition/blob/main/abalone-age-prediction/code/2_Modeling_xgboost-f1.ipynb<br>
(XGB2) https://nbviewer.org/github/pobredward/dacon-competition/blob/main/abalone-age-prediction/code/X_Modeling_xgboost-f2.ipynb<br>
(CAT1) https://nbviewer.org/github/pobredward/dacon-competition/blob/main/abalone-age-prediction/code/X_Modeling_catboost-f1.ipynb<br>
(CAT2) https://nbviewer.org/github/pobredward/dacon-competition/blob/main/abalone-age-prediction/code/X_Modeling_catboost-f2.ipynb<br>
(NN) https://nbviewer.org/github/pobredward/dacon-competition/blob/main/abalone-age-prediction/code/2_Modeling_nn.ipynb

#### 3. 총 2번의 Stacking을 거친 후 Blending하였습니다. Blending 과정은 seed가 바뀔 때마다 permutation으로 적절한 값을 노가다로 찾아야 함.
(STACK1) https://nbviewer.org/github/pobredward/dacon-competition/blob/main/abalone-age-prediction/code/3_stacking.ipynb<br>
(STACK2) https://nbviewer.org/github/pobredward/dacon-competition/blob/main/abalone-age-prediction/code/3_stacking2.ipynb

![structure.png](../image/structure.png)

# Import

In [None]:
# Basic Library
import os
import pandas as pd
import numpy as np
import pickle
from itertools import permutations, combinations
from tqdm.auto import tqdm
import warnings
warnings.filterwarnings('ignore')

# HP Tuning
import optuna
from optuna import Trial
from optuna.samplers import TPESampler
from optuna.visualization import plot_contour, plot_optimization_history, plot_parallel_coordinate, plot_slice, plot_param_importances

# Modeling
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_absolute_error
import xgboost as xgb

In [None]:
def create_dir(dir):
    if not os.path.exists(dir):
        os.makedirs(dir)
        print("Created Directory :", dir)
    else:
        print("Directory already existed :", dir)
create_dir("../pickle")
create_dir("../model")
create_dir("../submission")

In [None]:
train = pd.read_csv("../data/train.csv")
test = pd.read_csv("../data/test.csv")
submission = pd.read_csv("../data/sample_submission.csv")

train = train.drop(["id"], axis=1)
test = test.drop(["id"], axis=1)

In [None]:
rows_train = train.shape[0] # 주어진 train data의 row 수
rows_test = test.shape[0] # 주어진 test data의 row 수
num_trial = 100 # 파라미터 튜닝을 몇 번 진행하는지의 수
splits_hp = 5 # 파라미터 튜닝을 진행할 때의 kfold 수
splits_tr = 15 # 모델 트레이닝을 진행할 때의 kfold 수
basic_seed = 42 # default seed
num_seed_tr = 10 # 트레이닝 seed 개수
sel_seed = 3 # 선택할 seed 개수

In [None]:
def sort_dict(model, pred_dict, pred_test_dict):
    pred_dict_local = {}
    for key, value in pred_dict.items():
        if model in key:
            pred_dict_local[key]=value

    pred_test_dict_local = {}
    for key, value in pred_test_dict.items():
        if model in key:
            pred_test_dict_local[key]=value

    pred_dict_new_local = dict(sorted(
        pred_dict_local.items(), 
        key=lambda x:mean_absolute_error((train_y), list(x[1])), reverse=False)[:sel_seed])
    pred_test_dict_new_local = {}
    for key, value in pred_dict_new_local.items():
        pred_test_dict_new_local[key]=pred_test_dict_local[key]
        
    return pred_dict_new_local, pred_test_dict_new_local

In [None]:
def save_dict(model, pred_dict, pred_test_dict):
    with open('../pickle/pred_dict_'+model+'.pickle', 'wb') as fw:
        pickle.dump(pred_dict, fw)
    with open('../pickle/pred_test_dict_'+model+'.pickle', 'wb') as fw:
        pickle.dump(pred_test_dict, fw)

In [None]:
def load_dict(model):
    with open('../pickle/pred_dict_'+model+'.pickle', 'rb') as fw:
        pred_dict_new_local = pickle.load(fw)
    with open('../pickle/pred_test_dict_'+model+'.pickle', 'rb') as fw:
        pred_test_dict_new_local = pickle.load(fw)
    return pred_dict_new_local, pred_test_dict_new_local

In [None]:
pred_dict = {}
pred_test_dict = {}

In [None]:
pred_dict_lgb, pred_test_dict_lgb = load_dict('lgb')
pred_dict_lgb2, pred_test_dict_lgb2 = load_dict('lgb2')
pred_dict_xgb, pred_test_dict_xgb = load_dict('xgb')
pred_dict_cat, pred_test_dict_cat = load_dict('cat')
pred_dict_cat2, pred_test_dict_cat2 = load_dict('cat2')
pred_dict_nn, pred_test_dict_nn = load_dict('nn')

In [None]:
pred_dict_total = {**pred_dict_lgb, **pred_dict_cat,
                   **pred_dict_lgb2, **pred_dict_xgb, **pred_dict_cat2, **pred_dict_nn}
pred_test_dict_total = {**pred_test_dict_lgb, **pred_test_dict_cat, 
                        **pred_test_dict_lgb2, **pred_test_dict_xgb, **pred_test_dict_cat2, **pred_test_dict_nn}

# Stacking

## (2) HP Tuning

In [None]:
X_train = pd.DataFrame(np.vstack([x for _, x in pred_dict_total.items()]).T)
X_test = pd.DataFrame(np.vstack([x for _, x in pred_test_dict_total.items()]).T)
train_y = train['Target']

In [None]:
def stack_objective(trial: Trial) -> float:
    score_hp = []
    for seed_hp in [0]:
        params_xgb = {
            "random_state": basic_seed,
            "verbose": None,
            "learning_rate": trial.suggest_loguniform("learning_rate", 1e-3, 1e-1), # eta, default=0.3, range=[0,1]
            "gamma": trial.suggest_loguniform("gamma", 1e-2, 1e+2), # min_split_loss, default=0, range=[0,∞]
            "max_depth": trial.suggest_int("max_depth", 4, 12), # default=5, range=[0,∞]
            "min_child_weight": trial.suggest_int("min_child_weight", 1, 10), #default=1
            "max_delta_step" : trial.suggest_int("max_delta_step", 0, 10), #default=0
            "subsample": trial.suggest_uniform("subsample", 0.2, 1.0), # default=1, range=(0,1]
            "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.2, 1.0), # default=1, range=(0,1]
            "colsample_bylevel": trial.suggest_uniform("colsample_bylevel", 0.2, 1.0), # default=1, range=(0,1]
            "colsample_bynode": trial.suggest_uniform("colsample_bynode", 0.2, 1.0), # default=1, range=(0,1]
            "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-2, 1e+2), # default=0, range=[0,∞]
            "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-2, 1e+2), # default=1, range=[0,∞]
            "max_bin": trial.suggest_int("max_bin", 100, 500),
        }

        kfold = StratifiedKFold(n_splits=splits_hp, random_state=seed_hp, shuffle=True)
        cv = np.zeros(rows_train)

        for n, (train_idx, val_idx) in enumerate(kfold.split(X_train, train_y)):

            x_train, x_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
            y_train, y_val = train_y.iloc[train_idx], train_y.iloc[val_idx]

            dtrain = xgb.DMatrix(x_train, label=y_train)
            dvalid = xgb.DMatrix(x_val, label=y_val)
            watchlist = [(dtrain, 'train'), (dvalid, 'valid')]
                                                                                               
            stack_xgbmodel = xgb.train(params_xgb, dtrain, 100000, watchlist, early_stopping_rounds=30, verbose_eval=None)
            cv[val_idx] = stack_xgbmodel.predict(dvalid)

        score_hp.append(mean_absolute_error(train_y, cv))
    
    np.mean(score_hp)
    return np.mean(score_hp)

In [None]:
sampler = TPESampler(seed=basic_seed)
stack_study = optuna.create_study(study_name="stack_parameter_opt", direction="minimize", sampler=sampler)
stack_study.optimize(stack_objective, n_trials=num_trial)

stack_best_hyperparams = stack_study.best_trial.params
stack_base_hyperparams = {"random_state": basic_seed}
stack_best_hyperparams.update(stack_base_hyperparams)
print("The best hyperparameters are:\n", stack_best_hyperparams)

with open('../pickle/stack_best_hyperparams2.pickle', 'wb') as fw:
    pickle.dump(stack_best_hyperparams, fw)
print("The best hyperparameters are:\n", stack_best_hyperparams)

In [None]:
optuna.visualization.matplotlib.plot_param_importances(stack_study);

In [None]:
optuna.visualization.matplotlib.plot_slice(stack_study);

In [None]:
with open('../pickle/stack_best_hyperparams2.pickle', 'rb') as fw:
    stack_best_hyperparams = pickle.load(fw)

In [None]:
stack_best_hyperparams = stack_study.best_trial.params
stack_base_hyperparams = {"random_state": basic_seed}
stack_best_hyperparams.update(stack_base_hyperparams)
print("The best hyperparameters are:\n", stack_best_hyperparams)

with open('../pickle/stack_best_hyperparams2.pickle', 'wb') as fw:
    pickle.dump(stack_best_hyperparams, fw)

In [None]:
lucky_seeds = np.random.randint(0, 1000, num_seed_tr)

for i, seed in enumerate(lucky_seeds):

    kfold = StratifiedKFold(n_splits=splits_tr, random_state=seed, shuffle=True)
    cv = np.zeros(rows_train)
    pred_test = np.zeros(rows_test)

    for n, (train_idx, val_idx) in enumerate(kfold.split(X_train, train_y)):
        x_train, x_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_train, y_val = train_y.iloc[train_idx], train_y.iloc[val_idx]

        dtrain = xgb.DMatrix(x_train, label=y_train)
        dvalid = xgb.DMatrix(x_val, label=y_val)
        watchlist = [(dtrain, 'train'), (dvalid, 'valid')]
                                                                                           
        stack_xgbmodel = xgb.train(stack_best_hyperparams, dtrain, 100000, watchlist, early_stopping_rounds=30, verbose_eval=None)

        cv[val_idx] = stack_xgbmodel.predict(dvalid)
        pred_test += stack_xgbmodel.predict(xgb.DMatrix(X_test)) / splits_tr
        
    pred_dict['stack'+str(seed)] = cv
    pred_test_dict['stack'+str(seed)] = pred_test
    print(f'seed {seed}', 'mean_absolute_error :', mean_absolute_error(train_y, cv))

In [None]:
pred_dict_stack2, pred_test_dict_stack2 = sort_dict('stack', pred_dict, pred_test_dict)
save_dict('stack2', pred_dict_stack2, pred_test_dict_stack2)

In [None]:
pred = np.zeros(rows_train)
for _, value in pred_dict_stack2.items():
    pred += value
pred /= len(pred_dict_stack2)

In [None]:
print(f'CV mean_absolute_error: {mean_absolute_error(train_y, np.round(pred)):.6f}')

# 4. Blending

In [None]:
pred_dict_stack1, pred_test_dict_stack1 = load_dict('stack')

In [None]:
candidate = np.arange(0, 600)
# candidate = [0, 1, 2, 3, 4, 5, 10, 140, 520]
permute = permutations(candidate, 7)
score = {}
for i in tqdm(list(permute)):
    pred_permute = (
                    sum(pred_dict_lgb.values())/sel_seed * i[0] +
                    sum(pred_dict_lgb2.values())/sel_seed * i[1] +
#                     sum(pred_dict_xgb.values())/sel_seed * i[2] +
                    sum(pred_dict_cat.values())/sel_seed * i[2] +
                    sum(pred_dict_cat2.values())/sel_seed * i[3] +
                    sum(pred_dict_nn.values())/sel_seed * i[4] +
                    sum(pred_dict_stack1.values())/sel_seed * i[5] + 
                    sum(pred_dict_stack2.values())/sel_seed * i[6]
                   )
    score[i] = mean_absolute_error(train_y, pred_permute/sum(i))

score = dict(sorted(score.items(), key=lambda x: x[1], reverse=False)[:5])
score

In [None]:
pred = (sum(pred_dict_lgb.values())/sel_seed * list(score.keys())[0][0] +
        sum(pred_dict_lgb2.values())/sel_seed * list(score.keys())[0][1] +
#         sum(pred_dict_xgb.values())/sel_seed * list(score.keys())[0][2] +
        sum(pred_dict_cat.values())/sel_seed * list(score.keys())[0][2] +
        sum(pred_dict_cat2.values())/sel_seed * list(score.keys())[0][3] +
        sum(pred_dict_nn.values())/sel_seed * list(score.keys())[0][4] +
        sum(pred_dict_stack1.values())/sel_seed * list(score.keys())[0][5] +
        sum(pred_dict_stack2.values())/sel_seed * list(score.keys())[0][6]
       ) / sum(list(score.keys())[0])
mean_absolute_error(train_y, np.round(pred))

In [None]:
pred_test = (sum(pred_test_dict_lgb.values())/3 * list(score.keys())[0][0] +
             sum(pred_test_dict_lgb2.values())/3 * list(score.keys())[0][1] +
#              sum(pred_test_dict_xgb.values())/3 * list(score.keys())[0][2] +
             sum(pred_test_dict_cat.values())/3 * list(score.keys())[0][2] +
             sum(pred_test_dict_cat2.values())/3 * list(score.keys())[0][3] +
             sum(pred_test_dict_nn.values())/3 * list(score.keys())[0][4] +
             sum(pred_test_dict_stack1.values())/3 * list(score.keys())[0][5] +
             sum(pred_test_dict_stack2.values())/3 * list(score.keys())[0][6]
            ) / sum(list(score.keys())[0])

In [None]:
submission.Target = np.round(pred_test).astype(int)

In [None]:
submission_name = '20220401'
submission_number = '3'
submission.to_csv(f'../submission/{submission_name}-{submission_number}.csv', index = False)