# 1. Import

In [None]:
# Basic Library
import os
import pandas as pd
import numpy as np
import pickle
import warnings
warnings.filterwarnings('ignore')

# HP Tuning
import optuna
from optuna import Trial
from optuna.samplers import TPESampler
from optuna.visualization import plot_contour, plot_optimization_history, plot_parallel_coordinate, plot_slice, plot_param_importances

# Modeling
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_absolute_error
from catboost import CatBoostRegressor, Pool

In [None]:
def create_dir(dir):
    if not os.path.exists(dir):
        os.makedirs(dir)
        print("Created Directory :", dir)
    else:
        print("Directory already existed :", dir)
create_dir("../pickle")
create_dir("../model")
create_dir("../submission")

In [None]:
train = pd.read_csv("../data/train_f2.csv")
test = pd.read_csv("../data/test_f2.csv")
submission = pd.read_csv("../data/sample_submission.csv")

In [None]:
rows_train = train.shape[0] # 주어진 train data의 row 수
rows_test = test.shape[0] # 주어진 test data의 row 수
num_trial = 100 # 파라미터 튜닝을 몇 번 진행하는지의 수
splits_hp = 5 # 파라미터 튜닝을 진행할 때의 kfold 수
splits_tr = 15 # 모델 트레이닝을 진행할 때의 kfold 수
basic_seed = 42 # default seed
num_seed_tr = 15 # 트레이닝 seed 개수
sel_seed = 3 # 선택할 seed 개수

In [None]:
pred_dict = {}
pred_test_dict = {}

# 2. Catboost

In [None]:
train_x = train.drop(['Target'], axis=1)
train_y = train['Target']
test_x = test.copy()

cat_features = ["Gender"]

In [None]:
def cat_objective(trial: Trial) -> float:
    score_hp = []
    for seed_hp in [0]:
        params_cat = {
            "cat_features": cat_features,
            "random_state": seed_hp,
            "loss_function": "MAE",
            "eval_metric": "MAE",
            "iterations": 10000,
            "od_type": "iter",
            "od_wait": trial.suggest_int("od_wait", 300, 600),
            "learning_rate": trial.suggest_uniform("learning_rate", 1e-2, 5e-2), # default=0.03, range=[0,1]
            "colsample_bylevel":trial.suggest_float("colsample_bylevel", 0.3, 0.8),
            "bagging_temperature" :trial.suggest_loguniform('bagging_temperature', 1e-1, 1e+2),
            "random_strength" :trial.suggest_int('random_strength', 0, 30),
            "depth": trial.suggest_int("depth", 4, 7),
            "l2_leaf_reg": trial.suggest_loguniform("l2_leaf_reg", 1e-5, 1e+0), # default=3, range=[0,1]
            "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 30),
            "max_bin": trial.suggest_int("max_bin", 200, 500),
        }

        kfold = StratifiedKFold(n_splits=splits_hp, random_state=seed_hp, shuffle=True)
        cv = np.zeros(rows_train)

        for n, (train_idx, val_idx) in enumerate(kfold.split(train_x, train_y)):

            x_train, x_val = train_x.iloc[train_idx], train_x.iloc[val_idx]
            y_train, y_val = train_y.iloc[train_idx].values, train_y.iloc[val_idx].values

            catmodel = CatBoostRegressor(**params_cat)
                                                                                            
            catmodel.fit(x_train, y_train, eval_set=[(x_val, y_val)], early_stopping_rounds=50, verbose=False) 
            cv[val_idx] = catmodel.predict(x_val)
            
        score_hp.append(mean_absolute_error(train_y, cv))
    
    np.mean(score_hp)
    return np.mean(score_hp)

In [None]:
sampler = TPESampler(seed=basic_seed)
cat_study = optuna.create_study(study_name="cat_parameter_opt", direction="minimize", sampler=sampler)
cat_study.optimize(cat_objective, n_trials=num_trial)

cat_best_hyperparams = cat_study.best_trial.params
cat_base_hyperparams = {'loss_function': 'MAE', 'eval_metric': 'MAE', 'cat_features': cat_features, 'random_state': basic_seed,
                        'od_type': 'iter', 'iterations':10000}
cat_best_hyperparams.update(cat_base_hyperparams)

with open('../pickle/cat_best_hyperparams2.pickle', 'wb') as fw:
    pickle.dump(cat_best_hyperparams, fw)
print("The best hyperparameters are:\n", cat_best_hyperparams)

In [None]:
optuna.visualization.matplotlib.plot_param_importances(cat_study);

In [None]:
optuna.visualization.matplotlib.plot_slice(cat_study);

In [None]:
with open('../pickle/cat_best_hyperparams2.pickle', 'rb') as fw:
    cat_best_hyperparams = pickle.load(fw)

In [None]:
lucky_seeds = np.random.randint(0, 1000, num_seed_tr)

for i, seed in enumerate(lucky_seeds):

    kfold = StratifiedKFold(n_splits=splits_tr, random_state=seed, shuffle=True)
    cv = np.zeros(rows_train)
    pred_test = np.zeros(rows_test)

    for n, (train_idx, val_idx) in enumerate(kfold.split(train_x, train_y)):
        
        x_train, x_val = train_x.iloc[train_idx], train_x.iloc[val_idx]
        y_train, y_val = train_y.iloc[train_idx].values.ravel(), train_y.iloc[val_idx].values.ravel()

        catmodel = CatBoostRegressor(**cat_best_hyperparams)
        catmodel.fit(x_train, y_train, eval_set=[(x_val, y_val)], early_stopping_rounds=50, verbose=False)
        
        cv[val_idx] = catmodel.predict(x_val)
        pred_test += catmodel.predict(test_x) / splits_tr
        
    pred_dict['cat'+str(seed)] = cv
    pred_test_dict['cat'+str(seed)] = pred_test
    print(f'seed {seed}', 'mean_absolute_error :', mean_absolute_error(train_y, cv))

# 3. Export

In [None]:
def sort_dict(model, pred_dict, pred_test_dict):
    pred_dict_local = {}
    for key, value in pred_dict.items():
        if model in key:
            pred_dict_local[key]=value

    pred_test_dict_local = {}
    for key, value in pred_test_dict.items():
        if model in key:
            pred_test_dict_local[key]=value

    pred_dict_new_local = dict(sorted(
        pred_dict_local.items(), 
        key=lambda x:mean_absolute_error((train_y), list(x[1])), reverse=False)[:sel_seed])
    pred_test_dict_new_local = {}
    for key, value in pred_dict_new_local.items():
        pred_test_dict_new_local[key]=pred_test_dict_local[key]
        
    return pred_dict_new_local, pred_test_dict_new_local

In [None]:
def save_dict(model, pred_dict, pred_test_dict):
    with open('../pickle/pred_dict_'+model+'.pickle', 'wb') as fw:
        pickle.dump(pred_dict, fw)
    with open('../pickle/pred_test_dict_'+model+'.pickle', 'wb') as fw:
        pickle.dump(pred_test_dict, fw)

In [None]:
pred_dict_cat, pred_test_dict_cat = sort_dict('cat', pred_dict, pred_test_dict)
save_dict('cat2', pred_dict_cat, pred_test_dict_cat)