# 1. Import

In [11]:
# Basic Library
import os
import pandas as pd
import numpy as np
import pickle
from itertools import permutations, combinations
from tqdm.auto import tqdm
import warnings
warnings.filterwarnings('ignore')

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Preprocessing
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler, QuantileTransformer

# HP Tuning
import optuna
from optuna import Trial
from optuna.samplers import TPESampler
from optuna.visualization import plot_contour, plot_optimization_history
from optuna.visualization import plot_parallel_coordinate, plot_slice, plot_param_importances

# Modeling
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_absolute_error
from lightgbm import LGBMRegressor
import xgboost as xgb
from catboost import CatBoostRegressor, Pool

from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau

In [12]:
def create_dir(dir):
    if not os.path.exists(dir):
        os.makedirs(dir)
        print("Created Directory :", dir)
    else:
        print("Directory already existed :", dir)
create_dir("../pickle")
create_dir("../model")
create_dir("../submission")

Directory already existed : ../pickle
Directory already existed : ../model
Directory already existed : ../submission


In [13]:
train = pd.read_csv("../data/train.csv")
test = pd.read_csv("../data/test.csv")
submission = pd.read_csv("../data/sample_submission.csv")

train.drop('id', axis=1, inplace=True)
test.drop('id', axis=1, inplace=True)

In [14]:
rows_train = train.shape[0] # 주어진 train data의 row 수
rows_test = test.shape[0] # 주어진 test data의 row 수
num_trial = 100 # 파라미터 튜닝을 몇 번 진행하는지의 수
splits_hp = 5 # 파라미터 튜닝을 진행할 때의 kfold 수
splits_tr = 15 # 모델 트레이닝을 진행할 때의 kfold 수
basic_seed = 42 # default seed
num_seed_hp = 3 # 파라미터 튜닝 seed 개수
num_seed_tr = 10 # 트레이닝 seed 개수
sel_seed = 3 # 선택할 seed 개수

In [15]:
pred_dict = {}
pred_test_dict = {}

# 2. NN

In [16]:
train = pd.get_dummies(data = train, columns = ['Gender'], prefix = 'Gender')
test = pd.get_dummies(data = test, columns = ['Gender'], prefix = 'Gender')

In [17]:
train['Weight Ratio'] = train['Shucked Weight'] / train['Whole Weight']
test['Weight Ratio'] = test['Shucked Weight'] / test['Whole Weight']

In [18]:
train['Foreign Body'] = train['Whole Weight'] - (train['Shucked Weight'] + train['Viscra Weight'] + train['Shell Weight'])
test['Foreign Body'] = test['Whole Weight'] - (test['Shucked Weight'] + test['Viscra Weight'] + test['Shell Weight'])
train.loc[train[(train['Foreign Body']<0)].index, "Foreign Body"] = 0
test.loc[test[(test['Foreign Body']<0)].index, "Foreign Body"] = 0

In [19]:
train_ohe = train.copy()
test_ohe = test.copy()

train_ohe = pd.get_dummies(train_ohe)
test_ohe = pd.get_dummies(test_ohe)

train_x = train_ohe.drop(['Target'], axis=1) # 데이터 나누기
train_y = train_ohe['Target']
test_x = test_ohe.copy()

print('One-Hot Encoding Completed')

One-Hot Encoding Completed


In [None]:
def cat_objective(trial: Trial) -> float:
    score_hp = []
    for seed_hp in np.random.randint(0, 1000, num_seed_hp):
        params_cat = {
            "cat_features": cat_features,
            "random_state": seed_hp,
            "loss_function": "MAE",
            "eval_metric": "MAE",
            "iterations": 10000,
            "od_type": "iter",
            "od_wait": trial.suggest_int("od_wait", 50, 500),
            "learning_rate": trial.suggest_uniform("learning_rate", 1e-2, 5e-1), # default=0.03, range=[0,1]
            "colsample_bylevel":trial.suggest_float("colsample_bylevel", 0.3, 0.5),
            "bagging_temperature" :trial.suggest_loguniform('bagging_temperature', 1e-1, 1e+2),
            "random_strength" :trial.suggest_int('random_strength', 0, 30),
            "depth": trial.suggest_int("depth", 4, 6),
            "l2_leaf_reg": trial.suggest_loguniform("l2_leaf_reg", 1e-6, 1e+0), # default=3, range=[0,1]
            "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 20),
            "max_bin": trial.suggest_int("max_bin", 300, 500),
        }

        kfold = StratifiedKFold(n_splits=splits_hp, random_state=seed_hp, shuffle=True) # Cross-validation cv=5
        cv = np.zeros(rows_train)

        for n, (train_idx, val_idx) in enumerate(kfold.split(train_x, train_y)):

            x_train, x_val = train_x.iloc[train_idx], train_x.iloc[val_idx]
            y_train, y_val = train_y.iloc[train_idx].values, train_y.iloc[val_idx].values

            catmodel = CatBoostRegressor(**params_cat)
                                                                                            # 진행상황 보고싶을때 -1을 100으로
            catmodel.fit(x_train, y_train, eval_set=[(x_val, y_val)], early_stopping_rounds=50, verbose=200) 
            cv[val_idx] = catmodel.predict(x_val)
            
        score_hp.append(mean_absolute_error(train_y, cv))
    
    np.mean(score_hp)
    return np.mean(score_hp)

In [None]:
sampler = TPESampler(seed=basic_seed)
cat_study = optuna.create_study(study_name="cat_parameter_opt", direction="minimize", sampler=sampler)
cat_study.optimize(cat_objective, n_trials=num_trial)

cat_best_hyperparams = cat_study.best_trial.params
cat_base_hyperparams = {'loss_function': 'MAE', 'eval_metric': 'MAE', 'cat_features': cat_features, 'random_state': basic_seed,
                        'od_type': 'iter', 'iterations':10000}
cat_best_hyperparams.update(cat_base_hyperparams)

with open('../pickle/cat_best_hyperparams.pickle', 'wb') as fw:
    pickle.dump(cat_best_hyperparams, fw)
print("The best hyperparameters are:\n", cat_best_hyperparams)

In [None]:
optuna.visualization.matplotlib.plot_param_importances(cat_study);

In [None]:
optuna.visualization.matplotlib.plot_slice(cat_study);

In [None]:
# with open('../pickle/cat_best_hyperparams.pickle', 'rb') as fw:
#     cat_best_hyperparams = pickle.load(fw)

In [None]:
def create_model(layer_size, dropout_rates):
    model = Sequential()
    model.add(Dense(layer_size, input_dim=len(train_x.columns), activation='elu'))
    model.add(Dropout(dropout_rates))
    model.add(Dense(layer_size*2, activation='elu'))
    model.add(Dropout(dropout_rates))
    model.add(Dense(layer_size*4, activation='elu'))
    model.add(Dropout(dropout_rates))
    model.add(Dense(layer_size*2, activation='elu'))
    model.add(Dropout(dropout_rates))
    model.add(Dense(layer_size, activation='elu'))
    model.add(Dropout(dropout_rates))
    model.add(Dense(1))

    model.compile(loss='mean_absolute_error',
                  optimizer='Nadam',
                  metrics=['mae'])
    return model

In [20]:
MODEL_DIR = './model/'
if not os.path.exists(MODEL_DIR):
    os.mkdir(MODEL_DIR)

modelpath = "./model/{epoch:02d}-{val_loss:.4f}.hdf5"

# 모델 업데이트 및 저장
cp = ModelCheckpoint(filepath=modelpath, monitor='val_mae', verbose=0, save_best_only=True, mode = 'min')

# 학습 자동 중단 설정
es = EarlyStopping(monitor='val_mae', patience=50, mode='min')

rlrp = ReduceLROnPlateau(monitor='val_mae', factor=0.2, patience=40, mode='min')

In [None]:
print("fold:", mean_absolute_error(y_val, cv[val_idx]))

In [22]:
lucky_seeds = np.random.randint(0, 1000, 2)

for i, seed in enumerate(lucky_seeds):

    kfold = StratifiedKFold(n_splits=splits_tr, random_state=seed, shuffle=True) # CV 늘려가면서 하기
    cv = np.zeros(rows_train)
    pred_test = np.zeros(rows_test)

    for n, (train_idx, val_idx) in enumerate(kfold.split(train_x, train_y)):
        
        x_train, x_val = train_x.iloc[train_idx], train_x.iloc[val_idx]
        y_train, y_val = train_y.iloc[train_idx].values.ravel(), train_y.iloc[val_idx].values.ravel()
        
        nnmodel = Sequential()
        nnmodel.add(Dense(16, input_dim=len(train_x.columns), activation='elu'))
        nnmodel.add(Dense(32, activation='elu'))    
        nnmodel.add(Dense(64, activation='elu'))  
        nnmodel.add(Dropout(0.5))  
        nnmodel.add(Dense(32, activation='elu'))
        nnmodel.add(Dense(16, activation='elu'))
        nnmodel.add(Dense(1))


        nnmodel.compile(loss='mean_absolute_error',
              optimizer='Nadam', 
              metrics=['mae'])

        nnmodel.fit(train_x, train_y, validation_data=(x_val, y_val), epochs=1000, batch_size=32, verbose=None, callbacks=[es, cp, rlrp])
        
        cv[val_idx] = nnmodel.predict(x_val).flatten()
        pred_test += nnmodel.predict(test_x).flatten() / splits_tr
        print("fold:", mean_absolute_error(y_val, cv[val_idx]))
        
    pred_dict['nn'+str(seed)] = cv
    pred_test_dict['nn'+str(seed)] = pred_test
    print(f'seed {seed}', 'mean_absolute_error :', mean_absolute_error(train_y, cv))

KeyboardInterrupt: 

In [None]:
pred_test = pred_test_dict['nn86']

In [None]:
mean_absolute_error(train_y, np.round(pred_dict['nn191']))

# 3. Export

In [None]:
def sort_dict(model, pred_dict, pred_test_dict):
    pred_dict_local = {}
    for key, value in pred_dict.items():
        if model in key:
            pred_dict_local[key]=value

    pred_test_dict_local = {}
    for key, value in pred_test_dict.items():
        if model in key:
            pred_test_dict_local[key]=value

    pred_dict_new_local = dict(sorted(
        pred_dict_local.items(), 
        key=lambda x:mean_absolute_error((train_y), list(x[1])), reverse=False)[:sel_seed])
    pred_test_dict_new_local = {}
    for key, value in pred_dict_new_local.items():
        pred_test_dict_new_local[key]=pred_test_dict_local[key]
        
    return pred_dict_new_local, pred_test_dict_new_local

In [None]:
def save_dict(model, pred_dict, pred_test_dict):
    with open('../pickle/pred_dict_'+model+'.pickle', 'wb') as fw:
        pickle.dump(pred_dict, fw)
    with open('../pickle/pred_test_dict_'+model+'.pickle', 'wb') as fw:
        pickle.dump(pred_test_dict, fw)

In [None]:
pred_dict_cat, pred_test_dict_cat = sort_dict('cat', pred_dict, pred_test_dict)
save_dict('cat', pred_dict_cat, pred_test_dict_cat)

In [None]:
with open('../pickle/autosklearn_cv10_seed0.pickle', 'rb') as fw:
    pred_test_2 = pickle.load(fw)

In [None]:
submission.Target = np.round(pred_test).astype(int)

In [None]:
submission_name = '20220329'
submission_number = '2'
submission.to_csv(f'../submission/{submission_name}-{submission_number}.csv', index = False)