# 1. Import

In [None]:
# Basic Library
import os
import pandas as pd
import numpy as np
import pickle
import warnings
warnings.filterwarnings('ignore')

# Modeling
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_absolute_error
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau

In [None]:
def create_dir(dir):
    if not os.path.exists(dir):
        os.makedirs(dir)
        print("Created Directory :", dir)
    else:
        print("Directory already existed :", dir)
create_dir("../pickle")
create_dir("../model")
create_dir("../submission")

In [None]:
train = pd.read_csv("../data/train.csv")
test = pd.read_csv("../data/test.csv")
submission = pd.read_csv("../data/sample_submission.csv")

train = train.drop(["id"], axis=1)
test = test.drop(["id"], axis=1)

In [None]:
rows_train = train.shape[0] # 주어진 train data의 row 수
rows_test = test.shape[0] # 주어진 test data의 row 수
num_trial = 100 # 파라미터 튜닝을 몇 번 진행하는지의 수
splits_hp = 5 # 파라미터 튜닝을 진행할 때의 kfold 수
splits_tr = 5 # 모델 트레이닝을 진행할 때의 kfold 수
basic_seed = 42 # default seed
num_seed_tr = 10 # 트레이닝 seed 개수
sel_seed = 3 # 선택할 seed 개수

In [None]:
pred_dict = {}
pred_test_dict = {}

# 2. NN

In [None]:
train['Weight Ratio'] = train['Shucked Weight'] / train['Whole Weight']
test['Weight Ratio'] = test['Shucked Weight'] / test['Whole Weight']

In [None]:
train['Foreign Body'] = train['Whole Weight'] - (train['Shucked Weight'] + train['Viscra Weight'] + train['Shell Weight'])
test['Foreign Body'] = test['Whole Weight'] - (test['Shucked Weight'] + test['Viscra Weight'] + test['Shell Weight'])
train.loc[train[(train['Foreign Body']<0.0005)].index, "Foreign Body"] = 0.0005
test.loc[test[(test['Foreign Body']<0.0005)].index, "Foreign Body"] = 0.0005

In [None]:
train_ohe = train.copy()
test_ohe = test.copy()

train_ohe = pd.get_dummies(train_ohe)
test_ohe = pd.get_dummies(test_ohe)

train_x = train_ohe.drop(['Target'], axis=1)
train_y = train_ohe['Target']
test_x = test_ohe.copy()

print('One-Hot Encoding Completed')

In [None]:
MODEL_DIR = '../model/'
if not os.path.exists(MODEL_DIR):
    os.mkdir(MODEL_DIR)

modelpath = "../model/{epoch:02d}-{val_loss:.4f}.hdf5"

cp = ModelCheckpoint(filepath=modelpath, monitor='val_mae', verbose=0, save_best_only=True, mode = 'min')
es = EarlyStopping(monitor='val_mae', patience=50, mode='min')
rlrp = ReduceLROnPlateau(monitor='val_mae', factor=0.2, patience=40, mode='min')

In [None]:
lucky_seeds = np.random.randint(0, 1000, num_seed_tr)

for i, seed in enumerate(lucky_seeds):

    kfold = StratifiedKFold(n_splits=splits_tr, random_state=seed, shuffle=True)
    cv = np.zeros(rows_train)
    pred_test = np.zeros(rows_test)

    for n, (train_idx, val_idx) in enumerate(kfold.split(train_x, train_y)):
        
        x_train, x_val = train_x.iloc[train_idx], train_x.iloc[val_idx]
        y_train, y_val = train_y.iloc[train_idx].values.ravel(), train_y.iloc[val_idx].values.ravel()
        
        nnmodel = Sequential()
        nnmodel.add(Dense(16, input_dim=len(train_x.columns), activation='elu'))
        nnmodel.add(Dense(32, activation='elu'))    
        nnmodel.add(Dense(64, activation='elu'))
        nnmodel.add(Dropout(0.5))  
        nnmodel.add(Dense(32, activation='elu'))
        nnmodel.add(Dense(16, activation='elu'))
        nnmodel.add(Dense(1))

        nnmodel.compile(loss='mean_absolute_error',
              optimizer='Nadam', 
              metrics=['mae'])

        nnmodel.fit(train_x, train_y, validation_data=(x_val, y_val), epochs=1000, batch_size=32, 
                    verbose=None, callbacks=[es, cp, rlrp])
        
        cv[val_idx] = nnmodel.predict(x_val).flatten()
        pred_test += nnmodel.predict(test_x).flatten() / splits_tr
        
    pred_dict['nn'+str(seed)] = cv
    pred_test_dict['nn'+str(seed)] = pred_test
    print(f'seed {seed}', 'mean_absolute_error :', mean_absolute_error(train_y, cv))

# 3. Export

In [None]:
def sort_dict(model, pred_dict, pred_test_dict):
    pred_dict_local = {}
    for key, value in pred_dict.items():
        if model in key:
            pred_dict_local[key]=value

    pred_test_dict_local = {}
    for key, value in pred_test_dict.items():
        if model in key:
            pred_test_dict_local[key]=value

    pred_dict_new_local = dict(sorted(
        pred_dict_local.items(), 
        key=lambda x:mean_absolute_error((train_y), list(x[1])), reverse=False)[:sel_seed])
    pred_test_dict_new_local = {}
    for key, value in pred_dict_new_local.items():
        pred_test_dict_new_local[key]=pred_test_dict_local[key]
        
    return pred_dict_new_local, pred_test_dict_new_local

In [None]:
def save_dict(model, pred_dict, pred_test_dict):
    with open('../pickle/pred_dict_'+model+'.pickle', 'wb') as fw:
        pickle.dump(pred_dict, fw)
    with open('../pickle/pred_test_dict_'+model+'.pickle', 'wb') as fw:
        pickle.dump(pred_test_dict, fw)

In [None]:
pred_dict_nn, pred_test_dict_nn = sort_dict('nn', pred_dict, pred_test_dict)
save_dict('nn', pred_dict_nn, pred_test_dict_nn)