# 1. Import

In [20]:
# Basic Library
import os
import pandas as pd
import numpy as np
import pickle
from itertools import permutations, combinations
from tqdm.auto import tqdm
import warnings
warnings.filterwarnings('ignore')

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Preprocessing
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler, QuantileTransformer

# HP Tuning
import optuna
from optuna import Trial
from optuna.samplers import TPESampler
from optuna.visualization import plot_contour, plot_optimization_history
from optuna.visualization import plot_parallel_coordinate, plot_slice, plot_param_importances

# Modeling
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_absolute_error
from lightgbm import LGBMRegressor
import xgboost as xgb
from catboost import CatBoostRegressor, Pool

from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau

In [21]:
def create_dir(dir):
    if not os.path.exists(dir):
        os.makedirs(dir)
        print("Created Directory :", dir)
    else:
        print("Directory already existed :", dir)
create_dir("../pickle")
create_dir("../model")
create_dir("../submission")

Directory already existed : ../pickle
Directory already existed : ../model
Directory already existed : ../submission


In [22]:
train = pd.read_csv("../data/train.csv")
test = pd.read_csv("../data/test.csv")
submission = pd.read_csv("../data/sample_submission.csv")

train = train.drop(["id"], axis=1)
test = test.drop(["id"], axis=1)

In [23]:
rows_train = train.shape[0] # 주어진 train data의 row 수
rows_test = test.shape[0] # 주어진 test data의 row 수
num_trial = 100 # 파라미터 튜닝을 몇 번 진행하는지의 수
splits_hp = 5 # 파라미터 튜닝을 진행할 때의 kfold 수
splits_tr = 5 # 모델 트레이닝을 진행할 때의 kfold 수
basic_seed = 42 # default seed
num_seed_hp = 3 # 파라미터 튜닝 seed 개수
num_seed_tr = 10 # 트레이닝 seed 개수
sel_seed = 3 # 선택할 seed 개수

In [24]:
pred_dict = {}
pred_test_dict = {}

# 2. NN

In [25]:
train['Weight Ratio'] = train['Shucked Weight'] / train['Whole Weight']
test['Weight Ratio'] = test['Shucked Weight'] / test['Whole Weight']

In [26]:
train['Foreign Body'] = train['Whole Weight'] - (train['Shucked Weight'] + train['Viscra Weight'] + train['Shell Weight'])
test['Foreign Body'] = test['Whole Weight'] - (test['Shucked Weight'] + test['Viscra Weight'] + test['Shell Weight'])
train.loc[train[(train['Foreign Body']<0.0005)].index, "Foreign Body"] = 0.0005
test.loc[test[(test['Foreign Body']<0.0005)].index, "Foreign Body"] = 0.0005

In [27]:
# cat_cols = []
# num_cols = []
# for col in train.columns:
#     if train[col].dtypes=='object':
#         cat_cols.append(col)
#     elif train[col].dtypes=='float64':
#         num_cols.append(col)
        
# for num_col_first in num_cols:
#     for num_col_second in num_cols:
#         if (num_col_first != num_col_second):
# #             train[num_col_first+'/'+num_col_second] = train[num_col_first] / train[num_col_second]
#             train[num_col_first+'*'+num_col_second] = train[num_col_first] * train[num_col_second]
# #             test[num_col_first+'/'+num_col_second] = test[num_col_first] / test[num_col_second]
#             test[num_col_first+'*'+num_col_second] = test[num_col_first] * test[num_col_second]

In [28]:
train_ohe = train.copy()
test_ohe = test.copy()

train_ohe = pd.get_dummies(train_ohe)
test_ohe = pd.get_dummies(test_ohe)

train_x = train_ohe.drop(['Target'], axis=1) # 데이터 나누기
train_y = train_ohe['Target']
test_x = test_ohe.copy()

print('One-Hot Encoding Completed')

One-Hot Encoding Completed


In [29]:
# with open('../pickle/cat_best_hyperparams.pickle', 'rb') as fw:
#     cat_best_hyperparams = pickle.load(fw)

In [30]:
MODEL_DIR = './model/'
if not os.path.exists(MODEL_DIR):
    os.mkdir(MODEL_DIR)

modelpath = "./model/{epoch:02d}-{val_loss:.4f}.hdf5"

# 모델 업데이트 및 저장
cp = ModelCheckpoint(filepath=modelpath, monitor='val_mae', verbose=0, save_best_only=True, mode = 'min')

# 학습 자동 중단 설정
es = EarlyStopping(monitor='val_mae', patience=50, mode='min')

rlrp = ReduceLROnPlateau(monitor='val_mae', factor=0.2, patience=40, mode='min')

In [48]:
pred = (pred_dict['nn245'] + pred_dict['nn389'] + pred_dict['nn546']) / 3

In [54]:
pred_test = (pred_test_dict['nn245'] + pred_test_dict['nn389'] + pred_test_dict['nn546']) / 3

In [53]:
mean_absolute_error(train_y, np.round(pred))

1.3328012769353552

In [36]:
pred_dict

{'nn546': array([12.00800419,  7.0596571 , 13.46709633, ...,  7.82458782,
         7.96741009, 10.96669102]),
 'nn245': array([11.93707943,  7.03967094, 13.71122074, ...,  7.70122814,
         7.99127817, 10.82646084]),
 'nn389': array([11.66647053,  7.13539171, 14.2495203 , ...,  7.57735634,
         7.96422815, 10.68979359]),
 'nn565': array([12.12048531,  7.1705966 , 13.22999001, ...,  8.15041637,
         7.98031664, 10.97032833])}

In [35]:
pred_dict['nn389']

array([11.66647053,  7.13539171, 14.2495203 , ...,  7.57735634,
        7.96422815, 10.68979359])

In [32]:
lucky_seeds = np.random.randint(0, 1000, num_seed_tr)

for i, seed in enumerate(lucky_seeds):

    kfold = StratifiedKFold(n_splits=splits_tr, random_state=seed, shuffle=True) # CV 늘려가면서 하기
    cv = np.zeros(rows_train)
    pred_test = np.zeros(rows_test)

    for n, (train_idx, val_idx) in enumerate(kfold.split(train_x, train_y)):
        
        x_train, x_val = train_x.iloc[train_idx], train_x.iloc[val_idx]
        y_train, y_val = train_y.iloc[train_idx].values.ravel(), train_y.iloc[val_idx].values.ravel()
        
        nnmodel = Sequential()
        nnmodel.add(Dense(16, input_dim=len(train_x.columns), activation='elu'))
        nnmodel.add(Dense(32, activation='elu'))    
        nnmodel.add(Dense(64, activation='elu'))
        nnmodel.add(Dropout(0.5))  
        nnmodel.add(Dense(32, activation='elu'))
        nnmodel.add(Dense(16, activation='elu'))
        nnmodel.add(Dense(1))


        nnmodel.compile(loss='mean_absolute_error',
              optimizer='Nadam', 
              metrics=['mae'])

        nnmodel.fit(train_x, train_y, validation_data=(x_val, y_val), epochs=1000, batch_size=32, verbose=None, callbacks=[es, cp, rlrp])
        
        cv[val_idx] = nnmodel.predict(x_val).flatten()
        pred_test += nnmodel.predict(test_x).flatten() / splits_tr
        #print("fold", n, mean_absolute_error(y_val, cv[val_idx]))
        
    pred_dict['nn'+str(seed)] = cv
    pred_test_dict['nn'+str(seed)] = pred_test
    print(f'seed {seed}', 'mean_absolute_error :', mean_absolute_error(train_y, cv))

seed 245 mean_absolute_error : 1.381848666540452
seed 389 mean_absolute_error : 1.3886267177980611
seed 565 mean_absolute_error : 1.3847721446920944


KeyboardInterrupt: 

In [12]:
lucky_seeds = np.random.randint(0, 1000, num_seed_tr)

for i, seed in enumerate(lucky_seeds):

    kfold = StratifiedKFold(n_splits=splits_tr, random_state=seed, shuffle=True) # CV 늘려가면서 하기
    cv = np.zeros(rows_train)
    pred_test = np.zeros(rows_test)

    for n, (train_idx, val_idx) in enumerate(kfold.split(train_x, train_y)):
        
        x_train, x_val = train_x.iloc[train_idx], train_x.iloc[val_idx]
        y_train, y_val = train_y.iloc[train_idx].values.ravel(), train_y.iloc[val_idx].values.ravel()
        
        nnmodel = Sequential()
        nnmodel.add(Dense(16, input_dim=len(train_x.columns), activation='elu'))
        nnmodel.add(Dense(32, activation='elu'))    
        nnmodel.add(Dense(64, activation='elu'))  
        nnmodel.add(Dropout(0.5))  
        nnmodel.add(Dense(32, activation='elu'))
        nnmodel.add(Dense(16, activation='elu'))
        nnmodel.add(Dense(1))


        nnmodel.compile(loss='mean_absolute_error',
              optimizer='Nadam', 
              metrics=['mae'])

        nnmodel.fit(train_x, train_y, validation_data=(x_val, y_val), epochs=1000, batch_size=32, verbose=None, callbacks=[es, cp, rlrp])
        
        cv[val_idx] = nnmodel.predict(x_val).flatten()
        pred_test += nnmodel.predict(test_x).flatten() / splits_tr
        #print("fold", n, mean_absolute_error(y_val, cv[val_idx]))
        
    pred_dict['nn'+str(seed)] = cv
    pred_test_dict['nn'+str(seed)] = pred_test
    print(f'seed {seed}', 'mean_absolute_error :', mean_absolute_error(train_y, cv))

seed 560 mean_absolute_error : 1.3828230486426083
seed 345 mean_absolute_error : 1.3986986345609282
seed 348 mean_absolute_error : 1.3956041113434081


KeyboardInterrupt: 

In [13]:
pred = (pred_dict['nn779'] + pred_dict['nn672'] + pred_dict['nn456']) / 3

In [None]:
pred_test = (pred_test_dict['nn0'] + pred_test_dict['nn42']) / 2

In [None]:
pred_1 = (pred_dict['nn0'] + pred_dict['nn42']) / 2

In [None]:
pred_test_1 = (pred_test_dict['nn0'] + pred_test_dict['nn42']) / 2

In [None]:
pred = (pred_1 + pred_2) / 2

In [None]:
pred_test = (pred_test_1 + pred_test_2) / 2

In [14]:
mean_absolute_error(train_y, np.round(pred))

1.3743016759776536

In [None]:
mean_absolute_error(train_y, np.round(pred_dict['nn0']))

In [None]:
mean_absolute_error(train_y, np.round(pred_dict['nn42']))

In [None]:
mean_absolute_error(train_y, np.round(pred))

In [None]:
pred_test = pred_test_dict['nn0']

In [None]:
pred_test = np.where(pred_test>1, pred_test, 1)

In [None]:
for i, seed in enumerate([0, 42]):

    kfold = StratifiedKFold(n_splits=splits_tr, random_state=seed, shuffle=True) # CV 늘려가면서 하기
    cv = np.zeros(rows_train)
    pred_test = np.zeros(rows_test)

    for n, (train_idx, val_idx) in enumerate(kfold.split(train_x, train_y)):
        
        x_train, x_val = train_x.iloc[train_idx], train_x.iloc[val_idx]
        y_train, y_val = train_y.iloc[train_idx].values.ravel(), train_y.iloc[val_idx].values.ravel()
        
        nnmodel = Sequential()
        nnmodel.add(Dense(2**13, input_dim=len(train_x.columns), activation='elu'))
        nnmodel.add(Dense(2**11, activation='elu')) 
        nnmodel.add(Dense(2**9, activation='elu'))  
        nnmodel.add(Dense(2**7, activation='elu'))
        nnmodel.add(Dense(2**5, activation='elu'))
        nnmodel.add(Dense(2**3, activation='elu'))
        nnmodel.add(Dense(1))


        nnmodel.compile(loss='mean_absolute_error',
              optimizer='Nadam', 
              metrics=['mae'])

        nnmodel.fit(train_x, train_y, validation_data=(x_val, y_val), epochs=1000, batch_size=32, verbose=None, callbacks=[es, cp, rlrp])
        
        cv[val_idx] = nnmodel.predict(x_val).flatten()
        pred_test += nnmodel.predict(test_x).flatten() / splits_tr
        
    pred_dict['nn'+str(seed)] = cv
    pred_test_dict['nn'+str(seed)] = pred_test
    print(f'seed {seed}', 'mean_absolute_error :', mean_absolute_error(train_y, cv))

In [None]:
mean_absolute_error(train_y, np.round(pred_dict['nn191']))

# 3. Export

In [16]:
def sort_dict(model, pred_dict, pred_test_dict):
    pred_dict_local = {}
    for key, value in pred_dict.items():
        if model in key:
            pred_dict_local[key]=value

    pred_test_dict_local = {}
    for key, value in pred_test_dict.items():
        if model in key:
            pred_test_dict_local[key]=value

    pred_dict_new_local = dict(sorted(
        pred_dict_local.items(), 
        key=lambda x:mean_absolute_error((train_y), list(x[1])), reverse=False)[:sel_seed])
    pred_test_dict_new_local = {}
    for key, value in pred_dict_new_local.items():
        pred_test_dict_new_local[key]=pred_test_dict_local[key]
        
    return pred_dict_new_local, pred_test_dict_new_local

In [17]:
def save_dict(model, pred_dict, pred_test_dict):
    with open('../pickle/pred_dict_'+model+'.pickle', 'wb') as fw:
        pickle.dump(pred_dict, fw)
    with open('../pickle/pred_test_dict_'+model+'.pickle', 'wb') as fw:
        pickle.dump(pred_test_dict, fw)

In [19]:
pred_dict_nn, pred_test_dict_nn = sort_dict('nn', pred_dict, pred_test_dict)
save_dict('nn', pred_dict_nn, pred_test_dict_nn)

In [None]:
with open('../pickle/autosklearn_cv10_seed0.pickle', 'rb') as fw:
    pred_test_2 = pickle.load(fw)

In [55]:
submission.Target = np.round(pred_test).astype(int)

In [56]:
submission_name = '20220331'
submission_number = '2'
submission.to_csv(f'../submission/{submission_name}-{submission_number}.csv', index = False)