# 1. Import

In [None]:
# Basic Library
import os
import pandas as pd
import numpy as np
import pickle
import warnings
warnings.filterwarnings('ignore')

# Preprocessing
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

# Modeling
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_absolute_error
import lightgbm as lgb
import xgboost as xgb
from kaggler.model import AutoLGB

In [None]:
def create_dir(dir):
    if not os.path.exists(dir):
        os.makedirs(dir)
        print("Created Directory :", dir)
    else:
        print("Directory already existed :", dir)
create_dir("../pickle")
create_dir("../model")
create_dir("../submission")

In [None]:
train = pd.read_csv("../data/train_f2.csv")
test = pd.read_csv("../data/test_f2.csv")

In [None]:
rows_train = train.shape[0] # 주어진 train data의 row 수
rows_test = test.shape[0] # 주어진 test data의 row 수
num_trial = 100 # 파라미터 튜닝을 몇 번 진행하는지의 수
splits_hp = 5 # 파라미터 튜닝을 진행할 때의 kfold 수
splits_tr = 5 # 모델 트레이닝을 진행할 때의 kfold 수
basic_seed = 42 # default seed
num_seed_tr = 10 # 트레이닝 seed 개수
sel_seed = 3 # 선택할 seed 개수

In [None]:
pred_dict = {}
pred_test_dict = {}

# 2. AutoLGB

In [None]:
train_ohe = train.copy()
test_ohe = test.copy()

train_ohe = pd.get_dummies(train_ohe)
test_ohe = pd.get_dummies(test_ohe)

train_x = train_ohe.drop(['Target'], axis=1) # 데이터 나누기
train_y = train_ohe['Target']
test_x = test_ohe.copy()

print('One-Hot Encoding Completed')

In [None]:
unique_seed = np.random.randint(0, 1000, 1)[0]
kfold = StratifiedKFold(n_splits=splits_tr, random_state=unique_seed, shuffle=True) # CV 늘려가면서 하기
pred = np.zeros(rows_train)
pred_test = np.zeros(rows_test)

In [None]:
for n, (train_idx, val_idx) in enumerate(kfold.split(train_x, train_y)):
    if n == 0:
        autolgb = AutoLGB(objective='regression', metric='mae',
                      feature_selection=True, n_est=10000, random_state=42)
        autolgb.tune(train_x.iloc[train_idx], train_y[train_idx])
        n_best = autolgb.n_best
        features = autolgb.features
        params = autolgb.params
        print(f'best iteration: {n_best}')
        print(f'selected features ({len(features)}): {features}')        
        print(params)
        autolgb.fit(train_x.iloc[train_idx], train_y[train_idx])
    else:
        train_data = lgb.Dataset(train_x[features].iloc[train_idx], label=train_y[train_idx])
        autolgb = lgb.train(params, train_data, n_best, verbose_eval=100)
        
    pred[val_idx] = autolgb.predict(train_x[features].iloc[val_idx])
    pred_test += autolgb.predict(test_x[features]) / splits_tr

In [None]:
pred_dict['lgb'+str(unique_seed)] = pred
pred_test_dict['lgb'+str(unique_seed)] = pred_test
print(f'seed {unique_seed}', 'mean_absolute_error :', mean_absolute_error(train_y, pred))