In [None]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
%matplotlib inline
plt.rc('font',family='malgun gothic')
plt.rc('axes',unicode_minus=False)
import seaborn as sns

# EDA
import klib

# Preprocessing&Feature Engineering
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import MinMaxScaler, StandardScaler, PowerTransformer, RobustScaler, MaxAbsScaler
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectPercentile
from gensim.models import word2vec

# Hyperparameter Optimization
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from bayes_opt import BayesianOptimization
import kerastuner as kt

# Modeling
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, ARDRegression, BayesianRidge, Lars
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import VotingRegressor, StackingRegressor
from sklearn.neural_network import MLPRegressor
import tensorflow as tf
from vecstack import StackingTransformer
from vecstack import stacking

# Eveluation
from sklearn.metrics import mean_squared_error # squared=False시 RMSE
from sklearn.model_selection import cross_val_score

# Utility
import os
import time
import random
import warnings; warnings.filterwarnings("ignore")
from IPython.display import Image
import pickle
from tqdm import tqdm
import platform
from itertools import combinations
from scipy.stats.mstats import gmean
import pickle

### Read Data

In [None]:
x_train = pd.read_csv(os.path.abspath("../input")+'/x_train_raw.csv', index_col=0)
x_test = pd.read_csv(os.path.abspath("../input")+'/x_test_raw.csv', index_col=0)
df_train = pd.read_csv(os.path.abspath("../input")+'/X_train.csv', encoding='cp949')
y_train = pd.read_csv(os.path.abspath("../input")+'/y_train.csv', encoding='cp949').set_index('custid')
df_test = pd.read_csv(os.path.abspath("../input")+'/X_test.csv', encoding='cp949')
test_id = df_test['custid'].unique()

In [None]:
x_train_cat = pd.read_csv(os.path.abspath("../input")+'/x_train_cat.csv', index_col=0)
x_test_cat = pd.read_csv(os.path.abspath("../input")+'/x_test_cat.csv', index_col=0)
x_train_cat = np.array(x_train_cat)
x_test_cat = np.array(x_test_cat)
x_train_cat2, x_dev_cat, y_train2, y_dev = train_test_split(x_train_cat, y_train, test_size=0.3, random_state=0)

In [None]:
x_train_num = pd.read_csv(os.path.abspath("../input")+'/x_train_num.csv', index_col=0)
x_test_num = pd.read_csv(os.path.abspath("../input")+'/x_test_num.csv', index_col=0)
x_train_num = np.array(x_train_num)
x_test_num = np.array(x_test_num)
x_train_num2, x_dev_num, y_train2, y_dev = train_test_split(x_train_num, y_train, test_size=0.3, random_state=0)

In [None]:
num_stk = pd.read_csv(os.path.abspath("../input")+'/numeric_stkenlas6lgbm14_ard_lgbm_cat.csv').age
cat_stk_dnn = pd.read_csv(os.path.abspath("../input")+'/categorical_stken14laslgbm_rid_lgbm_cat_dnn6.csv').age

In [None]:
num_stk_dev = pd.read_csv(os.path.abspath("../input")+'/numeric_stkenlas6lgbm14_ard_lgbm_cat_dev.csv')
cat_stk_dnn_dev = pd.read_csv(os.path.abspath("../input")+'/categorical_stken14laslgbm_rid_lgbm_cat_dnn6_dev.csv')

In [None]:
cat_copy1 = pd.read_csv(os.path.abspath("../input")+'/cat_copy1_dev.csv')

### Ensemble

In [None]:
pd.DataFrame(dict(zip(['num_stk_dev','cat_stk_dnn_dev','cat_copy1'],[num_stk_dev.age,cat_stk_dnn_dev.age,cat_copy1.age])))

In [None]:
sns.heatmap(pd.DataFrame(dict(zip(['num_stk_dev','cat_stk_dnn_dev','cat_copy1'],[num_stk_dev.age,cat_stk_dnn_dev.age,cat_copy1.age]))).corr(),
           annot=True, cmap='Blues')
plt.show()

In [None]:
mean_squared_error((num_stk_dev+cat_copy1)/2,y_dev,squared=False)

In [None]:
weights_nc = []
rmse_best = 1000
for i in tqdm(range(0,101)):
    for j in range(0,101):
        if i+j!=100:
            continue
        pred = (num_stk_dev*i + cat_stk_dnn_dev*j)/100
        rmse = np.sqrt(mean_squared_error(y_dev, pred))
        if rmse < rmse_best:
            weights_nc = [i,j]
            rmse_best = rmse 
            print(rmse, i,j)     
print(rmse_best, weights_nc)

### Modeling

In [None]:
mix_dev = pd.DataFrame({'num':num_stk_dev.values.flatten()*29/100,'cat':cat_stk_dnn_dev.values.flatten()*71/100})
mix = pd.DataFrame({'num':num_stk.values.flatten()*29/100,'cat':cat_stk_dnn.values.flatten()*71/100})

In [None]:
mix.head()

In [None]:
mix_train, mix_test, y_mix, y_mix_dev = train_test_split(mix_dev,y_dev, test_size=0.3, random_state=0)

In [None]:
pbounds = {
    'alpha':(0,50)
}
def rid_opt(alpha):
    params = {
        'alpha':alpha
    }
    rid = Ridge(random_state=0, **params)
    rid.fit(mix_train,y_mix)
    score = mean_squared_error(rid.predict(mix_test),y_mix_dev,squared=False)
    return -score
BO_rid = BayesianOptimization(rid_opt, pbounds, random_state=0)
BO_rid.maximize(init_points=50, n_iter=50) # init_points: exploration, n_iter: iteration

In [None]:
pbounds = {
    'alpha':(0,50)
}
def las_opt(alpha):
    params = {
        'alpha':alpha
    }
    las = Lasso(random_state=0, **params)
    las.fit(mix_train,y_mix)
    score = mean_squared_error(las.predict(mix_test),y_mix_dev,squared=False)
    return -score
BO_las = BayesianOptimization(las_opt, pbounds, random_state=0)
BO_las.maximize(init_points=50, n_iter=50) # init_points: exploration, n_iter: iteration

In [None]:
pbounds = {
    'alpha':(0,50)
}
def ela_opt(alpha):
    params = {
        'alpha':alpha
    }
    ela = ElasticNet(random_state=0, **params)
    ela.fit(mix_train,y_mix)
    score = mean_squared_error(ela.predict(mix_test),y_mix_dev,squared=False)
    return -score
BO_ela = BayesianOptimization(ela_opt, pbounds, random_state=0)
BO_ela.maximize(init_points=50, n_iter=50) # init_points: exploration, n_iter: iteration

In [None]:
pbounds = {
    'n_iter':(100,1000),
    'alpha_1':(0,50),
    'alpha_2':(0,50),
    'lambda_1':(0,10),
    'lambda_2':(0,10)
}
def ard_opt(n_iter,alpha_1,alpha_2,lambda_1,lambda_2):
    params = {
        'n_iter':int(round(n_iter)),
        'alpha_1':alpha_1,
        'alpha_2':alpha_2,
        'lambda_1':lambda_1,
        'lambda_2':lambda_2
    }
    ard = ARDRegression(**params)
    ard.fit(mix_train,y_mix)
    score = mean_squared_error(ard.predict(mix_test),y_mix_dev,squared=False)
    return -score
BO_ard = BayesianOptimization(ard_opt, pbounds, random_state=0)
BO_ard.maximize(init_points=50, n_iter=50) # init_points: exploration, n_iter: iteration

In [None]:
pbounds = {
    'n_iter':(100,1000),
    'alpha_1':(0,50),
    'alpha_2':(0,50),
    'lambda_1':(0,10),
    'lambda_2':(0,10)
}
def bay_opt(n_iter,alpha_1,alpha_2,lambda_1,lambda_2):
    params = {
        'n_iter':int(round(n_iter)),
        'alpha_1':alpha_1,
        'alpha_2':alpha_2,
        'lambda_1':lambda_1,
        'lambda_2':lambda_2
    }
    bay = BayesianRidge(**params)
    bay.fit(mix_train,y_mix)
    score = mean_squared_error(bay.predict(mix_test),y_mix_dev,squared=False)
    return -score
BO_bay = BayesianOptimization(bay_opt, pbounds, random_state=0)
BO_bay.maximize(init_points=50, n_iter=50)

In [None]:
pbounds = {
    'n_estimators':(100,1000),
    'learning_rate':(0.000000000000000000000000001,1),
    'max_depth':(2, 32),
    'num_leaves':(2, 64),
    'min_child_samples':(10, 200),
    'min_child_weight':(1, 50),
    'subsample':(0.5, 1),
    'colsample_bytree':(0.5, 1),
    'max_bin':(10, 500),
    'reg_lambda':(0.001, 10),
    'reg_alpha':(0.01, 50)
}
def lgbm_opt(n_estimators, learning_rate, max_depth, num_leaves, min_child_samples, min_child_weight,
             subsample, colsample_bytree, max_bin, reg_lambda, reg_alpha):
    params = {
        "n_estimators":int(round(n_estimators)), 
        "learning_rate":learning_rate,
        'max_depth':int(round(max_depth)),
        'num_leaves':int(round(num_leaves)),
        'min_child_samples': int(round(min_child_samples)),
        'min_child_weight': int(round(min_child_weight)),
        'subsample':max(min(subsample, 1), 0),
        'colsample_bytree':max(min(colsample_bytree, 1), 0),
        'reg_lambda': reg_lambda,
        'reg_alpha': reg_alpha
    }
    lgbm = LGBMRegressor(random_state=0, **params)
    lgbm.fit(mix_train,y_mix)
    score = mean_squared_error(lgbm.predict(mix_test),y_mix_dev,squared=False)
    return -score
BO_lgbm = BayesianOptimization(lgbm_opt, pbounds, random_state=0)
BO_lgbm.maximize(init_points=50, n_iter=50)

In [None]:
max_params_rid = BO_rid.max['params']
max_params_las = BO_las.max['params']
max_params_ela = BO_ela.max['params']
max_params_ard = BO_ard.max['params']
max_params_bay = BO_bay.max['params']
max_params_lgbm = BO_lgbm.max['params']

In [None]:
max_params_ard['n_iter'] = int(round(max_params_ard['n_iter']))

max_params_bay['n_iter'] = int(round(max_params_bay['n_iter']))

max_params_lgbm['num_leaves'] = int(round(max_params_lgbm['num_leaves']))
max_params_lgbm['n_estimators'] = int(round(max_params_lgbm['n_estimators']))
max_params_lgbm['max_depth'] = int(round(max_params_lgbm['max_depth']))
max_params_lgbm['min_child_samples'] = int(round(max_params_lgbm['min_child_samples']))
max_params_lgbm['min_child_weight'] = int(round(max_params_lgbm['min_child_weight']))
max_params_lgbm['max_bin'] = int(round(max_params_lgbm['max_bin']))
max_params_lgbm['subsample'] = max(min(max_params_lgbm['subsample'], 1), 0)
max_params_lgbm['colsample_bytree'] = max(min(max_params_lgbm['colsample_bytree'], 1), 0)

In [None]:
regs_tuned = [Ridge(random_state=0, **max_params_rid),Lasso(random_state=0, **max_params_las),ElasticNet(random_state=0, **max_params_ela),
             ARDRegression(**max_params_ard),BayesianRidge(**max_params_bay),LGBMRegressor(random_state=0,**max_params_lgbm),CatBoostRegressor(random_state=0)]
regs_tuned = [(str(reg).split('(')[0], reg) for reg in regs_tuned]
regs_tuned[-1] = list(regs_tuned[-1])
regs_tuned[-1][0] = 'CatBoostRegressor'
regs_tuned[-1] = tuple(regs_tuned[-1])

regs_trained = [(name, reg.fit(mix_train,y_mix), float(mean_squared_error(reg.predict(mix_test),y_mix_dev,squared=False))) 
                    for name, reg in tqdm(regs_tuned.copy())]

In [None]:
regs_tuned = [Ridge(random_state=0, **max_params_rid),Lasso(random_state=0, **max_params_las),ElasticNet(random_state=0, **max_params_ela),
             ARDRegression(**max_params_ard),BayesianRidge(**max_params_bay),LGBMRegressor(random_state=0,**max_params_lgbm),CatBoostRegressor(random_state=0)]
regs_tuned = [(str(reg).split('(')[0], reg) for reg in regs_tuned]
regs_tuned[-1] = list(regs_tuned[-1])
regs_tuned[-1][0] = 'CatBoostRegressor'
regs_tuned[-1] = tuple(regs_tuned[-1])

regs_trained_for_submissions = [(name, reg.fit(mix_dev,y_dev)) for name, reg in tqdm(regs_tuned.copy())]

In [None]:
regs_tuned = [Ridge(random_state=0, **max_params_rid),Lasso(random_state=0, **max_params_las),ElasticNet(random_state=0, **max_params_ela),
             ARDRegression(**max_params_ard),BayesianRidge(**max_params_bay),LGBMRegressor(random_state=0,**max_params_lgbm),CatBoostRegressor(random_state=0)]
regs_tuned = [(str(reg).split('(')[0], reg) for reg in regs_tuned]
regs_tuned[-1] = list(regs_tuned[-1])
regs_tuned[-1][0] = 'CatBoostRegressor'
regs_tuned[-1] = tuple(regs_tuned[-1])

In [None]:
pred_results = []
for name, reg, reg_score in regs_trained:
    pred = list(reg.predict(mix_dev))
    name = f'{name} \n({reg_score:.4f})'
    pred_results.append(pd.Series(pred, name=name))
ensemble_results = pd.concat(pred_results, axis=1)
ensemble_results = ensemble_results.applymap(lambda x: float(x))

# 모형의 예측값 간의 상관관계를 보기 위해 hitmap을 도식한다.
plt.figure(figsize = (8,6))
g = sns.heatmap(ensemble_results.corr(), annot=True, cmap='Blues')
g.set_title("Correlation between models")
plt.show()

In [None]:
corr = (ensemble_results.corr().sum()-1)/(ensemble_results.corr().shape[0]-1)
names = corr.index
rmse = np.array(corr.index.str[-7:-1]).astype(float)
df = pd.DataFrame({'model': names, 'rmse': rmse, 'cor': corr})        

plt.figure(figsize=(8,6))
g = sns.scatterplot(x="cor", y="rmse", data=df, s=40, color='red')
for line in range(0, df.shape[0]):
     g.text(df.cor[line]+0.003, df.rmse[line]-0.003, 
            df.model[line], horizontalalignment='left', 
            size='medium', color='black', weight='semibold')
        
plt.xlim((df.cor.min()-0.01,df.cor.max()+0.01))
plt.ylim((df.rmse.min()-0.01,df.rmse.max()+0.01))
plt.xlabel('Mean Agreement')
plt.ylabel('RMSE')
plt.grid()
plt.show()

In [None]:
mean_squared_error(regs_trained[2][1].predict(mix_test),y_mix_dev,squared=False)

### Deep Neural Network
dnn.save('dnn_en')

In [None]:
def reset_seeds(reset_graph_with_backend=None):
    if reset_graph_with_backend is not None:
        K = reset_graph_with_backend
        K.clear_session()
        tf.compat.v1.reset_default_graph()
        print("KERAS AND TENSORFLOW GRAPHS RESET")  # optional

    np.random.seed(99)
    # seed를 잘 설정하면 성능이 더 잘 오른다.
    random.seed(9)
    tf.compat.v1.set_random_seed(16)
#    os.environ['CUDA_VISIBLE_DEVICES'] = ''  # for GPU
    print("RANDOM SEEDS RESET")  # optional
   
reset_seeds()

In [None]:
i = int(round(mix_dev.shape[0] * 0.8,0))
mix_val, y_val = mix_dev[i:], y_dev[i:]
mix_3, y_mix_3 = mix_dev[:i], y_dev[:i]

In [None]:
def model_fn(hp):
    inputs = tf.keras.Input(shape=(mix_3.shape[1],))
    x = inputs
    for i in range(hp.Int('num_layers', 2, 3, step=1)):
        x = tf.keras.layers.Dense(hp.Int('unit_'+str(i), 16, 64, step=16),
                               activation=hp.Choice('activation',['relu','tanh']))(x)
        x = tf.keras.layers.Dropout(hp.Float('dropout_'+str(i), 0, 0.7, step=0.1, default=0.5))(x)
    outputs = tf.keras.layers.Dense(1, activation='linear')(x)
    model = tf.keras.Model(inputs, outputs)
    model.compile(loss='mse', 
                  optimizer=tf.keras.optimizers.Adam(hp.Choice('learning_rate', [1e-2, 1e-3, 1e-4, 1e-5, 1e-6])), 
                  metrics=[tf.keras.metrics.RootMeanSquaredError()])
    return model

# keras tuner는 튜닝 종류가 4종류가 있음: hyperband, grid search, random search, bayesian optimization
tuner = kt.Hyperband(model_fn,
                     objective=kt.Objective('val_root_mean_squared_error', direction="min"), 
                     max_epochs=30,
                     hyperband_iterations=2,
                     overwrite=True,
                     directory='dnn_tuning')
# objective: 튜닝 기준, hyperband_iterations:이거 자체에서 2번 반복
# overwrite: False시, 기존을 근거로 해 재학습 안시킴

tuner.search(mix_3, y_mix_3, validation_data=(mix_val, y_val),
             callbacks=[tf.keras.callbacks.EarlyStopping()])

In [None]:
tuner.results_summary(1) # 1= 제일 성능이 좋은 놈

In [None]:
# Loss & RMSE
dnn = tuner.get_best_models(1)[0] # best model 중 가장 좋은 모델
dnn.evaluate(mix_val, y_val)

In [None]:
dnn.summary()

In [None]:
mean_squared_error((regs_trained[1][1].predict(mix_test)+dnn.predict(mix_test).flatten())/2,y_mix_dev,squared=False)

### Deployment

In [None]:
w0,w1 = weights_nc
pd.DataFrame({'custid': test_id, 'age': (num_stk*w0+cat_stk_dnn*w1)/100}).to_csv('Wnum_avg&Wcat_avg_dnn.csv', index=False)

In [None]:
selected = [#'LinearRegression',
            #'Ridge',
            'Lasso',
            #'ElasticNet',
            #'ARDRegression',
            #'BayesianRidge',
            #'RandomForestRegressor',
            #'XGBRegressor',
            #'LGBMRegressor',
            #'CatBoostRegressor'
            ]
model = [reg for name,reg in regs_trained_for_submissions if name in selected]
pd.DataFrame({'custid': test_id, 'age': model[0].predict(mix)}).to_csv('las_mix.csv', index=False)