In [None]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
%matplotlib inline
plt.rc('font',family='malgun gothic')
plt.rc('axes',unicode_minus=False)
import seaborn as sns

# EDA
import klib

# Preprocessing&Feature Engineering
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.preprocessing import MinMaxScaler, StandardScaler, PowerTransformer, RobustScaler, MaxAbsScaler
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectPercentile
from gensim.models import word2vec

# Hyperparameter Optimization
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from bayes_opt import BayesianOptimization
import kerastuner as kt

# Modeling
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, ARDRegression, BayesianRidge, Lars
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import VotingRegressor, StackingRegressor
from sklearn.neural_network import MLPRegressor
import tensorflow as tf
from vecstack import StackingTransformer
from vecstack import stacking

# Eveluation
from sklearn.metrics import mean_squared_error # squared=False시 RMSE
from sklearn.model_selection import cross_val_score

# Utility
import os
import time
import random
import warnings; warnings.filterwarnings("ignore")
from IPython.display import Image
import pickle
from tqdm import tqdm
import platform
from itertools import combinations
from scipy.stats.mstats import gmean
import pickle

### Read Data

In [None]:
df_train = pd.read_csv(os.path.abspath("../input")+'/X_train.csv', encoding='cp949')
y_train = pd.read_csv(os.path.abspath("../input")+'/y_train.csv', encoding='cp949').set_index('custid')
df_test = pd.read_csv(os.path.abspath("../input")+'/X_test.csv', encoding='cp949')
test_id = df_test['custid'].unique()
_,_,_,y_dev = train_test_split(df_train.groupby('custid').sum(), y_train, test_size=0.3, random_state=0)

In [None]:
round1_1st = pd.read_csv(os.path.abspath(os.path.abspath("../input")+'/1등feature모델링.csv'))
round1_2nd = pd.read_csv(os.path.abspath(os.path.abspath("../input")+'/2등feature모델링.csv'))
round1_my = pd.read_csv(os.path.abspath(os.path.abspath("../input")+'/averagingstk77_ridlgbmcat_dnn23.csv'))
cat_num = pd.read_csv(os.path.abspath(os.path.abspath("../input")+'/las_mix.csv'))
cy = pd.read_csv(os.path.abspath(os.path.abspath("../input")+'/averaging_rid_lgbm_cat.csv'))

In [None]:
round1_1st_dev = pd.read_csv(os.path.abspath(os.path.abspath("../input")+'/1등feature모델링_dev.csv'))
round1_2nd_dev = pd.read_csv(os.path.abspath(os.path.abspath("../input")+'/2등feature모델링_dev.csv'))
round1_my_dev = pd.read_csv(os.path.abspath(os.path.abspath("../input")+'/averagingstk77_ridlgbmcat_dnn23_dev.csv'))
cat_num_dev = pd.read_csv(os.path.abspath(os.path.abspath("../input")+'/las_mix_dev.csv'))
cy_dev = pd.read_csv(os.path.abspath(os.path.abspath("../input")+'/averaging_rid_lgbm_cat_dev.csv'))

### Ensemble

In [None]:
results_sub = pd.DataFrame({'1등':round1_1st.age,'2등':round1_2nd.age,'round1_수정':round1_my.age,'범주형,수치형분리':cat_num.age, '창용':cy.age})

In [None]:
results = pd.DataFrame({'1등':round1_1st_dev.age,'2등':round1_2nd_dev.age,'round1_수정':round1_my_dev.age,'범주형,수치형분리':cat_num_dev.age, '창용':cy_dev.age})

In [None]:
sns.heatmap(results.corr(), annot=True, cmap='Blues')
plt.show()

In [None]:
print(mean_squared_error(round1_1st_dev,y_dev,squared=False))
print(mean_squared_error(round1_2nd_dev,y_dev,squared=False))
print(mean_squared_error(round1_my_dev,y_dev,squared=False))
print(mean_squared_error(cat_num_dev,y_dev,squared=False))
print(mean_squared_error(cy_dev.age,y_dev,squared=False))

In [None]:
weights_avg = []
rmse_best = 1000
for i in tqdm(range(45, 101, 1)):
    for j in range(0, 101, 1):
        for k in range(0, 101, 1):
            for l in range(0,101,1):
                for m in range(0,101,1):
                    if (i+j+k+l+m) != 100:
                        continue
                    pred = (round1_1st_dev.age*i+round1_2nd_dev.age*j+round1_my_dev.age*k+cat_num_dev.age*l+cy_dev.age*m)/(i+j+k+l+m)
                    rmse = np.sqrt(mean_squared_error(y_dev, pred))
                    if rmse < rmse_best:
                        weights_avg = [i,j,k,l,m]
                        rmse_best = rmse 
                        print(rmse, i,j,k,l,m)            

print(rmse_best, weights_avg)

### Modeling

#### Model Tuning

In [None]:
results = np.array(results)
y_dev = np.array(y_dev)

In [None]:
r_train, r_dev, y_train2, y_dev2 = train_test_split(results,y_dev,test_size=0.3,random_state=0)

In [None]:
pbounds = {
    'alpha':(0,50)
}
def rid_opt(alpha):
    params = {
        'alpha':alpha
    }
    rid = Ridge(random_state=0, **params)
    rid.fit(r_train,y_train2)
    score = mean_squared_error(rid.predict(r_dev),y_dev2,squared=False)
    return -score
BO_rid = BayesianOptimization(rid_opt, pbounds, random_state=0)
BO_rid.maximize(init_points=50, n_iter=50) # init_points: exploration, n_iter: iteration

In [None]:
pbounds = {
    'alpha':(0,50)
}
def las_opt(alpha):
    params = {
        'alpha':alpha
    }
    las = Lasso(random_state=0, **params)
    las.fit(r_train,y_train2)
    score = mean_squared_error(las.predict(r_dev),y_dev2,squared=False)
    return -score
BO_las = BayesianOptimization(las_opt, pbounds, random_state=0)
BO_las.maximize(init_points=50, n_iter=50) # init_points: exploration, n_iter: iteration

In [None]:
pbounds = {
    'alpha':(0,50)
}
def ela_opt(alpha):
    params = {
        'alpha':alpha
    }
    ela = ElasticNet(random_state=0, **params)
    ela.fit(r_train,y_train2)
    score = mean_squared_error(ela.predict(r_dev),y_dev2,squared=False)
    return -score
BO_ela = BayesianOptimization(ela_opt, pbounds, random_state=0)
BO_ela.maximize(init_points=50, n_iter=50) # init_points: exploration, n_iter: iteration

In [None]:
pbounds = {
    'n_iter':(100,1000),
    'alpha_1':(0,50),
    'alpha_2':(0,50),
    'lambda_1':(0,10),
    'lambda_2':(0,10)
}
def ard_opt(n_iter,alpha_1,alpha_2,lambda_1,lambda_2):
    params = {
        'n_iter':int(round(n_iter)),
        'alpha_1':alpha_1,
        'alpha_2':alpha_2,
        'lambda_1':lambda_1,
        'lambda_2':lambda_2
    }
    ard = ARDRegression(**params)
    ard.fit(r_train,y_train2)
    score = mean_squared_error(ard.predict(r_dev),y_dev2,squared=False)
    return -score
BO_ard = BayesianOptimization(ard_opt, pbounds, random_state=0)
BO_ard.maximize(init_points=50, n_iter=50) # init_points: exploration, n_iter: iteration

In [None]:
pbounds = {
    'n_iter':(100,1000),
    'alpha_1':(0,50),
    'alpha_2':(0,50),
    'lambda_1':(0,10),
    'lambda_2':(0,10)
}
def bay_opt(n_iter,alpha_1,alpha_2,lambda_1,lambda_2):
    params = {
        'n_iter':int(round(n_iter)),
        'alpha_1':alpha_1,
        'alpha_2':alpha_2,
        'lambda_1':lambda_1,
        'lambda_2':lambda_2
    }
    bay = BayesianRidge(**params)
    bay.fit(r_train,y_train2)
    score = mean_squared_error(bay.predict(r_dev),y_dev2,squared=False)
    return -score
BO_bay = BayesianOptimization(bay_opt, pbounds, random_state=0)
BO_bay.maximize(init_points=50, n_iter=50)

In [None]:
pbounds = {
    'n_estimators':(100,1000),
    'learning_rate':(0.00000000000000000000000000000000000000000000000001,1),
    'max_depth':(2, 32),
    'num_leaves':(2, 64),
    'min_child_samples':(10, 200),
    'min_child_weight':(1, 50),
    'subsample':(0.5, 1),
    'colsample_bytree':(0.5, 1),
    'max_bin':(10, 500),
    'reg_lambda':(0.001, 10),
    'reg_alpha':(0.01, 50)
}
def lgbm_opt(n_estimators, learning_rate, max_depth, num_leaves, min_child_samples, min_child_weight,
             subsample, colsample_bytree, max_bin, reg_lambda, reg_alpha):
    params = {
        "n_estimators":int(round(n_estimators)), 
        "learning_rate":learning_rate,
        'max_depth':int(round(max_depth)),
        'num_leaves':int(round(num_leaves)),
        'min_child_samples': int(round(min_child_samples)),
        'min_child_weight': int(round(min_child_weight)),
        'subsample':max(min(subsample, 1), 0),
        'colsample_bytree':max(min(colsample_bytree, 1), 0),
        'reg_lambda': reg_lambda,
        'reg_alpha': reg_alpha
    }
    lgbm = LGBMRegressor(random_state=0, **params)
    lgbm.fit(r_train,y_train2)
    score = mean_squared_error(lgbm.predict(r_dev),y_dev2,squared=False)
    return -score
BO_lgbm = BayesianOptimization(lgbm_opt, pbounds, random_state=0)
BO_lgbm.maximize(init_points=50, n_iter=50)

In [None]:
max_params_rid = BO_rid.max['params']
max_params_las = BO_las.max['params']
max_params_ela = BO_ela.max['params']
max_params_ard = BO_ard.max['params']
max_params_bay = BO_bay.max['params']
max_params_lgbm = BO_lgbm.max['params']

In [None]:
max_params_ard['n_iter'] = int(round(max_params_ard['n_iter']))

max_params_bay['n_iter'] = int(round(max_params_bay['n_iter']))

max_params_lgbm['num_leaves'] = int(round(max_params_lgbm['num_leaves']))
max_params_lgbm['n_estimators'] = int(round(max_params_lgbm['n_estimators']))
max_params_lgbm['max_depth'] = int(round(max_params_lgbm['max_depth']))
max_params_lgbm['min_child_samples'] = int(round(max_params_lgbm['min_child_samples']))
max_params_lgbm['min_child_weight'] = int(round(max_params_lgbm['min_child_weight']))
max_params_lgbm['max_bin'] = int(round(max_params_lgbm['max_bin']))
max_params_lgbm['subsample'] = max(min(max_params_lgbm['subsample'], 1), 0)
max_params_lgbm['colsample_bytree'] = max(min(max_params_lgbm['colsample_bytree'], 1), 0)

In [None]:
print(max_params_rid,'\n',max_params_las,'\n',max_params_ela,'\n',max_params_ard,'\n',max_params_bay,'\n',max_params_lgbm)

In [None]:
regs_tuned = [Ridge(random_state=0, **max_params_rid),Lasso(random_state=0, **max_params_las),ElasticNet(random_state=0, **max_params_ela),
             ARDRegression(**max_params_ard),BayesianRidge(**max_params_bay),LGBMRegressor(random_state=0,**max_params_lgbm),CatBoostRegressor(random_state=0)]
regs_tuned = [(str(reg).split('(')[0], reg) for reg in regs_tuned]
regs_tuned[-1] = list(regs_tuned[-1])
regs_tuned[-1][0] = 'CatBoostRegressor'
regs_tuned[-1] = tuple(regs_tuned[-1])

regs_trained = [(name, reg.fit(r_train,y_train2), float(mean_squared_error(reg.predict(r_dev),y_dev2,squared=False))) 
                    for name, reg in tqdm(regs_tuned.copy())]

In [None]:
regs_tuned = [Ridge(random_state=0, **max_params_rid),Lasso(random_state=0, **max_params_las),ElasticNet(random_state=0, **max_params_ela),
             ARDRegression(**max_params_ard),BayesianRidge(**max_params_bay),LGBMRegressor(random_state=0,**max_params_lgbm),CatBoostRegressor(random_state=0)]
regs_tuned = [(str(reg).split('(')[0], reg) for reg in regs_tuned]
regs_tuned[-1] = list(regs_tuned[-1])
regs_tuned[-1][0] = 'CatBoostRegressor'
regs_tuned[-1] = tuple(regs_tuned[-1])

regs_trained_for_submissions = [(name, reg.fit(results,y_dev)) for name, reg in tqdm(regs_tuned.copy())]

In [None]:
regs_tuned = [Ridge(random_state=0, **max_params_rid),Lasso(random_state=0, **max_params_las),ElasticNet(random_state=0, **max_params_ela),
             ARDRegression(**max_params_ard),BayesianRidge(**max_params_bay),LGBMRegressor(random_state=0,**max_params_lgbm),CatBoostRegressor(random_state=0)]
regs_tuned = [(str(reg).split('(')[0], reg) for reg in regs_tuned]
regs_tuned[-1] = list(regs_tuned[-1])
regs_tuned[-1][0] = 'CatBoostRegressor'
regs_tuned[-1] = tuple(regs_tuned[-1])

In [None]:
pred_results = []
for name, reg, reg_score in regs_trained:
    pred = list(reg.predict(r_dev))
    name = f'{name} \n({reg_score:.4f})'
    pred_results.append(pd.Series(pred, name=name))
ensemble_results = pd.concat(pred_results, axis=1)
ensemble_results = ensemble_results.applymap(lambda x: float(x))

# 모형의 예측값 간의 상관관계를 보기 위해 hitmap을 도식한다.
plt.figure(figsize = (8,6))
g = sns.heatmap(ensemble_results.corr(), annot=True, cmap='Blues')
g.set_title("Correlation between models")
plt.show()

In [None]:
corr = (ensemble_results.corr().sum()-1)/(ensemble_results.corr().shape[0]-1)
names = corr.index
rmse = np.array(corr.index.str[-7:-1]).astype(float)
df = pd.DataFrame({'model': names, 'rmse': rmse, 'cor': corr})        

plt.figure(figsize=(8,6))
g = sns.scatterplot(x="cor", y="rmse", data=df, s=40, color='red')
for line in range(0, df.shape[0]):
     g.text(df.cor[line]+0.003, df.rmse[line]-0.003, 
            df.model[line], horizontalalignment='left', 
            size='medium', color='black', weight='semibold')
        
plt.xlim((df.cor.min()-0.01,df.cor.max()+0.01))
plt.ylim((df.rmse.min()-0.01,df.rmse.max()+0.01))
plt.xlabel('Mean Agreement')
plt.ylabel('RMSE')
plt.grid()
plt.show()

In [None]:
selected = [#'LinearRegression',
            #'Ridge',
            #'Lasso',
            'ElasticNet',
            #'ARDRegression',
            #'BayesianRidge',
            #'RandomForestRegressor',
            #'XGBRegressor',
            'LGBMRegressor',
            #'CatBoostRegressor',
            #'DeepNeuralNetwork'
            ]
models_for_ensemble = [(name,reg) for name,reg,score in regs_trained if name in selected]
mean_squared_error((models_for_ensemble[0][1].predict(r_dev)+models_for_ensemble[1][1].predict(r_dev))/2,y_dev2,squared=False)

In [None]:
# 최적의 가중치 찾기 
selected = [#'LinearRegression',
            #'Ridge',
            #'Lasso',
            'ElasticNet',
            #'ARDRegression',
            #'BayesianRidge',
            #'RandomForestRegressor',
            #'XGBRegressor',
            'LGBMRegressor',
            #'CatBoostRegressor',
            #'DeepNeuralNetwork'
            ]
models_for_ensemble = [(name,reg) for name,reg,score in regs_trained if name in selected]
weights_avg = []
rmse_best = 1000
for i in tqdm(range(1, 30, 1)):
    for j in range(1, 30, 1):
        if (i+j) != 30:
            continue
        pred = (models_for_ensemble[0][1].predict(r_dev).flatten() * i + models_for_ensemble[1][1].predict(r_dev) * j)/30
        rmse = np.sqrt(mean_squared_error(y_dev2, pred))
        if rmse < rmse_best:
            weights_avg = [i,j]
            rmse_best = rmse 
            print(rmse, i,j)            

print(rmse_best, weights_avg)

In [None]:
w0,w1 = [26, 4]
selected = [#'LinearRegression',
            #'Ridge',
            #'Lasso',
            'ElasticNet',
            #'ARDRegression',
            #'BayesianRidge',
            #'RandomForestRegressor',
            #'XGBRegressor',
            'LGBMRegressor',
            #'CatBoostRegressor',
            #'DeepNeuralNetwork'
            ]
models_for_ensemble = [(name,reg) for name,reg,score in regs_trained if name in selected]
avg = (models_for_ensemble[0][1].predict(r_dev).flatten() * w0 + models_for_ensemble[1][1].predict(r_dev) * w1)/(w0+w1)

### Deployment

In [None]:
w0,w1 = [26, 4]
selected = [#'LinearRegression',
            #'Ridge',
            #'Lasso',
            'ElasticNet',
            #'ARDRegression',
            #'BayesianRidge',
            #'RandomForestRegressor',
            #'XGBRegressor',
            'LGBMRegressor',
            #'CatBoostRegressor',
            #'DeepNeuralNetwork'
            ]
models_for_ensemble = [(name,reg) for name,reg,score in regs_trained if name in selected]
avg_pred = (models_for_ensemble[0][1].predict(results_sub).flatten() * w0 + models_for_ensemble[1][1].predict(results_sub) * w1)/(w0+w1)
pd.DataFrame({'custid': test_id, 'age':avg_pred}).to_csv('results_ensemble_ela_lgbm.csv', index=False)