# ProbSpace: YouTube動画視聴回数予測

In [None]:
out_dir = "out_tmp"

In [None]:
import pandas as pd
import numpy as np
import scipy

import itertools
import os, datetime, gc, glob, re, random
import time, datetime
import pickle
from tqdm.notebook import tqdm
from imblearn.over_sampling import SMOTE
import optuna
import bhtsne, umap

from janome.tokenizer import Tokenizer
from janome.analyzer import Analyzer
from janome.tokenfilter import *
from janome.charfilter import UnicodeNormalizeCharFilter, RegexReplaceCharFilter
import unicodedata

import lightgbm as lgb
import xgboost as xgb
from catboost import Pool, CatBoostRegressor, CatBoostClassifier

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.linear_model import LinearRegression, BayesianRidge, ElasticNet, Lasso, LogisticRegression, Ridge, SGDRegressor
from sklearn.ensemble import AdaBoostRegressor, BaggingRegressor
from sklearn.ensemble import StackingRegressor, VotingRegressor
from sklearn.ensemble import ExtraTreesRegressor, GradientBoostingRegressor, RandomForestRegressor
from sklearn.svm import LinearSVR

from ngboost import NGBRegressor
from ngboost.ngboost import NGBoost
from ngboost.learners import default_tree_learner
from ngboost.scores import MLE, CRPS, LogScore
from ngboost.distns import Normal, LogNormal

from sklearn.linear_model import BayesianRidge, ElasticNet, Lasso, LogisticRegression, Ridge, SGDRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.tree import DecisionTreeRegressor

from sklearn.cluster import KMeans, MiniBatchKMeans, DBSCAN
from sklearn.model_selection import KFold, RepeatedKFold, StratifiedKFold, cross_validate, cross_val_predict, train_test_split
from sklearn.metrics import mean_squared_error, roc_auc_score, log_loss
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler, Normalizer, RobustScaler, QuantileTransformer, PowerTransformer
from sklearn.feature_selection import SelectFromModel, RFE, SelectPercentile, SelectKBest

import tensorflow as tf
import tensorflow_addons as tfa

from tensorflow.keras import layers
from tensorflow.keras import optimizers
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras import backend as K
from tensorflow.keras import utils
from tensorflow.keras.initializers import he_normal, he_uniform, GlorotNormal, GlorotUniform
from tensorflow.keras.optimizers import Adadelta, Adagrad, Adam, Adamax, Ftrl, Nadam, RMSprop, SGD
from tensorflow.keras.callbacks import LearningRateScheduler, EarlyStopping, TensorBoard, LambdaCallback, ReduceLROnPlateau
from tensorflow.keras.metrics import MeanSquaredError, RootMeanSquaredError
from tensorflow.keras import layers
from tensorflow.keras.layers import Concatenate, Lambda
from tensorflow.keras.layers import Activation, Average, Dense, Dropout, Flatten, BatchNormalization, LeakyReLU, Input
from tensorflow.keras.layers import GaussianDropout, GaussianNoise
from tensorflow.keras.layers import Conv2D, SeparableConv2D, MaxPooling2D, AveragePooling2D, GlobalAveragePooling2D

import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno 

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 100)

In [None]:
start = datetime.datetime.now()

In [None]:
# Function for variable description
def description(df):
    summary = pd.DataFrame(df.dtypes, columns=['dtypes'])
    summary = summary.reset_index()
    summary["Name"] = summary['index']
    summary = summary[["Name",'dtypes']]
    summary["Missing"] = df.isnull().sum().values    
    summary["Uniques"] = df.nunique().values
    summary["Mean"] = np.nanmean(df, axis=0).astype(df.dtypes)
    summary["Std"] = np.nanstd(df, axis=0).astype(df.dtypes)
    summary["Minimum"] = np.nanmin(df, axis=0).astype(df.dtypes)
    summary["Maximum"] = np.nanmax(df, axis=0).astype(df.dtypes)
    summary["First Value"] = df.iloc[0].values
    summary["Second Value"] = df.iloc[1].values
    summary["Third Value"] = df.iloc[2].values
    summary["dimension"] = str(df.shape)
    return summary

In [None]:
def get_hist(target):
    plt.hist(target, bins=100)

    print("max:  {:>10,.6f}".format(target.max()))
    print("min:  {:>10,.6f}".format(target.min()))
    print("mean: {:>10,.6f}".format(target.mean()))
    print("std:  {:>10,.6f}".format(target.std()))
    
    return

def get_hist4(target1, title1, target2, title2, target3, title3, target4, title4):
    fig = plt.figure(figsize=(18, 18))

    ax1 = fig.add_subplot(5,1,1)
    ax2 = fig.add_subplot(5,1,2)
    ax3 = fig.add_subplot(5,1,3)
    ax4 = fig.add_subplot(5,1,4)
    ax5 = fig.add_subplot(5,1,5)

    ax1.set_title(title1)
    ax2.set_title(title2)
    ax3.set_title(title3)
    ax4.set_title(title4)
    ax5.set_title("OVERALL")
    
    ax1.hist(target1, bins=100)
    ax2.hist(target2, bins=100)
    ax3.hist(target3, bins=100)
    ax4.hist(target4, bins=100)

    ax5.hist(target1, bins=100, alpha=0.2, color='red')
    ax5.hist(target2, bins=100, alpha=0.2, color='green')
    ax5.hist(target3, bins=100, alpha=0.2, color='blue')
    #ax5.hist(target4, bins=100, alpha=0.2, color='grey')

    fig.show()

    return


## Load Data

In [None]:
%%time

# for train/test data
train_data = pd.read_csv("./input/train_data.csv")
test_data = pd.read_csv("./input/test_data.csv")

y = np.log1p(train_data['y']).copy()
y_bin = pd.cut(train_data['y'], [0, 10, 100,1000,10000,100000,1000000,10000000000], labels=[1,2,3,4,5,6,7])
y_bin = y_bin.astype(int)
test_id = test_data.id

train = train_data.drop(['id', 'y'], axis=1).copy()
test  = test_data.drop(['id'], axis=1).copy()


## 目的変数の分布

In [None]:
get_hist(y)

# seedの固定化

In [None]:
def seed_everything(seed=1234):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)


In [None]:
DIFF_THRESHOLD = 5

################################################################################
# RESULTS
################################################################################
def output_results(target, results, test_id, MODEL):
    RMSLE = mean_squared_error(target.values, results['train'], squared=False)
    print(f"Overall RMSLE={RMSLE}")

    # Make submission
    print("Saving submission file")
    submission = pd.DataFrame({'id': test_id, 'y': np.expm1(results['test'])})
    submission.to_csv(f"./{out_dir}/submission_{MODEL}_CV{RMSLE:.6f}.csv", index=False)
        
    return submission

def check_results(y, results):
    y_diff = np.abs(np.expm1(y) - np.expm1(results["train"]))
    y_log1p_diff = np.abs(y - results["train"])
    display(y_diff[y_log1p_diff>DIFF_THRESHOLD].index.values)
    display(train_data[y_log1p_diff>DIFF_THRESHOLD])
    display(pd.concat([pd.DataFrame(y[y_log1p_diff>DIFF_THRESHOLD], columns=['y']), \
                       pd.DataFrame(results["train"][y_log1p_diff>DIFF_THRESHOLD], \
                                    index=y_diff[y_log1p_diff>DIFF_THRESHOLD].index.values, columns=["pred_train"])], axis=1))    

    get_hist4(results["train"], "pred_train", \
              y, "y", \
              results["test"], "pred_test", \
              y_log1p_diff, "diff")
    
    display(pd.concat([pd.DataFrame(results["train"], columns=["pred_train"]), \
                       pd.DataFrame(y, columns=["y"]), \
                       y_log1p_diff.rename("y_log1p_diff")], \
                       axis=1).describe())
    
    display(pd.DataFrame(results["test"], columns=["pred_test"]).describe())
    
    RMSLE = mean_squared_error(y, results["train"], squared=False)
    display(f"Overall RMSLE={RMSLE:.6f}")

In [None]:
DIFF_THRESHOLD = 5

################################################################################
# METRICS
################################################################################
def rmsle(y, pred_y):
    return mean_squared_error(y, pred_y, squared=False)

################################################################################
# CROSS-VALIDATION
################################################################################
def print_cv_scores(label, cv_scores):
    print("*"*40)
    print(f"type(cv_scores): {type(cv_scores)}")
    print(f"{label} cv scores : {cv_scores}")
    print(f"{label} cv mean score : {np.mean(cv_scores)}")
    print(f"{label} cv std score : {np.std(cv_scores)}")
    
def run_cv_model(train, test, target, target_skf, encoding, model_fn, params={}, 
                 eval_fn=None, label='model', cv=5,  repeats=5, seed=43):

    if repeats==1:
        if target_skf is None:
            kf = KFold(n_splits=cv, shuffle=True, random_state=seed)
            target_y = target
        else:
            kf = StratifiedKFold(n_splits=cv, shuffle=True, random_state=seed)
            target_y = target_skf
        divide_counts = cv
    else:
        if target_skf is None:
            kf = RepeatedKFold(n_splits=cv,n_repeats=repeats, random_state=seed)
            target_y = target
        else:
            kf = RepeatedStratifiedKFold(n_splits=cv, n_repeats=repeats, random_state=seed)
            target_y = target_skf
        divide_counts = kf.get_n_splits()
        
    cv_scores = []
    pred_full_test = 0
    pred_train = np.zeros((train.shape[0]))

    for fold_id, (train_idx, val_idx) in enumerate(kf.split(train, target_y)):
        print("*"*40)
        print(f"Started {label} fold:{fold_id+1} / {divide_counts}")
        tr_X, val_X = train.iloc[train_idx].copy(), train.iloc[val_idx].copy()
        tr_y, val_y = target.iloc[train_idx], target.iloc[val_idx]
        
        # TARGET ENCODING
        if encoding:
            for c in encoding:
                # 学習データ全体で各カテゴリにおけるtargetの平均を計算
                data_tmp = pd.DataFrame({c: tr_X[c], 'target': tr_y})
                target_mean = data_tmp.groupby(c)['target'].mean()
                # バリデーションデータのカテゴリを置換
                val_X.loc[:, c] = val_X[c].map(target_mean)

                # 学習データの変換後の値を格納する配列を準備
                tmp = np.repeat(np.nan, tr_X.shape[0])
                kf_encoding = KFold(n_splits=4, shuffle=True, random_state=seed)
                for idx_1, idx_2 in kf_encoding.split(tr_X):
                    # out-of-foldで各カテゴリにおける目的変数の平均を計算
                    target_mean = data_tmp.iloc[idx_1].groupby(c)['target'].mean()
                    # 変換後の値を一次配列に格納
                    tmp[idx_2] = tr_X[c].iloc[idx_2].map(target_mean)


                tr_X.loc[:, c] = tmp
        # TARGET ENCODING
        
        params2 = params.copy() 
        model, pred_val_y, pred_test_y = model_fn(
            tr_X, tr_y, val_X, val_y, test, params2)
        
        pred_full_test = pred_full_test + pred_test_y
        pred_train[val_idx] = pred_val_y
        if eval_fn is not None:
            cv_score = eval_fn(val_y, pred_val_y)
            cv_scores.append(cv_score)
            print(f"{label} cv score {fold_id+1}: {cv_score}")
            
    
    print_cv_scores(label, cv_scores)    
    pred_full_test = pred_full_test / divide_counts
    results = {"label": label,
               "train": pred_train, 
               "test": pred_full_test, 
               "cv": cv_scores}
    
    RMSLE = mean_squared_error(target.values, results["train"], squared=False)
    print(f"Overall RMSLE={RMSLE}")
    
    return results

################################################################################
# RESULTS
################################################################################
def output_results(target, results, test_id, MODEL):
    RMSLE = mean_squared_error(target.values, results["train"], squared=False)
    print(f"Overall RMSLE={RMSLE}")

    # Make submission
    print("Saving submission file")
    submission = pd.DataFrame({'id': test_id, 'y': np.expm1(results["test"])})
    submission.to_csv(f"./{out_dir}/submission_{MODEL}_CV{RMSLE:.6f}.csv", index=False)
        
    return submission

def check_results(y, results):
    y_diff = np.abs(np.expm1(y) - np.expm1(results["train"]))
    y_log1p_diff = np.abs(y - results["train"])
    display(y_diff[y_log1p_diff>DIFF_THRESHOLD].index.values)
    display(train_data[y_log1p_diff>DIFF_THRESHOLD])
    display(pd.concat([pd.DataFrame(y[y_log1p_diff>DIFF_THRESHOLD], columns=['y']), \
                       pd.DataFrame(results["train"][y_log1p_diff>DIFF_THRESHOLD], \
                                    index=y_diff[y_log1p_diff>DIFF_THRESHOLD].index.values, columns=["pred_train"])], axis=1))    

    get_hist4(results["train"], "pred_train", \
              y, "y", \
              results["test"], "pred_test", \
              y_log1p_diff, "diff")
    
    display(pd.concat([pd.DataFrame(results["train"], columns=["pred_train"]), \
                       pd.DataFrame(y, columns=["y"]), \
                       y_log1p_diff.rename("y_log1p_diff")], \
                       axis=1).describe())
    
    display(pd.DataFrame(results["test"], columns=["pred_test"]).describe())
    
    RMSLE = mean_squared_error(y, results["train"], squared=False)
    display(f"Overall RMSLE={RMSLE:.6f}")
    
################################################################################
# MODEL
################################################################################
def runLGB(train_X, train_y, val_X, val_y, test_X, params):
    model = lgb.LGBMRegressor(**params)
    model.fit(train_X, train_y, eval_set=(val_X, val_y), early_stopping_rounds=100, eval_metric='rmse', verbose=100)
    pred_val_y = model.predict(val_X)
    pred_test_y = model.predict(test_X)
    return model, pred_val_y, pred_test_y

def runXGB(train_X, train_y, val_X, val_y, test_X, params):
    model = xgb.XGBRegressor(**params)
    model.fit(train_X, train_y, eval_set=[[val_X, val_y]], early_stopping_rounds=100, eval_metric='rmse', verbose=100)
    pred_val_y = model.predict(val_X)
    pred_test_y = model.predict(test_X)
    return model, pred_val_y, pred_test_y

def runCAT(train_X, train_y, val_X, val_y, test_X, params):
    model = CatBoostRegressor(**params)
    model.fit(train_X, train_y, eval_set=(val_X, val_y),
#              cat_features=cat_features,
              early_stopping_rounds=100, use_best_model=True, verbose=100)
    pred_val_y = model.predict(val_X)
    pred_test_y = model.predict(test_X)
    return model, pred_val_y, pred_test_y

def runNGB(train_X, train_y, val_X, val_y, test_X, params):
    model = NGBRegressor(**ngb_params)
    model.fit(train_X, train_y, X_val=val_X, Y_val=val_y)
    pred_val_y = model.predict(val_X)
    pred_test_y = model.predict(test_X)
    return model, pred_val_y, pred_test_y

def runLR(train_X, train_y, val_X, val_y, test_X, params):
    model = LogisticRegression(**params)
    model.fit(train_X, train_y, sample_weight=None)
    pred_val_y = model.predict_proba(val_X)[:, 1]
    pred_test_y = model.predict_proba(test_X)[:, 1]
    return model, pred_val_y, pred_test_y

def runLINR(train_X, train_y, val_X, val_y, test_X, params):
    model = LinearRegression(**params)
    model.fit(train_X, train_y, sample_weight=None)
    pred_val_y = model.predict(val_X)
    pred_test_y = model.predict(test_X)
    return model, pred_val_y, pred_test_y

def runBAYRIDGE(train_X, train_y, val_X, val_y, test_X, params):
    model = BayesianRidge(**params)
    model.fit(train_X, train_y, sample_weight=None)
    pred_val_y = model.predict(val_X)
    pred_test_y = model.predict(test_X)
    return model, pred_val_y, pred_test_y

def runRDG(train_X, train_y, val_X, val_y, test_X, params):
    model = Ridge(**params)
    model.fit(train_X, train_y, sample_weight=None)
    pred_val_y = model.predict(val_X)
    pred_test_y = model.predict(test_X)
    return model, pred_val_y, pred_test_y

def runELASTIC(train_X, train_y, val_X, val_y, test_X, params):
    model = ElasticNet(**params)
    model.fit(train_X, train_y, check_input=True)
    pred_val_y = model.predict(val_X)
    pred_test_y = model.predict(test_X)
    return model, pred_val_y, pred_test_y

def runLASSO(train_X, train_y, val_X, val_y, test_X, params):
    model = Lasso(**params)
    model.fit(train_X, train_y, check_input=True)
    pred_val_y = model.predict(val_X)
    pred_test_y = model.predict(test_X)
    return model, pred_val_y, pred_test_y

def runKN(train_X, train_y, val_X, val_y, test_X, params):
    model = KNeighborsRegressor(**params)
    model.fit(train_X, train_y)
    pred_val_y = model.predict(val_X)
    pred_test_y = model.predict(test_X)
    return model, pred_val_y, pred_test_y

def runRFR(train_X, train_y, val_X, val_y, test_X, params):
    model = RandomForestRegressor(**params)
    model.fit(train_X, train_y)
    pred_val_y = model.predict(val_X)
    pred_test_y = model.predict(test_X)
    return model, pred_val_y, pred_test_y

def runSGD(train_X, train_y, val_X, val_y, test_X, params):
    model = SGDRegressor(**params)
    model.fit(train_X, train_y, coef_init=None, intercept_init=None, sample_weight=None)
    pred_val_y = model.predict(val_X)
    pred_test_y = model.predict(test_X)
    return model, pred_val_y, pred_test_y

def runETR(train_X, train_y, val_X, val_y, test_X, params):
    model = ExtraTreesRegressor(**params)
    model.fit(train_X, train_y)
    pred_val_y = model.predict(val_X)
    pred_test_y = model.predict(test_X)
    return model, pred_val_y, pred_test_y


def runGBR(train_X, train_y, val_X, val_y, test_X, params):
    model = GradientBoostingRegressor(**params)
    model.fit(train_X, train_y)
    pred_val_y = model.predict(val_X)
    pred_test_y = model.predict(test_X)
    return model, pred_val_y, pred_test_y

def runBAG(train_X, train_y, val_X, val_y, test_X, params):
    model = BaggingRegressor(**params)
    model.fit(train_X, train_y, sample_weight=None)
    pred_val_y = model.predict(val_X)
    pred_test_y = model.predict(test_X)
    return model, pred_val_y, pred_test_y

def runABR(train_X, train_y, val_X, val_y, test_X, params):
    model = AdaBoostRegressor(**params)
    model.fit(train_X, train_y, sample_weight=None)
    pred_val_y = model.predict(val_X)
    pred_test_y = model.predict(test_X)
    return model, pred_val_y, pred_test_y

def runLINSVR(train_X, train_y, val_X, val_y, test_X, params):
    model = LinearSVR(**params)
    model.fit(train_X, train_y, sample_weight=None)
    pred_val_y = model.predict(val_X)
    pred_test_y = model.predict(test_X)
    return model, pred_val_y, pred_test_y

################################################################################
# MODEL PARAMETERS
################################################################################
lgb_params = {'boosting_type': 'gbdt', 'tree_learner': 'feature', #''serial' or feature' or 'data' or 'voting'
              'num_leaves': 31, 'max_depth': -1,
              'learning_rate': 5e-2, 'n_estimators': 10000, 'importance_type': 'gain',
              'subsample_for_bin': 200000, 'objective': 'regression', 'min_split_gain': 0.0, 'min_child_weight': 1e-3, 'min_child_samples': 20, 
              'bagging_freq': 0, 'bagging_fraction': 1.0, 'feature_fraction': 1.0,
              'reg_alpha': 0.2, 'reg_lambda': 0.2,
              'random_state': 43, 'data_random_seed': 1,
              'n_jobs': -1, 'silent': False}

xgb_params = {'base_score': 0.5, 'booster': 'gbtree', 'colsample_bylevel': 1, 'colsample_bynode': 1, 'colsample_bytree': 1, 'gamma': 0,
              'learning_rate': 5e-2, 'n_estimators': 20000, 'importance_type': 'gain',
              'max_delta_step': 0, 'max_depth': 6, 'min_child_weight': 0,
              'objective': 'reg:squarederror', 'reg_alpha': 0.2, 'reg_lambda': 0.2, 'scale_pos_weight': 1,
              'subsample': 0.9,
              'silent': None, 'verbosity': 0,
              'random_state': 43, 'seed': 43,
              'tree_method': 'gpu_hist', 'gpu_id': 0}

cat_params = {'iterations':10000, 'depth': 8, 'boosting_type': 'Ordered', #'Ordered', #'Plain',
              'loss_function': 'RMSE', 'eval_metric': 'RMSE',
              'learning_rate': 5e-2, 'leaf_estimation_method': 'Gradient', #'Newton', 'Exact'
              'l2_leaf_reg': 1.0, 'random_strength': 1.0, 'bagging_temperature': 1.0, 'has_time': False,
              'grow_policy': 'SymmetricTree', #'Depthwise', 'Lossguide'
              'min_data_in_leaf': 1, 'max_leaves': 31,
              'random_seed': 43,
#              'one_hot_max_size': len(cat_features),
              'task_type': 'GPU'}

ngb_params = {'Base': default_tree_learner, #決定木。Ridge回帰の場合は、default_linear_learner
              'Dist': Normal, 
              'Score': LogScore, #CRPS, MLEも可
              'learning_rate': 1e-2, 'natural_gradient': True, 'verbose': True, 'verbose_eval': 100,
              'tol': 1e-4, 'random_state': 43, 'n_estimators': 100, 'minibatch_frac': 0.5}

logr_params = {'penalty':'l2',  'solver': 'newton-cg', #'newton-cg', 'lbfgs', 'sag' , 'saga'
               'C': 0.05,
#               'class_weight':'balanced', 
               'max_iter': 500, 'random_state': 43, 'n_jobs': -1}

linr_params = {'fit_intercept': True, 'normalize': False, 'copy_X': True, 'n_jobs': -1}

bayridge_params = {'alpha_1': 1e-06, 'alpha_2': 1e-06, 'alpha_init': None, 'compute_score': False,
                   'copy_X': True, 'fit_intercept': True,
                   'lambda_1': 1e-06, 'lambda_2': 1e-06, 'lambda_init': None,
                   'n_iter': 200, 'normalize': False, 'tol': 1e-3,
                   'verbose': True}

rdg_params = {'alpha': 0.01, 'copy_X': True, 'fit_intercept': True,
              'max_iter': 100, 'normalize': False,
              'random_state': 43, 'solver': 'auto', 'tol': 1e-3}

elastic_params = {'alpha': 0.0001, 'copy_X': True, 'fit_intercept': True, 'l1_ratio': 0.5,
                  'max_iter': 200, 'normalize': False, 'positive': False, 'precompute': False,
                  'random_state': 43, 'selection': 'cyclic', 'tol': 1e-4, 'warm_start': False}

lasso_params = {'alpha': 0.0001, 'copy_X': True, 'fit_intercept': True, 'max_iter': 200,
                'normalize': False, 'positive': False, 'precompute': False,
                'random_state': 43, 'selection': 'random', 'tol': 1e-4, 'warm_start': False}

sgd_params = {'alpha': 1e-4, 'average': False, 'early_stopping': True,
              'epsilon': 1e-1, 'eta0': 1e-4, 'fit_intercept': True, 'l1_ratio': 0.15,
              'learning_rate': 'invscaling', 'loss': 'squared_loss', 'penalty': 'l2', 'power_t': 0.25,
              'max_iter': 3000, 'n_iter_no_change': 10, 'validation_fraction': 0.5,
              'random_state': 43, 'shuffle': True, 'tol': 1e-3, 'verbose': False, 'warm_start': False}
kn_params = {'n_neighbors': 5, 'weights': 'uniform', 'algorithm': 'auto',
             'leaf_size': 30, 'p': 2, 'metric': 'minkowski', 'metric_params': None, 'n_jobs': -1}

rfr_params = {'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'mse',
             'max_depth': None, 'max_features': 'auto', 'max_leaf_nodes': None,
             'min_impurity_decrease': 0.0, 'min_impurity_split': 1e-7, 'max_samples': None,
             'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0,
             'n_estimators': 1000, 'n_jobs': -1, 'oob_score': False,
             'random_state': 43, 'verbose': 1, 'warm_start': False}

etr_params = {'bootstrap': False, 'ccp_alpha': 0.0, 'criterion': 'mse',
              'max_depth': None, 'max_features': 'auto', 'max_leaf_nodes': None,
              'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'max_samples': None,
              'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0,  
              'n_estimators': 100, 'n_jobs': -1, 'oob_score': False,
              'random_state': 43, 'verbose': 1, 'warm_start': False}

gbr_params = {'alpha': 0.9, 'ccp_alpha': 0.0, 'criterion': 'friedman_mse', 'init': None,
              'learning_rate': 5e-2, 'n_estimators': 200, 'loss': 'ls',
              'max_depth': 6, 'max_features': None, 'max_leaf_nodes': None,
              'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2,
              'min_weight_fraction_leaf': 0.0, 'subsample': 1.0, 'validation_fraction': 0.2,
              'n_iter_no_change': None, 'presort': 'deprecated',
              'random_state': 43, 'tol': 1e-4, 'verbose': 1, 'warm_start': False}

bag_params = {'base_estimator': None,
              'bootstrap': True, 'bootstrap_features': False, 'max_features': 1.0, 'max_samples': 1.0,
              'n_estimators': 5, 'n_jobs': None, 'oob_score': False, 
              'random_state': 43, 'verbose': 1, 'warm_start': False}

abr_params = {'base_estimator': None, 
              'learning_rate': 1.0, 'loss': 'linear', 'n_estimators': 5,
              'random_state': 43}

linsvr_params = {'epsilon': 0.0, 'tol': 0.0001, 'C': 1.0,
                 'loss': 'epsilon_insensitive', 'fit_intercept': True, 'intercept_scaling': 1.0,
                 'dual': True, 'verbose': 1, 'random_state': 43, 'max_iter': 1000}

In [None]:
def lgb_regressor(train, test, target, target_skf, seed, n_folds, encoding):
    lgb_params = {'boosting_type': 'gbdt',
                  'objective' : 'regression',
                  'metric' : 'rmse',  
                  'tree_learner': 'feature', #''serial' or feature' or 'data' or 'voting'
                  'max_depth': -1,
                  'min_child_samples': 10, 
                  'min_split_gain': 0.01,
                  'min_child_weight': 1e-2,
                  'reg_alpha': 0.1,
                  'reg_lambda': 1,
                  'num_leaves': 35,
                  'max_bin': 300,
                  'learning_rate': 2e-2,
                  'bagging_fraction': 0.9,
                  'bagging_freq': 1,
                  'bagging_seed': 4590,
                  'feature_fraction': 0.85,
                  'n_estimators': 50000,
                  'importance_type': 'gain',
                  'subsample_for_bin': 200000,
                  'random_state': seed,
                  'data_random_seed': seed,
                  'n_jobs': -1,
                  'silent': False}
    
    lgb_results = run_cv_model(train, test, target, target_skf, encoding, runLGB, lgb_params, rmsle, 'LGBMRegressor', cv=n_folds, repeats=1, seed=seed)
    return lgb_results

def xgb_regressor(train, test, target, target_skf, seed, n_folds, encoding):
    xgb_params = {'base_score': 0.5,
                  'booster': 'gbtree',
                  'objective': 'reg:squarederror',
                  'colsample_bylevel': 0.6,
                  'colsample_bynode': 0.6,
                  'colsample_bytree': 0.6,
                  'gamma': 0,
                  'learning_rate': 1e-2,
                  'n_estimators': 50000,
                  'importance_type': 'gain',
                  'max_delta_step': 0,
                  'max_depth': 8,
                  'min_child_weight': 0,
                  'reg_alpha': 0.1,
                  'reg_lambda': 1,
                  'scale_pos_weight': 1,
                  'subsample': 0.8,
                  'silent': None,
                  'verbosity': 0,
                  'random_state': seed,
                  'seed': seed,
                  'tree_method': 'gpu_hist',
                  'gpu_id': 0}

    xgb_results = run_cv_model(train, test, target, target_skf, encoding, runXGB, xgb_params, rmsle, 'XGBRegressor', cv=n_folds, repeats=1, seed=seed)
    return xgb_results

def catboost_regressor(train, test, target, target_skf, seed, n_folds, encoding):
    cat_params = {'bootstrap_type': 'Bayesian', 
                  'boosting_type': 'Plain', #'Ordered', #'Plain',
                  'iterations':50000,
                  'depth': 8,
                  'loss_function': 'RMSE',
                  'eval_metric': 'RMSE',
                  'learning_rate': 1e-2,
                  'leaf_estimation_method': 'Gradient', #'Newton', 'Exact'
                  'l2_leaf_reg': 1.0,
                  'random_strength': 0.8,
                  'bagging_temperature': 0.9,
                  'has_time': False,
                  'grow_policy': 'SymmetricTree', #'Depthwise', 'Lossguide'
                  'min_data_in_leaf': 1,
                  'max_leaves': 31,
                  'random_seed': seed,
                  #'one_hot_max_size': len(cat_features),
                  'task_type': 'GPU'}

    cat_results = run_cv_model(train, test, target, target_skf, encoding, runCAT, cat_params, rmsle, 'CatBoostRegressor', cv=n_folds, repeats=1, seed=seed)
    return cat_results

def ngboost_regressor(train, test, target, target_skf, seed, n_folds, encoding):
    ngb_params['learning_rate'] = 5e-2
    ngb_params['n_estimators'] = 500
    ngb_params['minibatch_frac'] = 1.0
    ngb_params['random_state'] = seed

    ngb_results = run_cv_model(train, test, target, target_skf, encoding, runNGB, ngb_params, rmsle, 'NGBoost', cv=n_folds, repeats=1, seed=seed)
    return ngb_results

def logistic_regression(train, test, target, target_skf, seed, n_folds, encoding):
    logr_params['max_iter'] = 500
    logr_params['random_state'] = seed

    logr_results = run_cv_model(train, test, target, target_skf, encoding, runLR, logr_params, rmsle, 'LogisticRegression', cv=n_folds, repeats=1, seed=seed)
    return logr_results

def lin_regression(train, test, target, target_skf, seed, n_folds, encoding):
    linr_params['n_jobs'] = -1

    linr_results = run_cv_model(train, test, target, target_skf, encoding, runLINR, linr_params, rmsle, 'LinearRegression', cv=n_folds, repeats=1, seed=seed)
    return linr_results

def bayesianridge(train, test, target, target_skf, seed, n_folds, encoding):
    bayridge_params['alpha_1'] = 1e-06
    bayridge_params['alpha_2'] = 1e-06
    bayridge_params['lambda_1'] = 1.0
    bayridge_params['lambda_2'] = 1e-07
    bayridge_params['n_iter'] = 1000

    bay_results = run_cv_model(train, test, target, target_skf, encoding, runBAYRIDGE, bayridge_params, rmsle, 'BayesianRidge', cv=n_folds, repeats=1, seed=seed)
    return bay_results

def ridge(train, test, target, target_skf, seed, n_folds, encoding):
    rdg_params['alpha'] = 1.0
    rdg_params['random_state'] = seed
    rdg_params['max_iter'] = 1000

    rdg_results = run_cv_model(train, test, target, target_skf, encoding, runRDG, rdg_params, rmsle, 'Ridge', cv=n_folds, repeats=1, seed=seed)
    return rdg_results

def elastic(train, test, target, target_skf, seed, n_folds, encoding):
    elastic_params['alpha'] = 1e-04
    elastic_params['l1_ratio'] = 0.5
    elastic_params['random_state'] = seed
    elastic_params['max_iter'] = 1000

    elastic_results = run_cv_model(train, test, target, target_skf, encoding, runELASTIC, elastic_params, rmsle, 'ELasticNet', cv=n_folds, repeats=1, seed=seed)
    return elastic_results

def lasso(train, test, target, target_skf, seed, n_folds, encoding):
    lasso_params['alpha'] = 1e-04
    lasso_params['random_state'] = seed
    lasso_params['max_iter'] = 1000

    lasso_results = run_cv_model(train, test, target, target_skf, encoding, runLASSO, lasso_params, rmsle, 'Lasso', cv=n_folds, repeats=1, seed=seed)
    return lasso_results

def sgd(train, test, target, target_skf, seed, n_folds, encoding):
    sgd_params['alpha'] = 1e-04
    sgd_params['early_stopping'] = True
    sgd_params['epsilon'] = 1e-1
    sgd_params['eta0'] = 1e-4
    sgd_params['l1_ratio'] = 0.15
    sgd_params['learning_rate'] = 'invscaling'
    sgd_params['loss'] = 'squared_loss'
    sgd_params['validation_fraction'] = 0.2
    sgd_params['random_state'] = seed

    sgd_results = run_cv_model(train, test, target, target_skf, encoding, runSGD, sgd_params, rmsle, 'SGD', cv=n_folds, repeats=1, seed=seed)
    return sgd_results

def kn_regressor(train, test, target, target_skf, seed, n_folds, encoding):
    kn_params['n_neighbors'] = 5
    kn_params['weights'] = 'distance'
    kn_params['algorithm'] = 'auto' #auto, ball_tree, kd_tree, brute
    kn_params['leaf_size'] = 60

    kn_results = run_cv_model(train, test, target, target_skf, encoding, runKN, kn_params, rmsle, 'KNeighbors', cv=n_folds, repeats=1, seed=seed)
    return kn_results

def rf_regressor(train, test, target, target_skf, seed, n_folds, encoding):
    rfr_params['ccp_alpha'] = 0
    rfr_params['criterion'] = 'mse'
    rfr_params['max_depth'] = 63
    rfr_params['min_samples_leaf'] = 20
    rfr_params['min_samples_split'] = 50
    rfr_params['random_state'] = seed
    
    rfr_results = run_cv_model(train, test, target, target_skf, encoding, runRFR, rfr_params, rmsle, 'RandomForestRegressor', cv=n_folds, repeats=1, seed=seed)
    return rfr_results

def et_regressor(train, test, target, target_skf, seed, n_folds, encoding):
    etr_params['ccp_alpha'] = 0
    etr_params['criterion'] = 'mse'
    etr_params['max_depth'] = 63
    etr_params['min_samples_leaf'] = 20
    etr_params['min_samples_split'] = 50
    etr_params['min_weight_fraction_leaf'] = 0.0
    etr_params['n_estimators'] = 1000
    etr_params['random_state'] = seed
    
    etr_results = run_cv_model(train, test, target, target_skf, encoding, runETR, etr_params, rmsle, 'ExtraTreesRegressor', cv=n_folds, repeats=1, seed=seed)
    return etr_results

def gb_regressor(train, test, target, target_skf, seed, n_folds, encoding):
    gbr_params['alpha'] = 0.9
    gbr_params['ccp_alpha'] = 0
    gbr_params['criterion'] = 'friedman_mse'
    gbr_params['learning_rate'] = 5e-2
    gbr_params['n_estimators'] = 100
    gbr_params['max_depth'] = 31
    gbr_params['min_samples_leaf'] = 1
    gbr_params['min_samples_split'] = 2
    gbr_params['min_weight_fraction_leaf'] = 0.0
    gbr_params['subsample'] = 1.0
    gbr_params['validation_fraction'] = 0.2
    gbr_params['random_state'] = seed
    
    gbr_results = run_cv_model(train, test, target, target_skf, encoding, runGBR, gbr_params, rmsle, 'GradientBoostingRegressor', cv=n_folds, repeats=1, seed=seed)
    return gbr_results

def bag_regressor(train, test, target, target_skf, seed, n_folds, encoding):
    bag_params['base_estimator'] = BayesianRidge(n_iter=1000, lambda_1=1.0, lambda_2=1e-7)
    bag_params['bootstrap'] = True
    bag_params['bootstrap_features'] = True,
    bag_params['max_features'] = 1.0
    bag_params['max_samples'] = 1.0
    bag_params['n_estimators'] = 96
    bag_params['n_jobs'] = -1
    bag_params['random_state'] = seed
    
    bag_results = run_cv_model(train, test, target, target_skf, encoding, runBAG, bag_params, rmsle, 'BaggingRegressor', cv=n_folds, repeats=1, seed=seed)
    return bag_results

def ada_regressor(train, test, target, target_skf, seed, n_folds, encoding):
    abr_params['base_estimator'] = BayesianRidge(n_iter=1000, lambda_1=1.0, lambda_2=1e-7)
    abr_params['learning_rate'] = 2.0
    abr_params['loss'] = 'linear'
    abr_params['n_estimators'] = 100
    abr_params['random_state'] = seed
    
    abr_results = run_cv_model(train, test, target, target_skf, encoding, runABR, abr_params, rmsle, 'AdaBoostRegressor', cv=n_folds, repeats=1, seed=seed)
    return abr_results

def lin_svr(train, test, target, target_skf, seed, n_folds, encoding):
    linsvr_params['loss'] = 'squared_epsilon_insensitive'
    linsvr_params['max_iter'] = 1000
    linsvr_params['random_state'] = seed
    
    linsvr_results = run_cv_model(train, test, target, target_skf, encoding, runLINSVR, linsvr_params, rmsle, 'LinearSVR', cv=n_folds, repeats=1, seed=seed)
    return linsvr_results


# Ensemble & Stacking
---

In [None]:
%%time

NUM_DATASETS = 1
stacking_train_lists = []
stacking_test_lists  = []

for j in range(NUM_DATASETS):
    stacking_train_lists.append(["XGBRegressor_train_SEED47_FOLDS8_0623",
                                 "XGBRegressor2_train_SEED47_FOLDS8_0623",
                                 "LGBMRegressor_train_SEED47_FOLDS8_0623",
                                 "LGBMRegressor2_train_SEED47_FOLDS8_0623",
                                 "CatBoostRegressor_train_SEED47_FOLDS8_0623",
                                 "CatBoostRegressor2_train_SEED47_FOLDS8_0623",
                                 "XGBRegressor_train_SEED47_FOLDS8_0624",
                                 "XGBRegressor2_train_SEED47_FOLDS8_0624",
                                 "LGBMRegressor_train_SEED47_FOLDS8_0624",
                                 "LGBMRegressor2_train_SEED47_FOLDS8_0624",
                                 "CatBoostRegressor_train_SEED47_FOLDS8_0624",
                                 "CatBoostRegressor2_train_SEED47_FOLDS8_0624",
                                 "XGBRegressor_train_SEED47_FOLDS8_addon_0622",
                                 "LGBMRegressor_train_SEED47_FOLDS8_addon_0622",
                                 "CatBoostRegressor_train_SEED47_FOLDS8_addon_0622",
                                 "XGBRegressor_train_SEED47_FOLDS8_addon_0623",
                                 "LGBMRegressor_train_SEED47_FOLDS8_addon_0623",
                                 "CatBoostRegressor_train_SEED47_FOLDS8_addon_0623",
                                 "Ridge_train_SEED51_FOLDS10",
                                 "Ridge_train_SEED51_FOLDS10_0627",
                                 "RandomForestRegressor_train_SEED47_FOLDS8",
                                 "RandomForestRegressor_train_SEED47_FOLDS8_0627",
                                 "ExtraTreesRegressor_train_SEED47_FOLDS8",
                                 "ExtraTreesRegressor_train_SEED47_FOLDS8_0627",
                                 "NN_train_SEED47_FOLDS10",
                                 "NN2_train_SEED47_FOLDS10",
                                 "NN_train_SEED47_FOLDS10_0626",
                                 "NN2_train_SEED47_FOLDS10_0626",
                                 "NN_train_SEED47_FOLDS10_0627",
                                 "NN2_train_SEED47_FOLDS10_0627"
                                ])
    stacking_test_lists.append(["XGBRegressor_test_SEED47_FOLDS8_0623",
                                "XGBRegressor2_test_SEED47_FOLDS8_0623",
                                "LGBMRegressor_test_SEED47_FOLDS8_0623",
                                "LGBMRegressor2_test_SEED47_FOLDS8_0623",
                                "CatBoostRegressor_test_SEED47_FOLDS8_0623",
                                "CatBoostRegressor2_test_SEED47_FOLDS8_0623",
                                "XGBRegressor_test_SEED47_FOLDS8_0624",
                                "XGBRegressor2_test_SEED47_FOLDS8_0624",
                                "LGBMRegressor_test_SEED47_FOLDS8_0624",
                                "LGBMRegressor2_test_SEED47_FOLDS8_0624",
                                "CatBoostRegressor_test_SEED47_FOLDS8_0624",
                                "CatBoostRegressor2_test_SEED47_FOLDS8_0624",
                                "XGBRegressor_test_SEED47_FOLDS8_addon_0622",
                                "LGBMRegressor_test_SEED47_FOLDS8_addon_0622",
                                "CatBoostRegressor_test_SEED47_FOLDS8_addon_0622",
                                "XGBRegressor_test_SEED47_FOLDS8_addon_0623",
                                "LGBMRegressor_test_SEED47_FOLDS8_addon_0623",
                                "CatBoostRegressor_test_SEED47_FOLDS8_addon_0623",
                                "Ridge_test_SEED51_FOLDS10",
                                "Ridge_test_SEED51_FOLDS10_0627",
                                "RandomForestRegressor_test_SEED47_FOLDS8",
                                "RandomForestRegressor_test_SEED47_FOLDS8_0627",
                                "ExtraTreesRegressor_test_SEED47_FOLDS8",
                                "ExtraTreesRegressor_test_SEED47_FOLDS8_0627",
                                "NN_test_SEED47_FOLDS10",
                                "NN2_test_SEED47_FOLDS10",
                                "NN_test_SEED47_FOLDS10_0626",
                                "NN2_test_SEED47_FOLDS10_0626",
                                "NN_test_SEED47_FOLDS10_0627",
                                "NN2_test_SEED47_FOLDS10_0627"
                               ])

# Stacking 2層目
---

In [None]:
%%time

stacking_train_df_l = []
stacking_test_df_l  = []

pickle_l = glob.glob(f"./{out_dir}/*.pickle")

for stacking_train_l in stacking_train_lists:
    stacking_train_df = pd.DataFrame()
    for j, stacking_train_f in enumerate(stacking_train_l):
        stacking_train = [f for f in pickle_l if stacking_train_f in f][0]
        with open(stacking_train, 'rb') as f:
            stacking_train_df[f'stacking_{j}'] = pickle.load(f)
    
    stacking_train_df['stacking_addon1'] = pd.read_csv(f"./{out_dir}/train_lgb_817.csv").lgb_y
    stacking_train_df['stacking_addon2'] = pd.read_csv(f"./{out_dir}/train_lgb_0623.csv").lgb_y
    stacking_train_df['stacking_addon3'] = pd.read_csv(f"./{out_dir}/train_lgb_0624.csv").lgb_y
    stacking_train_df['stacking_addon4'] = pd.read_csv(f"./{out_dir}/train_lgb_0624_2.csv").lgb_y
    stacking_train_df_l.append(stacking_train_df)
    
for stacking_test_l in stacking_test_lists:
    stacking_test_df = pd.DataFrame()
    for j, stacking_test_f in enumerate(stacking_test_l):
        stacking_test  = [f for f in pickle_l if stacking_test_f in f][0]
        with open(stacking_test, 'rb') as f:
            stacking_test_df[f'stacking_{j}'] = pickle.load(f)

    stacking_test_df['stacking_addon1']  = pd.read_csv(f"./{out_dir}/test_lgb_817.csv").lgb_y
    stacking_test_df['stacking_addon2']  = pd.read_csv(f"./{out_dir}/test_lgb_0623.csv").lgb_y
    stacking_test_df['stacking_addon3']  = pd.read_csv(f"./{out_dir}/test_lgb_0624.csv").lgb_y
    stacking_test_df['stacking_addon4']  = pd.read_csv(f"./{out_dir}/test_lgb_0624_2.csv").lgb_y
    stacking_test_df_l.append(stacking_test_df)


In [None]:
fnc_l = {'LGBMRegressor': lgb_regressor,
         'XGBRegressor': xgb_regressor,
         'CatBoostRegressor': catboost_regressor,
         'NGBRegressor': ngboost_regressor,
         'LogisticRegression': logistic_regression,
         'LinearRegression': lin_regression,
         'BayesianRidge': bayesianridge,
         'Ridge': ridge,
         'ElasticNet': elastic,
         'Lasso': lasso,
         'SGDRegressor': sgd,
         'KNeighborsRegressor': kn_regressor,
         'RandomForestRegressor': rf_regressor,
         'ExtraTreesRegressor': et_regressor,
         'GradientBoostingRegressor': gb_regressor,
         'BaggingRegressor': bag_regressor,
         'AdaBoostRegressor': ada_regressor,
         'LinearSVR': lin_svr}

In [None]:
%%time

stacking_train2 = pd.DataFrame()
stacking_test2  = pd.DataFrame()


fnc_list = [fnc_l['LGBMRegressor'], fnc_l['XGBRegressor'], fnc_l['CatBoostRegressor'],
            #fnc_l['BayesianRidge'], fnc_l['SGDRegressor'], fnc_l['KNeighborsRegressor'],
            fnc_l['BayesianRidge'], fnc_l['SGDRegressor'],
            fnc_l['RandomForestRegressor'], fnc_l['ExtraTreesRegressor']]

for j, target_fn in enumerate(fnc_list):
    keys = [k for k, v in fnc_l.items() if v == target_fn]
    stacking_train_tmp = 0
    stacking_test_tmp = 0
    
    for k, (stacking_train, stacking_test) in enumerate(zip(stacking_train_df_l, stacking_test_df_l)):
        stacking_SEED = 47
        stacking_N_FOLDS = 8
        encoding = []
        
        stacking_train['mean'] = stacking_train.mean(axis=1)
        stacking_test['mean'] = stacking_test.mean(axis=1)

        results_stacking = target_fn(train=stacking_train, test=stacking_test, target=y, target_skf=None, \
                                     seed=stacking_SEED, n_folds=stacking_N_FOLDS, encoding=encoding)
        submission_stacking = output_results(y, results_stacking, test_id, f"STACKING_{keys[0]}_MODELSEL{k}")

        
        oof_train = pd.DataFrame()
        oof_test  = pd.DataFrame()
        oof_train['id']=train_data['id']
        oof_train['pred_y']=results_stacking['train']
        oof_train['y'] = np.log1p(train_data['y'])
        oof_test['id']=test_data['id']
        oof_test['pred_y']=results_stacking['test']
        oof_train.to_csv(f"./{out_dir}/train_stacking_{keys[0]}_MODELSEL{k}.csv",index=False)
        oof_test.to_csv(f"./{out_dir}/test_stacking_{keys[0]}_MODELSEL{k}.csv",index=False)

        stacking_train_tmp += results_stacking['train']
        stacking_test_tmp  += results_stacking['test']
        
    stacking_train2[f'stacking2_{j}'] = stacking_train_tmp/len(stacking_train_df_l)
    stacking_test2[f'stacking2_{j}']  = stacking_test_tmp/len(stacking_test_df_l)


In [None]:
N_SPLITS = 10
SEED = 47
LEARNING_RATE = 1e-3
BATCH_SIZE = 32
EPOCHS = 200
PATIENCE = 20

In [None]:
def create_callbacks():
    callbacks = []
    
    callbacks.append(EarlyStopping(monitor='val_root_mean_squared_error',
                                   min_delta=0,
                                   patience=PATIENCE,
                                   verbose=1,
                                   mode='auto',
                                   baseline=None,
                                   restore_best_weights=True))

    # Update the learning rate every epoch
    callbacks.append(ReduceLROnPlateau(monitor='val_root_mean_squared_error',
                                       factor=0.95,
                                       patience=1,
                                       verbose=0,
                                       mode='auto',
                                       min_delta=1e-4,
                                       cooldown=0,
                                       min_lr=1e-6))

    return callbacks

In [None]:
def nn(lr, seed, input_shape):

    model = Sequential([
        Dense(2 ** 8, activation='relu', input_dim=input_shape, kernel_initializer=he_normal(seed=seed)),
        Dense(2 ** 7, activation='relu', kernel_initializer=he_normal(seed=seed)),
        Dense(2 ** 6, activation='relu', kernel_initializer=he_normal(seed=seed)),
        Dense(2 ** 5, activation='relu', kernel_initializer=he_normal(seed=seed)),
        Dense(2 ** 4, activation='relu', kernel_initializer=he_normal(seed=seed)),
        Dense(2 ** 3, activation='relu', kernel_initializer=he_normal(seed=seed)),
        
        Dense(2 ** 3, activation='relu', kernel_initializer=he_normal(seed=seed)),
        Dense(1)
    ])

    # COMPILE WITH ADAM OPTIMIZER AND CROSS ENTROPY COST    
    adam_opt = Adam(learning_rate=lr, beta_1=0.9, beta_2=0.999, amsgrad=True)
    nadam_opt = Nadam(learning_rate=lr, beta_1=0.9, beta_2=0.999)
    ladam_opt = tfa.optimizers.LazyAdam(learning_rate=LEARNING_RATE, beta_1=0.9, beta_2=0.999, epsilon=1e-07, amsgrad=False)
    adamw_opt = tfa.optimizers.AdamW(learning_rate=LEARNING_RATE, weight_decay=1e-4, beta_1=0.9, beta_2=0.999, epsilon=1e-07, amsgrad=True)
    rmsprop_opt = RMSprop(learning_rate=lr, rho=0.9)
    sgd_opt = SGD(learning_rate=lr, momentum=0.0, nesterov=False)
    sgd_opt = SGD(learning_rate=lr, decay=1e-4, momentum=0.9, nesterov=True)

    model.compile(optimizer=nadam_opt,
                  loss='mean_squared_error', 
                  metrics=tf.keras.metrics.RootMeanSquaredError())

    return model

In [None]:
def nn2(lr, seed, input_shape):

    model = Sequential([
        Dense(2 ** 8, activation='relu', input_dim=input_shape, kernel_initializer=he_normal(seed=seed)),
        Dense(2 ** 7, activation='relu', kernel_initializer=he_normal(seed=seed)),
        Dense(2 ** 6, activation='relu', kernel_initializer=he_normal(seed=seed)),
        Dense(2 ** 5, activation='relu', kernel_initializer=he_normal(seed=seed)),
        
        Dense(2 ** 3, activation='relu', kernel_initializer=he_normal(seed=seed)),
        Dense(1)
    ])

    # COMPILE WITH ADAM OPTIMIZER AND CROSS ENTROPY COST    
    adam_opt = Adam(learning_rate=lr, beta_1=0.9, beta_2=0.999, amsgrad=True)
    nadam_opt = Nadam(learning_rate=lr, beta_1=0.9, beta_2=0.999)
    ladam_opt = tfa.optimizers.LazyAdam(learning_rate=LEARNING_RATE, beta_1=0.9, beta_2=0.999, epsilon=1e-07, amsgrad=False)
    adamw_opt = tfa.optimizers.AdamW(learning_rate=LEARNING_RATE, weight_decay=1e-4, beta_1=0.9, beta_2=0.999, epsilon=1e-07, amsgrad=True)
    rmsprop_opt = RMSprop(learning_rate=lr, rho=0.9)
    sgd_opt = SGD(learning_rate=lr, momentum=0.0, nesterov=False)
    sgd_opt = SGD(learning_rate=lr, decay=1e-4, momentum=0.9, nesterov=True)

    model.compile(optimizer=nadam_opt,
                  loss='mean_squared_error',
                  metrics=tf.keras.metrics.RootMeanSquaredError())

    return model

In [None]:
%%time

history, history_false = [], []
score, score_false = [], []
pred_train = np.zeros((stacking_train.shape[0]))
pred_full_test = 0

skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)

for fold_id, (train_idx, val_idx) in enumerate(tqdm(skf.split(stacking_train, y_bin))):
    print("*"*80)
    print(f"Started TF learning(1) fold:{fold_id+1} / {N_SPLITS}")

    # 全データで学習、予測
    model = nn(lr=LEARNING_RATE, seed=SEED, input_shape=stacking_train.shape[1])
    callbacks = create_callbacks()

    tr_X, val_X = stacking_train.iloc[train_idx].copy(), stacking_train.iloc[val_idx].copy()
    tr_y, val_y = y.iloc[train_idx], y.iloc[val_idx]
        
    history.append(model.fit(tr_X, tr_y, batch_size=BATCH_SIZE,
                             epochs=EPOCHS,
                             verbose=2,
                             validation_data=(val_X, val_y),
                             callbacks=callbacks))
    
    pred_train[val_idx] = model.predict(val_X).reshape(-1)
    score.append(model.evaluate(val_X, val_y, batch_size=BATCH_SIZE, verbose=0, return_dict=True))
    pred_full_test = pred_full_test + model.predict(stacking_test)
    
    RMSLE = mean_squared_error(y[val_idx], pred_train[val_idx], squared=False)
    print(f"RMSLE={RMSLE}")

RMSLE_overall = mean_squared_error(y, pred_train, squared=False)
print(f"Overall RMSLE={RMSLE_overall}")

# Make submission
print("Saving submission file")
submission = pd.DataFrame({'id': test_id, 'y': np.expm1((pred_full_test/N_SPLITS).reshape(-1))})
submission.to_csv(f"./{out_dir}/submission_STACKING_NN1_CV{RMSLE_overall:.6f}.csv", index=False)

oof_train = pd.DataFrame()
oof_test  = pd.DataFrame()
oof_train['id']=train_data['id']
oof_train['pred_y']=pred_train
oof_train['y'] = np.log1p(train_data['y'])
oof_test['id']=test_data['id']
oof_test['pred_y']=(pred_full_test/N_SPLITS).reshape(-1)
oof_train.to_csv(f"./{out_dir}/train_stacking_NN1.csv",index=False)
oof_test.to_csv(f"./{out_dir}/test_stacking_NN1.csv",index=False)

stacking_train2[f'stacking2_{len(fnc_list)}'] = pred_train
stacking_test2[f'stacking2_{len(fnc_list)}']  = pred_full_test/N_SPLITS


In [None]:
%%time

history, history_false = [], []
score, score_false = [], []
pred_train = np.zeros((stacking_train.shape[0]))
pred_full_test = 0

skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)
kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)

for fold_id, (train_idx, val_idx) in enumerate(tqdm(skf.split(stacking_train, y_bin))):
    print("*"*80)
    print(f"Started TF learning(2) fold:{fold_id+1} / {N_SPLITS}")

    # 全データで学習、予測
    model = nn2(lr=LEARNING_RATE, seed=SEED, input_shape=stacking_train.shape[1])
    callbacks = create_callbacks()

    tr_X, val_X = stacking_train.iloc[train_idx].copy(), stacking_train.iloc[val_idx].copy()
    tr_y, val_y = y.iloc[train_idx], y.iloc[val_idx]
        
    history.append(model.fit(tr_X, tr_y, batch_size=BATCH_SIZE,
                             epochs=EPOCHS,
                             verbose=2,
                             validation_data=(val_X, val_y),
                             callbacks=callbacks))
    
    pred_train[val_idx] = model.predict(val_X).reshape(-1)
    score.append(model.evaluate(val_X, val_y, batch_size=BATCH_SIZE, verbose=0, return_dict=True))
    pred_full_test = pred_full_test + model.predict(stacking_test)
    
    RMSLE = mean_squared_error(y[val_idx], pred_train[val_idx], squared=False)
    print(f"RMSLE={RMSLE}")

RMSLE_overall = mean_squared_error(y, pred_train, squared=False)
print(f"Overall RMSLE={RMSLE_overall}")

# Make submission
print("Saving submission file")
submission = pd.DataFrame({'id': test_id, 'y': np.expm1((pred_full_test/N_SPLITS).reshape(-1))})
submission.to_csv(f"./{out_dir}/submission_STACKING_NN2_CV{RMSLE_overall:.6f}.csv", index=False)

oof_train = pd.DataFrame()
oof_test  = pd.DataFrame()
oof_train['id']=train_data['id']
oof_train['pred_y']=pred_train
oof_train['y'] = np.log1p(train_data['y'])
oof_test['id']=test_data['id']
oof_test['pred_y']=(pred_full_test/N_SPLITS).reshape(-1)
oof_train.to_csv(f"./{out_dir}/train_stacking_NN2.csv",index=False)
oof_test.to_csv(f"./{out_dir}/test_stacking_NN2.csv",index=False)

stacking_train2[f'stacking2_{len(fnc_list)+1}'] = pred_train
stacking_test2[f'stacking2_{len(fnc_list)+1}']  = pred_full_test/N_SPLITS


# Stacking 3層目

In [None]:
%%time

stacking_train2.to_csv(f"./{out_dir}/train_stacking2_AllModel.csv",index=False)
stacking_test2.to_csv(f"./{out_dir}/test_stacking2_AllModel.csv",index=False)


In [None]:
%%time

fnc_list2 = [fnc_l['LinearRegression'], fnc_l['BaggingRegressor']]

cols_to_stack = [c for c in stacking_train2.columns]

for target_fn in fnc_list2:
    keys = [k for k, v in fnc_l.items() if v == target_fn]
    stacking_SEED = 51
    stacking_N_FOLDS = 10
    encoding = []

    results_stacking2 = target_fn(train=stacking_train2[cols_to_stack], test=stacking_test2[cols_to_stack], target=y, target_skf=y_bin, \
                                  seed=stacking_SEED, n_folds=stacking_N_FOLDS, encoding=encoding)
    submission_stacking2 = output_results(y, results_stacking2, test_id, f"STACKING2_full_{keys[0]}")
        
    oof_train = pd.DataFrame()
    oof_test  = pd.DataFrame()
    oof_train['id']=train_data['id']
    oof_train['pred_y']=results_stacking2['train']
    oof_train['y'] = np.log1p(train_data['y'])
    oof_test['id']=test_data['id']
    oof_test['pred_y']=results_stacking2['test']
    oof_train.to_csv(f"./{out_dir}/train_stacking2_full_{keys[0]}.csv",index=False)
    oof_test.to_csv(f"./{out_dir}/test_stacking2_full_{keys[0]}.csv",index=False)

# Ensemble 3層目

In [None]:
%%time
#         LGB   XGB   CAT   BAY   SGD   RFR   ETR   NN1   NN2
coef_l = [0.05, 0.00, 0.00, 0.65, 0.00, 0.05, 0.00, 0.15, 0.10]

results_train = 0
results_test  = 0
for j, coef in zip(range(stacking_train2.shape[1]), coef_l):
    results_train += stacking_train2[f'stacking2_{j}'] * coef
    results_test  += stacking_test2[f'stacking2_{j}'] * coef
    
results = {'train': results_train, 'test':  results_test}

submission_ensemble = output_results(y, results, test_id, f"ENSEMBLE2")

In [None]:
print(datetime.datetime.now()-start)