In [None]:
import numpy as np
import pandas as pd
import gc
pd.set_option('max_rows', 400)
pd.set_option('max_columns', 400)

import os
import glob

book_inc=100
trade_inc=100
book_intervals=np.arange(book_inc, 600, book_inc).tolist()
trade_intervals=np.arange(trade_inc, 600, trade_inc).tolist()

# Kaggle dir:
data_dir = '../input/optiver-realized-volatility-prediction/'

# Local dir:
# data_dir = './input/optiver-realized-volatility-prediction/'

############################################################################
# Feature Engineering Part I ###############################################
############################################################################

# Function to calculate first WAP
def calc_wap1(df):
    wap = (df['bid_price1'] * df['ask_size1'] + df['ask_price1'] * df['bid_size1']) / (df['bid_size1'] + df['ask_size1'])
    return wap

# Function to calculate second WAP
def calc_wap2(df):
    wap = (df['bid_price2'] * df['ask_size2'] + df['ask_price2'] * df['bid_size2']) / (df['bid_size2'] + df['ask_size2'])
    return wap

def calc_wap3(df):
    wap = (df['bid_price1'] * df['bid_size1'] + df['ask_price1'] * df['ask_size1']) / (df['bid_size1'] + df['ask_size1'])
    return wap

def calc_wap4(df):
    wap = (df['bid_price2'] * df['bid_size2'] + df['ask_price2'] * df['ask_size2']) / (df['bid_size2'] + df['ask_size2'])
    return wap

def log_return(list_stock_prices):
    return np.log(list_stock_prices).diff()

def realized_volatility(series):
    return np.sqrt(np.sum(series**2))

def count_unique(series):
    return len(np.unique(series))


# Create book features

def preprocessor_book(file_path):
    df = pd.read_parquet(file_path)
    #calculate return etc
    df['wap1'] = calc_wap1(df)
    df['wap2'] = calc_wap2(df)
    df['wap3'] = calc_wap3(df)
    df['wap4'] = calc_wap4(df)

    df['log_return1'] = df.groupby(['time_id'])['wap1'].apply(log_return)
    df['log_return2'] = df.groupby(['time_id'])['wap2'].apply(log_return)
    df['log_return3'] = df.groupby(['time_id'])['wap3'].apply(log_return)
    df['log_return4'] = df.groupby(['time_id'])['wap4'].apply(log_return)

    df['wap_balance'] = abs(df['wap1'] - df['wap2'])

    df['price_spread'] = (df['ask_price1'] - df['bid_price1']) / ((df['ask_price1'] + df['bid_price1']) / 2)
    df['price_spread2'] = (df['ask_price2'] - df['bid_price2']) / ((df['ask_price2'] + df['bid_price2']) / 2)

    df['bid_spread'] = df['bid_price1'] - df['bid_price2']
    df['ask_spread'] = df['ask_price1'] - df['ask_price2']
    df["bid_ask_spread"] = abs(df['bid_spread'] - df['ask_spread'])

    df['total_volume'] = (df['ask_size1'] + df['ask_size2']) + (df['bid_size1'] + df['bid_size2'])
    df['volume_imbalance'] = abs((df['ask_size1'] + df['ask_size2']) - (df['bid_size1'] + df['bid_size2']))

    #dict for aggregate

    create_feature_dict = {
        'wap1': [np.sum, np.std, np.mean],
        'wap2': [np.sum, np.std, np.mean],
        'wap3': [np.sum, np.std, np.mean],
        'wap4': [np.sum, np.std, np.mean],
        'log_return1': [realized_volatility],
        'log_return2': [realized_volatility],
        'log_return3': [realized_volatility],
        'log_return4': [realized_volatility],
        'wap_balance': [np.sum, np.max, np.mean],
        'price_spread':[np.sum, np.max, np.mean],
        'price_spread2':[np.sum, np.max, np.mean],
        'bid_spread':[np.sum, np.max, np.mean],
        'ask_spread':[np.sum, np.max, np.mean],
        'bid_ask_spread':[np.sum,  np.max, np.mean],
        'total_volume':[np.sum, np.max, np.mean],
        'volume_imbalance':[np.sum, np.max, np.mean],
    }

    create_feature_dict_time = {
        'log_return1': [realized_volatility],
        'log_return2': [realized_volatility],
        'log_return3': [realized_volatility],
        'log_return4': [realized_volatility],
    }

    #####groupby / all seconds
    df_feature = pd.DataFrame(df.groupby(['time_id']).agg(create_feature_dict)).reset_index()

    df_feature.columns = ['_'.join(col) for col in df_feature.columns] #time_id is changed to time_id_

    ######groupby / s seconds
    for i in range(0,len(book_intervals)):
        s_min=book_intervals[i]

        df_feature_sec = pd.DataFrame(df.query(f'seconds_in_bucket >= {s_min}').groupby(['time_id']).agg(create_feature_dict_time)).reset_index()

        df_feature_sec.columns = ['_'.join(col) for col in df_feature_sec.columns] #time_id is changed to time_id_

        df_feature_sec = df_feature_sec.add_suffix('_' + str(s_min))

        df_feature = pd.merge(df_feature,df_feature_sec,how='left',left_on='time_id_',right_on=f'time_id__{s_min}')
        df_feature = df_feature.drop([f'time_id__{s_min}'],axis=1)

    #create row_id
    stock_id = file_path.split('=')[1]
    df_feature['row_id'] = df_feature['time_id_'].apply(lambda x:f'{stock_id}-{x}')
    df_feature = df_feature.drop(['time_id_'],axis=1)

    return df_feature

# Example
# file_path = data_dir + "book_train.parquet/stock_id=0"
# preprocessor_book(file_path)

# Create Trade features

def preprocessor_trade(file_path):
    df = pd.read_parquet(file_path)
    df['log_return'] = df.groupby('time_id')['price'].apply(log_return)
    df['amount']=df['price']*df['size']


    create_feature_dict = {
        'log_return':[realized_volatility],
        'seconds_in_bucket':[count_unique],
        'size':[np.sum, np.max, np.min],
        'order_count':[np.sum,np.max,np.mean],
        'amount':[np.sum,np.max,np.min],
    }

    create_feature_dict_time = {
        'log_return':[realized_volatility],
        'seconds_in_bucket':[count_unique],
        'size':[np.sum],
        'order_count':[np.sum],
    }


    df_feature = df.groupby('time_id').agg(create_feature_dict)

    df_feature = df_feature.reset_index()
    df_feature.columns = ['_'.join(col) for col in df_feature.columns]


    ######groupby / seconds
    for i in range(0,len(trade_intervals)):
        s_min=trade_intervals[i]

        df_feature_sec = df.query(f'seconds_in_bucket >= {s_min}').groupby('time_id').agg(create_feature_dict_time)
        df_feature_sec = df_feature_sec.reset_index()

        df_feature_sec.columns = ['_'.join(col) for col in df_feature_sec.columns]
        df_feature_sec = df_feature_sec.add_suffix('_' + str(s_min))

        df_feature = pd.merge(df_feature,df_feature_sec,how='left',left_on='time_id_',right_on=f'time_id__{s_min}')
        df_feature = df_feature.drop([f'time_id__{s_min}'],axis=1)

    def tendency(price, vol):
        df_diff = np.diff(price)
        val = (df_diff/price[1:])*100
        power = np.sum(val*vol[1:])
        return(power)

    lis = []
    for n_time_id in df['time_id'].unique():
        df_id = df[df['time_id'] == n_time_id]
        tendencyV = tendency(df_id['price'].values, df_id['size'].values)
        f_max = np.sum(df_id['price'].values > np.mean(df_id['price'].values))
        f_min = np.sum(df_id['price'].values < np.mean(df_id['price'].values))
        df_max =  np.sum(np.diff(df_id['price'].values) > 0)
        df_min =  np.sum(np.diff(df_id['price'].values) < 0)
        # new
        abs_diff = np.median(np.abs( df_id['price'].values - np.mean(df_id['price'].values)))
        energy = np.mean(df_id['price'].values**2)
        iqr_p = np.percentile(df_id['price'].values,75) - np.percentile(df_id['price'].values,25)

        # vol vars

        abs_diff_v = np.median(np.abs( df_id['size'].values - np.mean(df_id['size'].values)))
        energy_v = np.sum(df_id['size'].values**2)
        iqr_p_v = np.percentile(df_id['size'].values,75) - np.percentile(df_id['size'].values,25)

        lis.append({'time_id':n_time_id,'tendency':tendencyV,'f_max':f_max,'f_min':f_min,'df_max':df_max,'df_min':df_min,
                   'abs_diff':abs_diff,'energy':energy,'iqr_p':iqr_p,'abs_diff_v':abs_diff_v,'energy_v':energy_v,'iqr_p_v':iqr_p_v})

    df_lr = pd.DataFrame(lis)

    df_feature = df_feature.merge(df_lr, how = 'left', left_on = 'time_id_', right_on = 'time_id')


    df_feature = df_feature.add_prefix('trade_')
    stock_id = file_path.split('=')[1]
    df_feature['row_id'] = df_feature['trade_time_id_'].apply(lambda x:f'{stock_id}-{x}')
    df_feature = df_feature.drop(['trade_time_id_'],axis=1)

    return df_feature

# Example
# file_path = data_dir + "trade_train.parquet/stock_id=0"
# preprocessor_trade(file_path)

def preprocessor(list_stock_ids, is_train = True):
    from joblib import Parallel, delayed # parallel computing to save time
    df = pd.DataFrame()

    def for_joblib(stock_id):
        if is_train:
            file_path_book = data_dir + "book_train.parquet/stock_id=" + str(stock_id)
            file_path_trade = data_dir + "trade_train.parquet/stock_id=" + str(stock_id)
        else:
            file_path_book = data_dir + "book_test.parquet/stock_id=" + str(stock_id)
            file_path_trade = data_dir + "trade_test.parquet/stock_id=" + str(stock_id)

        df_tmp = pd.merge(preprocessor_book(file_path_book),preprocessor_trade(file_path_trade),on='row_id',how='left')

        return pd.concat([df,df_tmp])

    df = Parallel(n_jobs=-1, verbose=1)(
        delayed(for_joblib)(stock_id) for stock_id in list_stock_ids
        )

    df =  pd.concat(df,ignore_index = True)
    return df

# Train

# Debug example:
# train = pd.read_csv(data_dir + 'train.csv')
# list_s = [0,1]
# df_train = preprocessor(list_s, is_train = True)

# Real:
train = pd.read_csv(data_dir + 'train.csv')
train_ids = train.stock_id.unique()
df_train = preprocessor(list_stock_ids= train_ids, is_train = True)

# Test
test = pd.read_csv(data_dir + 'test.csv')
test_ids = test.stock_id.unique()
df_test = preprocessor(list_stock_ids = test_ids, is_train = False)


############################################################################
# Feature Engineering Part II ##############################################
############################################################################

# TRAIN
####################################################################################
train['row_id'] = train['stock_id'].astype(str) + '-' + train['time_id'].astype(str)
train = train[['row_id','time_id','target']]
df_train = train.merge(df_train, on = ['row_id'], how = 'left')
df_train['stock_id'] = df_train['row_id'].apply(lambda x:x.split('-')[0])

#stock_id target encoding
stock_id_target_mean = df_train.groupby('stock_id')['target'].mean()
df_train['stock_id_target_enc'] = df_train['stock_id'].map(stock_id_target_mean) # train_set

# Stock_id as integer
df_train['stock_id'] = df_train['stock_id'].astype(int)

# tau
df_train['size_tau'] = np.sqrt( 1/ df_train['trade_seconds_in_bucket_count_unique'] )
df_train['size_tau_400'] = np.sqrt( 1/ df_train['trade_seconds_in_bucket_count_unique_400'] )
df_train['size_tau_300'] = np.sqrt( 1/ df_train['trade_seconds_in_bucket_count_unique_300'] )
df_train['size_tau_200'] = np.sqrt( 1/ df_train['trade_seconds_in_bucket_count_unique_200'] )

df_train['size_tau2'] = np.sqrt( 1/ df_train['trade_order_count_sum'] )
df_train['size_tau2_400'] = np.sqrt( 0.33/ df_train['trade_order_count_sum'] )
df_train['size_tau2_300'] = np.sqrt( 0.5/ df_train['trade_order_count_sum'] )
df_train['size_tau2_200'] = np.sqrt( 0.66/ df_train['trade_order_count_sum'] )

df_train['size_tau2_d'] = df_train['size_tau2_400'] - df_train['size_tau2']

# target to the end
cols=list(df_train)
cols.insert(len(cols), cols.pop(cols.index('target')))
df_train = df_train.reindex(columns= cols)


# TEST
####################################################################################
df_test = test.merge(df_test, on = ['row_id'], how = 'left')
df_test['stock_id'] = df_test['row_id'].apply(lambda x:x.split('-')[0])

# stock_id to the end (preserve same ordention than train)
cols=list(df_test)
cols.insert(len(cols), cols.pop(cols.index('stock_id')))
df_test = df_test.reindex(columns= cols)

#stock_id target encoding
df_test['stock_id_target_enc'] = df_test['stock_id'].map(stock_id_target_mean) # test_set

# Stock_id as integer
df_test['stock_id'] = df_test['stock_id'].astype(int)

# tau
df_test['size_tau'] = np.sqrt( 1/ df_test['trade_seconds_in_bucket_count_unique'] )
df_test['size_tau_400'] = np.sqrt( 1/ df_test['trade_seconds_in_bucket_count_unique_400'] )
df_test['size_tau_300'] = np.sqrt( 1/ df_test['trade_seconds_in_bucket_count_unique_300'] )
df_test['size_tau_200'] = np.sqrt( 1/ df_test['trade_seconds_in_bucket_count_unique_200'] )

df_test['size_tau2'] = np.sqrt( 1/ df_test['trade_order_count_sum'] )
df_test['size_tau2_400'] = np.sqrt( 0.33/ df_test['trade_order_count_sum'] )
df_test['size_tau2_300'] = np.sqrt( 0.5/ df_test['trade_order_count_sum'] )
df_test['size_tau2_200'] = np.sqrt( 0.66/ df_test['trade_order_count_sum'] )

df_test['size_tau2_d'] = df_test['size_tau2_400'] - df_test['size_tau2']

# row_id to the begining
cols=list(df_test)
cols.insert(0, cols.pop(cols.index('row_id')))

df_test = df_test.reindex(columns= cols)

# Check
print(list(df_train)[0:-1]==list(df_test))


def get_time_stock(df):
    vol_cols = ['log_return1_realized_volatility', 'log_return2_realized_volatility', 'log_return1_realized_volatility_500', 'log_return2_realized_volatility_500', 'log_return1_realized_volatility_400', 'log_return2_realized_volatility_400',
                'log_return1_realized_volatility_300', 'log_return2_realized_volatility_300', 'log_return1_realized_volatility_200', 'log_return2_realized_volatility_200', 'log_return1_realized_volatility_100', 'log_return2_realized_volatility_100',
                'trade_log_return_realized_volatility', 'trade_log_return_realized_volatility_500','trade_log_return_realized_volatility_400', 'trade_log_return_realized_volatility_300', 'trade_log_return_realized_volatility_200','trade_log_return_realized_volatility_100']

    # Group by the stock id
    df_stock_id = df.groupby(['stock_id'])[vol_cols].agg(['mean', 'std', 'max', 'min', ]).reset_index()
    # Rename columns joining suffix
    df_stock_id.columns = ['_'.join(col) for col in df_stock_id.columns]
    df_stock_id = df_stock_id.add_suffix('_' + 'stock')

    # Group by the stock id
    df_time_id = df.groupby(['time_id'])[vol_cols].agg(['mean', 'std', 'max', 'min', ]).reset_index()
    # Rename columns joining suffix
    df_time_id.columns = ['_'.join(col) for col in df_time_id.columns]
    df_time_id = df_time_id.add_suffix('_' + 'time')

    # Merge with original dataframe
    df = df.merge(df_stock_id, how = 'left', left_on = ['stock_id'], right_on = ['stock_id__stock'])
    df = df.merge(df_time_id, how = 'left', left_on = ['time_id'], right_on = ['time_id__time'])
    df.drop(['stock_id__stock', 'time_id__time'], axis = 1, inplace = True)
    return df


df_train = get_time_stock(df_train)
df_test = get_time_stock(df_test)

# target to the end
cols=list(df_train)
cols.insert(len(cols), cols.pop(cols.index('target')))
df_train = df_train.reindex(columns= cols)

# Check
print(list(df_train)[0:-1]==list(df_test))

############################################################################
# Feature Engineering Part III #############################################
############################################################################

from sklearn.cluster import KMeans
# making agg features

train_p = pd.read_csv(data_dir + 'train.csv')
train_p = train_p.pivot(index='time_id', columns='stock_id', values='target')

corr = train_p.corr()

ids = corr.index

kmeans = KMeans(n_clusters=7, random_state=0).fit(corr.values)
print(kmeans.labels_)

l = []
for n in range(7):
    l.append ( [ (x-1) for x in ( (ids+1)*(kmeans.labels_ == n)) if x > 0] )


mat = []
matTest = []

nnn = ['time_id',
     'log_return1_realized_volatility',
     'log_return2_realized_volatility',
     'total_volume_sum',
     'trade_size_sum',
     'trade_order_count_sum',
     'price_spread_sum',
     'bid_spread_sum',
     'ask_spread_sum',
     'volume_imbalance_sum',
     'bid_ask_spread_sum',
     'size_tau',
     'size_tau2',
     'stock_id']

n = 0
for ind in l:
    if len(ind)>=5:

        print(ind)
        newDf = df_train.loc[df_train['stock_id'].isin(ind),nnn]
        newDf = newDf.groupby(['time_id']).agg(np.nanmean)
        newDf.loc[:,'stock_id'] = 'c'+str(n)
        mat.append ( newDf )

        newDf = df_test.loc[df_test['stock_id'].isin(ind),nnn]
        newDf = newDf.groupby(['time_id']).agg(np.nanmean)
        newDf.loc[:,'stock_id'] = 'c'+str(n)
        matTest.append ( newDf )

    n+=1

mat1 = pd.concat(mat).reset_index()
mat2 = pd.concat(matTest).reset_index()
mat2 = pd.concat([mat2,mat1.loc[mat1.time_id==5]])


mat1 = mat1.pivot(index='time_id', columns='stock_id')
mat1.columns = ["_".join(x) for x in mat1.columns.ravel()]
mat1.reset_index(inplace=True)

mat2 = mat2.pivot(index='time_id', columns='stock_id')
mat2.columns = ["_".join(x) for x in mat2.columns.ravel()]
mat2.reset_index(inplace=True)

df_train = pd.merge(df_train,mat1,how='left',on='time_id')
df_train['cluster_id']=df_train['stock_id'].apply(lambda s_id:[i for i, l in enumerate(l) if s_id in l][0])

# target to the end
cols=list(df_train)
cols.insert(len(cols), cols.pop(cols.index('target')))
df_train = df_train.reindex(columns= cols)

df_test = pd.merge(df_test,mat2,how='left',on='time_id')
df_test['cluster_id']=df_test['stock_id'].apply(lambda s_id:[i for i, l in enumerate(l) if s_id in l][0])

# Check
print(list(df_train)[0:-1]==list(df_test))

del mat1,mat2
gc.collect()

############################################################################
# GBM Models ###############################################################
############################################################################

pred_features=list(df_train)[2:-1]
X_train=df_train[pred_features].reset_index(drop=True)
Y_train=df_train.target.reset_index(drop=True)
X_test=df_test[pred_features].reset_index(drop=True)

def rmspe(y_true, y_pred):
    return  (np.sqrt(np.mean(np.square((y_true - y_pred) / y_true))))

def feval_RMSPE(preds, lgbm_train):
    labels = lgbm_train.get_label()
    return 'RMSPE', round(rmspe(y_true = labels, y_pred = preds),5), False


# LightGBM Model
############################################################################
from sklearn.model_selection import KFold
import lightgbm as lgbm


def Model_lgbm_cv(params, k, X_train, X_test, Y_train, RS, cat, esr=100, makepred=True):
    # Create the k folds
    kf=KFold(n_splits=k, shuffle=True, random_state=RS)

    # first level train and test
    Level_1_train = pd.DataFrame(np.zeros((X_train.shape[0],1)), columns=['train_yhat'])
    if makepred==True:
        Level_1_test = pd.DataFrame()

    # Main loop for each fold. Initialize counter
    count=0
    for train_index, test_index in kf.split(X_train, Y_train):
        count+=1
        # Define train and test depending in which fold are we
        fold_train= X_train.loc[train_index.tolist(), :]
        fold_test=X_train.loc[test_index.tolist(), :]
        fold_ytrain=Y_train[train_index.tolist()]
        fold_ytest=Y_train[test_index.tolist()]


        weights = 1/np.square(fold_ytrain)
        lgbm_train = lgbm.Dataset(fold_train,fold_ytrain, weight = weights, categorical_feature=cat)

        weights = 1/np.square(fold_ytest)
        lgbm_valid = lgbm.Dataset(fold_test,fold_ytest,reference = lgbm_train, weight = weights, categorical_feature=cat)

        # (k-1)-folds model adjusting
        MODEL = lgbm.train(params=params,
                          train_set=lgbm_train,
                          valid_sets=[lgbm_train, lgbm_valid],
                          num_boost_round=15000,
                          feval=feval_RMSPE,
                          verbose_eval=100,
                          early_stopping_rounds=esr,
                          categorical_feature=cat)

        # Predict on the free fold to evaluate metric
        # and on train to have an overfitting-free prediction for the next level
        p_fold=MODEL.predict(fold_test)
        p_fold_train=MODEL.predict(fold_train)

        # Save in Level_1_train the "free" predictions concatenated
        Level_1_train.loc[test_index.tolist(),'train_yhat'] = p_fold

        # Predict in test to make the k model mean
        # Define name of the prediction (p_"iteration number")
        score=rmspe(fold_ytest,p_fold)
        print('\n',k, '- cv, Fold', count, 'RMSPE:', round(score,5),'\n')
        if makepred==True:
            name = 'p_' + str(count)
            # Predictin to real test
            real_pred = MODEL.predict(X_test)
            # Name
            real_pred = pd.DataFrame({name:real_pred}, columns=[name])
            # Add to Level_1_test
            Level_1_test=pd.concat((Level_1_test,real_pred),axis=1)

    # Compute the metric of the total concatenated prediction (and free of overfitting) in train
    score_total=rmspe(Y_train, Level_1_train['train_yhat'])
    print('\n',k, '- cv, TOTAL RMSPE:', round(score_total,5),'\n')

    # mean of the k predictions in test
    if makepred==True:
        Level_1_test['model']=Level_1_test.mean(axis=1)

    # Return train and test sets with predictions and the performance
    if makepred==True:
        return Level_1_train, pd.DataFrame({'test_yhat':Level_1_test['model']}), score_total
    else:
        return score_total

params = {
      "objective": "rmse",
      "metric": "rmse",
      "boosting_type": "gbdt",
      'learning_rate': 0.05,
      'max_depth': -1,
      'max_bin':100,
      'min_data_in_leaf':500,
      'subsample': 0.72,
      'subsample_freq': 4,
      'feature_fraction': 0.5,
      'lambda_l1': 0.5,
      'lambda_l2': 1.0
  }

categorical_feature=['stock_id','cluster_id']


lgbm_train, lgbm_test, slgbm = Model_lgbm_cv(params,5,X_train, X_test, Y_train, RS=2305, cat=categorical_feature, esr=100, makepred=True)


# Catboost Model
############################################################################
from sklearn.model_selection import KFold
import catboost as catb


def Model_catb_cv(params, k, X_train, X_test, Y_train, RS, cat, esr=100, makepred=True):
    # Cat features position
    Pos=list()
    for col in cat:
        Pos.append(X_train.columns.get_loc(col))

    # Create the k folds
    kf=KFold(n_splits=k, shuffle=True, random_state=RS)

    # first level train and test
    Level_1_train = pd.DataFrame(np.zeros((X_train.shape[0],1)), columns=['train_yhat'])
    if makepred==True:
        Level_1_test = pd.DataFrame()

    # Main loop for each fold. Initialize counter
    count=0
    for train_index, test_index in kf.split(X_train, Y_train):
        count+=1
        # Define train and test depending in which fold are we
        fold_train= X_train.loc[train_index.tolist(), :]
        fold_test=X_train.loc[test_index.tolist(), :]
        fold_ytrain=Y_train[train_index.tolist()]
        fold_ytest=Y_train[test_index.tolist()]


        weights = 1/np.square(fold_ytrain)
        lgbm_train = catb.Pool(fold_train,fold_ytrain, weight = weights, cat_features=cat)

        weights = 1/np.square(fold_ytest)
        lgbm_valid = catb.Pool(fold_test,fold_ytest, weight = weights, cat_features=cat)

        # (k-1)-folds model adjusting
        MODEL = catb.train(params=params,
                          pool=lgbm_train,
                          eval_set=lgbm_valid,
                          num_boost_round=15000,
                          verbose_eval=100,
                          early_stopping_rounds=esr)

        # Predict on the free fold to evaluate metric
        # and on train to have an overfitting-free prediction for the next level
        p_fold=MODEL.predict(fold_test)
        p_fold_train=MODEL.predict(fold_train)

        # Save in Level_1_train the "free" predictions concatenated
        Level_1_train.loc[test_index.tolist(),'train_yhat'] = p_fold

        # Predict in test to make the k model mean
        # Define name of the prediction (p_"iteration number")
        score=rmspe(fold_ytest,p_fold)
        print('\n',k, '- cv, Fold', count, 'RMSPE:', round(score,5),'\n')
        if makepred==True:
            name = 'p_' + str(count)
            # Predictin to real test
            real_pred = MODEL.predict(X_test)
            # Name
            real_pred = pd.DataFrame({name:real_pred}, columns=[name])
            # Add to Level_1_test
            Level_1_test=pd.concat((Level_1_test,real_pred),axis=1)

    # Compute the metric of the total concatenated prediction (and free of overfitting) in train
    score_total=rmspe(Y_train, Level_1_train['train_yhat'])
    print('\n',k, '- cv, TOTAL RMSPE:', round(score_total,5),'\n')

    # mean of the k predictions in test
    if makepred==True:
        Level_1_test['model']=Level_1_test.mean(axis=1)

    # Return train and test sets with predictions and the performance
    if makepred==True:
        return Level_1_train, pd.DataFrame({'test_yhat':Level_1_test['model']}), score_total
    else:
        return score_total


params = {'objective': 'RMSE',
            'learning_rate': 0.05,
            'depth': 4,
            'min_data_in_leaf': 700,
            'rsm': 0.8,
            'subsample': 0.8,
            'bootstrap_type': 'Bernoulli'}

categorical_feature=['stock_id','cluster_id']

catb_train, catb_test, scatb = Model_catb_cv(params,5,X_train, X_test, Y_train, RS=2305, cat=categorical_feature, esr=100, makepred=True)


del X_train, Y_train, X_test
gc.collect()

############################################################################
# NN Models ################################################################
############################################################################

# 1) Imports and definitions
############################################################################
from numpy.random import seed
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import backend as K

def rmspe(y_true, y_pred):
    return  (np.sqrt(np.mean(np.square((y_true - y_pred) / y_true))))

def root_mean_squared_per_error(y_true, y_pred):
         return K.sqrt(K.mean(K.square( (y_true - y_pred)/ y_true )))

es = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss', patience=20, verbose=0,
    mode='min',restore_best_weights=True)

plateau = tf.keras.callbacks.ReduceLROnPlateau(
    monitor='val_loss', factor=0.2, patience=7, verbose=0,
    mode='min')

from tensorflow.keras.backend import sigmoid
from tensorflow.keras.utils import get_custom_objects
from tensorflow.keras.layers import Activation

def swish(x, beta = 1):
    return (x * sigmoid(beta * x))

get_custom_objects().update({'swish': Activation(swish)})

# 2) Basic Model Structure
############################################################################

def base_model(hidden_units,stock_embedding_size,max_cat_data):
    # Each instance will consist of two inputs:
    stock_id_input = keras.Input(shape=(1,), name='stock_id')
    num_input = keras.Input(shape=(317,), name='num_data')

    #embedding, flatenning and concatenating
    stock_embedded = keras.layers.Embedding(max_cat_data+1, stock_embedding_size,
                                           input_length=1, name='stock_embedding')(stock_id_input)
    stock_flattened = keras.layers.Flatten()(stock_embedded)
    out = keras.layers.Concatenate()([stock_flattened, num_input])

    # Add one or more hidden layers
    for n_hidden in hidden_units:
        out = keras.layers.Dense(n_hidden, activation='swish')(out)

    # A single output: our predicted vol
    out = keras.layers.Dense(1, activation='linear', name='prediction')(out)

    model = keras.Model(
    inputs = [stock_id_input, num_input],
    outputs = out,
    )

    return model

# 3) Transform Dataframes
############################################################################
from sklearn.preprocessing import QuantileTransformer

colNames = [col for col in list(df_train.columns)
            if col not in {'row_id','time_id','stock_id','stock_id_target_enc','cluster_id','target'}]
df_train.replace([np.inf, -np.inf], np.nan,inplace=True)
df_test.replace([np.inf, -np.inf], np.nan,inplace=True)

train_nn=df_train[colNames].copy()
test_nn=df_test[colNames].copy()

# Quantile transformation
for col in colNames:
    qt = QuantileTransformer(random_state=21,n_quantiles=2000, output_distribution='normal')
    train_nn[col] = qt.fit_transform(train_nn[[col]])
    test_nn[col] = qt.transform(test_nn[[col]])

train_nn[['time_id','stock_id','stock_id_target_enc','target']] = df_train[['time_id','stock_id','stock_id_target_enc','target']]
test_nn[['time_id','stock_id','stock_id_target_enc']] = df_test[['time_id','stock_id','stock_id_target_enc']]

# Missing imputation
features_to_consider = list(train_nn)
features_to_consider.remove('time_id')
features_to_consider.remove('target')

train_nn[features_to_consider] = train_nn[features_to_consider].fillna(train_nn[features_to_consider].mean())
test_nn[features_to_consider] = test_nn[features_to_consider].fillna(train_nn[features_to_consider].mean())

del df_train,df_test
gc.collect()


# 4) Kfold Function
############################################################################
from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler

def Model_NN_cv(params, k, NN_train, NN_test, RS, epochs=300, makepred=True):
    # Create the k folds
    kf=KFold(n_splits=k, shuffle=True, random_state=RS)

    # first level train and test
    Level_1_train = pd.DataFrame(np.zeros((NN_train.shape[0],1)), columns=['train_yhat'])
    if makepred==True:
        Level_1_test = np.zeros(NN_test.shape[0])

    # Main loop for each fold
    count = 1
    for train_index, test_index in kf.split(NN_train):
        print('CV {}/{}'.format(count, k))

        # Define train and test depending in which fold are we
        fold_train = NN_train.loc[train_index.tolist(), features_to_consider]
        fold_ytrain = NN_train.loc[train_index.tolist(),'target']
        fold_test = NN_train.loc[test_index.tolist(), features_to_consider]
        fold_ytest = NN_train.loc[test_index.tolist(), 'target']

        model = base_model(**params)
        model.compile(
            keras.optimizers.Adam(learning_rate=0.006),
            loss=root_mean_squared_per_error
        )

        try:
            features_to_consider.remove('stock_id')
        except:
            pass

        scaler = MinMaxScaler(feature_range=(-1,1))

        num_data = fold_train[features_to_consider]
        num_data = scaler.fit_transform(num_data.values)
        cat_data = fold_train['stock_id']


        num_data_test = fold_test[features_to_consider]
        num_data_test = scaler.transform(num_data_test.values)
        cat_data_test = fold_test['stock_id']

        model.fit([cat_data, num_data],
                  fold_ytrain,
                  batch_size=2048,
                  epochs=epochs,
                  validation_data=([cat_data_test, num_data_test], fold_ytest),
                  callbacks=[es, plateau],
                  validation_batch_size=len(fold_ytest),
                  shuffle=True,
                  verbose = 1)

        preds = model.predict([cat_data_test, num_data_test]).reshape(1,-1)[0]

        score = round(rmspe(y_true = fold_ytest, y_pred = preds),5)
        print('Fold {} {}: {}'.format(count, 'Neural Network', score))

        # Train prediction
        Level_1_train.loc[test_index.tolist(),'train_yhat'] = preds

        # Test prediction
        if makepred==True:
            tt = scaler.transform(NN_test[features_to_consider].values)
            Level_1_test += model.predict([NN_test['stock_id'], tt]).reshape(1,-1)[0].clip(0,1e10)/k

        features_to_consider.append('stock_id')
        count += 1

    score_total=rmspe(NN_train['target'], Level_1_train['train_yhat'])
    print('\n',k, '- cv, TOTAL RMSPE:', round(score_total,5),'\n')

    if makepred==True:
        Level_1_test=pd.DataFrame(Level_1_test)
        Level_1_test.columns=['test_yhat']
        return Level_1_train, Level_1_test, score_total
    else:
        return score_total


# 5.1) Train NN 1
############################################################################
seed(55)
tf.random.set_seed(55)

params = {'hidden_units': (128,64,32,16,8),
          'stock_embedding_size': 36,
          'max_cat_data': max(train_nn['stock_id'])
          }

neuraln_train1, neuraln_test1, s1 = Model_NN_cv(params, 5, NN_train=train_nn, NN_test=test_nn, RS=2305, epochs=300, makepred=True)

# 5.2) Train NN 2
############################################################################
seed(33)
tf.random.set_seed(33)

params = {'hidden_units': (128,64,32,16),
          'stock_embedding_size': 36,
          'max_cat_data': max(train_nn['stock_id'])
          }

neuraln_train2, neuraln_test2, s2 = Model_NN_cv(params, 5, NN_train=train_nn, NN_test=test_nn, RS=2305, epochs=300, makepred=True)


############################################################################
# Stacking Level 2 #########################################################
############################################################################

X1_train=pd.DataFrame({
                       'stock_id': train_nn['stock_id'],
                       'lgbm':lgbm_train['train_yhat'],
                       'catb':catb_train['train_yhat'],
                       'neuraln1':neuraln_train1['train_yhat'],
                       'neuraln2':neuraln_train2['train_yhat']
                      })

X1_test=pd.DataFrame({
                       'stock_id': test_nn['stock_id'],
                       'lgbm':lgbm_test['test_yhat'],
                       'catb':catb_test['test_yhat'],
                       'neuraln1':neuraln_test1['test_yhat'],
                       'neuraln2':neuraln_test2['test_yhat']
                      })

params = {
     'objective': 'rmse',
     'metric': 'rmse',
     'boosting_type': 'gbdt',
     'learning_rate': 0.05,
     'num_leaves': 5,
     'min_data_in_leaf': 500,
     'colsample_bytree': 0.8,
  }

# Model LightGBM
categorical_feature=['stock_id']
lgbm_train2, lgbm_test2, s = Model_lgbm_cv(params,5,X1_train, X1_test, Y_train=train['target'], RS=2305, cat=categorical_feature, esr=50, makepred=True)

# Output
y_pred = pd.DataFrame({'row_id': test['row_id'],'target': lgbm_test2['test_yhat']})

# Kaggle output:
y_pred.to_csv('submission.csv',index = False)

# Local output:
# y_pred.to_csv('./output/submission.csv',index = False)