#### Thanks to https://www.kaggle.com/manels/lgb-starter for starting inspiration

In [None]:
import os
import glob
import pandas as pd
import numpy as np
from IPython.core.display import display, HTML
from timeit import default_timer as timer
import random
import gc

GLOBAL_SEED_VALUE = 0
os.environ['PYTHONHASHSEED']=str(GLOBAL_SEED_VALUE)
random.seed(GLOBAL_SEED_VALUE)
np.random.seed(GLOBAL_SEED_VALUE)
from tensorflow.random import set_seed
set_seed(GLOBAL_SEED_VALUE)

In [None]:
DATA_DIR = '/kaggle/input/optiver-realized-volatility-prediction'

BATCH_SIZE = 1024
EPSILON = 1e20
TRAIN_STOCK_IDS = []

TIMEID_WINSIZE = 150
TIMEID_SUBWINS = [i for i in range(int(600/TIMEID_WINSIZE))]
print('TIMEID_SUBWINS:', TIMEID_SUBWINS)

In [None]:
from sklearn.metrics import r2_score

def rmspe(y_true, y_pred):
    return (np.sqrt(np.mean(np.square((y_pred - y_true)/y_true))))

def get_all_stock_ids(data_type):
    paths = glob.glob(os.path.join(DATA_DIR, f'book_{data_type}.parquet/*'), recursive=True)
    return [int(path.split('=')[1]) for path in paths]

def read_parquet_file(path):
    stock_id = path.split('=')[1]
    df = pd.read_parquet(path)
    return stock_id, df

def read_parquet_file_for_stock(stock_id, data_type):
    df_book  = pd.read_parquet(os.path.join(DATA_DIR, f'book_{data_type}.parquet', f'stock_id={stock_id}'))
    df_trade = pd.read_parquet(os.path.join(DATA_DIR, f'trade_{data_type}.parquet', f'stock_id={stock_id}'))
    return df_book, df_trade

def logr(series):
    return np.log(series).diff()

def time_diff(seconds_in_bucket):
    return seconds_in_bucket.diff()

def rv(returns):
    return np.sqrt(np.sum(returns**2))

def fill_fb(series):
    return series.fillna(method='ffill').fillna(method='bfill')

def cnz(series):
    return np.count_nonzero(series)

### Data model by pre-processing raw book & trade data

In [None]:
def preprocess_one_stock_id_data(stock_id:int, data_type, export=False, verbose=False):
    # Model v100.5 --------------------------------------------------------------------------
    df_book, df_trade = read_parquet_file_for_stock(stock_id, data_type)
    df_trade.drop('order_count', axis=1, inplace=True)
    logstr = f'Preprocessing log stock_id: {stock_id} raw shape: book: {df_book.shape}, trade: {df_trade.shape}'
    
    if verbose: print('\nTime_check 1:', round(timer(), 0))
    # ---------------------------------------------------------------------------------------
    df_book['sib_id'] = (df_book['seconds_in_bucket']/TIMEID_WINSIZE).astype(int)
    df_book['wap1']   = (df_book['bid_price1'] * df_book['ask_size1'] + df_book['ask_price1'] * df_book['bid_size1']) / (df_book['bid_size1'] + df_book['ask_size1'])
    df_book['wap2']   = (df_book['bid_price2'] * df_book['ask_size2'] + df_book['ask_price2'] * df_book['bid_size2']) / (df_book['bid_size2'] + df_book['ask_size2'])
    df_book['wap3']   = (df_book['bid_price1'] * df_book['bid_size1'] + df_book['ask_price1'] * df_book['ask_size1']) / (df_book['bid_size1'] + df_book['ask_size1'])
    df_book['wap4']   = (df_book['bid_price2'] * df_book['bid_size2'] + df_book['ask_price2'] * df_book['ask_size2']) / (df_book['bid_size2'] + df_book['ask_size2'])
    df_book['ret_w1'] = df_book.groupby(['time_id'])['wap1'].apply(logr).fillna(0)
    df_book['ret_w2'] = df_book.groupby(['time_id'])['wap2'].apply(logr).fillna(0)
    df_book['ret_w3'] = df_book.groupby(['time_id'])['wap3'].apply(logr).fillna(0)
    df_book['ret_w4'] = df_book.groupby(['time_id'])['wap4'].apply(logr).fillna(0)
    df_book['vimb1'] = (df_book['ask_size1'] - df_book['bid_size1']) / (df_book['ask_size1'] + df_book['bid_size1'])
    df_book['chg_bs1'] = df_book.groupby(['time_id'])['bid_size1'].apply(logr).fillna(0)
    df_book['chg_as1'] = df_book.groupby(['time_id'])['ask_size1'].apply(logr).fillna(0)
    df_book['chg_bs2'] = df_book.groupby(['time_id'])['bid_size2'].apply(logr).fillna(0)
    df_book['chg_as2'] = df_book.groupby(['time_id'])['ask_size2'].apply(logr).fillna(0)
    df_book['sib_wt'] = df_book['seconds_in_bucket']/179700.0
    df_book['wret_w1'] = df_book['sib_wt']*df_book['ret_w1']
    df_book['wret_w2'] = df_book['sib_wt']*df_book['ret_w2']
    df_book['wret_w3'] = df_book['sib_wt']*df_book['ret_w3']
    df_book['wret_w4'] = df_book['sib_wt']*df_book['ret_w4']
    # --- above ok ----
    
    # ---------------------------------------------------------------------------------------
    df_trade['sib_id'] = (df_trade['seconds_in_bucket']/TIMEID_WINSIZE).astype(int)
    df_trade['ret_p1'] = df_trade.groupby(['time_id'])['price'].apply(logr).fillna(0)
    df_trade['sib_wt'] = df_trade['seconds_in_bucket']/179700.0
    df_trade['wret_p1'] = df_trade['sib_wt']*df_trade['ret_p1']
    # --- above ok ----
    
    if verbose: print('Time_check 2:', round(timer(), 0))
    
    # ---------------------------------------------------------------------------------------
    df_merged = pd.merge(df_book, df_trade, how='left', on=['time_id', 'sib_id', 'seconds_in_bucket']) #.fillna(0)
    df_merged['price'] = df_merged.groupby(['time_id'])['price'].apply(fill_fb) #compromise for now
    df_merged.fillna(0, inplace=True)
    df_merged['basprd1'] = (df_merged['bid_price1'] - df_merged['ask_price1']) #/(df_merged['price']+EPSILON) #better solution later
    df_merged['basprd2'] = (df_merged['bid_price2'] - df_merged['ask_price2']) #/(df_merged['price']+EPSILON) #better solution later
    # --- above ok ----
    
    # ---------------------------------------------------------------------------------------
    aggregation_model = {
        'ret_w1': [rv, cnz, np.std], 'ret_w2': [rv, cnz, np.std], 'ret_p1': [rv, cnz, np.std], 'ret_w3': [rv, cnz, np.std], 'ret_w4': [rv, cnz, np.std],
        'wap1': [np.std], 'vimb1': [np.std], 'basprd1': [np.mean, np.std], 'basprd2': [np.mean, np.std],
        'wret_w1': [rv, np.std], 'wret_w2': [rv, np.std], 'wret_w3': [rv, np.std], 'wret_w4': [rv, np.std], 'wret_p1': [rv, np.std],
        'chg_bs1': [cnz], 'chg_as1': [cnz], 'chg_bs2': [cnz], 'chg_as2': [cnz]
    }
    syn_df = df_merged.groupby(['time_id', 'sib_id']).agg(aggregation_model).reset_index()
    syn_df.columns = ['_'.join(col) for col in syn_df.columns]
    syn_df = syn_df.rename(columns={'time_id_':'time_id', 'sib_id_':'sib_id'})
    if verbose: print('Time_check 3:', round(timer(), 0))
    
    # ---------------------------------------------------------------------------------------
    syn_df['stock_id'] = int(stock_id)
    cols1 = ['stock_id', 'time_id', 'sib_id']
    cols2 = [x for x in syn_df.columns if x not in cols1]
    cols1.extend(cols2)
    syn_df = syn_df[cols1]
    if verbose: print('Time_check 4:', round(timer(), 0))
    if export: df_merged.to_csv('df_merged.csv', index=False);syn_df.to_csv('syn_df.csv', index=False)
    if verbose: print('Time_check 5:', round(timer(), 0))
    
    # Model v100.5 --------------------------------------------------------------------------
    
    logstr = logstr + f', pre-processed shape: {syn_df.shape}, {syn_df.isnull().T.any().T.sum()} NULLS: {syn_df.columns[syn_df.isnull().any()].tolist()}'
    if verbose: print(logstr)
    return stock_id, syn_df, logstr

if False: #set to True to test
    !rm /kaggle/working/*.csv
    from timeit import default_timer as timer
    STOCK_ID = 31
    time_start = timer()
    print(f'Running pre_processing test for stock_id {STOCK_ID} ... ', end="")
    _, df, logstr = preprocess_one_stock_id_data(STOCK_ID, 'train')
    time_end = timer()
    print(f'Time taken {round(time_end-time_start,1)} seconds')
    print(f'\n{logstr}\n')
    print('Processed data: columns:', list(df.columns))

### Parallelism

In [None]:
from joblib import Parallel, delayed
from tqdm import tqdm
def prepare_feature_data_new(data_type, stock_ids_in_scope=[]):
    stock_ids = get_all_stock_ids(data_type) if stock_ids_in_scope == [] else stock_ids_in_scope
    print(f'Considering {len(stock_ids)} stock ids ... {stock_ids}\n')
    
    job_inputs = tqdm(stock_ids)
    job_result = Parallel(n_jobs=-1)(delayed(preprocess_one_stock_id_data)(i, data_type) for i in job_inputs)
    print(f'extract_features completed: {len(job_result)} results returned.')
    result_stock_ids, result_dfs, result_output_str = zip(*job_result)
    assert sorted(set(result_stock_ids)) == sorted(set(stock_ids))
    assert len(result_stock_ids) == len(result_dfs)
    
    print('\nConsolidating the results ...')
    consolidated_df  = pd.DataFrame()
    for i in range(len(result_stock_ids)):
        consolidated_df = consolidated_df.append(result_dfs[i], ignore_index=True)
        print(' ', result_output_str[i])
    print('Done. Prepared feature data: shape:', consolidated_df.shape, '\n')
    
    return consolidated_df

### Training data

In [None]:
%%time
!rm /kaggle/working/*.csv
pre_processed_train_df = prepare_feature_data_new(data_type='train', stock_ids_in_scope=TRAIN_STOCK_IDS)
#pre_processed_train_df.to_csv('pre_processed_train_df.csv', index=False)

#pre_processed_train_df = pd.read_csv('/kaggle/input/orvp1005s150/pre_processed_train_df.csv')

print('pre_processed_train_df.shape:', pre_processed_train_df.shape)
pre_processed_train_df.head(5)

In [None]:
list(pre_processed_train_df.columns)
#from IPython.display import FileLink
#FileLink(r'pre_processed_train_df.csv')

In [None]:
gc.collect()

In [None]:
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

def cluster_stock_ids(n_clusters):
    print('Running KMeans to cluster stock_ids ...')
    train_csv = pd.read_csv(os.path.join(DATA_DIR, 'train.csv'))
    train_csv = train_csv.sample(frac=1, random_state=GLOBAL_SEED_VALUE).reset_index(drop=True)
    feat = {'target': [np.mean, np.std]}
    df = train_csv.groupby(['stock_id']).agg(feat).reset_index()
    df.columns = ['_'.join(col) for col in df.columns]
    df = df.rename(columns={'stock_id_': 'stock_id'})

    kmeans_model = KMeans(n_clusters=n_clusters, random_state=GLOBAL_SEED_VALUE)
    kmeans_model.fit(df[['target_mean', 'target_std']])
    labels = kmeans_model.labels_
    print('\nIdentified stock_id cluster labels:')
    display(labels)
    df['cluster_label'] = kmeans_model.labels_

    def centroid_values(stock_id):
        lbl = df.loc[df['stock_id'] == stock_id, 'cluster_label'].values[0]
        return kmeans_model.cluster_centers_[lbl, :]

    df['centroid_mean'] = [centroid_values(stock_id)[0] for stock_id in df['stock_id'].to_list()]
    df['centroid_std']  = [centroid_values(stock_id)[1] for stock_id in df['stock_id'].to_list()]
    return df, kmeans_model.cluster_centers_

stock_id_cluster_df, centroids = cluster_stock_ids(7)
print('\nIdentified centroid values:')
display(centroids)
print('\nResultant stock_id_cluster_df:')
display(stock_id_cluster_df.head(5))

LABEL_COLOR_MAP = {0:'green', 1:'brown', 2:'blue', 3:'cyan', 4:'red', 5:'magenta', 6: 'black'}
label_color = [LABEL_COLOR_MAP[l] for l in stock_id_cluster_df['cluster_label']]

plt.rcParams["figure.figsize"] = (12,8)
ax = stock_id_cluster_df.plot.scatter(x='target_mean', y='target_std', alpha=0.5, c=label_color)

for cc in range(centroids.shape[0]):
    plt.scatter(centroids[cc,0], centroids[cc,1], s=100, marker='s', c='blue')

for i, txt in enumerate(stock_id_cluster_df['stock_id']):
    ax.annotate(txt, (stock_id_cluster_df['target_mean'].iat[i], stock_id_cluster_df['target_std'].iat[i]))

plt.show()

### Prepare Training & Test Data

In [None]:
import sklearn.preprocessing as preprocessing

def flatten(df):
    print('Flatteing df ...')
    gfcols = [col for col in df.columns if col.startswith('gf_')]
    if len(gfcols) > 0:
        sibcols = [col for col in df.columns if col not in gfcols]
        gfcols.extend(['stock_id', 'time_id'])
        print('  gfcols:', gfcols, 'sib_cols:', sibcols)
        df_gf = df[gfcols]
        df_gf = df_gf.drop_duplicates()
        print('  df_gf.shape:', df_gf.shape, 'df_gf.columns:', df_gf.columns)
    else:
        sibcols = df.columns
        df_gf = None
    
    flattened_df = pd.DataFrame()
    for sib_id in sorted(TIMEID_SUBWINS):
        df_temp = df[sibcols].loc[df['sib_id'] == sib_id].drop(['sib_id'], axis=1).reset_index(drop=True)
        df_temp = df_temp.add_suffix(f'_sib_id_{sib_id}')
        df_temp = df_temp.rename(columns={f'time_id_sib_id_{sib_id}' : 'time_id', f'stock_id_sib_id_{sib_id}' : 'stock_id'})
        flattened_df = df_temp if flattened_df.shape[0] == 0 else flattened_df.merge(df_temp, how = 'left', on = ['stock_id', 'time_id'])
        
    print('  before merging df_gf, flattened_df.shape:', flattened_df.shape)
    
    if df_gf is not None:
        flattened_df = pd.merge(flattened_df, df_gf, how='left', on = ['stock_id', 'time_id'])
        print('  after merging df_gf, flattened_df.shape:', flattened_df.shape)
    print('Flatteing df completed')
    
    flattened_df = pd.merge(flattened_df, stock_id_cluster_df[['stock_id', 'centroid_mean', 'centroid_std']], how='left', on='stock_id').reset_index(drop=True)
    print('Kmeans cluster centroid mean/std added ... flattened_df.shape:', flattened_df.shape)
    
    return flattened_df

def scale_features(train_df, test_df, features):
    df = pd.concat([train_df[features[1:]], test_df[features[1:]]])
    scaler = preprocessing.StandardScaler()
    df = scaler.fit_transform(df)
    train_df[features[1:]] = df[0:train_df.shape[0]]
    test_df[features[1:]]  = df[train_df.shape[0]:]
    print(f'Scaled {len(features)-1} features: train_df.shape:', train_df.shape, 'test_df.shape:', test_df.shape)
    return train_df, test_df

def prepare_training_and_test_data(submission_mode, verbose, test_df_fraction):
    train_csv = pd.read_csv(os.path.join(DATA_DIR, 'train.csv'))
    print('train_csv:', train_csv.shape, 'columns:', list(train_csv.columns))
    test_csv = pd.read_csv(os.path.join(DATA_DIR, 'test.csv'))
    print('test_csv :', test_csv.shape, 'columns:', list(test_csv.columns), '\n')
    
    train_examples = flatten(pre_processed_train_df)
    print('flattened train_examples.shape:', train_examples.shape, 'null columns:', train_examples.columns[train_examples.isnull().any()].tolist())
    
    train_df = pd.merge(train_csv, train_examples, how = 'left', on=['stock_id', 'time_id']).fillna(0)
    train_df = train_df.sample(frac=1, random_state=GLOBAL_SEED_VALUE).reset_index(drop=True)
    print('train_df.shape:', train_df.shape)
    if verbose: display(train_df.head(2))

    if submission_mode:
        print('\nSUBMISSION MODE ===>')
        pre_processed_test_df = prepare_feature_data_new(data_type='test')
        test_examples = flatten(pre_processed_test_df)
        print('flattened test_examples.shape:', test_examples.shape, 'null columns:', test_examples.columns[test_examples.isnull().any()].tolist())
        test_df = pd.merge(test_csv, test_examples, how = 'left', on=['stock_id', 'time_id']).fillna(0)
    else:
        print(f'\nNON SUBMISSION MODE --> Taking {test_df_fraction}x sample from training data for test')
        test_df  = train_df.groupby("stock_id").sample(frac=test_df_fraction, random_state=GLOBAL_SEED_VALUE)
        train_df = train_df.drop(test_df.index)
        train_df = train_df.reset_index(drop=True)
        test_df  = test_df.reset_index(drop=True)
        test_df['row_id'] = test_df['stock_id'].astype(str) + '-' + test_df['time_id'].astype(str)
        
    print('test_df.shape:', test_df.shape)
    if verbose: display(test_df.head(2))
    print('\nFinal train/test/feature data: ', 'train_df.shape:', train_df.shape, 'test_df.shape:', test_df.shape)
    features = list(train_examples.columns)
    features.remove('time_id')
    print(f'\n{len(features)} features selected::')
    print(f'{features}\n')
    
    train_df, test_df = scale_features(train_df, test_df, features)
    
    return train_df, test_df, features

### LGBM model

In [None]:
import lightgbm as lgb
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt

def feval_rmspe(preds, train_data):
    labels = train_data.get_label()
    return 'feval_rmspe', round(rmspe(y_true = labels, y_pred = preds), 5), False

def lgb_model_1(model_name, X_train, Y_train, X_cval, Y_cval, features):
    model_params = {
        'task': 'train',
        'categorical_column':[0],
        'boosting_type': 'gbdt',
        'max_depth': -1,
        'max_bin':100,
        'min_data_in_leaf':500,
        'learning_rate': 0.05,
        'subsample': 0.72,
        'subsample_freq': 4,
        'feature_fraction': 0.5,
        'lambda_l1': 0.5,
        'lambda_l2': 1.0,
        'seed': GLOBAL_SEED_VALUE,
        "tree_learner": 'voting',
        'verbose': -1
    }
    
    train_data = lgb.Dataset(X_train, label=Y_train, categorical_feature=['stock_id'], weight=1/np.power(Y_train,2))
    cval_data  = lgb.Dataset(X_cval,  label=Y_cval, categorical_feature=['stock_id'], weight=1/np.power(Y_cval,2)) 
    model      = lgb.train(model_params, train_data, valid_sets=cval_data, feval=feval_rmspe, categorical_feature=['stock_id'],
                           num_boost_round=5000, early_stopping_rounds=500, verbose_eval=False)
    return model

### NN

In [None]:
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, Callback
from tensorflow.keras.layers import InputLayer, Dense, Activation, Dropout, Embedding, Flatten, Concatenate, LSTM, Reshape
from tensorflow.keras import Sequential, regularizers, Input, Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import TensorBoard
import keras.backend as K
from skopt.space import Real, Integer, Categorical
from skopt.utils import use_named_args
from skopt import gp_minimize

def my_rmspe(y_true, y_pred):
    return (K.sqrt(K.mean(K.square((y_pred - y_true)/y_true))))

### Bayes hyperparameter optimization

In [None]:
units             = Categorical(categories=[256, 512, 1024], name='units')
dropout           = Categorical(categories=[0.1, 0.2, 0.3, 0.4, 0.5], name='dropout')
l2_regularization = Real(low=0.001, high=0.5, prior='log-uniform', name='l2_regularization')
learning_rate     = Real(low=0.001, high=0.5, prior='log-uniform', name='learning_rate')
hyperparam_range  = [units, dropout, l2_regularization, learning_rate]
default_params    = [256, 0.3, 0.01, 0.01]
display(hyperparam_range)

nn_bayes_inputs = None
def nn_initialize_bayes_inputs(model_name, train_df, test_df, features):
    global nn_bayes_inputs
    nn_bayes_inputs = {}
    nn_bayes_inputs["iteration"]  = 0
    nn_bayes_inputs["best_rmspe"] = 1000
    
    nn_bayes_inputs["model_name"] = model_name
    nn_bayes_inputs["train_df"] = train_df
    nn_bayes_inputs["test_df"]  = test_df
    nn_bayes_inputs["features"] = features
    
    train_index, cval_index = next(KFold(n_splits=4, shuffle=True, random_state=GLOBAL_SEED_VALUE).split(nn_bayes_inputs["train_df"]))
    nn_bayes_inputs["train_index"] = train_index
    nn_bayes_inputs["cval_index"]  = cval_index
    
    print('nn_initialize_bayes_inputs:', 'train_df.shape:', nn_bayes_inputs["train_df"].shape, 'test_df.shape:', nn_bayes_inputs["test_df"].shape)
    return

@use_named_args(dimensions=hyperparam_range)
def bayes_one_iteration(units, dropout, l2_regularization, learning_rate):
    time_start = timer()
    nn_bayes_inputs["iteration"] += 1
    
    print(f'\nRunning iteration {nn_bayes_inputs["iteration"]}:')
    print(f' Hyperparams: [{units} / {dropout} / {round(l2_regularization,4)} / {round(learning_rate,4)}]')
    
    hyperparams = {'layers':3, 'units':units, 'dropout':dropout, 'l2_regularization':l2_regularization,
                   'learning_rate':learning_rate, 'activation':'relu', 'lstm_units':128, 'epochs':100}
    model, eval_rmspe, history = nn_model_lstm(hyperparams,
                                               nn_bayes_inputs["train_df"], nn_bayes_inputs["train_index"], nn_bayes_inputs["cval_index"],
                                               nn_bayes_inputs["features"], verbose=False)
    
    test_preds = nn_make_predictions(model, test_df, features, model_name=nn_bayes_inputs["model_name"], lstm_units=128, verbose=False).clip(0,1e10)
    test_rmspe = rmspe(y_true = test_df['target'], y_pred = test_preds)
    test_r2    = r2_score(y_true = test_df['target'], y_pred = test_preds)
    print(f' Result: eval_rmspe: {round(eval_rmspe, 3)}, test_rmspe: {round(test_rmspe, 3)}, r2_score: {round(test_r2, 3)}, time taken: {int(timer()-time_start)} secs')
    
    if test_rmspe < nn_bayes_inputs["best_rmspe"]:
        nn_bayes_inputs["best_rmspe"] = test_rmspe
        print(f' Better hyperparameters found ({round(test_rmspe, 3)}):')
        print(f'   hidden units:   {hyperparams["units"]}')
        print(f'   dropout:        {hyperparams["dropout"]}')
        print(f'   l2 reg rate:    {round(hyperparams["l2_regularization"], 4)}')
        print(f'   learning rate:  {round(hyperparams["learning_rate"], 4)}')
        print()

    del model
    K.clear_session()
    return test_rmspe

def run_bayes_optimization(train_df, test_df, features, n_calls=11):
    nn_initialize_bayes_inputs("LSTM", train_df, test_df, features)
    optimal_params = gp_minimize(func=bayes_one_iteration, dimensions=hyperparam_range,
                                 x0=default_params, random_state=GLOBAL_SEED_VALUE,
                                 n_calls=n_calls, acq_func='EI')
    print('best_rmspe:', nn_bayes_inputs["best_rmspe"], 'hyperparameters:', list(optimal_params.x))
    return optimal_params

### NN prediction function

In [None]:
def nn_make_predictions(model, X_df, features, model_name=None, verbose=False, lstm_units=None):
    m = X_df.shape[0]
    full_batches = int(m/BATCH_SIZE)
    if verbose: print(f'  make_predictions: {m} examples, {full_batches} full batches, {m - full_batches*BATCH_SIZE} leftover')
    if model_name == "NN1" or model_name == "LSTM":
        categorical_features = ['stock_id']
        numerical_features = features.copy()
        numerical_features.remove('stock_id')

    predictions = np.array([1.0])
    for i in range(full_batches):
        if verbose: print(f'   predicting {i}-th batch: {i*BATCH_SIZE} to {(i+1)*BATCH_SIZE}')
        if model_name == "NN1":
            Xc,  Xn = X_df[categorical_features], X_df[numerical_features]
            X = [Xc[i*BATCH_SIZE:(i+1)*BATCH_SIZE], Xn[i*BATCH_SIZE:(i+1)*BATCH_SIZE]]
        elif model_name == "LSTM":
            Xc,  Xn = X_df[categorical_features], X_df[numerical_features]
            a0 = np.zeros((BATCH_SIZE, lstm_units))
            c0 = np.zeros((BATCH_SIZE, lstm_units))
            X = [Xc[i*BATCH_SIZE:(i+1)*BATCH_SIZE], Xn[i*BATCH_SIZE:(i+1)*BATCH_SIZE], a0, c0]
        else:
            X = X_df[i*BATCH_SIZE:(i+1)*BATCH_SIZE]
            X = X[features]
        preds = model.predict(X)
        predictions = preds if len(predictions) == 0 else np.append(predictions, preds)

    if (m - full_batches*BATCH_SIZE) > 0:
        if verbose: print(f'   predicting last batch: {full_batches*BATCH_SIZE} to {m}')
        if model_name == "NN1":
            Xc,  Xn = X_df[categorical_features], X_df[numerical_features]
            X = [Xc[full_batches*BATCH_SIZE:], Xn[full_batches*BATCH_SIZE:]]
        elif model_name == "LSTM":
            Xc,  Xn = X_df[categorical_features], X_df[numerical_features]
            a0 = np.zeros((m-full_batches*BATCH_SIZE, lstm_units))
            c0 = np.zeros((m-full_batches*BATCH_SIZE, lstm_units))
            X = [Xc[full_batches*BATCH_SIZE:], Xn[full_batches*BATCH_SIZE:], a0, c0]
        else:
            X = X_df[full_batches*BATCH_SIZE:]
            X = X[features]
        preds = model.predict(X)
        predictions = preds if len(predictions) == 0 else np.append(predictions, preds)

    predictions = np.delete(predictions, 0)
    if verbose: print(f'  predictions.shape:', predictions.shape)
    assert m == predictions.shape[0]
    return predictions

### NN Model1

In [None]:
def nn_model_nn1(hyperparams, train_df, train_index, cval_index, features, verbose=True):
    categorical_features = ['stock_id']
    numerical_features = features.copy()
    numerical_features.remove('stock_id')
    
    X_train_cat, X_train_num = train_df.loc[train_index, categorical_features], train_df.loc[train_index, numerical_features]
    X_cval_cat,  X_cval_num  = train_df.loc[cval_index, categorical_features], train_df.loc[cval_index, numerical_features]
    Y_train, Y_cval = train_df.loc[train_index, 'target'].values, train_df.loc[cval_index, 'target'].values
    if verbose: print('X_train_cat.shape:', X_train_cat.shape, 'X_cval_cat.shape:', X_cval_cat.shape, 'Y_train.shape:', Y_train.shape)
    if verbose: print('X_train_num.shape:', X_train_num.shape, 'X_cval_num.shape:', X_cval_num.shape, 'Y_cval.shape :', Y_cval.shape)
    
    cat_input = Input(shape=(1,), name='stock_id')
    emb = Embedding(128, 24, input_length=1, name='embedding_inp')(cat_input)
    emb = Flatten()(emb)
    num_input = Input(shape=X_train_num.shape[1], name='numerical_input')
    out = Concatenate()([emb, num_input])
    for i in range(hyperparams['layers']):
        out = Dense(units=hyperparams['units'], activation=hyperparams['activation'], kernel_regularizer=regularizers.l2(hyperparams['l2_regularization']))(out)
        if hyperparams['dropout'] > 0:
            out = Dropout(hyperparams['dropout'], seed=GLOBAL_SEED_VALUE)(out)
    out = Dense(128, activation=hyperparams['activation'])(out)
    out = Dense(1)(out)
    model = Model(inputs=[cat_input, num_input], outputs=out)
    model.compile(Adam(lr=hyperparams['learning_rate'], decay=5e-4), loss=my_rmspe)
    escb = EarlyStopping(monitor='val_loss', mode='auto', restore_best_weights=True, patience=20, verbose=True)
    class CustomCallback(Callback):
        def __init__(self, verbose=False):
            super(CustomCallback, self).__init__()
            self.verbose = verbose
        def on_epoch_end(self, epoch, logs=None):
            if self.verbose and (epoch+1)%50 == 0: print(f'  --> epoch {epoch+1} completed')
    
    history = model.fit([X_train_cat, X_train_num], Y_train, validation_data=([X_cval_cat, X_cval_num], Y_cval),
                        batch_size=BATCH_SIZE, shuffle=True, verbose=verbose, epochs=hyperparams['epochs'], callbacks=[escb, CustomCallback(not verbose)])
    eval_rmspe = model.evaluate([X_cval_cat, X_cval_num], Y_cval, verbose=0, batch_size=BATCH_SIZE)
    if verbose: print(' eval_rmspe:', round(eval_rmspe, 3))
    return model, eval_rmspe, history

### NN model LSTM

In [None]:
def create_lstm_model(hyperparams, features, verbose):
    n_lstm_cells = len(TIMEID_SUBWINS)
    n_features   = int((len(features) - 1)/n_lstm_cells)
    
    cat_input = Input(shape=(1,), name='I1')
    emb = Embedding(128, 24, input_length=1, name='E1')(cat_input)
    emb = Flatten(name='F1')(emb)
    
    X  = Input(shape=(len(features)-1), name='i_num') # for now
    a0 = Input(shape=(hyperparams['lstm_units'],), name='a0')
    c0 = Input(shape=(hyperparams['lstm_units'],), name='c0')
    a  = a0
    c  = c0
    
    last_out = None
    for t in range(n_lstm_cells):
        x = X[:, int(t*n_features):int((t+1)*n_features)]
        x = Reshape((1, n_features), name=f'R{t}')(x)
        a, _, c = LSTM(hyperparams['lstm_units'], return_state = True, name=f'LSTM{t}')(inputs=x, initial_state=[a, c])
        last_out = a
    
    O = Concatenate(name='concat')([emb,last_out])
    O = Dense(units=hyperparams['units'], activation=hyperparams['activation'], kernel_regularizer=regularizers.l2(hyperparams['l2_regularization']), name='dense1')(O)
    if hyperparams['dropout'] > 0:
        O = Dropout(hyperparams['dropout'], seed=GLOBAL_SEED_VALUE, name='dropout1')(O)
    
    O = Dense(units=hyperparams['units'], activation=hyperparams['activation'], kernel_regularizer=regularizers.l2(hyperparams['l2_regularization']), name='dense2')(O)
    if hyperparams['dropout'] > 0:
        O = Dropout(hyperparams['dropout'], seed=GLOBAL_SEED_VALUE, name='dropout2')(O)
    
    O = Dense(128, activation=hyperparams['activation'], name='dense3')(O)
    O = Dense(1, activation='linear', name='output')(O)
    
    model = Model(inputs=[cat_input, X, a0, c0], outputs=O)
    print(' Using LSTM model: lstm_units: {}, n_features: {} --> {} layers, {} params'.format(
        hyperparams['lstm_units'], n_features, len(model.layers), model.count_params()))
    if verbose: model.summary()
    return model

def nn_model_lstm(hyperparams, train_df, train_index, cval_index, features, verbose=True):
    lstm_units = hyperparams['lstm_units']
    categorical_features = ['stock_id']
    numerical_features = features.copy()
    numerical_features.remove('stock_id')
    
    X_train_cat, X_train_num = train_df.loc[train_index, categorical_features], train_df.loc[train_index, numerical_features]
    X_cval_cat,  X_cval_num  = train_df.loc[cval_index, categorical_features], train_df.loc[cval_index, numerical_features]
    Y_train, Y_cval = train_df.loc[train_index, 'target'].values, train_df.loc[cval_index, 'target'].values
    if verbose: print('X_train_cat.shape:', X_train_cat.shape, 'X_cval_cat.shape:', X_cval_cat.shape, 'Y_train.shape:', Y_train.shape)
    if verbose: print('X_train_num.shape:', X_train_num.shape, 'X_cval_num.shape:', X_cval_num.shape, 'Y_cval.shape :', Y_cval.shape)
        
    model = create_lstm_model(hyperparams, features, verbose)
    model.compile(Adam(lr=hyperparams['learning_rate'], decay=5e-4), loss=my_rmspe)
    escb = EarlyStopping(monitor='val_loss', mode='auto', restore_best_weights=True, patience=20, verbose=True)
    class CustomCallback(Callback):
        def __init__(self, verbose=False):
            super(CustomCallback, self).__init__()
            self.verbose = verbose
        def on_epoch_end(self, epoch, logs=None):
            if self.verbose and (epoch+1)%50 == 0: print(f'  Epoch {epoch+1} completed')
    
    a0_train = np.zeros((X_train_cat.shape[0], lstm_units))
    c0_train = np.zeros((X_train_cat.shape[0], lstm_units))
    a0_cval  = np.zeros((X_cval_cat.shape[0],  lstm_units))
    c0_cval  = np.zeros((X_cval_cat.shape[0],  lstm_units))
    history = model.fit([X_train_cat, X_train_num, a0_train, c0_train], Y_train, validation_data=([X_cval_cat, X_cval_num, a0_cval, a0_cval], Y_cval),
                        batch_size=BATCH_SIZE, shuffle=True, verbose=verbose, epochs=hyperparams['epochs'], callbacks=[escb, CustomCallback(not verbose)])
    eval_rmspe = model.evaluate([X_cval_cat, X_cval_num, a0_cval, a0_cval], Y_cval, verbose=0, batch_size=BATCH_SIZE)
    if verbose: print(' eval_rmspe:', round(eval_rmspe, 3))
    return model, eval_rmspe, history

### Common

In [None]:
def create_train_model_and_predict(model_name, model_type, submission_mode, train_df, test_df, features, n_kfolds, verbose=False):
    print(f'\nRunning {model_type} model _{model_name}_ with {n_kfolds} folds ...')
    print(f' Inputs: train_df.shape: {train_df.shape}, train_df.shape: {train_df.shape}, {len(features)} features')
    print(f' Submission mode {submission_mode}')
    
    if not submission_mode:
        saved_y_true = test_df['target'].values
        syt = test_df[['row_id', 'stock_id', 'time_id', 'target']]
        syt = syt.rename(columns={'target':'y_true'})
    
    kfold_scores = {}
    kfold_scores[model_name] = []
    model_results = {}
    prediction_label = f'prediction_{model_name}'
    train_df[prediction_label] = 0 # initialize
    test_df['target'] = 0 # initialize
    
    kf = KFold(n_splits=n_kfolds, shuffle=True, random_state=GLOBAL_SEED_VALUE)
    print(f'\nKFold: {kf}\n')
    
    kfold_iter = 0
    for train_index, cval_index in kf.split(train_df):
        time_start = timer()
        kfold_iter += 1
        print(f'KFold Iteration {kfold_iter}: train size {len(train_index)}, cval size {len(cval_index)} ...')
        if model_type == "NN":
            if model_name == "NN1":
                hyperparams = {'layers':3, 'units':1024, 'dropout':0.4, 'l2_regularization':0.005, 'learning_rate':0.0005, 'activation':'relu', 'epochs':1000}
                model, eval_rmspe, history = nn_model_nn1(hyperparams, train_df, train_index, cval_index, features, verbose=verbose)
                cval_predictions = nn_make_predictions(model, train_df.loc[cval_index, features], features,
                                                       model_name=model_name, verbose=False).clip(0,1e10)
            elif model_name == "LSTM":
                hyperparams = {'layers':3, 'units':1024, 'dropout':0.3, 'l2_regularization':0.005, 'learning_rate':0.0005, 'activation':'relu', 'epochs':1000, 'lstm_units':128}
                model, eval_rmspe, history = nn_model_lstm(hyperparams, train_df, train_index, cval_index, features, verbose=verbose)
                cval_predictions = nn_make_predictions(model, train_df.loc[cval_index, features], features, lstm_units=hyperparams['lstm_units'],
                                                       model_name=model_name, verbose=False).clip(0,1e10)
            model_results[kfold_iter] = history
        elif model_type == "LGB":
            X_train, X_cval = train_df.loc[train_index, features], train_df.loc[cval_index, features]
            Y_train, Y_cval = train_df.loc[train_index, 'target'].values, train_df.loc[cval_index, 'target'].values
            print(' X_train.shape:', X_train.shape, 'X_cval.shape:', X_cval.shape, 'Y_train.shape:', Y_train.shape, 'Y_cval.shape:', Y_cval.shape)
            model = lgb_model_1(model_name, X_train, Y_train, X_cval, Y_cval, features)
            cval_predictions = model.predict(X_cval)
            model_results[kfold_iter] = pd.DataFrame({'Feature': model.feature_name(), 'Importance': model.feature_importance(importance_type='gain')})

        train_df.loc[cval_index, prediction_label] = cval_predictions
        cval_rmspe = round(rmspe(y_true=train_df.loc[cval_index, 'target'].values, y_pred=cval_predictions), 3)
        print(f' KFold iteration {kfold_iter} score = {cval_rmspe}')
        kfold_scores[model_name].append(cval_rmspe)
        if model_type == "NN":
            if model_name == "NN1":
                test_df['target'] = test_df['target'] + nn_make_predictions(model, test_df, features, model_name=model_name,
                                                                            verbose=False).clip(0,1e10)
            elif model_name == "LSTM":
                test_df['target'] = test_df['target'] + nn_make_predictions(model, test_df, features, model_name=model_name, lstm_units=hyperparams['lstm_units'],
                                                                            verbose=False).clip(0,1e10)
            del model
            K.clear_session()
        else:
            test_df['target'] = test_df['target'] + model.predict(test_df[features]).clip(0,1e10)
            del model
        time_end = timer()
        print(f'KFold iteration {kfold_iter} completed!!!, Took {round(time_end-time_start, 0)} seconds\n')
        
    avg_cval_rmspe = round(rmspe(y_true = train_df['target'].values, y_pred = train_df[prediction_label].values), 3)
    avg_eval_r2    = round(r2_score(y_true = train_df['target'].values, y_pred = train_df[prediction_label].values), 3)
    print(f'After training, avg_cval_rmspe = {avg_cval_rmspe}, Fold Scores: {kfold_scores[model_name]}, R2 score = {avg_eval_r2}')
    
    test_df['target'] = test_df['target'] / n_kfolds
    print('\nFew test predictions ...')
    display(test_df[['row_id', 'target']].head(3))
    model_test_predictions = test_df[['row_id', 'target']]
    if not submission_mode:
        err_df = pd.merge(syt, model_test_predictions, how='left', on='row_id')
        err_df = err_df.rename(columns={'target':'y_pred'})
        err_df['key'] = 'k-' + err_df['row_id']
        err_df = err_df[['key', 'row_id', 'stock_id', 'time_id', 'y_true', 'y_pred']]
        err_df.to_csv(f'err_df-{model_name}.csv', index=False)
    
    if not submission_mode:
        test_rmspe = round(rmspe(y_true = saved_y_true, y_pred = model_test_predictions['target'].values), 3)
        test_r2    = round(r2_score(y_true = saved_y_true, y_pred = model_test_predictions['target'].values), 3)
        print(f'On test data, RMSPE = {test_rmspe}, R2 score = {test_r2}')
        
    return model_results, model_test_predictions

### Run all -->

In [None]:
gc.collect()

In [None]:
%%time

submission_mode = True
verbose = False
to_plot = True
test_df_fraction = 0.15
n_kfolds = 4
model_type = 'NN' # LGB / NN
model_name = 'LSTM' # LGB1 / NN1 / LSTM

train_df, test_df, features = prepare_training_and_test_data(submission_mode=submission_mode, verbose=verbose, test_df_fraction=test_df_fraction)
gc.collect()
#'''
model_results, submission = create_train_model_and_predict(model_name=model_name, model_type=model_type, submission_mode=submission_mode, n_kfolds=n_kfolds,
                                                           train_df=train_df, test_df=test_df, features=features, verbose=verbose)
submission.to_csv('submission.csv', index=False)
#'''

#optimal_params = run_bayes_optimization(train_df, test_df, features, n_calls=50)

In [None]:
def plot_lgb(axs, model_perf):
    model_perf.sort_values(by='Importance', inplace=True)
    model_perf = model_perf.nlargest(25,'Importance', keep='first').sort_values(by='Importance', ascending=True)
    model_perf[['Importance', 'Feature']].plot(kind = 'barh', x = 'Feature', ax=axs, color = 'blue', fontsize=11)
    return

def plot_nn(axs, history, fig_title):
    hist = pd.DataFrame(history.history)
    hist['epoch'] = history.epoch; hist = hist.iloc[1:]
    axs.plot(hist['epoch'], hist['loss'], color='red')
    axs.plot(hist['epoch'], hist['val_loss'], color='blue')
    axs.set_title(fig_title); axs.grid(True)
    return

plt.rcParams["figure.figsize"] = (20,6)
plt.rcParams["font.size"] = 9
plt.rcParams["font.weight"] = "bold"
if model_type == 'LGB' and to_plot:
    fig, axs = plt.subplots(nrows=1, ncols=2)
    plot_lgb(axs[0], model_results[1]); plot_lgb(axs[1], model_results[2])
    fig.tight_layout(); plt.show()
    fig, axs = plt.subplots(nrows=1, ncols=2)
    plot_lgb(axs[0], model_results[3]); plot_lgb(axs[1], model_results[4])
    fig.tight_layout(); plt.show()
elif model_type == 'NN' and to_plot:
    fig, axs = plt.subplots(nrows=1, ncols=2)
    plot_nn(axs[0], model_results[1], 'KFold Model 1'); plot_nn(axs[1], model_results[2], 'KFold Model 2')
    fig.tight_layout(); plt.show()
    fig, axs = plt.subplots(nrows=1, ncols=2)
    plot_nn(axs[0], model_results[3], 'KFold Model 3'); plot_nn(axs[1], model_results[4], 'KFold Model 4')
    fig.tight_layout(); plt.show()
print('Done')