In this notebook, I will be showing you a baseline of what you could do with Neural Network with K Fold cross validation.  This one is demonstrated with training data of 3 numerical values (log_return_1, log_return_2, trade_log_return1) and 1 categorical feature (stock_id).

In [None]:
# Standard python libraries
import io, os, time, re

# Installed libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers.core import Dense, Activation
from sklearn.model_selection import KFold
import tensorflow as tf
from joblib import Parallel, delayed

path_data = '../input/optiver-realized-volatility-prediction'

In [None]:
# Create functions for common calculation use for this competition 
def log_return(list_stock_prices):
    return np.log(list_stock_prices).diff() 

def realized_volatility(series_log_return):
    return np.sqrt(np.sum(series_log_return**2))

In [None]:
# Create functions for extracting the data.  
# Thank you Manel for your insights on this from your original notebook https://www.kaggle.com/manels/lgb-starter
def get_dataSet(stock_ids : list, dataType = 'train'):
    stock_stat = Parallel(n_jobs=-1)(
        delayed(get_stock_stat)(stock_id, dataType) 
        for stock_id in stock_ids
    )
    stock_stat_df = pd.concat(stock_stat, ignore_index = True)
    return stock_stat_df

def get_stock_stat(stock_id : int, dataType = 'train'):
    key = ['stock_id', 'time_id', 'seconds_in_bucket']
    
    #Book features
    df_book = pd.read_parquet(os.path.join(path_data, 'book_{}.parquet/stock_id={}/'.format(dataType, stock_id)))
    df_book['stock_id'] = stock_id
    cols = key + [col for col in df_book.columns if col not in key]
    df_book = df_book[cols]

    df_book['wap1'] = (df_book['bid_price1'] * df_book['ask_size1'] +
                                    df_book['ask_price1'] * df_book['bid_size1']) / (df_book['bid_size1'] + df_book['ask_size1'])
    df_book['wap2'] = (df_book['bid_price2'] * df_book['ask_size2'] +
                                    df_book['ask_price2'] * df_book['bid_size2']) / (df_book['bid_size2'] + df_book['ask_size2'])
    
    df_book['log_return1'] = df_book.groupby(by = ['time_id'])['wap1'].apply(log_return)
    df_book = df_book[~df_book['log_return1'].isnull()]
    df_book['log_return2'] = df_book.groupby(by = ['time_id'])['wap2'].apply(log_return)
    df_book = df_book[~df_book['log_return2'].isnull()]
    
    features_to_apply_realized_volatility = ['log_return'+str(i+1) for i in range(2)]
    stock_stat = df_book.groupby(by = ['stock_id', 'time_id'])[features_to_apply_realized_volatility]\
                        .agg(realized_volatility).reset_index()

    #Trade features
    trade_stat =  pd.read_parquet(os.path.join(path_data,'trade_{}.parquet/stock_id={}'.format(dataType, stock_id)))
    trade_stat = trade_stat.sort_values(by=['time_id', 'seconds_in_bucket']).reset_index(drop=True)
    trade_stat['stock_id'] = stock_id
    cols = key + [col for col in trade_stat.columns if col not in key]
    trade_stat = trade_stat[cols]
    
    trade_stat['trade_log_return1'] = trade_stat.groupby(by = ['time_id'])['price'].apply(log_return)
    trade_stat = trade_stat[~trade_stat['trade_log_return1'].isnull()]
    
    trade_stat = trade_stat.groupby(by = ['stock_id', 'time_id'])[['trade_log_return1']]\
                           .agg(realized_volatility).reset_index()
    #Joining book and trade features
    stock_stat = stock_stat.merge(trade_stat, on=['stock_id', 'time_id'], how='left').fillna(-999)
    
    return stock_stat

In [None]:
# create the train dataframe
train = pd.read_csv(os.path.join(path_data, 'train.csv'))
train_stock_stat_df = get_dataSet(stock_ids = train['stock_id'].unique(), dataType = 'train')
train = pd.merge(train, train_stock_stat_df, on = ['stock_id', 'time_id'], how = 'left')
print('Train shape: {}'.format(train.shape))
print(train)

In [None]:
# create the test dataframe
test = pd.read_csv(os.path.join(path_data, 'test.csv'))
test_stock_stat_df = get_dataSet(stock_ids = test['stock_id'].unique(), dataType = 'test')
test = pd.merge(test, test_stock_stat_df, on = ['stock_id', 'time_id'], how = 'left').fillna(0)
print('Test shape: {}'.format(test.shape))
print(test)

In [None]:
# Create new dataframe called train_data with one-hot encoding for our categorical stock_id
train_data = train.sort_index(axis=1) # Sort First

# Make stock_id as one hot encoded categoral features
train_data = pd.concat([train_data, pd.get_dummies(train_data['stock_id'], prefix="stock_id")], axis=1)
train_data = train_data.drop(['time_id', 'stock_id'], axis=1)
train_data

In [None]:
# Separate out the train labels and targets and converting them to numpy array
X = train_data.drop(['target'], axis=1).values
Y = train_data['target'].values

In [None]:
# Function for constructing a NN model
def build_model():
    model = Sequential()
    model.add(Dense(84, activation='relu', input_shape=(X.shape[1],)))
    model.add(Dense(48, activation='relu'))
    model.add(Dense(12, activation='relu'))
    model.add(Dense(1))
    return model

In [None]:
# Create a kfold cross validation training and monitor on the MAPE
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
cvscores = []
fold_no = 1

for train, val in kfold.split(X, Y):
    model = build_model()
    model.compile(optimizer='adam', loss='mean_absolute_percentage_error')
    print(f'Training for Fold {fold_no} ...')
    history = model.fit(X[train], Y[train],
              batch_size=128,
              epochs=100)
    scores = model.evaluate(X[val], Y[val], verbose=0)
    print(f'Fold {fold_no} CV Score: {scores}')
    cvscores.append(scores)
    fold_no = fold_no + 1
print(f'Overall Average of {len(cvscores)} folds: {sum(cvscores) / len(cvscores)}')

In [None]:
# Process the test_data dataset just like the train_data so all columns are available & in same order
test_data = test.sort_index(axis=1) # Sort First

# Make stock_id as one hot encoded categoral features
test_data = pd.concat([test_data, pd.get_dummies(test['stock_id'], prefix="stock_id")], axis=1)
test_data = test_data.drop(['time_id', 'stock_id', 'row_id'], axis=1)
test_data = pd.DataFrame(data=test_data, columns = train_data.columns).fillna(0)
test_data = test_data.drop(['target'], axis=1)
test_data

In [None]:
# Use trained model to predict on the test_data in numpy array
predictions = model.predict(test_data.values)
predictions

In [None]:
# Create a submission file with the predictions against the row_ids
test['target'] = predictions.reshape(len(predictions)).tolist()
submission = test[['row_id', 'target']]
submission.to_csv('submission.csv',index = False)
submission

I hope this leaves you with lots of run to make adjustments & create your own Neural Network to make predictions for the Realized Volatility in this Competition.  Best of luck !  If this helps you, please kindly let me know in comments and upvote :)