For submission, I am going to use gradient boosting from sklearn. For study, please refer to this book:https://www.kaggle.com/pkang0831/multiple-model-tryouts

In [None]:
# import necessary libraries: this will get updated as I go along
import pandas as pd
import numpy as np
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import metrics, model_selection, preprocessing
import warnings 
warnings.filterwarnings('ignore')
%matplotlib inline

In [None]:
train_csv_route = '../input/optiver-realized-volatility-prediction/train.csv'
test_csv_route = '../input/optiver-realized-volatility-prediction/test.csv'
book_train = pd.read_parquet('../input/optiver-realized-volatility-prediction/book_train.parquet')
book_test = pd.read_parquet('../input/optiver-realized-volatility-prediction/book_test.parquet')
trade_train = pd.read_parquet('../input/optiver-realized-volatility-prediction/trade_train.parquet')
trade_test = pd.read_parquet('../input/optiver-realized-volatility-prediction/trade_test.parquet')

train = pd.read_csv(train_csv_route)
test = pd.read_csv(test_csv_route)
data = {
    'book_train':book_train,
    'book_test':book_test,
    'trade_train':trade_train,
    'trade_test':trade_test,
    'train':train,
    'test':test
}

In [None]:
# configuring class that groups the functions
class Optiver_feature_engineered:
    
    """
    it is a collection of the features... docstring work in progress. 
    """
    TODO: 'complete docstring for this'
    
    def __init__(self,df=None,df_name=None):
        self.df = df
        self.df_name = df_name
        
    def BAS(self,ask_price,bid_price):
        return [ask_p/bid_p - 1 for ask_p,bid_p in zip(ask_price,bid_price)]

    def WAP(self,df):
        wap = (df[df.columns[0]] * df[df.columns[1]] + df[df.columns[2]]*df[df.columns[3]])/(df[df.columns[1]]+df[df.columns[3]])
        return wap

    def log_return(self,list_stock_prices):
        return np.log(list_stock_prices).diff() 

    def realized_volatility(self,series_log_return):
        return np.sqrt(np.sum(series_log_return**2))

In [None]:
# preprocessing for booking dataset:
# BAS
# WAP
# Log return
# Calculated volatility

book_train = book_train[book_train['stock_id']==0]
trade_train = trade_train[trade_train['stock_id']==0]
train = train[train['stock_id']==0]

fe = Optiver_feature_engineered()
def preprocessings_book(df):
    
    df['seconds_in_bucket'] = df['seconds_in_bucket'] + 1 # cuz 0 seconds will mess up the data internally
    df['seconds_bids'] = df['seconds_in_bucket']*(df['bid_size1']+df['bid_size2'])
    df['seconds_asks'] = df['seconds_in_bucket']*(df['ask_size1']+df['ask_size2'])
    df['BAS1'] = fe.BAS(df['ask_price1'],df['bid_price1'])
    df['BAS2'] = fe.BAS(df['ask_price2'],df['bid_price2'])
    df['WAP1'] = fe.WAP(df[['bid_price1','ask_size1','ask_price1','bid_size1']])
    df['WAP2'] = fe.WAP(df[['bid_price2','ask_size2','ask_price2','bid_size2']])
    df['logr1'] = df.groupby(['time_id'])['WAP1'].apply(fe.log_return)
    df['logr2'] = df.groupby(['time_id'])['WAP2'].apply(fe.log_return)
    apply_functions = {"seconds_in_bucket":"mean",
                       "bid_price1":"mean",
                       "bid_price2":"mean",
                       "ask_price1":"mean",
                       "ask_price2":"mean",
                       "BAS1":"mean",
                       "BAS2":"mean",
                       "WAP1":"mean", # null values to be ignored when taking mean
                       "WAP2":"mean", # null values to be ignored when taking mean
                       "logr1":"mean",
                       "logr2":"mean",
                       "seconds_bids":"sum",
                       "seconds_asks":"sum",
                       'bid_size1':"sum",
                       'bid_size2':"sum",
                       'ask_size1':"sum",
                       'ask_size2':"sum"
                      }
    df_feature = df.groupby(['time_id']).agg(apply_functions)
    df_feature['vol_1'] = df.groupby(['time_id'])['logr1'].apply(fe.realized_volatility)
    df_feature['vol_2'] = df.groupby(['time_id'])['logr2'].apply(fe.realized_volatility)
    df_feature['seconds_bids'] = df_feature['seconds_bids']/(df_feature['bid_size1'] + df_feature['bid_size2'])
    df_feature['seconds_asks'] = df_feature['seconds_asks']/(df_feature['ask_size1'] + df_feature['ask_size2'])
    df_feature.reset_index(inplace=True)
    df_feature.drop(columns='seconds_in_bucket',axis=1,inplace=True)
    return df_feature

# Preprocessing for trading dataset:

def preprocessings_trade(df):

    df['seconds_in_bucket'] = df['seconds_in_bucket'] + 1
    df['seconds_size'] = df['seconds_in_bucket']*df['size']
    df['logr_p'] = df.groupby(['time_id'])['price'].apply(fe.log_return)
    apply_func = {
        'order_count':'sum',
        'seconds_in_bucket':'mean',
        'size':'sum',
        'seconds_size':'sum',
        'price':'mean',
        'logr_p':'mean'
    }

    df_feature = df.groupby(['time_id']).agg(apply_func)
    df_feature['spread'] = df.groupby(['time_id'])['price'].max() - df.groupby(['time_id'])['price'].min()
    df_feature['vol_p'] = df.groupby(['time_id'])['logr_p'].apply(fe.realized_volatility)
    df_feature['seconds_size'] = df_feature['seconds_size']/df_feature['size']
    df_feature.reset_index(inplace=True)
    df_feature.drop(columns='seconds_in_bucket',axis=1,inplace=True)
    return df_feature

book_train_feature = preprocessings_book(book_train)
book_test_feature = preprocessings_book(book_test)
trade_train_feature = preprocessings_trade(trade_train)
trade_test_feature = preprocessings_trade(trade_test)


dataset = pd.merge(book_train_feature,trade_train_feature,how='left',on=['time_id'])
df = pd.merge(dataset,train,how='right',on=['time_id'])
df.drop(columns='stock_id',inplace=True)
df.dropna(inplace=True)

In [None]:
dataset = pd.merge(book_train_feature,trade_train_feature,how='left',on=['time_id'])
df = pd.merge(dataset,train,how='right',on=['time_id'])
df.drop(columns='stock_id',inplace=True)
df.dropna(inplace=True)
dataset_val = pd.merge(book_test_feature,trade_test_feature,how='left',on=['time_id'])
df_val = pd.merge(dataset_val,test,how='right',on=['time_id'])
df_val.drop(columns=['stock_id','row_id'],inplace=True)

In [None]:
from sklearn import ensemble
alg = ensemble.GradientBoostingRegressor(
    alpha=0.9,
    criterion='friedman_mse',
    learning_rate=0.05,
    loss='huber',
    max_depth=3,
    max_features='sqrt',
    min_samples_leaf=10,
    min_samples_split=20,
    n_estimators=150,
    subsample=1.0,
    validation_fraction=0.2
)

In [None]:
from sklearn.model_selection import train_test_split, cross_validate
from sklearn import preprocessing

# Split the data
# X_train, X_test, Y_train, Y_test, = train_test_split(df.drop(columns='target',axis=1),df['target'], test_size = .33)
X_train, X_test, Y_train, Y_test, = train_test_split(df.drop(columns='target',axis=1),df['target'], test_size = .2)


# Normalizing the train,test predictor variables.
scaler = preprocessing.StandardScaler()

# Normalize the train predictors
X_train[X_train.columns] = scaler.fit_transform(X_train[X_train.columns]) 

# Apply normalization traits to the test predictors
X_test[X_test.columns] = scaler.transform(X_test[X_test.columns])

print(f'Train dataset shape: {X_train.shape}')
print(f'Test dataset shape: {X_test.shape}')
print(f'Train target dataset shape: {Y_train.shape}')
print(f'Test target dataset shape: {Y_test.shape}')

def rmspe(y_true, y_pred):
    '''
    Compute Root Mean Square Percentage Error between two arrays.
    '''
    loss = np.sqrt(np.mean(np.square(((y_true - y_pred) / y_true))))

    return loss

rmspe_loss = metrics.make_scorer(rmspe)

In [None]:
scoring = {
    'r2': 'r2',
    'rmspe': rmspe_loss
}
cv_results = cross_validate(alg, X_train, Y_train, return_train_score=True, scoring=scoring)
# fitting
model = alg.fit(X_train,Y_train)
# filling NaN with 0 so that model can handle it
df_val.fillna(0,inplace=True)
# produce prediction
predicted = model.predict(df_val)

In [None]:
submission = pd.concat([test.drop(['stock_id','time_id'],axis=1),pd.DataFrame(predicted,columns=['target'])],axis=1)
submission

In [None]:
submission.to_csv('submission.csv',index = False)