In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from glob import glob

In [None]:
#features from book data

def calculate_book_features(df):
    """
    df: book_train data for each stock_id
    """
    
    #calculate price for features
    df['wap'] = (df['bid_price1'] * df['ask_size1'] + df['ask_price1'] * df['bid_size1']) / (df['bid_size1']+ df['ask_size1'])
    
    #calculate historical volatility
    vol = df.groupby('time_id')['wap'].apply(lambda x: np.sqrt(np.sum(np.log(x).diff()**2)))
    vol_df = pd.DataFrame(vol)
    vol_df.rename(columns={'wap': 'vol'}, inplace=True)
    
    #calculate max and min bid-ask spread
    feat1 =  df.groupby('time_id').apply(lambda x: max(x['ask_price2'] / x['bid_price2'] - 1))
    feat2 =  df.groupby('time_id').apply(lambda x: max(x['ask_price1'] / x['bid_price1'] - 1))
    feat3 =  df.groupby('time_id').apply(lambda x: min(x['ask_price2'] / x['bid_price2'] - 1))
    feat4 =  df.groupby('time_id').apply(lambda x: min(x['ask_price1'] / x['bid_price1'] - 1))
    spread_df = pd.concat([feat1, feat2, feat3, feat4], axis=1)
    spread_df.columns = ['max_spread2', 'max_spread1', 'min_spread2', 'min_spread1']
    
    #calculate 10min return
    ret = df.groupby("time_id")['wap'].last() / df.groupby('time_id')['wap'].first() - 1
    ret_df = pd.DataFrame(ret)
    ret_df.rename(columns={'wap': 'ret'}, inplace=True)
    
    #calculate market depth
    ask =  df.groupby('time_id').apply(lambda x: min(x['ask_price1'] / x['ask_price2'] - 1))
    bid =  df.groupby('time_id').apply(lambda x: max(x['bid_price1'] / x['bid_price2'] - 1))
    depth_df = pd.concat([ask, bid], axis=1)
    depth_df.columns = ['ask_depth', 'bid_depth']
    
    feat_df = pd.concat([vol_df, spread_df, ret_df, depth_df], axis=1)
    
    return feat_df

In [None]:
#features from trade data

def calculate_trade_features(df):
    """
    df: trade_train data. 
    """
    
    #calculate traded price range
    price_range = df.groupby("time_id")['price'].max() / df.groupby("time_id")['price'].min() 
    range_df = pd.DataFrame(price_range)
    range_df.columns = ['price_range']
    
    #calculate max volume
    df['volume'] = df['size'] * df['price']
    max_volume = df.groupby("time_id")['volume'].max()
    max_volume_df = pd.DataFrame(max_volume)
    max_volume_df.columns = ['max_volume']
    
    feat_df = pd.concat([range_df, max_volume_df], axis=1)
    
    return feat_df 
    

In [None]:
# calculate metrics
def rmspe(y_true, y_pred):
    return  (np.sqrt(np.mean(((y_true - y_pred) / y_true + 1e-6)**2)))

In [None]:
#get training data set
book_data_list = glob('/kaggle/input/optiver-realized-volatility-prediction/book_train.parquet/*')
trade_data_list = glob('/kaggle/input/optiver-realized-volatility-prediction/trade_train.parquet/*')

In [None]:
df = pd.DataFrame()
for book_file, trade_file in zip(book_data_list, trade_data_list):
    book_df = pd.read_parquet(book_file)
    trade_df = pd.read_parquet(trade_file)
    stock_id = int(book_file.split("=")[1])
    book_feature = calculate_book_features(book_df)
    trade_feature = calculate_trade_features(trade_df)
    features = pd.concat([book_feature, trade_feature], axis=1)
    features['stock_id'] = stock_id
    
    df = pd.concat([df, features], axis=0)

In [None]:
df.head()

In [None]:
df.set_index(['stock_id'], append=True, inplace=True)

In [None]:
target = pd.read_csv("/kaggle/input/optiver-realized-volatility-prediction/train.csv")
target.set_index(['time_id', 'stock_id'], inplace=True)

In [None]:
data = pd.merge(df, target, left_index=True, right_index=True, how='inner')

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data.drop('target', axis=1), data['target'], test_size=0.2, shuffle=True, random_state=918)
X_train, X_eval, y_train, y_eval = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, random_state=918)

In [None]:
lgb_train = lgb.Dataset(X_train, label=y_train)
lgb_eval = lgb.Dataset(X_eval, label=y_eval)

In [None]:
lgb_params = {"objective": "l2", 
              "metric": 'rmse', 
              "learning_rate": 0.01,
              "boosting": "dart",
              "num_leaves": 100,
              "max_depth": -1,
              "verbosity": -1,
             "random_state": 0}

In [None]:
bst = lgb.train(params=lgb_params, train_set=lgb_train, num_boost_round=2000, 
                valid_sets=lgb_eval, early_stopping_rounds=50, verbose_eval=-1
               )

In [None]:
y_pred = bst.predict(X_test)

In [None]:
rmspe(y_test, y_pred)

In [None]:
tmp = pd.read_parquet('/kaggle/input/optiver-realized-volatility-prediction/book_test.parquet/stock_id=0')

In [None]:
tmp

In [None]:
#for submission
book_file_test = glob('/kaggle/input/optiver-realized-volatility-prediction/book_test.parquet/*')
trade_file_test = glob('/kaggle/input/optiver-realized-volatility-prediction/trade_test.parquet/*')

submit_df = pd.DataFrame()
for book_file, trade_file in zip(book_file_test, trade_file_test):
    test_book_df = pd.read_parquet(book_file)
    test_trade_df = pd.read_parquet(trade_file)
    test_book_feature = calculate_book_features(test_book_df)
    test_trade_feature = calculate_trade_features(test_trade_df)
    test_features = pd.concat([test_book_feature, test_trade_feature], axis=1)
    stock_id = book_file.split("=")[1]
    prediction = bst.predict(test_features)
    test_features['stock_id'] = stock_id
    row_id = test_features['stock_id'].astype(str) + '-' + test_features.index.astype(str)
    pred_df = pd.DataFrame({'row_id': row_id.values, 'target': prediction}, columns=['row_id', 'target'])
    #pred_df = pd.concat([row_id, pred_df], axis=1)
    #pred_df.rename(columns={0: "row_id", 1: "target"})
    submit_df = pd.concat([submit_df, pred_df], axis=0)
submit_df.to_csv('submission.csv', index = False)