In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import glob
import gc
from tqdm import tqdm

In [None]:


direc = {'trade_path': "../input/optiver-realized-volatility-prediction/trade_",
          'book_path': "../input/optiver-realized-volatility-prediction/book_",
          'train_path': '../input/optiver-realized-volatility-prediction/train.csv', #aggregate train file
          'test_path' : '../input/optiver-realized-volatility-prediction/test.csv'} #aggregate test file


# General Functions

In [None]:
def RMSPE(y_true, y_pred):
    loss = np.sqrt(np.mean(np.square(((y_true - y_pred) / y_true)), axis=0))
    return loss

def WAP1(df):
    WAP = (df['bid_price1'] * df['ask_size1'] + df['ask_price1'] * 
           df['bid_size1'])/(df['bid_size1'] + df['ask_size1'])
    return WAP

def WAP2(df):
    wap = (df['bid_price2'] * df['ask_size2'] + df['ask_price2'] * 
           df['bid_size2'])/(df['bid_size2'] + df['ask_size2'])
    return wap

def log_return(WAP):
    return np.log(WAP).diff() 

def realized_volatility(log_r):
    return np.sqrt((log_r**2).sum())

In [None]:
train_df = pd.read_csv(direc['train_path'])
test_df = pd.read_csv(direc['test_path'])

In [None]:
#read each stock id pq file and store as dataframe  
def read_data(stock_id, data_path, data_type):
    trade_file = glob.glob(direc[data_path]+f'{data_type}.parquet/stock_id={stock_id}/*')[0]
    trade = pd.read_parquet(trade_file)
    return trade

Check number trades for each stock

In [None]:
train_df.info()

In [None]:
freq = train_df.stock_id.value_counts()
incomplete_stocks = freq[freq<3830].index.tolist()
incomplete_stocks # stocks with less than 3830 trades

# Target Volatility Analysis

In [None]:
train_df.target.describe()

In [None]:
plt.figure(figsize=(8,4))
train_df.target.plot(kind='hist', bins=100)
plt.title('Target')
plt.grid()
plt.show()
#very right skewed distribution for target

# Grouped Book Features

In [None]:
def consol_book_df(stock_id, data_type):

    #read stock pq file
    df = read_data(stock_id=stock_id,data_path = 'book_path', data_type=data_type)
    
    #add stock-id column
    df['stock_id'] = stock_id
    
    #WAP
    df['WAP1'] = WAP1(df)
    df['WAP2'] = WAP2(df)
    
    #log return
    df['book_log_ret1'] = df.groupby('time_id')['WAP1'].apply(log_return).fillna(0)
    df['book_log_ret2'] = df.groupby('time_id')['WAP2'].apply(log_return).fillna(0)
    
    #Spread
    df['price_spread'] = (df['ask_price1'] - df['bid_price1']) / ((df['ask_price1'] + df['bid_price1'])/2)
    df['bid_spread'] = df['bid_price1'] - df['bid_price2']
    df['ask_spread'] = df['ask_price1'] - df['ask_price2']
    
    #Volume features
    df['total_volume'] = (df['ask_size1'] + df['ask_size2']) + (df['bid_size1'] + df['bid_size2'])
    df['volume_imbalance'] = abs((df['ask_size1'] + df['ask_size2']) - (df['bid_size1'] + df['bid_size2']))
    
    #Book features
    final_book = df.groupby(['stock_id', 'time_id']).agg(
                                              real_vol_1 =('book_log_ret1', realized_volatility),
                                              real_vol_2 = ('book_log_ret2', realized_volatility),
                                              WAP1_mean = ('WAP1', 'mean'),
                                              WAP2_mean = ('WAP2', 'mean'),
                                              log_ret1_mean = ('book_log_ret1','mean'),
                                              log_ret2_mean = ('book_log_ret2','mean'),
                                              price_spread_mean = ('price_spread', 'mean'),
                                              price_spread_var = ('price_spread', 'var'),
                                              price_spread_min = ('price_spread', 'min'),
                                              price_spread_max = ('price_spread', 'max'),                    
                                              bid_spread = ('bid_spread', 'mean'),
                                              ask_spread = ('ask_spread', 'mean'),
                                              total_vol_mean = ('total_volume', 'mean'),
                                              vol_imbal_mean = ('volume_imbalance', 'mean')).reset_index()

    return final_book


# Grouped Trade Features

In [None]:
def consol_trade_df(stock_id, data_type):
    
    #read stock pq file
    df = read_data(stock_id=stock_id,data_path = 'trade_path', data_type=data_type)
    
    #add stock-id column
    df['stock_id'] = stock_id
    
    #position size = price * size
    df['position_size'] = df['price']*df['size']
    
    #trade log return
    df['trade_log_ret'] = df.groupby('time_id')['price'].apply(log_return).fillna(0)
    
    #average order size
    df['average_ord_size'] = df['size']/df['order_count']
    
    #Trade features
    final_trade = df.groupby(['time_id', 'stock_id']).agg(
                                                     total_size = ('size', 'sum'),
                                                     position_size_mean = ('position_size', 'mean'),
                                                     position_size_var = ('position_size', 'var'),
                                                     avg_ord_size_mean = ('average_ord_size', 'mean'),
                                                     mean_order = ('order_count', 'mean'),
                                                     max_sec_in_bucket = ('seconds_in_bucket', 'max'), 
                                                     min_sec_in_bucket = ('seconds_in_bucket', 'min'),

                                                     real_vol_trade=('trade_log_ret', realized_volatility)).reset_index()

    return final_trade

In [None]:
def create_dataSet(category):
    if category == 'train':
        df = train_df
    else:
        df = test_df
    unique_id = df['stock_id'].unique().tolist() #unique stock ids
    final_df = pd.DataFrame()
    for id in tqdm(unique_id):
        book = consol_book_df(id, data_type = category)
        trade = consol_trade_df(id, data_type = category)
        merged_df = (pd.merge(book, trade, on=['stock_id', 'time_id'], how='left')
                     .merge(df, on=['stock_id', 'time_id'], how='left'))
        final_df = pd.concat([final_df, merged_df])
        gc.collect()
    return final_df 

In [None]:
train_data = create_dataSet(category='train')
test_data = create_dataSet(category='test')

train_data.reset_index()
test_data.reset_index()

In [None]:
train_data.reset_index()

In [None]:
#check nan rows with train_set
train_data = train_data.dropna()
train_data[train_data.isnull().any(axis=1)]

# Correlation Plot of Features

In [None]:
import seaborn as sns
plt.figure(figsize=(25, 10))
heatmap = sns.heatmap(train_data.iloc[:,2:].corr(), vmin=-1, vmax=1, annot=True)
heatmap.set_title('Correlation Heatmap of Features', fontdict={'fontsize':18}, pad=12);

In [None]:
X = train_data.loc[ : , train_data.columns != 'target']
#remove id coluimns
del X['stock_id']
del X['time_id']
y = train_data['target']


In [None]:
from sklearn.linear_model import LinearRegression
model = LinearRegression().fit(X,y)


In [None]:
test_data = create_dataSet(category='test')

test_data = test_data.iloc[:,:-1]
test_data = test_data.iloc[:,2:]



In [None]:
submission = pd.DataFrame(model.predict(test_data), columns=['target'])
submission.insert(0, "row_id", "0-4")
submission.to_csv('submission.csv',index = False)

pd.read_csv('submission.csv')