Join Order Book and Trade data into a single element, dropping excess Order Book data.

In [None]:
import pandas as pd
import numpy as np
import os
import glob

In [None]:
ob_dir = '../input/optiver-realized-volatility-prediction/book_train.parquet/'
trade_dir = '../input/optiver-realized-volatility-prediction/trade_train.parquet/'

ob_files = ob_dir+'*'
trade_files = trade_dir+'*'
ob_files = glob.glob(ob_files)
trade_files = glob.glob(trade_files)

In [None]:
parquet_dir = './optiver_parquet'
try:
    os.mkdir(parquet_dir)
except:
    pass

for f in ob_files:
    stock_id = f.split('/')[-1].split('=')[-1]
    book_file = '../input/optiver-realized-volatility-prediction/book_train.parquet/stock_id='+str(stock_id)
    trade_file = '../input/optiver-realized-volatility-prediction/trade_train.parquet/stock_id='+str(stock_id)
    book_df = pd.read_parquet(book_file)
    trade_df =  pd.read_parquet(trade_file)
    
    # Reindex both dataframes:
    book_df['reindex'] = book_df['time_id'].astype(str) + ':' +book_df['seconds_in_bucket'].astype(str)
    book_df = book_df.drop(columns=['time_id', 'seconds_in_bucket'])
    book_df.set_index('reindex', inplace=True)
    
    trade_df['reindex'] = trade_df['time_id'].astype(str) + ':' +trade_df['seconds_in_bucket'].astype(str)
    trade_df = trade_df.drop(columns=['time_id', 'seconds_in_bucket'])
    trade_df.set_index('reindex', inplace=True)
    
    # Now they can be joined:
    full_data = book_df.join(trade_df)
    
    
    # Reindex:
    full_data['index_values'] = full_data.index.to_series()
    full_data['time_id'] = full_data['index_values'].apply(lambda x : x.split(':')[0])
    full_data['sib'] = full_data['index_values'].apply(lambda x : x.split(':')[1])
    full_data.reset_index(inplace=True)
    full_data.drop(columns=['reindex','index_values'], inplace = True)
    
    # Fill values that should be zero:
    full_data['size'].fillna(0, inplace=True)
    full_data['order_count'].fillna(0, inplace=True)
    
    # Fill forward the price and drop missing values:
    full_data['price'] = full_data.groupby(['time_id'], sort=False)['price'].apply(lambda x: x.ffill())
    full_data.dropna(inplace = True)
    
    '''
    # Alternatively, fill in two passes, introduces some bias while preserving data:
    full_data['price'] = full_data.groupby(['time_id'], sort=False)['price'].apply(lambda x: x.ffill().bfill())
    '''
    
    # Save parquet:
    parquet_name = parquet_dir + '/' + stock_id + '.parquet'
    full_data.to_parquet(parquet_name)

In [None]:
import shutil
shutil.make_archive('optiver_parquet', 'zip', parquet_dir)

In [None]:
full_data