In [None]:
import os
import glob
from joblib import Parallel, delayed
import pandas as pd
import numpy as np
import scipy as sc
from sklearn.model_selection import KFold
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')
pd.set_option('max_columns', 300)
from tqdm import tqdm
import time

In [None]:
# 細かい関数
# data directory
data_dir = '../input/optiver-realized-volatility-prediction/'

In [None]:
# Function to read our base train and test set
def read_train_test():
    train = pd.read_csv('../input/optiver-realized-volatility-prediction/train.csv')
    test = pd.read_csv('../input/optiver-realized-volatility-prediction/test.csv')
    # Create a key to merge with book and trade data
    train['row_id'] = train['stock_id'].astype(str) + '-' + train['time_id'].astype(str)
    test['row_id'] = test['stock_id'].astype(str) + '-' + test['time_id'].astype(str)
    print(f'Our training set has {train.shape[0]} rows')
    return train, test

# Read train and test
train, test = read_train_test()

In [None]:
stocklist = train['stock_id'].unique()
len(stocklist)

In [None]:
#現在価格の計算
def calc_wap1(df):
    wap = (df['bid_price1'] * df['ask_size1'] + df['ask_price1'] * df['bid_size1']) / (df['bid_size1'] + df['ask_size1'])
    return wap

# Function to calculate second WAP
def calc_wap2(df):
    wap = (df['bid_price2'] * df['ask_size2'] + df['ask_price2'] * df['bid_size2']) / (df['bid_size2'] + df['ask_size2'])
    return wap

In [None]:
pricelist = ['bid_price1','ask_price1','bid_price2','ask_price2','wap1','wap2']
columns = ['stock_id','time_id']
for price in pricelist:
    element = [f'{price}_{sec}' for sec in range(0,600)]
    columns.extend(element)

In [None]:
def get_time_alldata(b_df,stock_id,time_id,columns):
    tmp = pd.DataFrame(index=range(0,600)).rename_axis('seconds_in_bucket')
    tmp = pd.concat([tmp,b_df[b_df['time_id'] == time_id].set_index('seconds_in_bucket')],axis=1)
    tmp = tmp.fillna(method='ffill').fillna(method='bfill').reset_index()
    tmp_row = []
    for price in pricelist:
        tmp_row.extend(tmp[price].to_list())
    tmp_row = [stock_id,time_id] + tmp_row
    tmp_df = pd.DataFrame(tmp_row, index=columns).T
    return tmp_df

In [None]:
all_stock_df = pd.DataFrame()
for i,stock_id in tqdm(enumerate(stocklist)):
    if i == 60:
        #メモリを超える可能性があるため、stock_idを半分当たりで一度出力して、メモリを解放する
        all_stock_df.to_parquet(f'all_stock_df_stock_id0to{stocklist[i]}.parquet',index=False)
        all_stock_df = pd.DataFrame()
    b_df = pd.read_parquet(data_dir+f'book_train.parquet/stock_id={stock_id}')
    b_df['wap1'] = calc_wap1(b_df)
    b_df['wap2'] = calc_wap2(b_df)
    df = Parallel(n_jobs = -1, verbose = 1)(delayed(get_time_alldata)(b_df,stock_id,time_id,columns) for time_id in b_df['time_id'].unique())
    df = pd.concat(df, ignore_index = True)
    all_stock_df = pd.concat([all_stock_df, df],ignore_index=True)

all_stock_df.to_parquet(f'all_stock_df_stock_id_after{stocklist[60]}.parquet',index=False)
del df,b_df