In [None]:
import os
import glob
from joblib import Parallel, delayed
import pandas as pd
import numpy as np
import scipy as sc
from sklearn.model_selection import KFold
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')
pd.set_option('max_columns', 300)
from tqdm import tqdm
import time

In [None]:
# 細かい関数
# data directory
data_dir = '../input/optiver-realized-volatility-prediction/'

In [None]:
# Function to read our base train and test set
def read_train_test():
    train = pd.read_csv('../input/optiver-realized-volatility-prediction/train.csv')
    test = pd.read_csv('../input/optiver-realized-volatility-prediction/test.csv')
    # Create a key to merge with book and trade data
    train['row_id'] = train['stock_id'].astype(str) + '-' + train['time_id'].astype(str)
    test['row_id'] = test['stock_id'].astype(str) + '-' + test['time_id'].astype(str)
    print(f'Our training set has {train.shape[0]} rows')
    return train, test

# Read train and test
train, test = read_train_test()

In [None]:
#stocklist = train['stock_id'].unique()
stocklist = [0,1] #軽量化のため、2つのみ
len(stocklist)

In [None]:
#現在価格の計算
def calc_wap1(df):
    wap = (df['bid_price1'] * df['ask_size1'] + df['ask_price1'] * df['bid_size1']) / (df['bid_size1'] + df['ask_size1'])
    return wap

# Function to calculate second WAP
def calc_wap2(df):
    wap = (df['bid_price2'] * df['ask_size2'] + df['ask_price2'] * df['bid_size2']) / (df['bid_size2'] + df['ask_size2'])
    return wap

In [None]:
pricelist = ['bid_price1','ask_price1','bid_price2','ask_price2','wap1','wap2']
columns = ['stock_id','time_id']
for price in pricelist:
    element = [f'{price}_{sec}' for sec in range(0,600)]
    columns.extend(element)

In [None]:
def get_time_alldata(b_df,stock_id,time_id,columns):
    tmp = pd.DataFrame(index=range(0,600)).rename_axis('seconds_in_bucket')
    tmp = pd.concat([tmp,b_df[b_df['time_id'] == time_id].set_index('seconds_in_bucket')],axis=1)
    tmp = tmp.fillna(method='ffill').fillna(method='bfill').reset_index()
    tmp_row = []
    for price in pricelist:
        tmp_row.extend(tmp[price].to_list())
    tmp_row = [stock_id,time_id] + tmp_row
    tmp_df = pd.DataFrame(tmp_row, index=columns).T
    return tmp_df

In [None]:
all_stock_df = pd.DataFrame()
for stock_id in tqdm(stocklist):
    b_df = pd.read_parquet(data_dir+f'book_train.parquet/stock_id={stock_id}')
    b_df['wap1'] = calc_wap1(b_df)
    b_df['wap2'] = calc_wap2(b_df)
    df = Parallel(n_jobs = -1, verbose = 1)(delayed(get_time_alldata)(b_df,stock_id,time_id,columns) for time_id in b_df['time_id'].unique())
    df = pd.concat(df, ignore_index = True)
    all_stock_df = pd.concat([all_stock_df, df],ignore_index=True)
del df,b_df

## PCA

In [None]:
from sklearn.decomposition import PCA
#pca_feats_std = all_stock_df.filter(like=pricelist[1],axis=1).apply(lambda x: (x-x.mean())/x.std(), axis=0)
n_components=20
pca = PCA(n_components=n_components,random_state=57)
#pca_feats = pca_feats_df.drop(['stock_id', 'time_id'], axis=1)
#標準化してもしなくても良い
pca_feats_std = all_stock_df.drop(['stock_id', 'time_id'], axis=1).apply(lambda x: (x-x.mean())/x.std(), axis=0)
pca.fit(pca_feats_std)
feature = pca.transform(pca_feats_std)
feature = pd.DataFrame(feature, columns=["PC{}".format(x + 1) for x in range(n_components)])
del pca_feats_std

#寄与率
display(pd.DataFrame(pca.explained_variance_ratio_, index=["PC{}".format(x + 1) for x in range(n_components)]).head())
#固有値
display(pd.DataFrame(pca.explained_variance_, index=["PC{}".format(x + 1) for x in range(n_components)]).head())

In [None]:
feature.loc[:,['stock_id','time_id']] = all_stock_df[['stock_id','time_id']]

## Trainデータの作成

In [None]:
#本来ならtrainにjoinした方が良いが、今は欠損が出るためleftjoinにしている
train = train[train['stock_id'].isin(stocklist)]
train = train.merge(all_stock_df,how='left',on=['stock_id','time_id'])
train = train.merge(feature, how='left', on=['stock_id','time_id'])

In [None]:
def rmspe(y_true, y_pred):
    return np.sqrt(np.mean(np.square((y_true - y_pred) / y_true)))

def feval_rmspe(y_pred, lgb_train):
    y_true = lgb_train.get_label()
    return 'RMSPE', rmspe(y_true, y_pred), False

## モデル作成

## PCAあり

In [None]:
# Hyperparammeters (just basic)
params = {
  'objective': 'rmse',  
  'boosting_type': 'gbdt',
  'num_leaves': 100,
  'n_jobs': -1,
  'learning_rate': 0.1,
  'feature_fraction': 0.8,
  'bagging_fraction': 0.8,
  'verbose': -1
}

In [None]:
# Split features and target
x = train.drop(['row_id', 'target', 'time_id'], axis = 1)
y = train['target']
#x_test = test.drop(['row_id', 'time_id'], axis = 1)
# Transform stock id to a numeric value
x['stock_id'] = x['stock_id'].astype(int)
#x_test['stock_id'] = x_test['stock_id'].astype(int)

# Create out of folds array
oof_predictions = np.zeros(x.shape[0])
# Create test array to store predictions
#test_predictions = np.zeros(x_test.shape[0])
# Create a KFold object
kfold = KFold(n_splits = 5, random_state = 66, shuffle = True)
# Iterate through each fold
modellist = []
for fold, (trn_ind, val_ind) in enumerate(kfold.split(x)):
    print(f'Training fold {fold + 1}')
    x_train, x_val = x.iloc[trn_ind], x.iloc[val_ind]
    y_train, y_val = y.iloc[trn_ind], y.iloc[val_ind]
    # Root mean squared percentage error weights
    train_weights = 1 / np.square(y_train)
    val_weights = 1 / np.square(y_val)
    train_dataset = lgb.Dataset(x_train, y_train, weight = train_weights, categorical_feature = ['stock_id'])
    val_dataset = lgb.Dataset(x_val, y_val, weight = val_weights, categorical_feature = ['stock_id'])
    model = lgb.train(params = params, 
                      train_set = train_dataset, 
                      valid_sets = [train_dataset, val_dataset], 
                      num_boost_round = 10000, 
                      early_stopping_rounds = 50, 
                      verbose_eval = 50,
                      feval = feval_rmspe)
    modellist.append(model)
    # Add predictions to the out of folds array
    oof_predictions[val_ind] = model.predict(x_val)
    # Predict the test set
    #test_predictions += model.predict(x_test) / 5

rmspe_score = rmspe(y, oof_predictions)
print(f'Our out of folds RMSPE is {rmspe_score}')

## PCAなし

In [None]:
PC_cols = train.filter(like='PC',axis=1).columns.to_list()
# Split features and target
x = train.drop(['row_id', 'target', 'time_id']+PC_cols, axis = 1)
y = train['target']
#x_test = test.drop(['row_id', 'time_id'], axis = 1)
# Transform stock id to a numeric value
x['stock_id'] = x['stock_id'].astype(int)
#x_test['stock_id'] = x_test['stock_id'].astype(int)

# Create out of folds array
oof_predictions = np.zeros(x.shape[0])
# Create test array to store predictions
#test_predictions = np.zeros(x_test.shape[0])
# Create a KFold object
kfold = KFold(n_splits = 5, random_state = 66, shuffle = True)
# Iterate through each fold
modellist = []
for fold, (trn_ind, val_ind) in enumerate(kfold.split(x)):
    print(f'Training fold {fold + 1}')
    x_train, x_val = x.iloc[trn_ind], x.iloc[val_ind]
    y_train, y_val = y.iloc[trn_ind], y.iloc[val_ind]
    # Root mean squared percentage error weights
    train_weights = 1 / np.square(y_train)
    val_weights = 1 / np.square(y_val)
    train_dataset = lgb.Dataset(x_train, y_train, weight = train_weights, categorical_feature = ['stock_id'])
    val_dataset = lgb.Dataset(x_val, y_val, weight = val_weights, categorical_feature = ['stock_id'])
    model = lgb.train(params = params, 
                      train_set = train_dataset, 
                      valid_sets = [train_dataset, val_dataset], 
                      num_boost_round = 10000, 
                      early_stopping_rounds = 50, 
                      verbose_eval = 50,
                      feval = feval_rmspe)
    modellist.append(model)
    # Add predictions to the out of folds array
    oof_predictions[val_ind] = model.predict(x_val)
    # Predict the test set
    #test_predictions += model.predict(x_test) / 5

rmspe_score = rmspe(y, oof_predictions)
print(f'Our out of folds RMSPE is {rmspe_score}')