In [None]:
import pandas as pd
import glob
import pyarrow as pa
import numpy as np
import matplotlib.pyplot as plt

In [None]:
train_data = pd.read_csv("../input/optiver-realized-volatility-prediction/train.csv")
test_data = pd.read_csv("../input/optiver-realized-volatility-prediction/test.csv")

print(train_data.shape)
print(test_data.shape)

train_data = train_data[train_data.stock_id == 0]
display(train_data)
display(test_data)

In [None]:
list = glob.glob('../input/optiver-realized-volatility-prediction/trade_train.parquet/stock_id=0/*.parquet')
list

In [None]:
book = pa.parquet.read_table(list[0])
book = book.to_pandas()
book.head(305)

In [None]:
list = glob.glob("../input/optiver-realized-volatility-prediction/book_train.parquet/stock_id=0/*.parquet")
list

In [None]:
train = pa.parquet.read_table(list[0])
train = train.to_pandas()
train.head(305)

In [None]:
train_data = train_data.drop(["stock_id"], axis=1)
train_data

In [None]:
train_input = pd.merge(train_data, train, on=["time_id"])
train_input

In [None]:
list = glob.glob("../input/optiver-realized-volatility-prediction/book_test.parquet/stock_id=0/*.parquet")
list

In [None]:
test = pa.parquet.read_table(list[0])
test_input = test.to_pandas()

print(test_input.shape)
test_input

In [None]:
list = glob.glob("../input/optiver-realized-volatility-prediction/trade_test.parquet/stock_id=0/*.parquet")
list

In [None]:
test_book = pa.parquet.read_table(list[0])
test_book = test_book.to_pandas()
test_book

In [None]:
def log_return(list_stock_prices):
    return np.log(list_stock_prices).diff() 

def rv(series_log_return):
    return np.sqrt(np.sum(series_log_return**2))


def rv2(series_log_return):
    return np.sqrt(np.sum(series_log_return**2))


# taken from https://www.kaggle.com/yus002/realized-volatility-prediction-lgbm-train
def my_metrics(y_true, y_pred):
    return np.sqrt(np.mean(np.square((y_true - y_pred) / y_true)))
def rmspe(y_true, y_pred):  
    output = my_metrics(y_true, y_pred)
    return 'rmspe', output, False

In [None]:
df_book = train_input.copy()
df_book

In [None]:
#df_book.sort_values(by=["time_id", "seconds_in_bucket"])


In [None]:
df_book = train_input.copy()
df_book.sort_values(by=['time_id', 'seconds_in_bucket'])

# compute different vwap
df_book['wap1'] = (df_book['bid_price1'] * df_book['ask_size1'] + df_book['ask_price1'] * df_book['bid_size1']) / (
    df_book['bid_size1']+ df_book['ask_size1'])

# wap2
a = df_book['bid_price2'] * df_book['ask_size2'] + df_book['ask_price2'] * df_book['bid_size2']
b = df_book['bid_size2']+ df_book['ask_size2']
df_book['wap2'] = a/b

# wap3
a1 = df_book['bid_price1'] * df_book['ask_size1'] + df_book['ask_price1'] * df_book['bid_size1']
a2 = df_book['bid_price2'] * df_book['ask_size2'] + df_book['ask_price2'] * df_book['bid_size2']
b = df_book['bid_size1'] + df_book['ask_size1'] + df_book['bid_size2']+ df_book['ask_size2']    
df_book['wap3'] = (a1 + a2)/ b

# wap4 
a = (df_book['bid_price1'] * df_book['ask_size1'] + df_book['ask_price1'] * df_book['bid_size1']) / (df_book['bid_size1']+ df_book['ask_size1'])
b = (df_book['bid_price2'] * df_book['ask_size2'] + df_book['ask_price2'] * df_book['bid_size2']) / (df_book['bid_size2']+ df_book['ask_size2'])
df_book['wap4'] = (a + b) / 2

df_book['vol_wap1'] = (df_book.groupby(by = ['time_id'])['wap1'].apply(log_return).reset_index(drop = True).fillna(0))
df_book['vol_wap2'] = (df_book.groupby(by = ['time_id'])['wap2'].apply(log_return).reset_index(drop = True).fillna(0))
df_book['vol_wap3'] = (df_book.groupby(by = ['time_id'])['wap3'].apply(log_return).reset_index(drop = True).fillna(0))
df_book['vol_wap4'] = (df_book.groupby(by = ['time_id'])['wap4'].apply(log_return).reset_index(drop = True).fillna(0))

df_book['bas'] = (df_book[['ask_price1', 'ask_price2']].min(axis = 1) / df_book[['bid_price1', 'bid_price2']].max(axis = 1) - 1)

# different spreads
df_book['h_spread_l1'] = df_book['ask_price1'] - df_book['bid_price1']
df_book['h_spread_l2'] = df_book['ask_price2'] - df_book['bid_price2']
df_book['v_spread_b'] = df_book['bid_price1'] - df_book['bid_price2']
df_book['v_spread_a'] = df_book['ask_price1'] - df_book['bid_price2']

In [None]:
display(df_book.head())
print(df_book.shape)

In [None]:
# attach volatitilies based on different VWAPs
stock_stat = pd.merge(df_book.groupby(by = ['time_id'])['vol_wap1'].agg(rv).reset_index(), df_book.groupby(by = ['time_id'],
                as_index = False)['bas'].mean(),on = ['time_id'], how = 'left')
stock_stat = pd.merge( df_book.groupby(by = ['time_id'])['vol_wap2'].agg(rv).reset_index(), stock_stat, on = ['time_id'], how = 'left')
stock_stat = pd.merge( df_book.groupby(by = ['time_id'])['vol_wap3'].agg(rv).reset_index(), stock_stat, on = ['time_id'], how = 'left')
stock_stat = pd.merge( df_book.groupby(by = ['time_id'])['vol_wap4'].agg(rv).reset_index(), stock_stat, on = ['time_id'], how = 'left')

# spread summaries
stock_stat = pd.merge( df_book.groupby(by = ['time_id'])['h_spread_l1'].agg(max).reset_index(),
    stock_stat, on = ['time_id'], how = 'left')     
stock_stat = pd.merge( df_book.groupby(by = ['time_id'])['h_spread_l2'].agg(max).reset_index(),
    stock_stat, on = ['time_id'], how = 'left')     
stock_stat = pd.merge( df_book.groupby(by = ['time_id'])['v_spread_b'].agg(max).reset_index(),
    stock_stat, on = ['time_id'], how = 'left')   
stock_stat = pd.merge( df_book.groupby(by = ['time_id'])['v_spread_a'].agg(max).reset_index(),
    stock_stat, on = ['time_id'], how = 'left')   

stock_stat['target'] = train_input.target

In [None]:
display(stock_stat.head())
print(stock_stat.shape)

In [None]:
target = stock_stat["target"]
train_data = stock_stat.drop("target", axis=1)
train_data

In [None]:
df_book = test_input.copy()
df_book.sort_values(by=["time_id", "seconds_in_bucket"])

# compute different vwap
df_book['wap1'] = (df_book['bid_price1'] * df_book['ask_size1'] + df_book['ask_price1'] * df_book['bid_size1']) / (
    df_book['bid_size1']+ df_book['ask_size1'])

# wap2
a = df_book['bid_price2'] * df_book['ask_size2'] + df_book['ask_price2'] * df_book['bid_size2']
b = df_book['bid_size2']+ df_book['ask_size2']
df_book['wap2'] = a/b

# wap3
a1 = df_book['bid_price1'] * df_book['ask_size1'] + df_book['ask_price1'] * df_book['bid_size1']
a2 = df_book['bid_price2'] * df_book['ask_size2'] + df_book['ask_price2'] * df_book['bid_size2']
b = df_book['bid_size1'] + df_book['ask_size1'] + df_book['bid_size2']+ df_book['ask_size2']    
df_book['wap3'] = (a1 + a2)/ b

# wap4 
a = (df_book['bid_price1'] * df_book['ask_size1'] + df_book['ask_price1'] * df_book['bid_size1']) / (df_book['bid_size1']+ df_book['ask_size1'])
b = (df_book['bid_price2'] * df_book['ask_size2'] + df_book['ask_price2'] * df_book['bid_size2']) / (df_book['bid_size2']+ df_book['ask_size2'])
df_book['wap4'] = (a + b) / 2


df_book['vol_wap1'] = (df_book.groupby(by = ['time_id'])['wap1'].apply(log_return).reset_index(drop = True).fillna(0))
df_book['vol_wap2'] = (df_book.groupby(by = ['time_id'])['wap2'].apply(log_return).reset_index(drop = True).fillna(0))
df_book['vol_wap3'] = (df_book.groupby(by = ['time_id'])['wap3'].apply(log_return).reset_index(drop = True).fillna(0))
df_book['vol_wap4'] = (df_book.groupby(by = ['time_id'])['wap4'].apply(log_return).reset_index(drop = True).fillna(0))

df_book['bas'] = (df_book[['ask_price1', 'ask_price2']].min(axis = 1) / df_book[['bid_price1', 'bid_price2']].max(axis = 1) - 1)

# different spreads
df_book['h_spread_l1'] = df_book['ask_price1'] - df_book['bid_price1']
df_book['h_spread_l2'] = df_book['ask_price2'] - df_book['bid_price2']
df_book['v_spread_b'] = df_book['bid_price1'] - df_book['bid_price2']
df_book['v_spread_a'] = df_book['ask_price1'] - df_book['bid_price2']
##
df_book['vol_wap1'] = df_book['vol_wap1'].apply(rv).reset_index()
df_book['vol_wap2'] = df_book['vol_wap2'].apply(rv).reset_index()
df_book['vol_wap3'] = df_book['vol_wap3'].apply(rv).reset_index()
df_book['vol_wap4'] = df_book['vol_wap4'].apply(rv).reset_index()
##


###ここから下のロジックは現在つかっていない
# attach volatitilies based on different VWAPs
stock_stat = pd.merge(df_book.groupby(by = ['time_id'])['vol_wap1'].agg(rv).reset_index(),
    df_book.groupby(by = ['time_id'], as_index = False)['bas'].mean(),on = ['time_id'], how = 'left')
stock_stat = pd.merge( df_book.groupby(by = ['time_id'])['vol_wap2'].agg(rv).reset_index(),stock_stat, on = ['time_id'], how = 'left')
stock_stat = pd.merge( df_book.groupby(by = ['time_id'])['vol_wap3'].agg(rv).reset_index(),stock_stat, on = ['time_id'], how = 'left')
stock_stat = pd.merge( df_book.groupby(by = ['time_id'])['vol_wap4'].agg(rv).reset_index(),stock_stat, on = ['time_id'], how = 'left')     

# spread summaries
stock_stat = pd.merge( df_book.groupby(by = ['time_id'])['h_spread_l1'].agg(max).reset_index(),stock_stat, on = ['time_id'], how = 'left')     
stock_stat = pd.merge( df_book.groupby(by = ['time_id'])['h_spread_l2'].agg(max).reset_index(),stock_stat, on = ['time_id'], how = 'left') 
stock_stat = pd.merge( df_book.groupby(by = ['time_id'])['v_spread_b'].agg(max).reset_index(),stock_stat, on = ['time_id'], how = 'left')   
stock_stat = pd.merge( df_book.groupby(by = ['time_id'])['v_spread_a'].agg(max).reset_index(),stock_stat, on = ['time_id'], how = 'left')   

#stock_stat['stock_id'] = 0

In [None]:
test_data = stock_stat
display(test_data)
print(test_data.shape)

2021/07/26
time_id情報を消して学習してみる
lgbのimportanceを確認してみて調整予定

In [None]:
test_data = test_data.drop("time_id", axis=1)
train_data = train_data.drop("time_id", axis=1)

display(train_data)
display(test_data)
print("train_data:", train_data.shape)
print("test_data:", test_data.shape)

# **LightGBM**

In [None]:
import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, KFold

In [None]:
best_lgb_params = {
 'bagging_fraction': 1,
 'bagging_freq': 0,
 'feature_fraction': 0.7,
 #'feature_pre_filter': False,
 #'lambda_l1': 1.263018256839349e-07,
 #'lambda_l2': 0.002578740827596048,
 'metric': 'l2',
 #'min_child_samples': 200,
 #'num_leaves': 131,
 'objective': 'mse'}
best_lgb_params["learning_rate"] = 0.5
best_lgb_params["early_stopping_round"] = 100
best_lgb_params["num_iterations"] = 10000

In [None]:
x_train, x_test, y_train, y_test = train_test_split(train_data, target, test_size=0.3, random_state=42)

In [None]:
lgb_train = lgb.Dataset(x_train, y_train)
lgb_valid = lgb.Dataset(x_test, y_test)
model = lgb.train(best_lgb_params, lgb_train, valid_sets=[lgb_valid], verbose_eval=100)

In [None]:
lgb.plot_importance(model, figsize=(12, 6))
plt.show()

In [None]:
sub = pd.read_csv('../input/optiver-realized-volatility-prediction/sample_submission.csv')
sub

In [None]:
test_data['target'] =model.predict(test_data)
display(test_data)
print(test_data.shape)

In [None]:
# x = test_data[test_data.time_id==4]
# sub['target'][0] = np.round(x['target'].mean(),decimals=9)

# x = test_data[test_data.time_id==32]
# sub['target'][1] = np.round(x['target'].mean(),decimals=9)

# x = test_data[test_data.time_id==34]
# sub['target'][2] = np.round(x['target'].mean(),decimals=9)
# sub = sub.dropna()
# sub

In [None]:
sub.to_csv('./submission.csv',index=False)