In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from glob import glob
import plotly_express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from tqdm import tqdm
from sklearn.metrics import *

In [None]:
train = pd.read_csv('../input/optiver-realized-volatility-prediction/train.csv')
test  = pd.read_csv('../input/optiver-realized-volatility-prediction/test.csv')
sample_sub = pd.read_csv('../input/optiver-realized-volatility-prediction/sample_submission.csv')
display(train)
display(test)
display(sample_sub)

In [None]:
def file_path_to_volatility(path,info=False):
    part_data = []
    stock_id = path.split('/')[4].split('=')[1]
    sample_book = pd.read_parquet(path)
    sample_book['wap'] = (sample_book['bid_price1'] * sample_book['ask_size1'] + sample_book['ask_price1'] * sample_book['bid_size1']) / (sample_book['bid_size1']+ sample_book['ask_size1'])
    for gid0,gid in sample_book.groupby('time_id'):
        gid['log_return'] = gid['wap'].apply(lambda x:np.log(x)).diff()
        if info :
            print(f'Realized Volatiliy for time id {gid.time_id.iloc[0]} is ' ,np.sqrt(np.sum(gid['log_return'].reset_index(drop=True).drop(index = 0).apply(lambda x:x**2))))
        part_data.append([stock_id,gid.time_id.iloc[0],np.sqrt(np.sum(gid['log_return'].reset_index(drop=True).drop(index = 0).apply(lambda x:x**2)))])
    return part_data
def read_all_files(path):
    """ Reads All file in the sub Folder (path / *) and read all parquets (trade/book) and picks only the first occurence based on Stock + Time
        Returns a list of all dataframs use concat to join them back ."""
    demo_all = []
    for i in tqdm(glob(os.path.join(path,'*'))):
        demo_merged = path_to_data(i)
        demo = demo_merged.groupby(['stock_id','time_id']).first().reset_index()
        demo.stock_id = demo.stock_id.astype('int64')
        demo_all.append(demo)
    return demo_all

def path_to_data(path):
    """ This return a merged dataframe of trades where the trades actually took place """
#     print(path)
    stock_id = path.split('/')[-1].split('=')[1]
    curr_book = pd.read_parquet(path)
    curr_trade = pd.read_parquet(path.replace('book','trade'))
    merged_data = pd.merge(curr_book,curr_trade,on=['time_id','seconds_in_bucket'])
    merged_data['stock_id'] = stock_id
#     print(curr_book.shape,curr_trade.shape,len(merged_data))
    if len(merged_data) ==0 :
        merged_data = curr_trade.merge(curr_book, how='cross',suffixes=['','_y'])
        merged_data['diff'] = abs(merged_data.seconds_in_bucket-merged_data.seconds_in_bucket_y)
        merged_data = pd.merge(merged_data.groupby(['time_id','seconds_in_bucket'])['diff'].min().reset_index(),merged_data,how="left")
        merged_data.drop(columns=['time_id_y','seconds_in_bucket_y','diff'],inplace=True)
        merged_data['stock_id'] = stock_id
    merged_data.dropna(inplace=True)
    merged_data.reset_index(drop=True)
    return merged_data

def files_to_numbers(demo_all,vol_calculated,csv_path = '../input/optiver-realized-volatility-prediction/train.csv'):
    """ Takes in a List of DataFrame and Merges them with a CSV File and then with preprocessed data that we have where we calculate the Volatility
        at end of 10 min or bucket mark """
    csv_file = pd.read_csv(csv_path)
    demo = pd.concat(demo_all).reset_index(drop=True)
    demo_vol = pd.merge(csv_file,demo,on=['stock_id','time_id'])
    demo_vol_all_data = pd.merge(demo_vol,vol_calculated)
    return demo_vol_all_data

def rmspe(y_true, y_pred):
    return  (np.sqrt(np.mean(np.square((y_true - y_pred) / y_true))))

In [None]:
from sklearn.metrics import *
from sklearn.model_selection import *
from sklearn.preprocessing import *
from sklearn.linear_model import LinearRegression
book_dir = glob('../input/optiver-realized-volatility-prediction/book_train.parquet/*/*')
# train_data_vol = []
# for i in tqdm(book_dir):
#     train_data_vol.append(file_path_to_volatility(i))
# past_data = pd.concat([pd.DataFrame(i,columns=['stock_id','time_id','vol']) for i in train_data_vol])
# past_data.stock_id = past_data.stock_id.astype('int64')
past_data = pd.read_csv('../input/starter/train_data.csv')
demo_all = read_all_files('../input/optiver-realized-volatility-prediction/book_train.parquet')
data = files_to_numbers(demo_all,past_data,'../input/optiver-realized-volatility-prediction/train.csv')
data_df = data.copy()
df_traval,df_test = train_test_split(data_df,stratify=data_df['stock_id'])
df_train,df_val = train_test_split(df_traval,stratify=df_traval['stock_id'])
df_train.drop(columns=['stock_id','time_id'],inplace=True)
df_val.drop(columns=['stock_id','time_id'],inplace=True)
df_test.drop(columns=['stock_id','time_id'],inplace=True)
feature = list(df_train.columns)
feature.remove('target')
train_x,train_y = df_train[feature],df_train['target']
val_x,val_y = df_val[feature],df_val['target']
test_x,test_y = df_test[feature],df_test['target']
std_scaler = StandardScaler()
train_x_transform = std_scaler.fit_transform(train_x)
val_x_transform = std_scaler.transform(val_x)
test_x_transform = std_scaler.transform(test_x)


lr = LinearRegression()
lr.fit(train_x_transform,train_y)
print(lr.score(train_x_transform,train_y),lr.score(val_x_transform,val_y),lr.score(test_x_transform,test_y))
print('Mean Squared Error \t RMSPE METRIC(COMP)')
print(mean_squared_error(lr.predict(train_x_transform),train_y),rmspe(lr.predict(train_x_transform),train_y))
print(mean_squared_error(lr.predict(val_x_transform),val_y),rmspe(lr.predict(val_x_transform),val_y))
print(mean_squared_error(lr.predict(test_x_transform),test_y),rmspe(lr.predict(test_x_transform),test_y))
print(lr.predict(test_x_transform[:5]),test_y[:5].values)

data_df.drop(columns=['stock_id','time_id'],inplace=True)
train_data,train_label = data_df[feature],data_df['target']
std_scaler = StandardScaler()
train_data_transform = std_scaler.fit_transform(train_data)
train_x_transform = std_scaler.transform(train_x)
val_x_transform = std_scaler.transform(val_x)
test_x_transform = std_scaler.transform(test_x)


lr = LinearRegression()
lr.fit(train_data_transform,train_label)
print(lr.score(train_x_transform,train_y),lr.score(val_x_transform,val_y),lr.score(test_x_transform,test_y))
print('Mean Squared Error \t RMSPE METRIC(COMP)')
print(mean_squared_error(lr.predict(train_data_transform),train_label),rmspe(lr.predict(train_data_transform),train_label))
print(mean_squared_error(lr.predict(train_x_transform),train_y),rmspe(lr.predict(train_x_transform),train_y))
print(mean_squared_error(lr.predict(val_x_transform),val_y),rmspe(lr.predict(val_x_transform),val_y))
print(mean_squared_error(lr.predict(test_x_transform),test_y),rmspe(lr.predict(test_x_transform),test_y))
print(lr.predict(test_x_transform[:5]),test_y[:5].values)


In [None]:
# book_dir_test = glob('../input/optiver-realized-volatility-prediction/book_test.parquet/*/*')
# test_data_vol = []
# for i in tqdm(book_dir_test):
#     test_data_vol.append(file_path_to_volatility(i))
# test_data = pd.concat([pd.DataFrame(i,columns=['stock_id','time_id','vol']) for i in test_data_vol])
# display(test_data)
# test_data.stock_id = test_data.stock_id.astype('int64')
book_test_dir = glob('../input/optiver-realized-volatility-prediction/book_test.parquet/*/*')
train_data_vol_pred = []
for i in tqdm(book_test_dir):
    train_data_vol_pred.append(file_path_to_volatility(i))
past_data_pred = pd.concat([pd.DataFrame(i,columns=['stock_id','time_id','vol']) for i in train_data_vol_pred])
past_data_pred.stock_id = past_data_pred.stock_id.astype('int64')




# display(test_data.dtypes)
# display(test.dtypes)
demo_all_pred =  read_all_files('../input/optiver-realized-volatility-prediction/book_test.parquet')



data_pred = files_to_numbers(demo_all_pred,past_data_pred,'../input/optiver-realized-volatility-prediction/test.csv')
display(data_pred)
data_pred.dropna(inplace=True)
data_pred.reset_index(drop=True)
row_orderings = data_pred.row_id
data_pred.drop(columns='row_id',inplace=True)
feature = ['seconds_in_bucket','bid_price1','ask_price1','bid_price2','ask_price2','bid_size1','ask_size1','bid_size2','ask_size2','price','size','order_count','vol']
data_pred = data_pred[feature]
pred = lr.predict(std_scaler.transform(data_pred))

# naive_pred = pd.merge(test,test_data,on=['stock_id','time_id'],how='left')[['row_id','vol']]
# naive_pred.columns=sample_sub.columns
# naive_pred.target.fillna(0.003048022,inplace=True)
# naive_pred.to_csv('submission.csv',index = False)
# display(naive_pred)

if np.sum(data_pred.isnull().sum())==0:
    sub = pd.merge(sample_sub,pd.DataFrame({'row_id':row_orderings,'target':pred}),on='row_id',how='left',suffixes=['_old',''])[sample_sub.columns]
    sub.target.fillna(0.003048022,inplace=True)
    display(sub)
    sub.to_csv('submission.csv',index=False)

# naive_pred = pd.merge(test,past_data_pred,on=['stock_id','time_id'],how='left')[['row_id','vol']]
# naive_pred.columns=sample_sub.columns
# naive_pred.target.fillna(0.003048022,inplace=True)
# naive_pred.to_csv('submission.csv',index = False)
# display(naive_pred)

In [None]:
# book_dir_test = glob('../input/optiver-realized-volatility-prediction/book_test.parquet/*/*')
# test_data_vol = []
# for i in tqdm(book_dir_test):
#     test_data_vol.append(file_path_to_volatility(i))
# stock_id = '0'
# test_data = pd.concat([pd.DataFrame(i,columns=['stock_id','time_id','vol']) for i in test_data_vol])
# display(test_data)
# test_data.stock_id = test_data.stock_id.astype('int64')
# display(test_data.dtypes)
# display(test.dtypes)
# naive_pred = pd.merge(test,test_data,on=['stock_id','time_id'],how='left')[['row_id','vol']]
# naive_pred.columns=sample_sub.columns
# naive_pred.target.fillna(0.003048022,inplace=True)
# naive_pred.to_csv('submission.csv',index = False)
# display(naive_pred)