## This Notebook

In this Notebook, we will draw and summarize CandleStick Charts and OHLC Charts from the data in train_trade.parquet.

For trade data, it is possible to obtain not only the minimum and maximum values at each time_id, but also the opening and closing prices. And by drawing a diagram from these four values, we can pattern the price movement at that time_id.


I have also posted on DISCUSSION about using these charts, so I would appreciate if you could comment on that as well.

[How about using a candlestick charts and OHLC charts?](https://www.kaggle.com/c/optiver-realized-volatility-prediction/discussion/270276)

In [None]:
from IPython.core.display import display, HTML

import pandas as pd
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import glob
import os
import gc
import plotly.graph_objects as go

from joblib import Parallel, delayed
from sklearn import preprocessing, model_selection
import matplotlib.pyplot as plt 
import seaborn as sns
from tqdm import tqdm


data_dir = '../input/optiver-realized-volatility-prediction/'

In [None]:
def read_train_test():
    train = pd.read_csv('../input/optiver-realized-volatility-prediction/train.csv')
    # Create a key to merge with book and trade data
    train['row_id'] = train['stock_id'].astype(str) + '-' + train['time_id'].astype(str)
    print(f'Our training set has {train.shape[0]} rows')
    return train

def trade_preprocessor(file_path):
    df = pd.read_parquet(file_path)
    stock_id = file_path.split('=')[1]
    df['row_id'] = df['time_id'].apply(lambda x:f'{stock_id}-{x}')
    df = df.drop('time_id',axis=1)
    return df

def preprocessor(list_stock_ids, is_train = True):
    
    # Parrallel for loop
    def for_joblib(stock_id):
        # Train
        file_path_trade = data_dir + "trade_train.parquet/stock_id=" + str(stock_id)
        df_tmp = trade_preprocessor(file_path_trade)
        return df_tmp
    
    # Use parallel api to call paralle for loop
    df = Parallel(n_jobs = -1, verbose = 1)(delayed(for_joblib)(stock_id) for stock_id in list_stock_ids)
    # Concatenate all the dataframes that return from Parallel
    df = pd.concat(df, ignore_index = True)
    return df

In [None]:
# Read train and test
train = read_train_test()
train_stock_ids = train['stock_id'].unique()
print(train_stock_ids) 

train_ = preprocessor(train_stock_ids, is_train = True)
train = train.merge(train_, on = ['row_id'], how = 'left')

In [None]:
# Calculate the min,max,first,last value for each stock_id and time_id
_train = train[['stock_id','time_id','price']]
ohlc_df = _train.groupby(['stock_id','time_id']).agg(['min','max','first','last']).reset_index()
ohlc_df

## CandleStick Charts

In [None]:

for stock_id in tqdm(np.unique(ohlc_df['stock_id'])[:2]):
    df_plot = ohlc_df[ohlc_df['stock_id']==stock_id]['price']
    time_id_title = ohlc_df[ohlc_df['stock_id']==stock_id]['time_id'] 
    fig = go.Figure()
    fig.add_trace(go.Candlestick(
        x = time_id_title, 
        open = df_plot['first'],
        high = df_plot['max'],
        low = df_plot['min'],
        close = df_plot['last']
    ))
    fig.update_layout(
        title = 'stock_id :'+str(stock_id),
        xaxis_title = 'time_id',
        yaxis_title = 'price'
    )
    fig.show()          
    

## OHLC Charts

In [None]:
for stock_id in tqdm(np.unique(ohlc_df['stock_id'])[:2]):
    df_plot = ohlc_df[ohlc_df['stock_id']==stock_id]['price']
    time_id_title = ohlc_df[ohlc_df['stock_id']==stock_id]['time_id'] 
    fig = go.Figure()
    fig.add_trace(go.Ohlc(
        x = time_id_title, 
        open = df_plot['first'],
        high = df_plot['max'],
        low = df_plot['min'],
        close = df_plot['last']
    ))
    fig.update_layout(
        title = 'stock_id :'+str(stock_id),
        xaxis_title = 'time_id',
        yaxis_title = 'price'
    )
    fig.show()          
    