**Copyright (c) 2021 Risklab Middle East - All Rights Reserved**

---


**Author: Mehrdad Moghimi**



# Imports libraries

In [1]:
%%capture
!pip install plotly -U

In [2]:
import pandas as pd
import numpy as np 
import datetime
import time
import sys
from scipy import stats
from statsmodels.stats import stattools

import plotly.express as px
import plotly.figure_factory as ff
import plotly.graph_objects as go
from plotly.subplots import make_subplots

pd.options.plotting.backend = "plotly"
np.seterr(divide='ignore', invalid='ignore')

  import pandas.util.testing as tm


{'divide': 'warn', 'invalid': 'warn', 'over': 'warn', 'under': 'ignore'}

# Import Tick data

In [3]:
dir = "https://raw.githubusercontent.com/risk-labratory/data/main/"
url = dir + "IVE_2020.csv"
df = pd.read_csv(url, header=0)
df['dates'] = pd.to_datetime(df['dates'])
df.set_index('dates', inplace=True, drop=True)
df.drop_duplicates(inplace=True)
df = df[(df.index.hour>=9) & (df.index.hour<16)]
df.head()

Unnamed: 0_level_0,price,bid,ask,size
dates,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-01-02 09:30:00,130.68,130.59,130.6,20625
2020-01-02 09:30:01,130.5,130.5,130.77,200
2020-01-02 09:30:04,130.53,130.52,130.78,100
2020-01-02 09:30:04,130.55,130.52,130.78,100
2020-01-02 09:30:04,130.53,130.52,130.78,200


# Functions

In [4]:
def progressBar(value, end_value, start_time, bar_length=20):
    percent = float(value) / end_value
    arrow = '-' * int(round(percent * bar_length)-1) + '>'
    spaces = ' ' * (bar_length - len(arrow))
    remaining = int(((time.time()-start_time)/value)*(end_value-value)/60)
    sys.stdout.write("\rCompleted: [{0}] {1}% - {2} minutes remaining.".format(arrow + spaces, int(round(percent * 100)), remaining))
    sys.stdout.flush()

In [5]:
def get_ewma(input, window_length):
    n = input.shape[0]
    ewma = np.empty(n, dtype='float64')
    w = 1
    alpha = 2 / float(window_length + 1)
    ewma_old = input[0]
    ewma[0] = ewma_old
    for i in range(1, n):
        w += (1-alpha)**i
        ewma_old = ewma_old*(1-alpha) + input[i]
        ewma[i] = ewma_old / w
    return ewma

In [6]:
def get_ohlcv(df_group):
  ohlc = df_group['price'].ohlc()
  ohlc['volume'] = df_group['size'].sum()
  ohlc['vwap'] = df_group.apply(lambda x: (x['price']*x['size']).sum()/x['size'].sum())
  ohlc['twap'] = df_group['price'].mean()
  ohlc['tick_count'] = df_group['price'].count()
  ohlc['twap_logr'] = np.log(ohlc['twap']) - np.log(ohlc['twap'].shift(1))
  return ohlc

def get_time_bar(df, freq="5Min"):
  df_group = df.groupby(pd.Grouper(freq=freq))
  ohlcv = get_ohlcv(df_group)
  return ohlcv

def get_tick_bar(df, tick_per_bar=10, num_of_bars=None):
  if not tick_per_bar:
    tick_per_bar = int(df.shape[0] / num_of_bars)
  tick_group = df.reset_index().assign(grpId=lambda x: x.index // tick_per_bar)
  dates = tick_group.groupby('grpId', as_index=False).first()['dates']
  df_group =  tick_group.groupby('grpId')
  ohlcv = get_ohlcv(df_group)
  ohlcv.set_index(dates, drop=True, inplace=True)
  return ohlcv

def get_volume_bar(df, volume_per_bar=10000, num_of_bars=None):
  df['cum_size'] = df['size'].cumsum() 
  if not volume_per_bar:
    total_vol = df['cum_size'].values[-1]
    volume_per_bar = total_vol / num_of_bars
    volume_per_bar = round(volume_per_bar, -2) # round to the nearest hundred
  tick_group = df.reset_index().assign(grpId=lambda x: x.cum_size // volume_per_bar)
  dates = tick_group.groupby('grpId', as_index=False).first()['dates']
  df_group =  tick_group.groupby('grpId')
  ohlcv = get_ohlcv(df_group)
  ohlcv.set_index(dates, drop=True, inplace=True)
  return ohlcv

def get_dollar_bar(df, dollar_per_bar=100000, num_of_bars=None):
  df['dollar'] = df['price']*df['size']
  df['cum_dv'] = df['dollar'].cumsum() 
  if not dollar_per_bar:
    total_dvol = df['cum_dv'].values[-1]
    dollar_per_bar = total_dvol / num_of_bars
    dollar_per_bar = round(dollar_per_bar, -2) # round to the nearest hundred
  tick_group = df.reset_index().assign(grpId=lambda x: x.cum_dv // dollar_per_bar)
  dates = tick_group.groupby('grpId', as_index=False).first()['dates']
  df_group =  tick_group.groupby('grpId')
  ohlcv = get_ohlcv(df_group)
  ohlcv.set_index(dates, drop=True, inplace=True)
  return ohlcv

In [17]:
def compute_grpId(bvs, ET_init, Ebv_init):
  Ts, i_s = [], []
  i_prev, ET, Ebv  = 0, ET_init, Ebv_init
  n = bvs.shape[0]
  bvs = bvs.values.astype(np.float64)
  abs_thetas, thresholds, thetas, grpId = np.zeros(n), np.zeros(n), np.zeros(n), np.zeros(n)
  abs_thetas[0], current_theta = np.abs(bvs[0]), bvs[0]
  t = time.time()
  current_grpId = 0
  for i in range(1, n):
    current_theta += bvs[i]
    thetas[i] = current_theta
    abs_theta = np.abs(current_theta)
    abs_thetas[i] = abs_theta  
    threshold = ET * Ebv
    thresholds[i] = threshold  
    grpId[i] = current_grpId
    if abs_theta >= threshold:
      current_grpId += 1
      current_theta = 0
      Ts.append(np.float64(i - i_prev))
      i_s.append(i)
      i_prev = i
      ET = get_ewma(np.array(Ts), window_length=np.int64(len(Ts)))[-1]
      Ebv = np.abs(get_ewma(bvs[:i], window_length=np.int64(ET_init * 1))[-1] ) # window of 3 bars
    progressBar(i,n,t)
  return Ts, abs_thetas, thresholds, i_s, thetas, grpId

def get_info_bar(df, mode="volume", ET_init=2000):
  # User can choose between "tick", "volume" or "dollar" imbalanced bars
  # The value of ET_init should be chosen very carefully as it affects the results significantly
  # 2000 ticks to warm up
  if mode=="volume":
    b = df['bv']
  elif mode=="tick":
    b = df['b']
  elif mode=="dollar":
    b = df['bd']
  else:
    print("Error")
  Ebv_init = np.abs(b.mean())
  Ts, abs_thetas, thresholds, i_s, thetas, grpId = compute_grpId(b, ET_init, Ebv_init)
  tick_group = df.reset_index().assign(grpId = grpId)
  dates = tick_group.groupby('grpId', as_index=False).first()['dates']
  df_group = tick_group.groupby('grpId')
  ohlcv = get_ohlcv(df_group)
  ohlcv.set_index(dates, drop=True, inplace=True)
  return ohlcv, abs_thetas, thresholds

In [8]:
def plot_ohlcv(ohlcv):
  dt_all = pd.date_range(start=ohlcv.index[0],end=ohlcv.index[-1])
  dt_obs = [d.strftime("%Y-%m-%d") for d in ohlcv.index]
  dt_breaks = [d for d in dt_all.strftime("%Y-%m-%d").tolist() if not d in dt_obs]
  fig = make_subplots(rows=3, cols=1,
                      shared_xaxes=True,
                      vertical_spacing=0.05, specs=[[{"rowspan": 2}], 
                                                  [{}], 
                                                  [{}]])
  fig.add_trace(go.Candlestick(x=ohlcv.index, 
                              open=ohlcv.open, 
                              high=ohlcv.high,
                              low=ohlcv.low, 
                              close=ohlcv.close, name='Candlestick'), row=1, col=1)
  fig.add_trace(go.Bar(x=ohlcv.index, y=ohlcv.volume, marker_color='rgba(255, 100, 100, 0.7)', name='volume'), row=3, col=1)
  fig.update_yaxes(title_text="Price", row=1, col=1)
  fig.update_yaxes(title_text="Volume", row=3, col=1)
  fig.update_xaxes(
          rangeslider_visible=False,
          rangebreaks=[
              dict(bounds=["sat", "mon"]),  # hide weekends, eg. hide sat to before mon
              dict(bounds=[16, 9.5], pattern="hour"),  # hide hours outside of 9.30am-4pm
              dict(values=dt_breaks)  # hide empty dates
          ]
      )
  fig.update_layout(xaxis_rangeslider_visible=False)
  fig.show()

In [9]:
def plot_ohlc_list(ohlc_list, names_list):
  ohlc = ohlc_list[0]
  dt_all = pd.date_range(start=ohlc.index[0],end=ohlc.index[-1])
  dt_obs = [d.strftime("%Y-%m-%d") for d in ohlc.index]
  dt_breaks = [d for d in dt_all.strftime("%Y-%m-%d").tolist() if not d in dt_obs]
  fig = make_subplots(rows=len(ohlc_list), cols=1,
                      shared_xaxes=True,
                      vertical_spacing=0.05)
  for i, ohlc in enumerate(ohlc_list):
    fig.add_trace(go.Candlestick(x=ohlc.index, 
                                open=ohlc.open, 
                                high=ohlc.high,
                                low=ohlc.low, 
                                close=ohlc.close, name=names_list[i]), row=i+1, col=1)
    fig.update_yaxes(title_text=names_list[i], row=i+1, col=1)
  fig.update_xaxes(
          rangeslider_visible=False,
          rangebreaks=[
              dict(bounds=["sat", "mon"]),  # hide weekends, eg. hide sat to before mon
              dict(bounds=[16, 9.5], pattern="hour"),  # hide hours outside of 9.30am-4pm
              dict(values=dt_breaks)  # hide empty dates
          ]
      )
  fig.update_layout(xaxis_rangeslider_visible=False, height=200*len(ohlc_list))
  fig.show()

In [10]:
def plot_info_bar(df, ohlcv_info, abs_thetas, thresholds):
  dt_all = pd.date_range(start=df.index[0],end=df.index[-1])
  dt_obs = [d.strftime("%Y-%m-%d") for d in df.index]
  dt_breaks = [d for d in dt_all.strftime("%Y-%m-%d").tolist() if not d in dt_obs]
  fig = make_subplots(rows=3, cols=1,
                        shared_xaxes=True,
                        vertical_spacing=0.02)

  fig.add_trace(go.Scatter(x=df.index, y=df.price, mode='lines', name="Price"), row=1, col=1)
  fig.add_trace(go.Scatter(x=df.index, y=abs_thetas, mode='lines', name="Absolute Tick Imbalance"), row=2, col=1)
  fig.add_trace(go.Scatter(x=df.index, y=thresholds, mode='lines', name="Threshold"), row=2, col=1)
  fig.add_trace(go.Candlestick(x=ohlcv_info.index, 
                              open=ohlcv_info.open, 
                              high=ohlcv_info.high,
                              low=ohlcv_info.low, 
                              close=ohlcv_info.close, name='Candlestick'), row=3, col=1)
  fig.update_xaxes(
          rangeslider_visible=False,
          rangebreaks=[
              dict(bounds=["sat", "mon"]),  # hide weekends, eg. hide sat to before mon
              dict(bounds=[16, 9.5], pattern="hour"),  # hide hours outside of 9.30am-4pm
              dict(values=dt_breaks)  # hide empty dates
          ]
      )
  #fig.update_layout(height=600)
  fig.show()

# Analysis of various bars

In [11]:
ohlcv_time1 = get_time_bar(df, freq="30Min")
time_bar_len1 = ohlcv_time1.shape[0]
ohlcv_tick1 = get_tick_bar(df, tick_per_bar=None, num_of_bars=time_bar_len1)
ohlcv_volume1 = get_volume_bar(df, volume_per_bar=None, num_of_bars=time_bar_len1)
ohlcv_dollar1 = get_dollar_bar(df, dollar_per_bar=None, num_of_bars=time_bar_len1)

In [12]:
count_average = pd.DataFrame()
count_average['Time'] = ohlcv_time1.resample("1W")['tick_count'].mean()
count_average['Tick'] = ohlcv_tick1.resample("1W")['tick_count'].mean()
count_average['Volume'] = ohlcv_volume1.resample("1W")['tick_count'].mean()
count_average['Dollar'] = ohlcv_dollar1.resample("1W")['tick_count'].mean()
count_average.plot()

In [13]:
# Select a smaller range
start_date = datetime.datetime(2020, 3, 1)
end_date = datetime.datetime(2020, 3, 5)
df2 = df[((df.index>=start_date) & (df.index<=end_date))][['price', 'size']]
df2.drop_duplicates()
df2['r'] = np.log(df2['price']) - np.log(df2['price'].shift(1))
df2['b'] = np.sign(df2['r'])
df2['bv'] = df2['b']*df2['size']
df2['bd'] = df2['bv']*df2['price']
df2.dropna(inplace=True)
df2.head()

Unnamed: 0_level_0,price,size,r,b,bv,bd
dates,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-03-02 09:00:12,114.7,123,8.7e-05,1.0,123.0,14108.1
2020-03-02 09:30:00,115.28,26670,0.005044,1.0,26670.0,3074518.0
2020-03-02 09:30:07,115.34,100,0.00052,1.0,100.0,11534.0
2020-03-02 09:30:16,115.22,100,-0.001041,-1.0,-100.0,-11522.0
2020-03-02 09:30:26,115.2637,172,0.000379,1.0,172.0,19825.36


In [14]:
ohlcv_time = get_time_bar(df2, freq="30Min")
ohlcv_tick = get_tick_bar(df2, tick_per_bar=1000, num_of_bars=None)
ohlcv_volume = get_volume_bar(df2, volume_per_bar=10000, num_of_bars=None)
ohlcv_dollar = get_dollar_bar(df2, dollar_per_bar=500000, num_of_bars=None)

In [15]:
plot_ohlcv(ohlcv_time)

In [18]:
ohlcv_info, abs_thetas, thresholds = get_info_bar(df2, mode="dollar", ET_init=500)

Completed: [------------------->] 100% - 0 minutes remaining.

In [19]:
bar_list = [ohlcv_time, ohlcv_tick, ohlcv_volume, ohlcv_dollar, ohlcv_info]
bar_labels = ["Time Bar", "Tick Bar", "Volume Bar", "Dollar Bar", "Info Bar"]
plot_ohlc_list(bar_list, bar_labels)

In [20]:
plot_info_bar(df2, ohlcv_info, abs_thetas, thresholds)

# Statistical Analysis of various bars

In [21]:
def standardize_return(s):
  return s.sub(s.mean()).div(s.std())

normal_sample = np.random.normal(size=2000)
bar_list = [ohlcv_time, ohlcv_tick, ohlcv_volume, ohlcv_dollar, ohlcv_info]
bar_returns = [standardize_return(x.twap_logr.dropna()) for x in bar_list]
bar_returns.append(normal_sample)
bar_labels = ["Time Bar", "Tick Bar", "Volume Bar", "Dollar Bar", "Info Bar", "Normal Distribution"]
fig = ff.create_distplot(bar_returns, bar_labels, show_hist=False)
fig.update_xaxes(range=[-4,4])
fig.show()

In [22]:
bar_stats = pd.DataFrame(index=bar_labels[:-1]) 
bar_stats["Jarque Bera"] = [np.round(stats.jarque_bera(bar_returns[i].values)[1], 4) for i in range(len(bar_labels)-1)]
bar_stats["Shapiro"] = [np.round(stats.shapiro(bar_returns[i].values)[1], 4) for i in range(len(bar_labels)-1)]
bar_stats["Durbin Watson"] = [np.round(stattools.durbin_watson(bar_returns[i]), 4) for i in range(len(bar_labels)-1)]
bar_stats["Autocorr"] = [np.round(bar_returns[i].autocorr(), 4) for i in range(len(bar_labels)-1)]
bar_stats["Count Mean"] = [np.round(bar_list[i]['tick_count'].mean(), 1) for i in range(len(bar_labels)-1)]
bar_stats["Count Var"] = [np.round(bar_list[i]['tick_count'].var(), 1) for i in range(len(bar_labels)-1)]
bar_stats

Unnamed: 0,Jarque Bera,Shapiro,Durbin Watson,Autocorr,Count Mean,Count Var
Time Bar,0.9909,0.9965,1.4044,0.2843,82.6,18420.0
Tick Bar,0.6514,0.3405,1.5898,0.1601,908.4,83905.6
Volume Bar,0.0,0.0,1.5587,0.2141,36.3,314.5
Dollar Bar,0.0,0.0,1.5986,0.1937,16.7,66.9
Info Bar,0.734,0.7247,2.4864,-0.3082,757.0,1712912.4


In [30]:
months = range(1, 13)
minutes = range(1, 60, 20)
DW_df = pd.DataFrame(index=minutes, columns=months)
AC_df = pd.DataFrame(index=minutes, columns=months)
t = time.time()
for i, month in enumerate(months):
  for j, minute in enumerate(minutes):
    df_temp = df[(df.index.year==2020) & (df.index.month==month)][['price', 'size']]
    ohlcv_temp = get_time_bar(df_temp, freq="{}Min".format(minute))
    return_temp = standardize_return(ohlcv_temp.twap_logr.dropna())
    DW_df.iloc[j, i] = np.round(stattools.durbin_watson(return_temp), 4)
    AC_df.iloc[j, i] = np.round(return_temp.autocorr(), 4)
  progressBar(i+1, len(months), t)

Completed: [------------------->] 100% - 0 minutes remaining.

In [32]:
fig = px.imshow(DW_df, 
                labels=dict(x="Month", y="Frequency", color="RdBu_r"),
                y=list(minutes),
                x=list(months),
                color_continuous_scale='RdBu_r',
                aspect='square')
fig.show()

In [33]:
fig = px.imshow(AC_df, 
                labels=dict(x="Month", y="Frequency", color="RdBu_r"),
                x=list(months),
                y=list(minutes),
                color_continuous_scale='RdBu_r',
                aspect='square')
fig.show()