**Copyright (c) 2021 Risklab Middle East - All Rights Reserved**

---


**Author: Mehrdad Moghimi**



# Imports libraries

In [None]:
%%capture
!pip install plotly -U

In [None]:
import pandas as pd
import numpy as np 
import datetime
import time
import sys
from scipy import stats
from statsmodels.stats import stattools

import plotly.express as px
import plotly.figure_factory as ff
import plotly.graph_objects as go
from plotly.subplots import make_subplots

import multiprocessing as mp

pd.options.plotting.backend = "plotly"
np.seterr(divide='ignore', invalid='ignore')

  import pandas.util.testing as tm


{'divide': 'warn', 'invalid': 'warn', 'over': 'warn', 'under': 'ignore'}

# Import Tick data

In [None]:
dir = "https://raw.githubusercontent.com/risk-labratory/data/main/"
url = dir + "IVE_2020.csv"
df = pd.read_csv(url, header=0)
df['dates'] = pd.to_datetime(df['dates'])
df.set_index('dates', inplace=True, drop=True)
df.drop_duplicates(inplace=True)
df = df[(df.index.hour>=9) & (df.index.hour<16)]
df.head()

Unnamed: 0_level_0,price,bid,ask,size
dates,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-01-02 09:30:00,130.68,130.59,130.6,20625
2020-01-02 09:30:01,130.5,130.5,130.77,200
2020-01-02 09:30:04,130.53,130.52,130.78,100
2020-01-02 09:30:04,130.55,130.52,130.78,100
2020-01-02 09:30:04,130.53,130.52,130.78,200


# Functions

In [None]:
# SNIPPET 20.5 THE linParts FUNCTION
def linParts(numAtoms,numThreads):
  # partition of atoms with a single loop
  parts=np.linspace(0,numAtoms,min(numThreads,numAtoms)+1)
  parts=np.ceil(parts).astype(int)
  return parts

# SNIPPET 20.6 THE nestedParts FUNCTION
def nestedParts(numAtoms,numThreads,upperTriang=False):
  # partition of atoms with an inner loop
  parts,numThreads_=[0],min(numThreads,numAtoms)
  for num in range(numThreads_):
    part=1 + 4*(parts[-1]**2+parts[-1]+numAtoms*(numAtoms+1.)/numThreads_)
    part=(-1+part**.5)/2.
    parts.append(part)
  parts=np.round(parts).astype(int)
  if upperTriang: # the first rows are the heaviest
    parts=np.cumsum(np.diff(parts)[::-1])
    parts=np.append(np.array([0]),parts)
  return parts

# SNIPPET 20.7 THE mpPandasObj, USED AT VARIOUS POINTS IN THE BOOK
def mpPandasObj(func,pdObj,numThreads=24,mpBatches=1,linMols=True,**kargs):
  """
  Parallelize jobs, return a DataFrame or Series
  + func: function to be parallelized. Returns a DataFrame
  + pdObj[0]: Name of argument used to pass the molecule
  + pdObj[1]: List of atoms that will be grouped into molecules
  + kargs: any other argument needed by func
  Example: df1=mpPandasObj(func,(’molecule’,df0.index),24,**kargs)
  """
  argList = list(kargs.values()) #?
  if linMols:
    parts=linParts(len(argList[1]),numThreads*mpBatches)
  else:
    parts=nestedParts(len(argList[1]),numThreads*mpBatches)
  jobs=[] 
  for i in range(1,len(parts)):
    job={pdObj[0]:pdObj[1][parts[i-1]:parts[i]],'func':func}
    job.update(kargs)
    jobs.append(job)
  if numThreads==1:
    out=processJobs_(jobs)
  else:
    out=processJobs(jobs,numThreads=numThreads)
  if isinstance(out[0],pd.DataFrame):
    df0=pd.DataFrame()
  elif isinstance(out[0],pd.Series):
    df0=pd.Series(dtype="float64")
  else:
    return out
  for i in out:
    df0=df0.append(i)
  df0=df0.sort_index()
  return df0

# SNIPPET 20.8 SINGLE-THREAD EXECUTION, FOR DEBUGGING
def processJobs_(jobs):
  # Run jobs sequentially, for debugging
  out=[]
  for job in jobs:
    out_=expandCall(job)
    out.append(out_)
  return out

# SNIPPET 20.9 EXAMPLE OF ASYNCHRONOUS CALL TO PYTHON’S MULTIPROCESSING LIBRARY
def reportProgress(jobNum,numJobs,time0,task):
  # Report progress as asynch jobs are completed
  msg=[float(jobNum)/numJobs,(time.time()-time0)/60.]
  msg.append(msg[1]*(1/msg[0]-1))
  timeStamp=str(datetime.datetime.fromtimestamp(time.time()))
  msg= timeStamp+' '+str(round(msg[0]*100,2))+'% '+task+' done after '+ str(round(msg[1],2))+' minutes. Remaining '+str(round(msg[2],2))+' minutes.'
  if jobNum<numJobs:
    sys.stderr.write(msg+'\r')
  else:
    sys.stderr.write(msg+'\n')
  return

def processJobs(jobs,task=None,numThreads=24):
  # Run in parallel.
  # jobs must contain a ’func’ callback, for expandCall
  if task is None:task=jobs[0]['func'].__name__
  pool=mp.Pool(processes=numThreads)
  outputs,out,time0=pool.imap_unordered(expandCall,jobs),[],time.time()
  # Process asynchronous output, report progress
  for i,out_ in enumerate(outputs,1):
    out.append(out_)
    reportProgress(i,len(jobs),time0,task)
  pool.close()
  pool.join() # this is needed to prevent memory leaks
  return out

# SNIPPET 20.10 PASSING THE JOB (MOLECULE) TO THE CALLBACK FUNCTION
def expandCall(kargs):
  # Expand the arguments of a callback function, kargs[’func’]
  func=kargs['func']
  del kargs['func']
  out=func(**kargs)
  return out

In [None]:
def progressBar(value, end_value, start_time, bar_length=20):
    percent = float(value) / end_value
    arrow = '-' * int(round(percent * bar_length)-1) + '>'
    spaces = ' ' * (bar_length - len(arrow))
    remaining = int(((time.time()-start_time)/value)*(end_value-value)/60)
    sys.stdout.write("\rCompleted: [{0}] {1}% - {2} minutes remaining.".format(arrow + spaces, int(round(percent * 100)), remaining))
    sys.stdout.flush()

In [None]:
def get_ohlcv(df_group):
  ohlc = df_group['price'].ohlc()
  ohlc['volume'] = df_group['size'].sum()
  ohlc['vwap'] = df_group.apply(lambda x: (x['price']*x['size']).sum()/x['size'].sum())
  ohlc['twap'] = df_group['price'].mean()
  ohlc['tick_count'] = df_group['price'].count()
  ohlc['twap_logr'] = np.log(ohlc['twap']) - np.log(ohlc['twap'].shift(1))
  return ohlc

def get_time_bar(df, freq="5Min"):
  df_group = df.groupby(pd.Grouper(freq=freq))
  ohlcv = get_ohlcv(df_group)
  return ohlcv

def get_tick_bar(df, tick_per_bar=10, num_of_bars=None):
  if not tick_per_bar:
    tick_per_bar = int(df.shape[0] / num_of_bars)
  tick_group = df.reset_index().assign(grpId=lambda x: x.index // tick_per_bar)
  dates = tick_group.groupby('grpId', as_index=False).first()['dates']
  df_group =  tick_group.groupby('grpId')
  ohlcv = get_ohlcv(df_group)
  ohlcv.set_index(dates, drop=True, inplace=True)
  return ohlcv

def get_volume_bar(df, volume_per_bar=10000, num_of_bars=None):
  df['cum_size'] = df['size'].cumsum() 
  if not volume_per_bar:
    total_vol = df['cum_size'].values[-1]
    volume_per_bar = total_vol / num_of_bars
    volume_per_bar = round(volume_per_bar, -2) # round to the nearest hundred
  tick_group = df.reset_index().assign(grpId=lambda x: x.cum_size // volume_per_bar)
  dates = tick_group.groupby('grpId', as_index=False).first()['dates']
  df_group =  tick_group.groupby('grpId')
  ohlcv = get_ohlcv(df_group)
  ohlcv.set_index(dates, drop=True, inplace=True)
  return ohlcv

def get_dollar_bar(df, dollar_per_bar=100000, num_of_bars=None):
  df['dollar'] = df['price']*df['size']
  df['cum_dv'] = df['dollar'].cumsum() 
  if not dollar_per_bar:
    total_dvol = df['cum_dv'].values[-1]
    dollar_per_bar = total_dvol / num_of_bars
    dollar_per_bar = round(dollar_per_bar, -2) # round to the nearest hundred
  tick_group = df.reset_index().assign(grpId=lambda x: x.cum_dv // dollar_per_bar)
  dates = tick_group.groupby('grpId', as_index=False).first()['dates']
  df_group =  tick_group.groupby('grpId')
  ohlcv = get_ohlcv(df_group)
  ohlcv.set_index(dates, drop=True, inplace=True)
  return ohlcv

In [None]:
def plot_ohlcv(ohlcv):
  dt_all = pd.date_range(start=ohlcv.index[0],end=ohlcv.index[-1])
  dt_obs = [d.strftime("%Y-%m-%d") for d in ohlcv.index]
  dt_breaks = [d for d in dt_all.strftime("%Y-%m-%d").tolist() if not d in dt_obs]
  fig = make_subplots(rows=3, cols=1,
                      shared_xaxes=True,
                      vertical_spacing=0.05, specs=[[{"rowspan": 2}], 
                                                  [{}], 
                                                  [{}]])
  fig.add_trace(go.Candlestick(x=ohlcv.index, 
                              open=ohlcv.open, 
                              high=ohlcv.high,
                              low=ohlcv.low, 
                              close=ohlcv.close, name='Candlestick'), row=1, col=1)
  fig.add_trace(go.Bar(x=ohlcv.index, y=ohlcv.volume, marker_color='rgba(255, 100, 100, 0.7)', name='volume'), row=3, col=1)
  fig.update_yaxes(title_text="Price", row=1, col=1)
  fig.update_yaxes(title_text="Volume", row=3, col=1)
  fig.update_xaxes(
          rangeslider_visible=False,
          rangebreaks=[
              dict(bounds=["sat", "mon"]),  # hide weekends, eg. hide sat to before mon
              dict(bounds=[16, 9.5], pattern="hour"),  # hide hours outside of 9.30am-4pm
              dict(values=dt_breaks)  # hide empty dates
          ]
      )
  fig.update_layout(xaxis_rangeslider_visible=False)
  fig.show()

# Code Snippets

In [None]:
ohlcv = get_time_bar(df, freq="1B")
ohlcv.dropna(inplace=True)
close = ohlcv.close
ohlcv.head()

Unnamed: 0_level_0,open,high,low,close,volume,vwap,twap,tick_count,twap_logr
dates,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2020-01-03,129.33,129.9874,129.2912,129.64,526340,129.751774,129.719157,922,-0.003845
2020-01-06,129.0,129.8952,128.93,129.8952,655431,129.548003,129.493223,770,-0.001743
2020-01-07,129.52,129.58,129.1405,129.38,413423,129.376731,129.357347,908,-0.00105
2020-01-08,129.38,130.2999,129.24,129.76,449383,129.881903,129.858126,1028,0.003864
2020-01-09,130.3,130.38,129.92,130.3168,376142,130.161216,130.161563,614,0.002334


SNIPPET 2.4 THE SYMMETRIC CUSUM FILTER



In [None]:
def getTEvents(gRaw, h):
  tEvents, sPos, sNeg = [], 0, 0
  diff = gRaw.diff()
  for i in diff.index[1:]:
    sPos, sNeg = max(0, sPos+diff.loc[i]), min(0, sNeg+diff.loc[i])
    if sNeg<-h:
      sNeg=0
      tEvents.append(i)
    elif sPos>h:
      sPos=0
      tEvents.append(i)
  return pd.DatetimeIndex(tEvents)

SNIPPET 3.1 DAILY VOLATILITY ESTIMATES

In [None]:
def getDailyVol(close, span0=63):
    # daily vol, reindexed to close
    df0 = close.index.searchsorted(close.index - pd.Timedelta(days=1))
    df0 = df0[df0 > 0]
    df0 = pd.Series(close.index[df0 - 1], index=close.index[close.shape[0] - df0.shape[0]:])
    rets = (close.loc[df0.index] / close.loc[df0.values].values - 1).rename("rets")  # daily returns
    stds = rets.ewm(span=span0).std().rename("std")
    return rets, stds

SNIPPET 3.2 TRIPLE-BARRIER LABELING METHOD

In [None]:
def applyPtSlOnT1(close, events, ptSl, molecule):
    # apply stop loss/profit taking, if it takes place before t1 (end of event)
    events_ = events.loc[molecule]
    out = events_[['t1']].copy(deep=True)
    if ptSl[0] > 0:
        pt = ptSl[0] * events_['trgt']
    else:
        pt = pd.Series(index=events.index, dtype="float64")  # NaNs
    if ptSl[1] > 0:
        sl = -ptSl[1] * events_['trgt']
    else:
        sl = pd.Series(index=events.index, dtype="float64")  # NaNs
    for loc, t1 in events_['t1'].fillna(close.index[-1]).iteritems():
        df0 = close[loc:t1]  # path prices
        df0 = (df0 / close[loc] - 1) * events_.at[loc, 'side']  # path returns
        out.loc[loc, 'sl'] = df0[df0 < sl[loc]].index.min()  # earliest stop loss.
        out.loc[loc, 'pt'] = df0[df0 > pt[loc]].index.min()  # earliest profit taking.
    return out

SNIPPET 3.3 GETTING THE TIME OF FIRST TOUCH

In [None]:
def getEvents(close, tEvents, ptSl, trgt, minRet, numThreads, t1=False):
    # 1) get target
    trgt = trgt.loc[tEvents]
    trgt = trgt[trgt > minRet]  # minRet
    # 2) get t1 (max holding period)
    if t1 is False: 
        t1 = pd.Series(pd.NaT, index=tEvents)
    # 3) form events object, apply stop loss on t1
    side_ = pd.Series(1., index=trgt.index)
    events = pd.concat({'t1': t1, 'trgt': trgt, 'side': side_}, axis=1).dropna(subset=['trgt'])
    df0 = mpPandasObj(func=applyPtSlOnT1, pdObj=('molecule', events.index), 
                      numThreads=numThreads, close=close, events=events, ptSl=[ptSl, ptSl])
    events['t1'] = df0.dropna(how='all').min(axis=1)  # pd.min ignores nan
    events = events.drop('side', axis=1)
    return events

Plotting the overlap

In [None]:
numDays = 21
tEvents = getTEvents(close, h=5)
t1 = close.index.searchsorted(tEvents+pd.Timedelta(days=numDays))
t1 = t1[t1<close.shape[0]]
t1 = pd.Series(close.index[t1],index=tEvents[:t1.shape[0]]) # NaNs at end
rets, stds = getDailyVol(close, span0=32)
events = getEvents(close, tEvents, ptSl=1, trgt=stds, minRet=0.0, numThreads=1, t1=t1)

def add_box(close, events, fig, i):
  pt, sl = events['trgt'], events['trgt']
  upper, lower = close[events.index]*(1+pt), close[events.index]*(1-sl)
  tt1, tt2 = events.index[i], t1.loc[events.index].iloc[i]
  ft = events['t1'].iloc[i]
  pft = close.loc[ft]
  u, l = upper.iloc[i], lower.iloc[i]
  fig.add_shape(type="line", x0=tt1, y0=u, x1=tt1, y1=l, line=dict(color="red", width=2))
  fig.add_shape(type="line", x0=tt1, y0=u, x1=tt2, y1=u, line=dict(color="red", width=2))
  fig.add_shape(type="line", x0=tt1, y0=l, x1=tt2, y1=l, line=dict(color="red", width=2))
  fig.add_shape(type="line", x0=tt2, y0=u, x1=tt2, y1=l, line=dict(color="red", width=2))
  fig.add_shape(type="line", x0=ft, y0=u, x1=ft, y1=l, line=dict(color="red", width=2, dash='dash'))
  fig.add_trace(go.Scatter(x=[ft], y=[pft], marker=dict(size=[10], color=['green']), name="First Touch"))
  return fig, tt1, ft

In [None]:
fig = close.plot()
fig.add_trace(go.Scatter(x=tEvents, y=close[tEvents], mode="markers", name="tEvents"))
fig, t11, t12 = add_box(close, events, fig, i=20)
fig, t21, t22 = add_box(close, events, fig, i=21)
if t21<np.min([t12, t22]):
  fig.add_vrect(x0=t21, x1=np.min([t12, t22]), annotation_text="overlap", fillcolor="green", opacity=0.25, line_width=0)
fig.show()

SNIPPET 4.1 ESTIMATING THE UNIQUENESS OF A LABEL

Compute the number of concurrent events per bar
* closeIdx: pd.DataFrame, the index of the close price
* t1: pd series, timestamps of the vertical barriers. (index: eventStart, value: eventEnd).
* molecule: the date of the event on which the weight will be computed
  - molecule[0] is the date of the first event on which the weight will be computed
  - molecule[-1] is the date of the last event on which the weight will be computed
    
Any event that starts before t1[molecule].max() impacts the count

The function returns:
* count: pd.Series, the number of concurrent event per bar

In [None]:
def mpNumCoEvents(closeIdx, t1, molecule):
    # 1) Find events that span the period [molecule[0], molecule[-1]]
    # unclosed events still impact other weights
    # fill the unclosed events with the last available (index) date
    t1 = t1.fillna(closeIdx[-1]) 
    # events that end at or after molecule[0] (the first event date)
    t1 = t1[t1 >= molecule[0]]
    # events that start at or before t1[molecule].max() which is the furthest stop date in the batch
    t1 = t1.loc[ : t1[molecule].max()]

    # 2) Count events spanning a bar
    # find the indices begining start date ([t1.index[0]) and the furthest stop date (t1.max())
    iloc = closeIdx.searchsorted(np.array([t1.index[0], t1.max()]))
    # form a 0-array, index: from the begining start date to the furthest stop date
    count = pd.Series(0, index = closeIdx[iloc[0] : iloc[1] + 1])
    # for each signal t1 (index: eventStart, value: eventEnd)
    for tIn, tOut in t1.iteritems():
        # add 1 if and only if [t_(i,0), t_(i.1)] overlaps with [t-1,t]
        count.loc[tIn : tOut] += 1 # every timestamp between tIn and tOut
    # compute the number of labels concurrents at t
    return count.loc[molecule[0] : t1[molecule].max()] # only return the timespan of the molecule

In [None]:
count = mpNumCoEvents(closeIdx=close.index, t1=events['t1'], molecule=tEvents)

fig = make_subplots(specs=[[{"secondary_y": True}]])
fig.add_trace(go.Scatter(x=close.index, y=close.values, mode="lines", marker=dict(color='blue'), name="close"), secondary_y=False) 
fig.add_trace(go.Scatter(x=tEvents, y=close[tEvents], mode="markers", marker=dict(color='red'), name="tEvents 10"), secondary_y=False)
fig.add_trace(go.Scatter(x=count.index, y=count.values, mode="lines", marker=dict(color='red'), name="t10 counts"), secondary_y=True) 
#fig, t21, t22 = add_box(close, events, fig, i=21)
fig.update_yaxes(title_text="Price", secondary_y=False)
fig.update_yaxes(title_text="Count", secondary_y=True)
fig.show()

SNIPPET 4.2 ESTIMATING THE AVERAGE UNIQUENESS OF A LABEL

mpSampleTW:
* t1: pd series, timestamps of the vertical barriers. (index: eventStart, value: eventEnd).
* numCoEvent: 
* molecule: the date of the event on which the weight will be computed
  - molecule[0] is the date of the first event on which the weight will be computed
  - molecule[-1] is the date of the last event on which the weight will be computed


SampleTW:
* close: A pd series of prices
* events: A Pd dataframe
  -   t1: the timestamp of vertical barrier. if the value is np.nan, no vertical barrier
  -   trgr: the unit width of the horizontal barriers, e.g. standard deviation
* numThreads: constant, The no. of threads concurrently used by the function

The functions returns:
 * wght: pd.Series, the sample weight of each (volume) bar

In [None]:
def mpSampleTW(t1, numCoEvents, molecule):
    # derive average uniqueness over the event's lifespan
    wght = pd.Series(index = molecule, dtype='float64')
    # for each events
    for tIn, tOut in t1.loc[wght.index].iteritems():
        # tIn, starts of the events, tOut, ends of the events
        # the more the coEvents, the lower the weights
        wght.loc[tIn] = (1. / numCoEvents.loc[tIn : tOut]).mean()
    return wght

def SampleTW(close, events, numThreads):
    out = events[['t1']].copy(deep = True)
    out['t1'] = out['t1'].fillna(close.index[-1])
    events['t1'] = events['t1'].fillna(close.index[-1])
    numCoEvents = mpPandasObj(mpNumCoEvents, ('molecule', events.index), numThreads, closeIdx = close.index, t1 = out['t1'])
    numCoEvents = numCoEvents.loc[~numCoEvents.index.duplicated(keep = 'last')]
    numCoEvents = numCoEvents.reindex(close.index).fillna(0)
    out['tW'] = mpPandasObj(mpSampleTW, ('molecule', events.index), numThreads, t1 = out['t1'], numCoEvents = numCoEvents)
    return out, numCoEvents

In [None]:
out, numCoEvents = SampleTW(close, events, numThreads=1)
fig = make_subplots(specs=[[{"secondary_y": True}]])
fig.add_trace(go.Scatter(x=numCoEvents.index, y=numCoEvents.values, mode="lines", marker=dict(color='blue'), name="Count"), secondary_y=False) 
fig.add_trace(go.Scatter(x=out.index, y=out['tW'], mode="lines", marker=dict(color='green'), name="Weight"), secondary_y=True) 

In [None]:
tEvents2 = close.index[1:]
t2 = close.index.searchsorted(tEvents2+pd.Timedelta(days=numDays))
t2 = t2[t2<close.shape[0]]
t2 = pd.Series(close.index[t2],index=tEvents2[:t2.shape[0]]) # NaNs at end
events2 = getEvents(close, tEvents2, ptSl=1, trgt=stds, minRet=0.0, numThreads=1, t1=t2)
out2, numCoEvents2 = SampleTW(close, events2, numThreads=1)
fig = px.histogram(out2['tW'], nbins=100)
fig.show()

SNIPPET 4.3 BUILD AN INDICATOR MATRIX

Get Indicator matrix
* barIx: the index of bars
* t1: pd series, timestamps of the vertical barriers. (index: eventStart, value: eventEnd).

The function returns:
* indM: binary matrix, indicate what (price) bars influence the label for each observation

In [None]:
def getIndMatrix(barIx, t1):
    indM = pd.DataFrame(0, index = barIx, columns = range(t1.shape[0]))
    for i, (t0, t1) in enumerate(t1.iteritems()): # signal = obs
        indM.loc[t0 : t1, i] = 1. # each obs each column, you can see how many bars are related to an obs/
    return indM

In [None]:
indM = getIndMatrix(barIx=close.index, t1=out['t1'])
indM.columns = t1.index
indM.iloc[30:40,:]

Unnamed: 0_level_0,2020-01-31,2020-02-12,2020-02-24,2020-02-26,2020-02-27,2020-03-02,2020-03-06,2020-03-09,2020-03-11,2020-03-12,2020-03-13,2020-03-16,2020-03-17,2020-03-18,2020-03-23,2020-03-24,2020-03-26,2020-04-01,2020-04-06,2020-04-09,2020-04-27,2020-05-01,2020-05-13,2020-05-18,2020-05-27,2020-06-05,2020-06-11,2020-06-24,2020-07-14,2020-08-10,2020-09-21,2020-10-05,2020-10-28,2020-11-05,2020-11-10,2020-11-24
dates,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1
2020-02-20,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2020-02-21,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2020-02-24,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2020-02-25,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2020-02-26,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2020-02-27,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2020-02-28,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2020-03-02,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2020-03-03,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2020-03-04,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


SNIPPET 4.4 COMPUTE AVERAGE UNIQUENESS

* indM: binary matrix, indicate what (price) bars influence the label for each observation

The function returns:
* avgU: average uniqueness of each observed feature

In [None]:
def getAvgUniqueness(indM):
    # Average uniqueness from indicator matrix
    c = indM.sum(axis = 1) # concurrency, how many obs share the same bar
    u = indM.div(c, axis = 0) # uniqueness, the more obs share the same bar, the less important the bar is
    avgU = u[u > 0].mean() # average uniqueness
    return avgU

In [None]:
avgU = getAvgUniqueness(indM)
avgU.plot()

In [None]:
# t0: t1.index; t1: t1.values
t3 = pd.Series([2, 3, 5], index = [0,2,4])
# index of bars
barIx3 = range(t3.max() + 1)
# get indicator matrix
indM3 = getIndMatrix(barIx3, t3)
u3 = indM3.div(indM3.sum(axis = 1), axis = 0)
print("Indicator Matrix:\n", indM3)
print("Uniqueness Matrix:\n", u3)
print(getAvgUniqueness(indM3))

Indicator Matrix:
    0  1  2
0  1  0  0
1  1  0  0
2  1  1  0
3  0  1  0
4  0  0  1
5  0  0  1
Uniqueness Matrix:
      0    1    2
0  1.0  0.0  0.0
1  1.0  0.0  0.0
2  0.5  0.5  0.0
3  0.0  1.0  0.0
4  0.0  0.0  1.0
5  0.0  0.0  1.0
0    0.833333
1    0.750000
2    1.000000
dtype: float64


SNIPPET 4.5 RETURN SAMPLE FROM SEQUENTIAL BOOTSTRAP
Give the index of the features sampled by the sequential bootstrap
* indM: binary matrix, indicate what (price) bars influence the label for each observation
* sLength: optional, sample length, default: as many draws as rows in indM

In [None]:
def seqBootstrap(indM, sLength = None, log=False):
    # Generate a sample via sequential bootstrap
    if sLength is None: # default
        sLength = indM.shape[1] # sample length = # of rows in indM
    # Create an empty list to store the sequence of the draws
    phi = []
    if log: print("indM:\n", indM, "\n")
    while len(phi) < sLength:
        avgU = pd.Series(dtype="float64") # store the average uniqueness of the draw
        for i in indM: # for every obs
            indM_ = indM[phi + [i]] # add the obs to the existing bootstrapped sample
            if log: print("indM_ {}:\n".format(i), indM_, "\n")
            # get the average uniqueness of the draw after adding to the new phi
            if log: print("getAvgUniqueness(indM_):\n", getAvgUniqueness(indM_), "\n")
            avgU.loc[i] = getAvgUniqueness(indM_).iloc[-1] # only the last is the obs concerned, others are not important
            if log: print("Average Uniqeness:\n", avgU, "\n")
        prob = avgU / avgU.sum() # cal prob <- normalise the average uniqueness
        if log: print("Probabilities:\n", prob, "\n")
        phi += [np.random.choice(indM.columns, p = prob)] # add a random sample from indM.columns with prob. = prob
        if log: print("Phi:", phi, "\n")
    return phi

SNIPPET 4.6 EXAMPLE OF SEQUENTIAL BOOTSTRAP

In [None]:
phi = seqBootstrap(indM3, log=True)

In [None]:
phi = np.random.choice(indM3.columns, size = indM3.shape[1])
print(phi)
print ('Standard uniqueness:', getAvgUniqueness(indM3[phi]).mean())
phi = seqBootstrap(indM3)
print(phi)
print ('Sequential uniqueness:', getAvgUniqueness(indM3[phi]).mean())

[0 2 1]
Standard uniqueness: 0.8611111111111112
[0, 2, 0]
Sequential uniqueness: 0.6666666666666666


SNIPPET 4.7 GENERATING A RANDOM T1 SERIES

In [None]:
def getRndT1(numObs, numBars, maxH):
    # random t1 Series
    t1 = pd.Series(dtype="float64")
    for _ in range(numObs):
        ix = np.random.randint(0, numBars)
        val = ix + np.random.randint(1, maxH)
        t1.loc[ix] = val
    return t1.sort_index()

In [None]:
getRndT1(numObs = 10, numBars = 100, maxH = 5)

3      5
9     11
12    14
19    20
21    25
54    58
68    69
76    78
91    93
dtype: int64

SNIPPET 4.8 UNIQUENESS FROM STANDARD AND SEQUENTIAL BOOTSTRAPS

In [None]:
def auxMC(numObs, numBars, maxH):
    # Parallelized auxiliary function
    t1 = getRndT1(numObs, numBars, maxH)
    barIx = range(t1.max() + 1)
    indM = getIndMatrix(barIx, t1)
    phi = np.random.choice(indM.columns, size = indM.shape[1])
    stdU = getAvgUniqueness(indM[phi]).mean()
    phi = seqBootstrap(indM)
    seqU = getAvgUniqueness(indM[phi]).mean()
    return {'stdU': stdU, 'seqU': seqU}

SNIPPET 4.9 MULTI-THREADED MONTE CARLO

In [None]:
def mainMC(numObs = 10, numBars = 100, maxH = 5, numIters = 1E6, numThreads = 24):
    # Monte Carlo experiments
    jobs=[]
    for _ in range(int(numIters)):
        job={'func': auxMC, 'numObs': numObs, 'numBars': numBars, 'maxH': maxH}
        jobs.append(job)
    if numThreads == 1:
        out = processJobs_(jobs)
    else:
        out = processJobs(jobs, numThreads = numThreads)
    out = pd.DataFrame(out)
    print(out.describe())
    return out

In [None]:
out_mc = mainMC(numObs = 10, numBars = 100, maxH = 5, numIters = 1E3, numThreads = 8)

2021-04-14 16:03:01.475790 100.0% auxMC done after 4.76 minutes. Remaining 0.0 minutes.


              stdU         seqU
count  1000.000000  1000.000000
mean      0.606231     0.701142
std       0.103916     0.085671
min       0.250000     0.500000
25%       0.533333     0.656667
50%       0.600000     0.700000
75%       0.673333     0.750000
max       0.817500     0.966667


In [None]:
fig = go.Figure()

# Group data together
hist_data = [out_mc.iloc[:,0], out_mc.iloc[:,1]]
group_labels = ["stdU", "seqU"]

# Create distplot with custom bin_size
fig = ff.create_distplot(hist_data, group_labels, bin_size=.05)
fig.show()

SNIPPET 4.10 DETERMINATION OF SAMPLE WEIGHT BY ABSOLUTE RETURN ATTRIBUTION

SampleW:
* close: A pd series of prices
* events: A Pd dataframe
  -   t1: the timestamp of vertical barrier. if the value is np.nan, no vertical barrier
  -   trgr: the unit width of the horizontal barriers, e.g. standard deviation
* numThreads: constant, The no. of threads concurrently used by the function

In [None]:
def mpSampleW(t1, numCoEvents, close, molecule):
    # Derive sample weight by return attribution
    ret = np.log(close).diff() # log-returns, so that they are additive
    wght = pd.Series(index = molecule, dtype="float64")
    for tIn, tOut in t1.loc[wght.index].iteritems():
        wght.loc[tIn] = (ret.loc[tIn : tOut] / numCoEvents.loc[tIn : tOut]).sum()
    return wght.abs()

def SampleW(close, events, numThreads):
    out = events[['t1']].copy(deep = True)
    numCoEvents = mpPandasObj(mpNumCoEvents,('molecule', events.index),numThreads, closeIdx = close.index, t1 = events['t1'])
    numCoEvents = numCoEvents.loc[~numCoEvents.index.duplicated(keep = 'last')]
    numCoEvents = numCoEvents.reindex(close.index).fillna(0)
    out['w'] = mpPandasObj(mpSampleW, ('molecule', events.index), numThreads, t1 = events['t1'], numCoEvents = numCoEvents, close = close)
    out['w'] *= out.shape[0] / out['w'].sum() # normalised, sum up to sample size
    return out, numCoEvents

In [None]:
out_w, numCoEvents_w = SampleW(close, events, numThreads=1)
fig = make_subplots(specs=[[{"secondary_y": True}]])
fig.add_trace(go.Scatter(x=numCoEvents_w.index, y=numCoEvents_w.values, mode="lines", marker=dict(color='blue'), name="Count"), secondary_y=False) 
fig.add_trace(go.Scatter(x=out_w.index, y=out_w['w'], mode="lines", marker=dict(color='green'), name="Weight_W"), secondary_y=True) 
fig.add_trace(go.Scatter(x=out.index, y=out['tW'], mode="lines", marker=dict(color='red'), name="Weight_TW"), secondary_y=True) 

SNIPPET 4.11 IMPLEMENTATION OF TIME-DECAY FACTORS

apply piecewise-linear decay to observed uniqueness (tW)
* clfLastW = 1: no time decay
* 0 <= clfLastW <= 1: weights decay linearly over time, but every obersevation still receives a strictly positive weight
  - c = 0: weights converge linearly to 0 as they become older
  - c < 0: the oldest portion cT of the observations receive 0 weight
  - c > 1: weights increase as they get older

In [None]:
def getTimeDecay(tW, clfLastW = 1.):
    # newest observation gets weight=1, oldest observation gets weight=clfLastW
    clfW = tW.sort_index().cumsum() # cumulative sum of the observed uniqueness
    if clfLastW >= 0: # if 0 <= clfLastW <= 1
        slope = (1. - clfLastW) / clfW.iloc[-1]
    else: # if -1 < clfLastW < 0
        slope=1. / ((clfLastW + 1) * clfW.iloc[-1])
    const = 1. - slope * clfW.iloc[-1]
    clfW = const + slope * clfW
    clfW[clfW < 0] = 0 # neg weight -> 0
    print(const, slope)
    return clfW

In [None]:
fig = go.Figure()
for i in range(-9,11):
  temp = getTimeDecay(out['tW'], clfLastW = i/10)
  fig.add_trace(go.Scatter(x=temp.index, y=temp, name="clfLastW_{}".format(i/10)))
fig.show()

-9.000000000000002 0.3618676310177278
-4.000000000000001 0.1809338155088639
-2.333333333333333 0.1206225436725759
-1.5 0.09046690775443192
-0.9999999999999998 0.07237352620354554
-0.6666666666666665 0.06031127183628795
-0.4285714285714286 0.05169537585967539
-0.25 0.04523345387721596
-0.11111111111111094 0.040207514557525297
1.1102230246251565e-16 0.03618676310177277
0.09999999999999998 0.032568086791595494
0.19999999999999996 0.02894941048141822
0.30000000000000004 0.02533073417124094
0.4 0.02171205786106366
0.5 0.018093381550886384
0.6 0.01447470524070911
0.7 0.010856028930531832
0.8 0.007237352620354552
0.9 0.003618676310177276
1.0 0.0


In [None]:
def get_Concur_Uniqueness(close, events, numThreads):
    out = events[['t1']].copy(deep = True)
    out['t1'] = out['t1'].fillna(close.index[-1])
    events['t1'] = events['t1'].fillna(close.index[-1])
    numCoEvents = mpPandasObj(mpNumCoEvents, ('molecule', events.index), numThreads, closeIdx = close.index, t1 = out['t1'])
    numCoEvents = numCoEvents.loc[~numCoEvents.index.duplicated(keep = 'last')]
    numCoEvents = numCoEvents.reindex(close.index).fillna(0)
    out['tW'] = mpPandasObj(mpSampleTW, ('molecule', events.index), numThreads, t1 = out['t1'], numCoEvents = numCoEvents)
    out['w'] = mpPandasObj(mpSampleW, ('molecule', events.index), numThreads, t1 = events['t1'], numCoEvents = numCoEvents, close = close)
    out['w'] *= out.shape[0] / out['w'].sum() # normalised, sum up to sample size
    return out