**Copyright (c) 2021 Risklab Middle East - All Rights Reserved**

---


**Author: Mehrdad Moghimi**



# Imports libraries

In [None]:
%%capture
!pip install plotly -U

In [None]:
import pandas as pd
import numpy as np 
import datetime
import time
import sys
from scipy import stats
from statsmodels.stats import stattools

import plotly.express as px
import plotly.figure_factory as ff
import plotly.graph_objects as go
from plotly.subplots import make_subplots

import multiprocessing as mp

pd.options.plotting.backend = "plotly"
np.seterr(divide='ignore', invalid='ignore')

  import pandas.util.testing as tm


{'divide': 'warn', 'invalid': 'warn', 'over': 'warn', 'under': 'ignore'}

# Import Tick data

In [None]:
dir = "https://raw.githubusercontent.com/risk-labratory/data/main/"
url = dir + "IVE_2020.csv"
df = pd.read_csv(url, header=0)
df['dates'] = pd.to_datetime(df['dates'])
df.set_index('dates', inplace=True, drop=True)
df.drop_duplicates(inplace=True)
df = df[(df.index.hour>=9) & (df.index.hour<16)]
df.head()

Unnamed: 0_level_0,price,bid,ask,size
dates,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-01-02 09:30:00,130.68,130.59,130.6,20625
2020-01-02 09:30:01,130.5,130.5,130.77,200
2020-01-02 09:30:04,130.53,130.52,130.78,100
2020-01-02 09:30:04,130.55,130.52,130.78,100
2020-01-02 09:30:04,130.53,130.52,130.78,200


# Functions

In [None]:
# SNIPPET 20.5 THE linParts FUNCTION
def linParts(numAtoms,numThreads):
  # partition of atoms with a single loop
  parts=np.linspace(0,numAtoms,min(numThreads,numAtoms)+1)
  parts=np.ceil(parts).astype(int)
  return parts

# SNIPPET 20.6 THE nestedParts FUNCTION
def nestedParts(numAtoms,numThreads,upperTriang=False):
  # partition of atoms with an inner loop
  parts,numThreads_=[0],min(numThreads,numAtoms)
  for num in range(numThreads_):
    part=1 + 4*(parts[-1]**2+parts[-1]+numAtoms*(numAtoms+1.)/numThreads_)
    part=(-1+part**.5)/2.
    parts.append(part)
  parts=np.round(parts).astype(int)
  if upperTriang: # the first rows are the heaviest
    parts=np.cumsum(np.diff(parts)[::-1])
    parts=np.append(np.array([0]),parts)
  return parts

# SNIPPET 20.7 THE mpPandasObj, USED AT VARIOUS POINTS IN THE BOOK
def mpPandasObj(func,pdObj,numThreads=24,mpBatches=1,linMols=True,**kargs):
  """
  Parallelize jobs, return a DataFrame or Series
  + func: function to be parallelized. Returns a DataFrame
  + pdObj[0]: Name of argument used to pass the molecule
  + pdObj[1]: List of atoms that will be grouped into molecules
  + kargs: any other argument needed by func
  Example: df1=mpPandasObj(func,(’molecule’,df0.index),24,**kargs)
  """
  argList = list(kargs.values()) #?
  if linMols:
    parts=linParts(len(argList[1]),numThreads*mpBatches)
  else:
    parts=nestedParts(len(argList[1]),numThreads*mpBatches)
  jobs=[] 
  for i in range(1,len(parts)):
    job={pdObj[0]:pdObj[1][parts[i-1]:parts[i]],'func':func}
    job.update(kargs)
    jobs.append(job)
  if numThreads==1:
    out=processJobs_(jobs)
  else:
    out=processJobs(jobs,numThreads=numThreads)
  if isinstance(out[0],pd.DataFrame):
    df0=pd.DataFrame()
  elif isinstance(out[0],pd.Series):
    df0=pd.Series()
  else:
    return out
  for i in out:
    df0=df0.append(i)
  df0=df0.sort_index()
  return df0

# SNIPPET 20.8 SINGLE-THREAD EXECUTION, FOR DEBUGGING
def processJobs_(jobs):
  # Run jobs sequentially, for debugging
  out=[]
  for job in jobs:
    out_=expandCall(job)
    out.append(out_)
  return out

# SNIPPET 20.9 EXAMPLE OF ASYNCHRONOUS CALL TO PYTHON’S MULTIPROCESSING LIBRARY
def reportProgress(jobNum,numJobs,time0,task):
  # Report progress as asynch jobs are completed
  msg=[float(jobNum)/numJobs,(time.time()-time0)/60.]
  msg.append(msg[1]*(1/msg[0]-1))
  timeStamp=str(datetime.datetime.fromtimestamp(time.time()))
  msg= timeStamp+' '+str(round(msg[0]*100,2))+'% '+task+' done after '+ str(round(msg[1],2))+' minutes. Remaining '+str(round(msg[2],2))+' minutes.'
  if jobNum<numJobs:
    sys.stderr.write(msg+'\r')
  else:
    sys.stderr.write(msg+'\n')
  return

def processJobs(jobs,task=None,numThreads=24):
  # Run in parallel.
  # jobs must contain a ’func’ callback, for expandCall
  if task is None:task=jobs[0]['func'].__name__
  pool=mp.Pool(processes=numThreads)
  outputs,out,time0=pool.imap_unordered(expandCall,jobs),[],time.time()
  # Process asynchronous output, report progress
  for i,out_ in enumerate(outputs,1):
    out.append(out_)
    reportProgress(i,len(jobs),time0,task)
  pool.close()
  pool.join() # this is needed to prevent memory leaks
  return out

# SNIPPET 20.10 PASSING THE JOB (MOLECULE) TO THE CALLBACK FUNCTION
def expandCall(kargs):
  # Expand the arguments of a callback function, kargs[’func’]
  func=kargs['func']
  del kargs['func']
  out=func(**kargs)
  return out

In [None]:
def progressBar(value, end_value, start_time, bar_length=20):
    percent = float(value) / end_value
    arrow = '-' * int(round(percent * bar_length)-1) + '>'
    spaces = ' ' * (bar_length - len(arrow))
    remaining = int(((time.time()-start_time)/value)*(end_value-value)/60)
    sys.stdout.write("\rCompleted: [{0}] {1}% - {2} minutes remaining.".format(arrow + spaces, int(round(percent * 100)), remaining))
    sys.stdout.flush()

In [None]:
def get_ohlcv(df_group):
  ohlc = df_group['price'].ohlc()
  ohlc['volume'] = df_group['size'].sum()
  ohlc['vwap'] = df_group.apply(lambda x: (x['price']*x['size']).sum()/x['size'].sum())
  ohlc['twap'] = df_group['price'].mean()
  ohlc['tick_count'] = df_group['price'].count()
  ohlc['twap_logr'] = np.log(ohlc['twap']) - np.log(ohlc['twap'].shift(1))
  return ohlc

def get_time_bar(df, freq="5Min"):
  df_group = df.groupby(pd.Grouper(freq=freq))
  ohlcv = get_ohlcv(df_group)
  return ohlcv

def get_tick_bar(df, tick_per_bar=10, num_of_bars=None):
  if not tick_per_bar:
    tick_per_bar = int(df.shape[0] / num_of_bars)
  tick_group = df.reset_index().assign(grpId=lambda x: x.index // tick_per_bar)
  dates = tick_group.groupby('grpId', as_index=False).first()['dates']
  df_group =  tick_group.groupby('grpId')
  ohlcv = get_ohlcv(df_group)
  ohlcv.set_index(dates, drop=True, inplace=True)
  return ohlcv

def get_volume_bar(df, volume_per_bar=10000, num_of_bars=None):
  df['cum_size'] = df['size'].cumsum() 
  if not volume_per_bar:
    total_vol = df['cum_size'].values[-1]
    volume_per_bar = total_vol / num_of_bars
    volume_per_bar = round(volume_per_bar, -2) # round to the nearest hundred
  tick_group = df.reset_index().assign(grpId=lambda x: x.cum_size // volume_per_bar)
  dates = tick_group.groupby('grpId', as_index=False).first()['dates']
  df_group =  tick_group.groupby('grpId')
  ohlcv = get_ohlcv(df_group)
  ohlcv.set_index(dates, drop=True, inplace=True)
  return ohlcv

def get_dollar_bar(df, dollar_per_bar=100000, num_of_bars=None):
  df['dollar'] = df['price']*df['size']
  df['cum_dv'] = df['dollar'].cumsum() 
  if not dollar_per_bar:
    total_dvol = df['cum_dv'].values[-1]
    dollar_per_bar = total_dvol / num_of_bars
    dollar_per_bar = round(dollar_per_bar, -2) # round to the nearest hundred
  tick_group = df.reset_index().assign(grpId=lambda x: x.cum_dv // dollar_per_bar)
  dates = tick_group.groupby('grpId', as_index=False).first()['dates']
  df_group =  tick_group.groupby('grpId')
  ohlcv = get_ohlcv(df_group)
  ohlcv.set_index(dates, drop=True, inplace=True)
  return ohlcv

In [None]:
def plot_ohlcv(ohlcv):
  dt_all = pd.date_range(start=ohlcv.index[0],end=ohlcv.index[-1])
  dt_obs = [d.strftime("%Y-%m-%d") for d in ohlcv.index]
  dt_breaks = [d for d in dt_all.strftime("%Y-%m-%d").tolist() if not d in dt_obs]
  fig = make_subplots(rows=3, cols=1,
                      shared_xaxes=True,
                      vertical_spacing=0.05, specs=[[{"rowspan": 2}], 
                                                  [{}], 
                                                  [{}]])
  fig.add_trace(go.Candlestick(x=ohlcv.index, 
                              open=ohlcv.open, 
                              high=ohlcv.high,
                              low=ohlcv.low, 
                              close=ohlcv.close, name='Candlestick'), row=1, col=1)
  fig.add_trace(go.Bar(x=ohlcv.index, y=ohlcv.volume, marker_color='rgba(255, 100, 100, 0.7)', name='volume'), row=3, col=1)
  fig.update_yaxes(title_text="Price", row=1, col=1)
  fig.update_yaxes(title_text="Volume", row=3, col=1)
  fig.update_xaxes(
          rangeslider_visible=False,
          rangebreaks=[
              dict(bounds=["sat", "mon"]),  # hide weekends, eg. hide sat to before mon
              dict(bounds=[16, 9.5], pattern="hour"),  # hide hours outside of 9.30am-4pm
              dict(values=dt_breaks)  # hide empty dates
          ]
      )
  fig.update_layout(xaxis_rangeslider_visible=False)
  fig.show()

# Code Snippets

In [None]:
ohlcv = get_time_bar(df, freq="1B")
ohlcv.dropna(inplace=True)
close = ohlcv.close
ohlcv.head()

Unnamed: 0_level_0,open,high,low,close,volume,vwap,twap,tick_count,twap_logr
dates,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2020-01-03,129.33,129.9874,129.2912,129.64,526340,129.751774,129.719157,922,-0.003845
2020-01-06,129.0,129.8952,128.93,129.8952,655431,129.548003,129.493223,770,-0.001743
2020-01-07,129.52,129.58,129.1405,129.38,413423,129.376731,129.357347,908,-0.00105
2020-01-08,129.38,130.2999,129.24,129.76,449383,129.881903,129.858126,1028,0.003864
2020-01-09,130.3,130.38,129.92,130.3168,376142,130.161216,130.161563,614,0.002334


SNIPPET 3.1 DAILY VOLATILITY ESTIMATES

In [None]:
def getDailyVol(close, span0=63):
    # daily vol, reindexed to close
    df0 = close.index.searchsorted(close.index - pd.Timedelta(days=1))
    df0 = df0[df0 > 0]
    df0 = pd.Series(close.index[df0 - 1], index=close.index[close.shape[0] - df0.shape[0]:])
    rets = (close.loc[df0.index] / close.loc[df0.values].values - 1).rename("rets")  # daily returns
    stds = rets.ewm(span=span0).std().rename("std")
    return rets, stds

In [None]:
rets, stds = getDailyVol(close, span0=32)
fig = go.Figure()
fig.add_trace(go.Scatter(x=rets.index, y=rets.values, mode="lines", marker=dict(color='firebrick'), name="rets"))
fig.add_trace(go.Scatter(x=rets.index, y=stds.values, mode="lines", marker=dict(color='royalblue'),  name="stds"))
fig.add_trace(go.Scatter(x=rets.index, y=-stds.values, mode="lines", marker=dict(color='royalblue'),  name="-stds"))

SNIPPET 3.2 TRIPLE-BARRIER LABELING METHOD

Snippet 3.2 implements the triple-barrier method. The function receives four arguments:
* close: A pandas series of prices.
* events: A pandas dataframe, with columns,
 -  t1: The timestamp of vertical barrier. When the value is np.nan, there will not be a vertical barrier.
 - trgt: The unit width of the horizontal barriers.
* ptSl: A list of two non-negative float values:
 - ptSl[0]: The factor that multiplies trgt to set the width of the upper barrier.
If 0, there will not be an upper barrier.
 - ptSl[1]: The factor that multiplies trgt to set the width of the lower barrier.
If 0, there will not be a lower barrier.
* molecule: A list with the subset of event indices that will be processed by a
single thread. Its use will become clear later on in the chapter.

In [None]:
def applyPtSlOnT1(close, events, ptSl, molecule):
    # apply stop loss/profit taking, if it takes place before t1 (end of event)
    events_ = events.loc[molecule]
    out = events_[['t1']].copy(deep=True)
    if ptSl[0] > 0:
        pt = ptSl[0] * events_['trgt']
    else:
        pt = pd.Series(index=events.index)  # NaNs
    if ptSl[1] > 0:
        sl = -ptSl[1] * events_['trgt']
    else:
        sl = pd.Series(index=events.index)  # NaNs
    for loc, t1 in events_['t1'].fillna(close.index[-1]).iteritems():
        df0 = close[loc:t1]  # path prices
        df0 = (df0 / close[loc] - 1) * events_.at[loc, 'side']  # path returns
        out.loc[loc, 'sl'] = df0[df0 < sl[loc]].index.min()  # earliest stop loss.
        out.loc[loc, 'pt'] = df0[df0 > pt[loc]].index.min()  # earliest profit taking.
    return out

SNIPPET 2.4 THE SYMMETRIC CUSUM FILTER

The function getTEvents receives two arguments: the raw time series we wish
to filter (gRaw) and the threshold, h. One practical aspect that makes CUSUM filters
appealing is that multiple events are not triggered by gRaw hovering around a threshold
level, which is a flaw suffered by popular market signals such as Bollinger bands.
It will require a full run of length h for gRaw to trigger an event.



In [None]:
def getTEvents(gRaw, h):
  tEvents, sPos, sNeg = [], 0, 0
  diff = gRaw.diff()
  for i in diff.index[1:]:
    sPos, sNeg = max(0, sPos+diff.loc[i]), min(0, sNeg+diff.loc[i])
    if sNeg<-h:
      sNeg=0
      tEvents.append(i)
    elif sPos>h:
      sPos=0
      tEvents.append(i)
  return pd.DatetimeIndex(tEvents)

In [None]:
tEvents = getTEvents(close, h=5)
#tEvents = close.index
fig = close.plot()
fig.add_trace(go.Scatter(x=tEvents, y=close[tEvents], mode="markers", name="tEvents"))

In [None]:
t1 = pd.Series(pd.NaT, index=tEvents)
side_ = pd.Series(1., index=tEvents)
trgt = stds.loc[tEvents] 
ptSl = [1, 1]
events = pd.concat({'t1': t1, 'trgt': trgt, 'side': side_}, axis=1).dropna(subset=['trgt'])
out = applyPtSlOnT1(close, events, ptSl, molecule=tEvents)
pt, sl = ptSl[0]*trgt, ptSl[1]*trgt
upper, lower = close[tEvents]*(1+pt), close[tEvents]*(1-sl)
out.head()

Unnamed: 0,t1,sl,pt
2020-01-31,NaT,2020-02-25,2020-02-04
2020-02-12,NaT,2020-02-24,NaT
2020-02-24,NaT,2020-02-25,NaT
2020-02-26,NaT,2020-02-27,2020-11-16
2020-02-27,NaT,2020-03-09,2020-03-02


In [None]:
i = 4
tt1, tt2 = out.index[i], close.index[-1]
u, l = upper.iloc[i], lower.iloc[i]
tsl, tpt = out['sl'].iloc[i], out['pt'].iloc[i]
psl, ppt = close.loc[tsl], close.loc[tpt]
fig = close.plot()
fig.add_shape(type="line", x0=tt1, y0=u, x1=tt1, y1=l, line=dict(color="red", width=2))
fig.add_shape(type="line", x0=tt1, y0=u, x1=tt2, y1=u, line=dict(color="red", width=2))
fig.add_shape(type="line", x0=tt1, y0=l, x1=tt2, y1=l, line=dict(color="red", width=2))
fig.add_trace(go.Scatter(x=[tsl], y=[psl], marker=dict(size=[10], color=['red']), name="Stop Loss"))
fig.add_trace(go.Scatter(x=[tpt], y=[ppt], marker=dict(size=[10], color=['green']), name="Take Profit"))
fig.show()

SNIPPET 3.3 GETTING THE TIME OF FIRST TOUCH

Snippet 3.3 implements the function getEvents, which finds the time of the first
barrier touch. The function receives the following arguments:
* close: A pandas series of prices.
* tEvents: The pandas timeindex containing the timestamps that will seed every
triple barrier. These are the timestamps selected by the sampling procedures
discussed in Chapter 2, Section 2.5.
* ptSl: A non-negative float that sets the width of the two barriers. A 0 value
means that the respective horizontal barrier (profit taking and/or stop loss) will
be disabled.
* t1: A pandas series with the timestamps of the vertical barriers. We pass a
False when we want to disable vertical barriers.
* trgt: A pandas series of targets, expressed in terms of absolute returns.
* minRet: The minimum target return required for running a triple barrier search.
* numThreads: The number of threads concurrently used by the function.

The output from this function is a pandas dataframe with columns:
* t1: The timestamp at which the first barrier is touched.
* trgt: The target that was used to generate the horizontal barriers.

In [None]:
def getEvents(close, tEvents, ptSl, trgt, minRet, numThreads, t1=False):
    # 1) get target
    trgt = trgt.loc[tEvents]
    trgt = trgt[trgt > minRet]  # minRet
    # 2) get t1 (max holding period)
    if t1 is False: 
        t1 = pd.Series(pd.NaT, index=tEvents)
    # 3) form events object, apply stop loss on t1
    side_ = pd.Series(1., index=trgt.index)
    events = pd.concat({'t1': t1, 'trgt': trgt, 'side': side_}, axis=1).dropna(subset=['trgt'])
    df0 = mpPandasObj(func=applyPtSlOnT1, pdObj=('molecule', events.index), 
                      numThreads=numThreads, close=close, events=events, ptSl=[ptSl, ptSl])
    events['t1'] = df0.dropna(how='all').min(axis=1)  # pd.min ignores nan
    events = events.drop('side', axis=1)
    return events

SNIPPET 3.4 ADDING A VERTICAL BARRIER


In [None]:
numDays = 21
t1 = close.index.searchsorted(tEvents+pd.Timedelta(days=numDays))
t1 = t1[t1<close.shape[0]]
t1 = pd.Series(close.index[t1],index=tEvents[:t1.shape[0]]) # NaNs at end

In [None]:
events2 = getEvents(close, tEvents, ptSl=1, trgt=stds, minRet=0.04, numThreads=1, t1=t1)
pt, sl = events2['trgt'], events2['trgt']
upper, lower = close[events2.index]*(1+pt), close[events2.index]*(1-sl)
events2.head()

Unnamed: 0,t1,trgt
2020-03-12,2020-03-13,0.044984
2020-03-13,2020-03-16,0.043517
2020-03-16,2020-03-17,0.047672
2020-03-17,2020-03-18,0.046638
2020-03-18,2020-03-20,0.045596


In [None]:
i = 2
tt1, tt2 = events2.index[i], t1.loc[events2.index].iloc[i]
ft = events2['t1'].iloc[i]
pft = close.loc[ft]
u, l = upper.iloc[i], lower.iloc[i]
fig = close.plot()
fig.add_shape(type="line", x0=tt1, y0=u, x1=tt1, y1=l, line=dict(color="red", width=2))
fig.add_shape(type="line", x0=tt1, y0=u, x1=tt2, y1=u, line=dict(color="red", width=2))
fig.add_shape(type="line", x0=tt1, y0=l, x1=tt2, y1=l, line=dict(color="red", width=2))
fig.add_shape(type="line", x0=tt2, y0=u, x1=tt2, y1=l, line=dict(color="red", width=2))
fig.add_trace(go.Scatter(x=[ft], y=[pft], marker=dict(size=[10], color=['red']), name="First Touch"))
fig.show()

SNIPPET 3.5 LABELING FOR SIDE AND SIZE

In [None]:
def getBins(events, close):
    # 1) prices aligned with events
    events_ = events.dropna(subset=['t1'])
    px = events_.index.union(events_['t1'].values).drop_duplicates()
    px = close.reindex(px, method='bfill')
    # 2) create out object
    out = pd.DataFrame(index=events_.index)
    out['ret'] = px.loc[events_['t1'].values].values / px.loc[events_.index] - 1
    out['bin'] = np.sign(out['ret'])
    return out

In [None]:
out3 = getBins(events2, close)
out3.head()

Unnamed: 0,ret,bin
2020-03-12,0.089471,1.0
2020-03-13,-0.112428,-1.0
2020-03-16,0.063362,1.0
2020-03-17,-0.063188,-1.0
2020-03-18,-0.050752,-1.0


SNIPPET 3.6 EXPANDING getEvents TO INCORPORATE
META-LABELING

In [None]:
def getEventsM(close,tEvents,ptSl,trgt,minRet,numThreads,t1=False,side=None):
  #1) get target
  trgt = trgt.loc[tEvents]
  trgt = trgt[trgt > minRet] # minRet
  #2) get t1 (max holding period)
  if t1 is False:
    t1 = pd.Series(pd.NaT, index=tEvents)
  #3) form events object, apply stop loss on t1
  if side is None:
    side_, ptSl_ = pd.Series(1.,index=trgt.index), [ptSl[0],ptSl[0]]
  else:
    side_, ptSl_ = side.loc[trgt.index], ptSl[:2]
  events = pd.concat({'t1':t1,'trgt':trgt,'side':side_},axis=1).dropna(subset=['trgt'])
  df0 = mpPandasObj(func=applyPtSlOnT1, pdObj=('molecule',events.index), 
                    numThreads=numThreads, close=close, events=events, ptSl=ptSl_)
  events['t1'] = df0.dropna(how='all').min(axis=1) # pd.min ignores nan
  if side is None:
    events = events.drop('side',axis=1)
  return events

In [None]:
events3 = getEventsM(close, tEvents, ptSl=[1,1], trgt=stds, minRet=0.04, numThreads=1, t1=t1)
events3.head()

Unnamed: 0,t1,trgt
2020-03-12,2020-03-13,0.044984
2020-03-13,2020-03-16,0.043517
2020-03-16,2020-03-17,0.047672
2020-03-17,2020-03-18,0.046638
2020-03-18,2020-03-20,0.045596


SNIPPET 3.7 EXPANDING getBins TO INCORPORATE
META-LABELING

In [None]:
def getBinsM(events, close):
    """
    Compute event's outcome (including side information, if provided).
    events is a DataFrame where:
    —events.index is event's starttime
    —events[’t1’] is event's endtime
    —events[’trgt’] is event's target
    —events[’side’] (optional) implies the algo's position side
    Case 1: (’side’ not in events): bin in (-1,1) <—label by price action
    Case 2: (’side’ in events): bin in (0,1) <—label by pnl (meta-labeling)
    """
    # 1) prices aligned with events
    events_ = events.dropna(subset=['t1'])
    px = events_.index.union(events_['t1'].values).drop_duplicates()
    px = close.reindex(px, method='bfill')
    # 2) create out object
    out = pd.DataFrame(index=events_.index)
    out['ret'] = px.loc[events_['t1'].values].values / px.loc[events_.index] - 1
    if 'side' in events_: 
      out['ret'] *= events_['side']  # meta-labeling
    out['bin'] = np.sign(out['ret'])
    if 'side' in events_: 
      out.loc[out['ret'] <= 0, 'bin'] = 0  # meta-labeling
    return out

In [None]:
out4 = getBinsM(events3, close)
out4.head()

Unnamed: 0,ret,bin
2020-03-12,0.089471,1.0
2020-03-13,-0.112428,-1.0
2020-03-16,0.063362,1.0
2020-03-17,-0.063188,-1.0
2020-03-18,-0.050752,-1.0


SNIPPET 3.8 DROPPING UNDER-POPULATED LABELS

In [None]:
def dropLabels(events,minPtc=.05):
  # apply weights, drop labels with insufficient examples
  while True:
    df0=events['bin'].value_counts(normalize=True)
    if df0.min()>minPct or df0.shape[0]<3:
      break
    print('dropped label',df0.argmin(),df0.min())
    events=events[events['bin']!=df0.argmin()]
  return events