In [4]:
import numpy as np
import pandas as pd

In [9]:
# fixed-time horizon method
def fth_labeling(prices, h, tau):
    "Label each price point using the fixed-time horizon method."
    xy = []
    for i in range(len(prices) - h):
        r = (prices[i + h] / prices[i]) - 1

        if r < -tau: y = -1
        elif abs(r) <= tau: y = 0
        else: y=1 # r > tau

        xy.append((prices[i], y))
    return xy

In [11]:
df  =  pd.read_csv("../data/btcusdt/2025102617_tradebars.csv")
labels = fth_labeling(df['close'].values, h=10, tau=0.001)
label_df = pd.DataFrame(labels, columns=['price', 'label'])
label_df.head()

Unnamed: 0,price,label
0,113654.33,0
1,113654.33,0
2,113671.37,0
3,113672.91,0
4,113678.0,0


In [13]:
def get_daily_volatility(close: pd.Series, span=100):
    "Compute the exponentially weighted daily volatility aligned to intraday data."
    # Find previous day's timestamp for each bar
    prev_day_idx = close.index.searchsorted(close.index - pd.Timedelta(days=1))
    prev_day_idx = prev_day_idx[prev_day_idx > 0]

    # Map index: current time → previous day's time
    prev = pd.Series(
        close.index[prev_day_idx - 1],
        index=close.index[-len(prev_day_idx):]
    )

    # Compute close-to-close daily returns
    daily_returns = close.loc[prev.index] / close.loc[prev.values].values - 1

    # Apply exponential weighting to smooth volatility
    daily_vol = daily_returns.ewm(span=span).std()

    return daily_vol

In [22]:
# df['close'].index - pd.Timedelta(days=1)

In [21]:
# daily_vol = get_daily_volatility(df['close'])
# daily_vol.head()

In [5]:
def triple_barrier_labeling(prices, h, daily_vol, pt_sl, min_ret):
    "Label each price point using the triple-barrier method."
    xy = []
    for i in range(len(prices) - h):
        if daily_vol[i] < min_ret:
            continue

        # Set the barriers
        pt = prices[i] * (1 + pt_sl[0] * daily_vol[i])
        sl = prices[i] * (1 - pt_sl[1] * daily_vol[i])
        y = 0 # default label is 0: no barrier touched within h

        for j in range(1, h + 1):
            if prices[i + j] > pt:
                y = 1 # profit-taking barrier touched
                break
            elif prices[i + j] < sl:
                y = -1 # stop-loss barrier touched
                break

        xy.append((prices[i], y))
    return xy

In [6]:
def triple_barrier_method(close, events, ptSl, molecule):
    events_ = events.loc[molecule]
    out = events_[['t1']].copy(deep=True)

    # Profit-taking and stop-loss barriers
    if ptSl[0] > 0:pt = ptSl[0] * events_['trgt']
    else: pt = pd.Series(index=events_.index, dtype=float)

    if ptSl[1] > 0:sl = -ptSl[1] * events_['trgt']
    else: sl = pd.Series(index=events_.index, dtype=float)

    for loc, t1 in zip(events_.index, events_['t1'].fillna(close.index[-1])):
        df0 = close.loc[loc:t1]  # path prices
        df0 = (df0 / close.loc[loc] - 1) * events_.at[loc, 'side']

        out.at[loc, 'sl'] = df0[df0 < sl[loc]].index.min()  
        out.at[loc, 'pt'] = df0[df0 > pt[loc]].index.min()  
    return out


In [7]:
# Simulated closing prices
dates = pd.date_range(start='2025-01-01', periods=10, freq='5min')
close = pd.DataFrame([100, 101, 99, 102, 98, 97, 99, 100, 103, 105], index=dates)


In [8]:
events = pd.DataFrame({
    't1': [dates[4], dates[5], dates[7]],  # when each event expires
    'trgt': [0.02, 0.015, 0.025],          # target (could be volatility or fixed)
    'side': [1, -1, 1]                     # long, short, long
}, index=[dates[0], dates[2], dates[4]])    # when each event starts

In [9]:
ptSl = [1, 1]  # 1×target for both profit-taking and stop-loss


In [10]:
molecule = events.index


In [11]:
out = triple_barrier_method(close, events, ptSl, molecule)

In [12]:
out

Unnamed: 0,t1,sl,pt
2025-01-01 00:00:00,2025-01-01 00:20:00,2025-01-01 00:00:00,2025-01-01 00:00:00
2025-01-01 00:10:00,2025-01-01 00:25:00,2025-01-01 00:10:00,2025-01-01 00:10:00
2025-01-01 00:20:00,2025-01-01 00:35:00,2025-01-01 00:20:00,2025-01-01 00:20:00


In [25]:
# Set random seed for reproducibility
np.random.seed(42)

# Parameters
n = 500              # number of time points
start_price = 100    # starting price
daily_vol = 0.02     # daily volatility (2%)
h = 10               # vertical barrier in bars
pt_sl = [1, 1]       # profit-taking and stop-loss multipliers
min_ret = 0.001      # minimum return to consider

# Generate synthetic prices using geometric Brownian motion
returns = np.random.normal(loc=0, scale=daily_vol, size=n)
prices = start_price * np.cumprod(1 + returns)
prices = pd.Series(prices, name='close')

# Generate synthetic daily_vol series
daily_vol_series = pd.Series([daily_vol] * n)

# Generate vertical barrier timestamps (every h bars)
t1 = pd.Series([i + h if i + h < n else n-1 for i in range(n)], name='t1')

# Targets (absolute returns, can be same as daily_vol * some factor)
trgt = daily_vol_series.copy()

# tEvents: take all indices as events for simplicity
tEvents = pd.Index(range(n))

# Display the first few rows
df = pd.DataFrame({
    'close': prices,
    'daily_vol': daily_vol_series,
    't1': t1,
    'trgt': trgt
})
print(df.head(15))

         close  daily_vol  t1  trgt
0   100.993428       0.02  10  0.02
1   100.714153       0.02  11  0.02
2   102.018781       0.02  12  0.02
3   105.126334       0.02  13  0.02
4   104.634020       0.02  14  0.02
5   104.144046       0.02  15  0.02
6   107.433358       0.02  16  0.02
7   109.082320       0.02  17  0.02
8   108.058093       0.02  18  0.02
9   109.230653       0.02  19  0.02
10  108.218265       0.02  20  0.02
11  107.210255       0.02  21  0.02
12  107.729072       0.02  22  0.02
13  103.606754       0.02  23  0.02
14  100.032491       0.02  24  0.02


In [14]:
def get_events(prices, tEvents, pt_sl, trgt, min_ret, t1=False):
    "Create events dataframe for triple-barrier method."
    trgt = trgt.loc[tEvents]
    trgt = trgt[trgt > min_ret]

    if t1 is False: t1=pd.Series(pd.NaT, index=tEvents)

    side_ = pd.Series(1., index=trgt.index)
    events = pd.concat({'t1': t1, 'trgt': trgt, 'side': side_}, axis=1).dropna(subset=['trgt'])
    df0 = triple_barrier_method(prices, events, pt_sl, events.index)
    events['t1'] = df0.dropna(how='all').min(axis=1)
    events = events.drop('side', axis=1)
    return events, df0

In [15]:
events, df0 = get_events(prices, tEvents, pt_sl, trgt, min_ret, t1)
events

Unnamed: 0,t1,trgt
0,3.0,0.02
1,3.0,0.02
2,3.0,0.02
3,6.0,0.02
4,6.0,0.02
...,...,...
495,496.0,0.02
496,498.0,0.02
497,499.0,0.02
498,499.0,0.02


In [16]:
df0#.dropna()

Unnamed: 0,t1,sl,pt
0,10,,3.0
1,11,,3.0
2,12,,3.0
3,13,,6.0
4,14,14.0,6.0
...,...,...,...
495,499,496.0,
496,499,498.0,
497,499,499.0,
498,499,499.0,


In [17]:
def getBins(events, prices):
    events_ = events.dropna(subset=['t1'])
    px = events_.index.union(events_['t1'].values).drop_duplicates()
    px = prices.reindex(px, method="bfill")
    out = pd.DataFrame(index=events_.index)
    out['ret'] = px.loc[events_['t1'].values].values / px.loc[events_.index]-1
    out['bin'] = np.sign(out['ret'])
    return out

In [18]:
bins = getBins(events, prices)

In [19]:
bins

Unnamed: 0,ret,bin
0,0.040923,1.0
1,0.043809,1.0
2,0.030461,1.0
3,0.021945,1.0
4,0.026754,1.0
...,...,...
495,-0.020745,-1.0
496,-0.021252,-1.0
497,-0.044684,-1.0
498,-0.027656,-1.0


In [31]:
def get_events(prices, tEvents, pt_sl, trgt, min_ret, t1=False, side=None):
    "Create events dataframe for triple-barrier method."
    trgt = trgt.loc[tEvents]
    trgt = trgt[trgt > min_ret]

    if t1 is False: t1=pd.Series(pd.NaT, index=tEvents)

    if side is None:side_,pt_sl_ = pd.Series(1., index=trgt.index), [pt_sl[0], pt_sl[0]]
    else:side_, pt_sl_ = side.loc[trgt.index], pt_sl[:2]
    events = pd.concat({'t1': t1, 'trgt': trgt, 'side': side_}, axis=1).dropna(subset=['trgt'])
    df0 = triple_barrier_method(prices, events, pt_sl_, events.index)
    events['t1'] = df0.dropna(how='all').min(axis=1)
    if side is None: events = events.drop('side', axis=1)
    return events, df0

In [33]:
events, df0 = get_events(prices, tEvents, pt_sl, trgt, min_ret, t1)

In [34]:
events

Unnamed: 0,t1,trgt
0,3.0,0.02
1,3.0,0.02
2,3.0,0.02
3,6.0,0.02
4,6.0,0.02
...,...,...
495,496.0,0.02
496,498.0,0.02
497,499.0,0.02
498,499.0,0.02


In [35]:
df0

Unnamed: 0,t1,sl,pt
0,10,,3.0
1,11,,3.0
2,12,,3.0
3,13,,6.0
4,14,14.0,6.0
...,...,...,...
495,499,496.0,
496,499,498.0,
497,499,499.0,
498,499,499.0,


In [39]:
def getBins(events, prices):
    events_ = events.dropna(subset=['t1'])
    px = events_.index.union(events_['t1'].values).drop_duplicates()
    px = prices.reindex(px, method='bfill')
    out = pd.DataFrame(index=events_.index)
    out['ret'] = px.loc[events_['t1'].values].values / px.loc[events_.index]-1
    if 'side' in events_: out['ret'] *= events_['side']
    out['bin'] = np.sign(out['ret'])
    if 'side' in events_: out.loc[out['ret'] <= 0, 'bin'] = 0
    return out

In [40]:
bins = getBins(events, prices)

In [41]:
bins

Unnamed: 0,ret,bin
0,0.040923,1.0
1,0.043809,1.0
2,0.030461,1.0
3,0.021945,1.0
4,0.026754,1.0
...,...,...
495,-0.020745,-1.0
496,-0.021252,-1.0
497,-0.044684,-1.0
498,-0.027656,-1.0


In [1]:
def dropLabels(events,minPtc=.5):
    # apply weights, drop labels with insufficient examples
    while True:
        df0=events['bin'].value_counts(normalize=True)
        if df0.min()>minPtc or df0.shape[0]<3:break
        print ('dropped label',df0.argmin(),df0.min())
        events=events[events['bin']!=df0.argmin()]
    return events