In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import plotly.graph_objects as go
from scipy.stats import skew, kurtosis

In [3]:
df = pd.read_csv('data/btcusdt_5min.csv')
df['timestamp'] = pd.to_datetime(df['timestamp'])
df.set_index('timestamp', inplace=True)

In [4]:
df.head()

Unnamed: 0_level_0,open,high,low,close,volume
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2025-03-31 12:35:00,82199.5,82249.9,82113.6,82186.9,11.545725
2025-03-31 12:40:00,82186.9,82528.0,82146.0,82490.9,54.304581
2025-03-31 12:45:00,82490.9,82490.9,82390.0,82472.6,54.241944
2025-03-31 12:50:00,82472.6,83155.8,82472.6,82864.3,210.304118
2025-03-31 12:55:00,82864.3,82911.7,82760.1,82779.0,59.860116


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 9990 entries, 2025-03-31 12:35:00 to 2025-05-05 05:45:00
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   open    9990 non-null   float64
 1   high    9990 non-null   float64
 2   low     9990 non-null   float64
 3   close   9990 non-null   float64
 4   volume  9990 non-null   float64
dtypes: float64(5)
memory usage: 468.3 KB


In [6]:
df['label'] = np.nan
df[['label_filtered', 'log_return', 'realized_pnl']] = np.nan

In [6]:
window = 24
offset = 8
threshold = 0.03

In [7]:
for start in range(offset, len(df) - window, window):
    end = start + window
    start_price = df['close'].iloc[start]
    end_price = df['close'].iloc[end]
    slope = (end_price - start_price) / window

    if slope > threshold:
        df.iloc[start, df.columns.get_loc('label')] = 1
    elif slope < -threshold:
        df.iloc[start, df.columns.get_loc('label')] = -1

In [8]:
df.label.value_counts()

label
 1.0    219
-1.0    195
Name: count, dtype: int64

In [9]:
labeled = df[df['label'].notna()].copy()


In [10]:
labeled

Unnamed: 0_level_0,open,high,low,close,volume,label,label_filtered,log_return,realized_pnl
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2025-03-31 13:15:00,82655.8,82745.1,82575.3,82575.3,14.837103,1.0,,,
2025-03-31 15:15:00,83442.6,83642.2,83440.2,83562.2,42.259176,-1.0,,,
2025-03-31 17:15:00,83465.3,83469.3,83243.8,83243.8,12.022286,-1.0,,,
2025-03-31 19:15:00,83358.2,83364.2,83203.1,83237.4,17.939385,-1.0,,,
2025-03-31 21:15:00,82548.9,82548.9,82494.4,82538.2,11.760208,-1.0,,,
...,...,...,...,...,...,...,...,...,...
2025-05-04 18:00:00,95400.4,95474.8,95400.4,95474.3,12.628353,1.0,,,
2025-05-04 20:00:00,95579.6,95653.7,95579.5,95653.6,7.135635,-1.0,,,
2025-05-04 22:00:00,95669.0,95679.2,95120.5,95167.0,129.742620,-1.0,,,
2025-05-05 00:00:00,94273.2,94387.6,94192.3,94278.2,66.833615,-1.0,,,


In [11]:
labeled['prev_label'] = labeled['label'].shift(1)
labeled_filtered = labeled[labeled['label'] != labeled['prev_label']]

In [12]:
labeled_filtered

Unnamed: 0_level_0,open,high,low,close,volume,label,label_filtered,log_return,realized_pnl,prev_label
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2025-03-31 13:15:00,82655.8,82745.1,82575.3,82575.3,14.837103,1.0,,,,
2025-03-31 15:15:00,83442.6,83642.2,83440.2,83562.2,42.259176,-1.0,,,,1.0
2025-03-31 23:15:00,82556.4,82876.2,82510.8,82515.9,46.357568,1.0,,,,-1.0
2025-04-01 09:15:00,84155.3,84293.7,84155.0,84252.8,31.467289,-1.0,,,,1.0
2025-04-01 13:15:00,83779.6,83779.6,83616.9,83700.0,17.758564,1.0,,,,-1.0
...,...,...,...,...,...,...,...,...,...,...
2025-05-04 14:00:00,95376.6,95389.2,95324.9,95389.2,14.867994,1.0,,,,-1.0
2025-05-04 16:00:00,95470.0,95541.2,95392.3,95524.5,15.542173,-1.0,,,,1.0
2025-05-04 18:00:00,95400.4,95474.8,95400.4,95474.3,12.628353,1.0,,,,-1.0
2025-05-04 20:00:00,95579.6,95653.7,95579.5,95653.6,7.135635,-1.0,,,,1.0


In [13]:
labeled_filtered.index

DatetimeIndex(['2025-03-31 13:15:00', '2025-03-31 15:15:00',
               '2025-03-31 23:15:00', '2025-04-01 09:15:00',
               '2025-04-01 13:15:00', '2025-04-01 21:15:00',
               '2025-04-02 07:15:00', '2025-04-02 09:15:00',
               '2025-04-02 13:15:00', '2025-04-02 17:15:00',
               ...
               '2025-05-04 02:00:00', '2025-05-04 04:00:00',
               '2025-05-04 06:00:00', '2025-05-04 10:00:00',
               '2025-05-04 12:00:00', '2025-05-04 14:00:00',
               '2025-05-04 16:00:00', '2025-05-04 18:00:00',
               '2025-05-04 20:00:00', '2025-05-05 02:00:00'],
              dtype='datetime64[ns]', name='timestamp', length=201, freq=None)

In [14]:
labeled_filtered.loc[:, 'log_return'] = np.log(labeled_filtered['close'] / labeled_filtered['close'].shift(1))
labeled_filtered.loc[:, 'realized_pnl'] = labeled_filtered['log_return'] * labeled_filtered['label'].shift(1)

In [15]:
df.head()

Unnamed: 0_level_0,open,high,low,close,volume,label,label_filtered,log_return,realized_pnl
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2025-03-31 12:35:00,82199.5,82249.9,82113.6,82186.9,11.545725,,,,
2025-03-31 12:40:00,82186.9,82528.0,82146.0,82490.9,54.304581,,,,
2025-03-31 12:45:00,82490.9,82490.9,82390.0,82472.6,54.241944,,,,
2025-03-31 12:50:00,82472.6,83155.8,82472.6,82864.3,210.304118,,,,
2025-03-31 12:55:00,82864.3,82911.7,82760.1,82779.0,59.860116,,,,


In [16]:
df.loc[labeled_filtered.index, ['label_filtered', 'log_return', 'realized_pnl']] = (
    labeled_filtered[['label', 'log_return', 'realized_pnl']]
)

In [17]:
df_update = labeled_filtered[['label', 'log_return', 'realized_pnl']].copy()
df_update.columns = ['label_filtered', 'log_return', 'realized_pnl']
df.update(df_update)

In [18]:
df.head()


Unnamed: 0_level_0,open,high,low,close,volume,label,label_filtered,log_return,realized_pnl
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2025-03-31 12:35:00,82199.5,82249.9,82113.6,82186.9,11.545725,,,,
2025-03-31 12:40:00,82186.9,82528.0,82146.0,82490.9,54.304581,,,,
2025-03-31 12:45:00,82490.9,82490.9,82390.0,82472.6,54.241944,,,,
2025-03-31 12:50:00,82472.6,83155.8,82472.6,82864.3,210.304118,,,,
2025-03-31 12:55:00,82864.3,82911.7,82760.1,82779.0,59.860116,,,,


In [27]:
df.label_filtered.value_counts()

label_filtered
 1.0    101
-1.0    100
Name: count, dtype: int64

In [19]:
labeled_filtered.label.value_counts()

label
 1.0    101
-1.0    100
Name: count, dtype: int64

In [20]:
df[df['label_filtered'].notna()].head(10)

Unnamed: 0_level_0,open,high,low,close,volume,label,label_filtered,log_return,realized_pnl
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2025-03-31 13:15:00,82655.8,82745.1,82575.3,82575.3,14.837103,1.0,1.0,,
2025-03-31 15:15:00,83442.6,83642.2,83440.2,83562.2,42.259176,-1.0,-1.0,0.011881,0.011881
2025-03-31 23:15:00,82556.4,82876.2,82510.8,82515.9,46.357568,1.0,1.0,-0.0126,0.0126
2025-04-01 09:15:00,84155.3,84293.7,84155.0,84252.8,31.467289,-1.0,-1.0,0.020831,0.020831
2025-04-01 13:15:00,83779.6,83779.6,83616.9,83700.0,17.758564,1.0,1.0,-0.006583,0.006583
2025-04-01 21:15:00,85288.3,85288.3,85157.8,85249.0,9.897987,-1.0,-1.0,0.018337,0.018337
2025-04-02 07:15:00,84032.3,84214.8,84009.1,84181.5,13.341433,1.0,1.0,-0.012601,0.012601
2025-04-02 09:15:00,85307.0,85351.1,85074.5,85121.1,75.525684,-1.0,-1.0,0.0111,0.0111
2025-04-02 13:15:00,84841.5,84841.5,84716.6,84824.7,46.693023,1.0,1.0,-0.003488,0.003488
2025-04-02 17:15:00,87217.5,87245.1,87083.4,87159.1,49.67348,-1.0,-1.0,0.027148,0.027148


In [21]:
print(df['label_filtered'].notna().sum())

201


In [22]:
df.label.value_counts()


label
 1.0    219
-1.0    195
Name: count, dtype: int64

In [23]:
print(df.index.dtype, labeled_filtered.index.dtype)

datetime64[ns] datetime64[ns]


In [24]:
print(df.index.intersection(labeled_filtered.index))

DatetimeIndex(['2025-03-31 13:15:00', '2025-03-31 15:15:00',
               '2025-03-31 23:15:00', '2025-04-01 09:15:00',
               '2025-04-01 13:15:00', '2025-04-01 21:15:00',
               '2025-04-02 07:15:00', '2025-04-02 09:15:00',
               '2025-04-02 13:15:00', '2025-04-02 17:15:00',
               ...
               '2025-05-04 02:00:00', '2025-05-04 04:00:00',
               '2025-05-04 06:00:00', '2025-05-04 10:00:00',
               '2025-05-04 12:00:00', '2025-05-04 14:00:00',
               '2025-05-04 16:00:00', '2025-05-04 18:00:00',
               '2025-05-04 20:00:00', '2025-05-05 02:00:00'],
              dtype='datetime64[ns]', name='timestamp', length=201, freq=None)


In [25]:
print(df.index.intersection(labeled_filtered.index).size)

201


In [26]:
df

Unnamed: 0_level_0,open,high,low,close,volume,label,label_filtered,log_return,realized_pnl
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2025-03-31 12:35:00,82199.5,82249.9,82113.6,82186.9,11.545725,,,,
2025-03-31 12:40:00,82186.9,82528.0,82146.0,82490.9,54.304581,,,,
2025-03-31 12:45:00,82490.9,82490.9,82390.0,82472.6,54.241944,,,,
2025-03-31 12:50:00,82472.6,83155.8,82472.6,82864.3,210.304118,,,,
2025-03-31 12:55:00,82864.3,82911.7,82760.1,82779.0,59.860116,,,,
...,...,...,...,...,...,...,...,...,...
2025-05-05 05:25:00,94421.5,94446.0,94418.0,94433.3,6.693873,,,,
2025-05-05 05:30:00,94433.3,94478.1,94374.7,94424.1,25.270591,,,,
2025-05-05 05:35:00,94424.1,94529.4,94424.1,94518.7,21.342656,,,,
2025-05-05 05:40:00,94518.7,94554.7,94514.1,94514.2,22.352516,,,,
