In [19]:
import numpy as np
import pandas as pd
from pathlib import Path
import os

# ----------------------------
# Strategy 2: Bollinger Mean Reversion (1-min)
# - If price is far below rolling mean (z < -ENTER_Z) -> go LONG (expect reversion up)
# - If price is far above rolling mean (z > +ENTER_Z) -> go SHORT (expect reversion down)
# - Exit when z comes back inside +/- EXIT_Z (hysteresis reduces churn)
# ----------------------------

# --- Paths (project root = one level above /notebooks) ---
PROJECT_ROOT = Path(os.getcwd()).resolve().parents[0]
DATA_DIR = PROJECT_ROOT / "data_processed"
G1_TRADE_DIR = DATA_DIR / "group1" / "trade"

print("PROJECT_ROOT:", PROJECT_ROOT)
print("G1_TRADE_DIR:", G1_TRADE_DIR, "exists:", G1_TRADE_DIR.exists())
assert G1_TRADE_DIR.exists(), f"Missing: {G1_TRADE_DIR}"

# --- Load one quarter first (end-to-end sanity check) ---
g1_files = sorted(G1_TRADE_DIR.glob("*.parquet"))
print("Num quarters (G1 trade):", len(g1_files))
assert len(g1_files) > 0, "No parquet files found in group1/trade"

sample_file = g1_files[0]
df = pd.read_parquet(sample_file)

ASSET = "SP"
POINT_VALUE = 50
TCOST_PER_TRADE = 12

price = df[ASSET].astype(float).dropna().sort_index()
print("Using file:", sample_file.name)
print("Price rows:", len(price), "from", price.index.min(), "to", price.index.max())

PROJECT_ROOT: C:\Users\LO79RS\HFD_Project
G1_TRADE_DIR: C:\Users\LO79RS\HFD_Project\data_processed\group1\trade exists: True
Num quarters (G1 trade): 7
Using file: data1_2023_Q1.parquet
Price rows: 23120 from 2023-01-03 09:56:00+00:00 to 2023-03-31 16:00:00+00:00


In [20]:
# --- Parameters (1-min data) ---
BOLL_WIN = 120      # rolling window in minutes (memory)
Z_ENTER  = 2.0      # entry threshold (far from mean)
Z_EXIT   = 0.5      # exit threshold (closer to mean)  -> hysteresis
MIN_HOLD = 5

Bollinger inputs (mid, sigma, z-score)

In [21]:
import numpy as np
mid = price.rolling(BOLL_WIN).mean()
sigma = price.rolling(BOLL_WIN).std(ddof=0)

# Avoid division by zero
sigma = sigma.replace(0, np.nan)

z = (price - mid) / sigma
z.name = "zscore"

print("BOLL_WIN:", BOLL_WIN)
print("z-score summary:")
z.describe()

BOLL_WIN: 120
z-score summary:


count    22770.000000
mean         0.141247
std          1.508879
min        -10.908712
25%         -0.896547
50%          0.221401
75%          1.129227
max         10.908712
Name: zscore, dtype: float64

Mean Reversion Entry/Exit

In [22]:
raw_pos = pd.Series(0, index=price.index, dtype=int)

# Entry
raw_pos[z > Z_ENTER] = -1
raw_pos[z < -Z_ENTER] = 1

# Exit (flat zone)
raw_pos[(z.abs() < Z_EXIT)] = 0

raw_pos.name = "raw_pos"

print("Z_ENTER:", Z_ENTER, "Z_EXIT:", Z_EXIT)
raw_pos.value_counts(dropna=False)

Z_ENTER: 2.0 Z_EXIT: 0.5


raw_pos
 0    19713
-1     1891
 1     1516
Name: count, dtype: int64

In [23]:
# Build persistent position from raw signal
pos = pd.Series(0, index=price.index, dtype=int)

state = 0
for t in range(len(price)):
    r = raw_pos.iloc[t]

    if state == 0:
        # enter only when signal appears
        if r != 0:
            state = r

    elif state == 1:
        # exit long only when flat signal appears
        if r == 0:
            state = 0

    elif state == -1:
        # exit short only when flat signal appears
        if r == 0:
            state = 0

    pos.iloc[t] = state

pos.name = "pos"

pos.value_counts(dropna=False)

pos
 0    19713
-1     1909
 1     1498
Name: count, dtype: int64

Trade counts (entries/exits/flips)

In [24]:
# Trade statistics based on position changes
prev_pos = pos.shift(1).fillna(0)

n_flips = (pos != prev_pos).sum()
n_entries = ((prev_pos == 0) & (pos != 0)).sum()
n_exits = ((prev_pos != 0) & (pos == 0)).sum()

print("Trade summary:")
print("Total position changes:", n_flips)
print("Entries (0 -> +/-1):", n_entries)
print("Exits (+/-1 -> 0):", n_exits)

Trade summary:
Total position changes: 986
Entries (0 -> +/-1): 493
Exits (+/-1 -> 0): 493


In [25]:
# Direct long <-> short flips (should be zero for clean MR logic)

long_to_short = ((prev_pos == 1) & (pos == -1)).sum()
short_to_long = ((prev_pos == -1) & (pos == 1)).sum()

print("Direct flips summary:")
print("Long -> Short:", long_to_short)
print("Short -> Long:", short_to_long)

Direct flips summary:
Long -> Short: 0
Short -> Long: 0


In [26]:
# PnL backbone (1-minute, position shifted to avoid look-ahead)

dP = price.diff()

# Trade next bar using previous bar's position
pos_lag = pos.shift(1).fillna(0).astype(int)

# Gross PnL in index points (no transaction costs)
pnl_points = pos_lag * dP

print("Gross PnL (points) summary:")
print(pnl_points.describe())

Gross PnL (points) summary:
count    23119.000000
mean         0.007137
std          0.895547
min        -26.031000
25%         -0.000000
50%          0.000000
75%          0.000000
max         50.982000
dtype: float64


In [27]:
# Aggregate 1-minute PnL to daily gross PnL

pnl_daily = pnl_points.resample("1D").sum()

print("Daily gross PnL summary:")
print(pnl_daily.describe())

Daily gross PnL summary:
count    88.000000
mean      1.874898
std      10.423467
min     -21.828000
25%      -2.613250
50%       0.000000
75%       7.655000
max      37.238000
dtype: float64


In [28]:
prev_pos = pos.shift(1).fillna(0).astype(int)

# each position change = 1 transaction "unit" in your implementation
tx_1m = (pos - prev_pos).abs()

# NOTE: if you ever allow direct +1 -> -1 flips, abs gives 2 (two transactions) which is fine.
tx_daily1 = tx_1m.resample("1D").sum().rename("tx_daily")

tx_daily1.describe(), tx_daily1.sum()

(count    88.000000
 mean     11.204545
 std       9.312159
 min       0.000000
 25%       0.000000
 50%      12.000000
 75%      16.250000
 max      36.000000
 Name: tx_daily, dtype: float64,
 np.int64(986))

In [29]:
pnl_daily_gross_usd1= (pnl_daily * POINT_VALUE).rename("pnl_daily_gross_usd")

cost_daily_usd1 = (tx_daily1 * TCOST_PER_TRADE).rename("cost_daily_usd")

pnl_daily_net_usd1 = (pnl_daily_gross_usd1 - cost_daily_usd1).rename("pnl_daily_net_usd")

pnl_daily_gross_usd1.describe(), pnl_daily_net_usd1.describe()

(count      88.000000
 mean       93.744886
 std       521.173372
 min     -1091.400000
 25%      -130.662500
 50%         0.000000
 75%       382.750000
 max      1861.900000
 Name: pnl_daily_gross_usd, dtype: float64,
 count      88.000000
 mean      -40.709659
 std       504.811080
 min     -1259.400000
 25%      -275.812500
 50%         0.000000
 75%       234.850000
 max      1549.900000
 Name: pnl_daily_net_usd, dtype: float64)

Strategy 2 (BOLL WIN 60, Z_ENTER 2.5, Z_EXIT 0.75)

In [None]:
# --- Parameters (1-min data) ---
BOLL_WIN_S2 = 60     # rolling window in minutes (memory)
Z_ENTER_S2  = 2     # entry threshold (far from mean)
Z_EXIT_S2   = 1      # exit threshold (closer to mean)  -> hysteresis
MIN_HOLD = 15

Bollinger inputs (mid, sigma, z-score)

In [31]:
mid = price.rolling(BOLL_WIN_S2).mean()
sigma = price.rolling(BOLL_WIN_S2).std(ddof=0)

# Avoid division by zero
sigma = sigma.replace(0, np.nan)

z = (price - mid) / sigma
z.name = "zscore"

print("BOLL_WIN_S2:", BOLL_WIN_S2)
print("z-score summary:")
z.describe()

BOLL_WIN_S2: 60
z-score summary:


count    21743.000000
mean         0.081112
std          1.491646
min         -7.681246
25%         -0.893079
50%          0.153867
75%          1.068419
max          7.681146
Name: zscore, dtype: float64

Mean Reversion Entry/Exit

In [32]:
raw_pos2 = pd.Series(0, index=price.index, dtype=int)

# Entry
raw_pos2[z > Z_ENTER_S2] = -1
raw_pos2[z < -Z_ENTER_S2] = 1

# Exit (flat zone)
raw_pos2[(z.abs() < Z_EXIT_S2)] = 0

raw_pos2.name = "raw_pos2"

print("Z_ENTER_S2:", Z_ENTER_S2, "Z_EXIT_S2:", Z_EXIT_S2)
raw_pos.value_counts(dropna=False)

NameError: name 'Z_ENTER_S2' is not defined

In [None]:
# Build persistent position from raw signal
pos2 = pd.Series(0, index=price.index, dtype=int)

state = 0
for t in range(len(price)):
    r = raw_pos2.iloc[t]

    if state == 0:
        # enter only when signal appears
        if r != 0:
            state = r

    elif state == 1:
        # exit long only when flat signal appears
        if r == 0:
            state = 0

    elif state == -1:
        # exit short only when flat signal appears
        if r == 0:
            state = 0

    pos2.iloc[t] = state

pos2.name = "pos2"

pos2.value_counts(dropna=False)

pos2
 0    16989
-1     3247
 1     2884
Name: count, dtype: int64

Trade counts (entries/exits/flips)

In [None]:
# Trade statistics based on position changes
prev_pos2 = pos2.shift(1).fillna(0)

n_flips2 = (pos2 != prev_pos2).sum()
n_entries2 = ((prev_pos2 == 0) & (pos2 != 0)).sum()
n_exits2 = ((prev_pos2 != 0) & (pos2 == 0)).sum()

print("Trade summary:")
print("Total position changes:", n_flips2)
print("Entries (0 -> +/-1):", n_entries2)
print("Exits (+/-1 -> 0):", n_exits2)

Trade summary:
Total position changes: 1954
Entries (0 -> +/-1): 977
Exits (+/-1 -> 0): 977


In [None]:
# Direct long <-> short flips (should be zero for clean MR logic)

long_to_short2 = ((prev_pos2 == 1) & (pos2 == -1)).sum()
short_to_long2 = ((prev_pos2 == -1) & (pos2 == 1)).sum()

print("Direct flips summary:")
print("Long -> Short:", long_to_short2)
print("Short -> Long:", short_to_long2)

Direct flips summary:
Long -> Short: 0
Short -> Long: 0


In [None]:
# PnL backbone (1-minute, position shifted to avoid look-ahead)

dP = price.diff()

# Trade next bar using previous bar's position
pos2_lag = pos2.shift(1).fillna(0).astype(int)

# Gross PnL in index points (no transaction costs)
pnl_points2 = pos2_lag * dP

print("Gross PnL (points) summary:")
print(pnl_points2.describe())

Gross PnL (points) summary:
count    23119.000000
mean         0.000844
std          1.110646
min        -26.031000
25%          0.000000
50%          0.000000
75%          0.000000
max         50.982000
dtype: float64


In [None]:
# Aggregate 1-minute PnL to daily gross PnL

pnl_daily2 = pnl_points2.resample("1D").sum()

print("Daily gross PnL summary:")
print(pnl_daily2.describe())

Daily gross PnL summary:
count    88.000000
mean      0.221693
std      14.209006
min     -43.249000
25%      -5.739750
50%       0.000000
75%       8.138250
max      29.818000
dtype: float64


In [None]:
prev_pos2 = pos2.shift(1).fillna(0).astype(int)

# each position change = 1 transaction "unit" in your implementation
tx_1m = (pos2 - prev_pos2).abs()

# NOTE: if you ever allow direct +1 -> -1 flips, abs gives 2 (two transactions) which is fine.
tx_daily = tx_1m.resample("1D").sum().rename("tx_daily")

tx_daily.describe(), tx_daily.sum()

(count    88.000000
 mean     22.204545
 std      17.625190
 min       0.000000
 25%       0.000000
 50%      21.500000
 75%      38.000000
 max      58.000000
 Name: tx_daily, dtype: float64,
 np.int64(1954))

In [None]:
pnl_daily_gross_usd= (pnl_daily2 * POINT_VALUE).rename("pnl_daily_gross_usd")

cost_daily_usd = (tx_daily * TCOST_PER_TRADE).rename("cost_daily_usd")

pnl_daily_net_usd = (pnl_daily_gross_usd - cost_daily_usd).rename("pnl_daily_net_usd")

pnl_daily_gross_usd.describe(), pnl_daily_net_usd.describe()

(count      88.000000
 mean       11.084659
 std       710.450324
 min     -2162.450000
 25%      -286.987500
 50%         0.000000
 75%       406.912500
 max      1490.900000
 Name: pnl_daily_gross_usd, dtype: float64,
 count      88.000000
 mean     -255.369886
 std       720.145917
 min     -2618.450000
 25%      -615.550000
 50%         0.000000
 75%        38.975000
 max      1008.300000
 Name: pnl_daily_net_usd, dtype: float64)

In [None]:
# --- Strategy comparison table (S1 vs S2) ---
# Compare daily PnL statistics for two parameter sets of the same strategy

def summarize_daily_pnl(pnl_daily, label):
    return pd.Series({
        "mean_daily_pnl": pnl_daily.mean(),
        "std_daily_pnl": pnl_daily.std(),
        "sharpe_daily": (
            pnl_daily.mean() / pnl_daily.std()
            if pnl_daily.std() != 0 else np.nan
        ),
        "min_daily_pnl": pnl_daily.min(),
        "max_daily_pnl": pnl_daily.max(),
        "positive_days_%": (pnl_daily > 0).mean() * 100,
        "num_days": pnl_daily.count()
    }, name=label)

# Build comparison table
summary_s1 = summarize_daily_pnl(pnl_daily, "Bollinger S1")
summary_s2 = summarize_daily_pnl(pnl_daily2, "Bollinger S2")

comparison_table = pd.concat([summary_s1, summary_s2], axis=1)

print("Strategy comparison (daily PnL based):")
comparison_table

Strategy comparison (daily PnL based):


Unnamed: 0,Bollinger S1,Bollinger S2
mean_daily_pnl,1.874898,0.221693
std_daily_pnl,10.423467,14.209006
sharpe_daily,0.179873,0.015602
min_daily_pnl,-21.828,-43.249
max_daily_pnl,37.238,29.818
positive_days_%,40.909091,37.5
num_days,88.0,88.0


In both strategies of Mean Reversion, we get negative daily net PnL even though Bollinger S1 performs better than Trend following in some of the basic statistics above. We decide to move forward with Trend following.