# 1. Data Loading 

In [4]:
import pandas as pd

def load_data(path):
    df = pd.read_parquet(path)

    if not isinstance(df.index, pd.DatetimeIndex):
        datetime_col = None
        for col in df.columns:
            if "time" in col.lower() or "date" in col.lower():
                datetime_col = col
                break

        if datetime_col is None:
            raise ValueError("No datetime column found in data.")

        df[datetime_col] = pd.to_datetime(df[datetime_col])
        df = df.set_index(datetime_col)

    df = df.sort_index()
    return df

In [5]:
path = "data_raw\group1"

In [6]:
import os

files = [f for f in os.listdir("../data_raw/group1") if f.endswith(".parquet")]
files

['data1_2023_Q1.parquet',
 'data1_2023_Q3.parquet',
 'data1_2023_Q4.parquet',
 'data1_2024_Q2.parquet',
 'data1_2024_Q4.parquet',
 'data1_2025_Q1.parquet',
 'data1_2025_Q2.parquet']

In [7]:
df = load_data(f"../data_raw/group1/{files[0]}")
df.head()

Unnamed: 0_level_0,NQ,SP
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1
2023-01-02 09:31:00+00:00,,
2023-01-02 09:32:00+00:00,,
2023-01-02 09:33:00+00:00,,
2023-01-02 09:34:00+00:00,,
2023-01-02 09:35:00+00:00,,


## 2. Common Settings (Groups, Costs, Session Rules)

In [8]:
# ===== Transaction costs & point values =====
CONTRACTS = {
    "SP":  {"cost": 12, "point_value": 50},
    "NQ":  {"cost": 12, "point_value": 20},
    "CAD": {"cost": 10, "point_value": 100000},
    "AUD": {"cost": 10, "point_value": 100000},
    "XAU": {"cost": 15, "point_value": 100},
    "XAG": {"cost": 10, "point_value": 5000},
}

# ===== Group definitions =====
GROUP1_ASSETS = ["SP", "NQ"]              # NYSE session, 1-min data
GROUP2_ASSETS = ["CAD", "AUD", "XAU", "XAG"]  # 24h with break, 5-min data

# ===== Time rules (as strings; weâ€™ll use them later for masks) =====
GROUP1_RULES = {
    "session_start": "09:30",
    "session_end": "16:00",
    "no_trade_start": "09:31",
    "no_trade_end": "09:55",
    "force_exit": "15:40",
    "drop_early_start": "09:31",
    "drop_early_end": "09:40",
    "drop_late_start": "15:51",
    "drop_late_end": "16:00",
}

GROUP2_RULES = {
    "break_start": "17:00",
    "break_end": "18:00",
    "force_exit": "16:50",
    "no_trade_after_break_end": "18:10",
}

ANNUALIZATION_DAYS = 252

In [9]:
print("Group1 assets:", GROUP1_ASSETS)
print("SP cost/point:", CONTRACTS["SP"])
print("Group1 force_exit:", GROUP1_RULES["force_exit"])

Group1 assets: ['SP', 'NQ']
SP cost/point: {'cost': 12, 'point_value': 50}
Group1 force_exit: 15:40


## 3. Time Masks (Trading & Data Filtering Rules)

In [10]:
import pandas as pd

def time_between(index, start, end):
    """
    index: pd.DatetimeIndex
    start, end: 'HH:MM'
    """
    t = index.time
    return (t >= pd.to_datetime(start).time()) & (t <= pd.to_datetime(end).time())

In [11]:
def group1_masks(df):
    idx = df.index

    masks = {}

    # tradable session
    masks["in_session"] = time_between(
        idx,
        GROUP1_RULES["session_start"],
        GROUP1_RULES["session_end"],
    )

    # no-trade window (first 25 minutes)
    masks["no_trade"] = time_between(
        idx,
        GROUP1_RULES["no_trade_start"],
        GROUP1_RULES["no_trade_end"],
    )

    # forced exit time
    masks["force_exit"] = time_between(
        idx,
        GROUP1_RULES["force_exit"],
        GROUP1_RULES["session_end"],
    )

    # drop data for calculations
    masks["drop_calc"] = (
        time_between(idx, GROUP1_RULES["drop_early_start"], GROUP1_RULES["drop_early_end"])
        | time_between(idx, GROUP1_RULES["drop_late_start"], GROUP1_RULES["drop_late_end"])
    )

    return masks

In [12]:
masks = group1_masks(df)

for k, v in masks.items():
    print(k, v.sum())

in_session 25109
no_trade 1625
force_exit 1343
drop_calc 1289


In [13]:
import os

files2 = [f for f in os.listdir("../data_raw/group2") if f.endswith(".parquet")]
files2

['data2_2023_Q1.parquet',
 'data2_2023_Q3.parquet',
 'data2_2023_Q4.parquet',
 'data2_2024_Q2.parquet',
 'data2_2024_Q4.parquet',
 'data2_2025_Q1.parquet',
 'data2_2025_Q2.parquet']

In [14]:
df2 = load_data(f"../data_raw/group2/{files2[0]}")
df2.head()

Unnamed: 0_level_0,AUD,CAD,XAG,XAU
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2023-01-01 17:35:00+00:00,,,,
2023-01-01 18:00:00+00:00,,,,
2023-01-01 18:05:00+00:00,0.68142,,,
2023-01-01 18:15:00+00:00,0.68123,0.739557,,
2023-01-01 18:20:00+00:00,0.68083,0.739552,,


In [15]:
def group2_masks(df):
    idx = df.index
    masks = {}

    # Break window (positions must NOT be held)
    masks["break_time"] = time_between(idx, GROUP2_RULES["break_start"], GROUP2_RULES["break_end"])

    # No trade first 10 minutes after break ends (18:00-18:10)
    masks["no_trade"] = time_between(idx, GROUP2_RULES["break_end"], GROUP2_RULES["no_trade_after_break_end"])

    # Force exit before break starts (close all positions at 16:50)
    masks["force_exit"] = time_between(idx, GROUP2_RULES["force_exit"], GROUP2_RULES["break_start"])

    return masks

In [16]:
m2 = group2_masks(df2)

for k, v in m2.items():
    print("G2", k, v.sum())

G2 break_time 696
G2 no_trade 187
G2 force_exit 182


Apply Masks to first data file to test mask functions

In [17]:
m1 = group1_masks(df)

df1_trade = df[m1["in_session"] & (~m1["no_trade"])].copy()

m1_trade = group1_masks(df1_trade)
df1_calc = df1_trade[(~m1_trade["drop_calc"])].copy()

print("G1 original:", df.shape)
print("G1 trade   :", df1_trade.shape)
print("G1 calc    :", df1_calc.shape)

G1 original: (25109, 2)
G1 trade   : (23484, 2)
G1 calc    : (22845, 2)


In [18]:
m2 = group2_masks(df2)

df2_trade = df2[(~m2["break_time"]) & (~m2["no_trade"])].copy()

print("G2 original:", df2.shape)
print("G2 trade   :", df2_trade.shape)

G2 original: (18398, 4)
G2 trade   : (17573, 4)


In [19]:
check_break = df2_trade[
    time_between(df2_trade.index, GROUP2_RULES["break_start"], GROUP2_RULES["break_end"])
].shape[0]

print("Rows during break in df2_trade:", check_break)

Rows during break in df2_trade: 0


Applying masks to all data and create processed versions

In [20]:
from pathlib import Path

RAW_G1 = Path("../data_raw/group1")
RAW_G2 = Path("../data_raw/group2")

OUT_G1_TRADE = Path("../data_processed/group1/trade")
OUT_G1_CALC  = Path("../data_processed/group1/calc")
OUT_G2_TRADE = Path("../data_processed/group2/trade")

g1_files = sorted(RAW_G1.glob("*.parquet"))
g2_files = sorted(RAW_G2.glob("*.parquet"))

print("G1 files:", len(g1_files))
print("G2 files:", len(g2_files))
print("First G1:", g1_files[0].name if g1_files else None)
print("First G2:", g2_files[0].name if g2_files else None)

G1 files: 7
G2 files: 7
First G1: data1_2023_Q1.parquet
First G2: data2_2023_Q1.parquet


In [21]:
g1_log = []

for p in g1_files:
    df = load_data(str(p))
    m = group1_masks(df)

    df_trade = df[m["in_session"] & (~m["no_trade"])].copy()
    m_trade = group1_masks(df_trade)
    df_calc = df_trade[(~m_trade["drop_calc"])].copy()

    df_trade.to_parquet(OUT_G1_TRADE / p.name)
    df_calc.to_parquet(OUT_G1_CALC / p.name)

    g1_log.append((p.name, df.shape[0], df_trade.shape[0], df_calc.shape[0]))

print("Saved G1 trade+calc for files:", len(g1_log))
g1_log[:3]

Saved G1 trade+calc for files: 7


[('data1_2023_Q1.parquet', 25109, 23484, 22845),
 ('data1_2023_Q3.parquet', 25148, 23553, 22913),
 ('data1_2023_Q4.parquet', 24952, 23353, 22713)]

In [22]:
g2_log = []

for p in g2_files:
    df = load_data(str(p))
    m = group2_masks(df)

    df_trade = df[(~m["break_time"]) & (~m["no_trade"])].copy()
    df_trade.to_parquet(OUT_G2_TRADE / p.name)

    g2_log.append((p.name, df.shape[0], df_trade.shape[0]))

print("Saved G2 trade for files:", len(g2_log))
g2_log[:3]

Saved G2 trade for files: 7


[('data2_2023_Q1.parquet', 18398, 17573),
 ('data2_2023_Q3.parquet', 18539, 17650),
 ('data2_2023_Q4.parquet', 18517, 17558)]

In [23]:
from pathlib import Path

RAW_G1 = Path("../data_raw/group1/outofsample_data")

OUT_G1_TRADE = Path("../data_processed/group1/trade")

g1_files = sorted(RAW_G1.glob("*.parquet"))


print("G1 files:", len(g1_files))
print("First G1:", g1_files[0].name if g1_files else None)

G1 files: 5
First G1: data1_2023_Q2.parquet


In [24]:
g1_log = []

for p in g1_files:
    df = load_data(str(p))
    m = group1_masks(df)

    df_trade = df[m["in_session"] & (~m["no_trade"])].copy()
    m_trade = group1_masks(df_trade)
    df_calc = df_trade[(~m_trade["drop_calc"])].copy()

    df_trade.to_parquet(OUT_G1_TRADE / p.name)
    df_calc.to_parquet(OUT_G1_CALC / p.name)

    g1_log.append((p.name, df.shape[0], df_trade.shape[0], df_calc.shape[0]))

print("Saved G1 trade for files:", len(g1_log))
g1_log[:3]

Saved G1 trade for files: 5


[('data1_2023_Q2.parquet', 24448, 22898, 22280),
 ('data1_2024_Q1.parquet', 24924, 23324, 22685),
 ('data1_2024_Q3.parquet', 25727, 24077, 23418)]

In [28]:
RAW_G2 = Path("../data_raw/group2/outofsample_data")

OUT_G1_TRADE = Path("../data_processed/group2/trade")

g2_files = sorted(RAW_G2.glob("*.parquet"))


print("G2 files:", len(g2_files))
print("First G2:", g2_files[0].name if g2_files else None)

G2 files: 5
First G2: data2_2023_Q2.parquet


In [30]:
g2_log = []

for p in g2_files:
    df = load_data(str(p))
    m = group2_masks(df)

    df_trade = df[(~m["break_time"]) & (~m["no_trade"])].copy()
    df_trade.to_parquet(OUT_G2_TRADE / p.name)

    g2_log.append((p.name, df.shape[0], df_trade.shape[0]))

print("Saved G2 trade for files:", len(g2_log))
g2_log[:3]

Saved G2 trade for files: 5


[('data2_2023_Q2.parquet', 17911, 17276),
 ('data2_2024_Q1.parquet', 18496, 17522),
 ('data2_2024_Q3.parquet', 19001, 18013)]