# Import Libraries

In [1]:
import pandas as pd
import glob
import os
import re
from datetime import time
import numpy as np

In [2]:
# ---------------------------------------------------------
# 1. Configuration: Project definitions
# ---------------------------------------------------------

DATA_DIR = '../../data'

# List of quarters designated as "In-Sample" for strategy selection
IN_SAMPLE_QUARTERS = [
    "2023_Q1", "2023_Q3", "2023_Q4",
    "2024_Q2", "2024_Q4",
    "2025_Q1", "2025_Q2"
]

In [3]:
# ---------------------------------------------------------
# 2. Data Loading Function
# ---------------------------------------------------------

def load_project_data(data_dir):
    """
    Loads Parquet files, sets 'datetime' as index, and removes timezone info (+00:00).
    """
    file_paths = glob.glob(os.path.join(data_dir, "*.parquet"))

    data_store = {
        "group1": {},
        "group2": {}
    }

    print(f"Found {len(file_paths)} files. Loading and processing...")

    for path in file_paths:
        filename = os.path.basename(path)
        match = re.match(r"(data[12])_(\d{4})_(Q\d)\.parquet", filename)

        if match:
            raw_group = match.group(1)
            group_id = "group" + raw_group[-1]
            year = match.group(2)
            quarter = match.group(3)
            quarter_id = f"{year}_{quarter}"

            try:
                # 1. Load Data
                df = pd.read_parquet(path)

                # 2. Set 'datetime' column as Index
                if 'datetime' in df.columns:
                    df = df.set_index('datetime')

                # 3. Ensure Index is Datetime Object
                df.index = pd.to_datetime(df.index)

                # 4. Remove Timezone Information (+00:00)
                # tz_localize(None) strips the timezone, keeping the local time
                if df.index.tz is not None:
                    df.index = df.index.tz_localize(None)

                # 5. Sort by Time
                df = df.sort_index()

                # 6. Add Metadata
                df['Quarter_ID'] = quarter_id
                df['Is_In_Sample'] = quarter_id in IN_SAMPLE_QUARTERS

                # Store
                data_store[group_id][quarter_id] = df

                status = "In-Sample" if df['Is_In_Sample'].iloc[0] else "Out-of-Sample"
                print(f"Loaded {filename} -> {group_id} [{status}] | Timezone Removed.")

            except Exception as e:
                print(f"Error loading {filename}: {e}")

    return data_store

In [4]:
# ---------------------------------------------------------
# 3. Helper Function to Combine Data
# ---------------------------------------------------------

def combine_quarters(data_store, group_id, only_in_sample=True):
    if group_id not in data_store:
        return pd.DataFrame()

    quarters_dict = data_store[group_id]
    df_list = []

    for q_id, df in quarters_dict.items():
        if only_in_sample and not df['Is_In_Sample'].iloc[0]:
            continue
        df_list.append(df)

    if df_list:
        combined_df = pd.concat(df_list)
        combined_df = combined_df.sort_index()
        return combined_df
    else:
        return pd.DataFrame()

In [5]:
# ---------------------------------------------------------
# 4. Execution Example
# ---------------------------------------------------------

if __name__ == "__main__":
    raw_data = load_project_data(DATA_DIR)

    # Create combined training set
    df_g1_train = combine_quarters(raw_data, "group1", only_in_sample=True)

    print("-" * 40)
    if not df_g1_train.empty:
        print("Verifying Index (Should be simple Datetime without +00:00):")
        print(df_g1_train.index)

        print("\nHead Sample:")
        print(df_g1_train.head(5))
        print(df_g1_train.tail(5))

if __name__ == "__main__":
    # 1. Load all data (if not already loaded)
    # raw_data = load_project_data(DATA_DIR)

    # ---------------------------------------------------------
    # Create Group 2 Data (In-Sample)
    # ---------------------------------------------------------
    print("Processing Group 2 (CAD, AUD, XAU, XAG)...")

    df_g2_train = combine_quarters(raw_data, "group2", only_in_sample=True)

    print("-" * 40)
    if not df_g2_train.empty:
        print("Group 2 Shape:")
        print(df_g2_train.shape)

        print("\nGroup 2 Columns (Should be CAD, AUD, XAU, XAG, etc.):")
        print(df_g2_train.columns)

        print("\nGroup 2 Head Sample (5-min freq, No Timezone):")
        print(df_g2_train.head(3))

        print("\nVerifying Index Type:")
        print(df_g2_train.index.dtype)
    else:
        print("Group 2 data is empty. Please check the file names (data2_...).")

Found 14 files. Loading and processing...
Loaded data2_2023_Q4.parquet -> group2 [In-Sample] | Timezone Removed.
Loaded data1_2023_Q3.parquet -> group1 [In-Sample] | Timezone Removed.
Loaded data2_2025_Q1.parquet -> group2 [In-Sample] | Timezone Removed.
Loaded data1_2023_Q1.parquet -> group1 [In-Sample] | Timezone Removed.
Loaded data2_2025_Q2.parquet -> group2 [In-Sample] | Timezone Removed.
Loaded data2_2024_Q4.parquet -> group2 [In-Sample] | Timezone Removed.
Loaded data1_2024_Q2.parquet -> group1 [In-Sample] | Timezone Removed.
Loaded data2_2023_Q3.parquet -> group2 [In-Sample] | Timezone Removed.
Loaded data1_2025_Q1.parquet -> group1 [In-Sample] | Timezone Removed.
Loaded data1_2023_Q4.parquet -> group1 [In-Sample] | Timezone Removed.
Loaded data1_2024_Q4.parquet -> group1 [In-Sample] | Timezone Removed.
Loaded data2_2024_Q2.parquet -> group2 [In-Sample] | Timezone Removed.
Loaded data1_2025_Q2.parquet -> group1 [In-Sample] | Timezone Removed.
Loaded data2_2023_Q1.parquet -> gro

In [6]:
df_g1_train.info()
df_g2_train.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 175812 entries, 2023-01-02 09:31:00 to 2025-06-30 16:00:00
Data columns (total 4 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   NQ            175423 non-null  float64
 1   SP            175423 non-null  float64
 2   Quarter_ID    175812 non-null  object 
 3   Is_In_Sample  175812 non-null  bool   
dtypes: bool(1), float64(2), object(1)
memory usage: 5.5+ MB
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 129770 entries, 2023-01-01 17:35:00 to 2025-06-30 23:55:00
Data columns (total 6 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   AUD           124578 non-null  float64
 1   CAD           124577 non-null  float64
 2   XAG           124320 non-null  float64
 3   XAU           124320 non-null  float64
 4   Quarter_ID    129770 non-null  object 
 5   Is_In_Sample  129770 non-null  bool   
dtypes: bool(1), float64(4), object

In [7]:
# ---------------------------------------------------------
# Configuration: Time Rules
# ---------------------------------------------------------

# Group 1 (1-min) Rules
G1_CALC_EXCLUDE_START = time(9, 31)
G1_CALC_EXCLUDE_END   = time(9, 40)  # First 10 mins
G1_TRADE_START        = time(9, 56)  # Start trading AFTER 9:55
G1_EXIT_TIME          = time(15, 40) # Exit 20 mins before session end
G1_SESSION_END_START  = time(15, 51)
G1_SESSION_END_FINISH = time(16, 00) # Last 10 mins

# Group 2 (5-min) Rules
G2_EXIT_TIME          = time(16, 50) # 10 mins before break
G2_BREAK_START        = time(17, 00)
G2_BREAK_END          = time(18, 00)
G2_TRADE_RESTART      = time(18, 10) # 10 mins after break

In [8]:
# ---------------------------------------------------------
# Processing Function
# ---------------------------------------------------------
def apply_common_assumptions(df, group_type):
    if df.empty: return df

    # Defaults
    df['can_trade'] = False
    df['force_exit'] = False
    t = df.index.time

    if group_type == "group2":
        # === CRITICAL FIX: ROW REMOVAL ===
        # Remove 16:50 <= t < 18:10
        # Keep: t < 16:50 OR t >= 18:10
        keep_mask = (t < time(16, 50)) | (t >= time(18, 10))
        df = df.loc[keep_mask].copy()

        # Re-index time after removal
        t = df.index.time

        # Set flags
        df['can_trade'] = True

        # Force exit at the last bar before removal (16:45)
        df.loc[t == time(16, 45), 'force_exit'] = True

    return df

In [9]:
# ==========================================
# 2. Execution & Verification (Run this part!)
# ==========================================

# 1. Reload original data (to be sure)
# df_g2_train = pd.read_pickle('df_g2_processed.pkl') # If loading from file
# OR use your existing variable if it's the raw one

print(f"Rows BEFORE cleanup: {len(df_g2_train)}")

# 2. APPLY THE FIX and ASSIGN to a new variable
df_g2_clean = apply_common_assumptions(df_g2_train.copy(), "group2")

print(f"Rows AFTER cleanup:  {len(df_g2_clean)}")

# 3. VERIFY
print("\n" + "="*40)
print("FINAL CHECK")
print("="*40)

# Check A: Bar count (Target: 272)
daily_counts = df_g2_clean.groupby(df_g2_clean.index.date).size()
mode_count = daily_counts.mode()[0]
print(f"Bars per day (Mode): {mode_count}  <-- Should be 272")

# Check B: Gap Check
# Is there any data at 16:50 or 18:05?
gap_check = df_g2_clean[
    (df_g2_clean.index.time == time(16, 50)) |
    (df_g2_clean.index.time == time(18, 5))
]

if gap_check.empty:
    print("Gap Check: ✅ OK (Gap rows removed)")
else:
    print("Gap Check: ❌ FAILED (Rows still exist!)")
    print(gap_check.head())

# Check C: Start Time
# The first bar after the gap should be 18:10
last_session_start = df_g2_clean.between_time('18:00', '18:20').index.time
if len(last_session_start) > 0:
    print(f"Sample time around restart: {last_session_start[0]} <-- Should be 18:10")

Rows BEFORE cleanup: 129770
Rows AFTER cleanup:  122791

FINAL CHECK
Bars per day (Mode): 272  <-- Should be 272
Gap Check: ✅ OK (Gap rows removed)
Sample time around restart: 18:15:00 <-- Should be 18:10


In [10]:
# ---------------------------------------------------------
# Execution
# ---------------------------------------------------------

if __name__ == "__main__":
    # Assume df_g1_train and df_g2_train are already loaded from previous step

    print("Applying Common Assumptions to Group 1...")
    df_g1_clean = apply_common_assumptions(df_g1_train.copy(), "group1")

    print("Applying Common Assumptions to Group 2...")
    df_g2_clean = apply_common_assumptions(df_g2_train.copy(), "group2")

    # --- Verification ---
    print("\n" + "="*40)
    print("VERIFICATION GROUP 1 (SP/NQ)")
    print("="*40)

    # Check Morning NaN Mask (09:31 - 09:40)
    test_time_nan = time(9, 35)
    row_nan = df_g1_clean[df_g1_clean.index.time == test_time_nan]
    if not row_nan.empty:
        print(f"Time {test_time_nan} (Should be NaN):")
        print(row_nan[['SP', 'NQ']].head(1))

    # Check Trading Start (09:56)
    test_time_trade = time(9, 56)
    row_trade = df_g1_clean[df_g1_clean.index.time == test_time_trade]
    if not row_trade.empty:
        print(f"\nTime {test_time_trade} (can_trade should be True):")
        print(row_trade[['can_trade']].head(1))

    # Check Exit (15:40)
    test_time_exit = time(15, 40)
    row_exit = df_g1_clean[df_g1_clean.index.time == test_time_exit]
    if not row_exit.empty:
        print(f"\nTime {test_time_exit} (force_exit should be True):")
        print(row_exit[['force_exit', 'can_trade']].head(1))

    print("\n" + "="*40)
    print("VERIFICATION GROUP 2 (CAD/AUD...)")
    print("="*40)

    # Check Post-Break Block (18:05 - Should be False)
    test_g2_block = time(18, 5) # 5 min freq
    row_g2_block = df_g2_clean[df_g2_clean.index.time == test_g2_block]
    if not row_g2_block.empty:
        print(f"Time {test_g2_block} (can_trade should be False):")
        print(row_g2_block[['can_trade']].head(1))

    # Check Restart (18:10 - Should be True)
    test_g2_start = time(18, 10)
    row_g2_start = df_g2_clean[df_g2_clean.index.time == test_g2_start]
    if not row_g2_start.empty:
        print(f"Time {test_g2_start} (can_trade should be True):")
        print(row_g2_start[['can_trade']].head(1))

Applying Common Assumptions to Group 1...
Applying Common Assumptions to Group 2...

VERIFICATION GROUP 1 (SP/NQ)
Time 09:35:00 (Should be NaN):
                     SP  NQ
datetime                   
2023-01-02 09:35:00 NaN NaN

Time 09:56:00 (can_trade should be True):
                     can_trade
datetime                      
2023-01-02 09:56:00      False

Time 15:40:00 (force_exit should be True):
                     force_exit  can_trade
datetime                                  
2023-01-02 15:40:00       False      False

VERIFICATION GROUP 2 (CAD/AUD...)
Time 18:10:00 (can_trade should be True):
                     can_trade
datetime                      
2023-01-02 18:10:00       True


In [11]:
# ==========================================
# Function to Add Returns and Volatility
# ==========================================
def add_return_and_volatility(df, assets, interval_min, hours_per_day, vol_window=20):
    """
    Adds Log Returns and Annualized Volatility columns to the DataFrame.

    Parameters:
    - df: Input DataFrame
    - assets: List of asset column names (e.g., ['SP', 'NQ'])
    - interval_min: Data frequency in minutes (1 or 5)
    - hours_per_day: Trading hours per day for annualization
    - vol_window: Rolling window size for volatility calculation (default: 20 bars)
    """
    df = df.copy()

    # 1. Calculate Annualization Factor
    # Calculate how many bars exist in one trading day
    bars_per_day = (hours_per_day * 60) / interval_min

    # Standard assumption: 252 trading days per year
    # Annualization Factor = sqrt(Daily Bars * 252)
    annual_factor = np.sqrt(bars_per_day * 252)

    print(f"Processing assets: {assets}")
    print(f"  - Frequency: {interval_min} min")
    print(f"  - Trading Hours/Day: {hours_per_day}")
    print(f"  - Annualization Factor: {annual_factor:.2f}")

    # 2. Loop through assets to create new columns
    for asset in assets:
        if asset in df.columns:
            # Log Return: ln(Current / Previous)
            df[f'{asset}_rtn'] = np.log(df[asset] / df[asset].shift(1))

            # Annualized Volatility: Rolling Std Dev * Annualization Factor
            df[f'{asset}_vol'] = df[f'{asset}_rtn'].rolling(window=vol_window).std() * annual_factor
        else:
            print(f"  [Warning] Asset '{asset}' not found in DataFrame.")

    return df

In [12]:
# ==========================================
# Execution
# ==========================================

# --- Group 1 Processing ---
# Assets: SP, NQ
# Frequency: 1 min
# Trading Hours: 9:30 - 16:00 CET = 6.5 hours
df_g1_train = add_return_and_volatility(
    df_g1_train,
    assets=['SP', 'NQ'],
    interval_min=1,
    hours_per_day=6.5
)

print("-" * 30)

# --- Group 2 Processing ---
# Assets: AUD, CAD, XAG, XAU
# Frequency: 5 min
# Trading Hours: Almost 24h with 1h break = 23 hours
# (Note: Using list comprehension to ensure we only process columns that exist in the dataframe)
g2_assets = ['AUD', 'CAD', 'XAG', 'XAU']
existing_g2_assets = [col for col in g2_assets if col in df_g2_train.columns]

df_g2_train = add_return_and_volatility(
    df_g2_train,
    assets=existing_g2_assets,
    interval_min=5,
    hours_per_day=23
)

Processing assets: ['SP', 'NQ']
  - Frequency: 1 min
  - Trading Hours/Day: 6.5
  - Annualization Factor: 313.50
------------------------------
Processing assets: ['AUD', 'CAD', 'XAG', 'XAU']
  - Frequency: 5 min
  - Trading Hours/Day: 23
  - Annualization Factor: 263.73


In [13]:
# ==========================================
# Verification
# ==========================================
print("\n--- Group 1: SP Data Check (Last 5 rows) ---")
print(df_g1_train[['SP', 'SP_rtn', 'SP_vol']].tail())

print("\n--- Group 2: First Asset Data Check (Last 5 rows) ---")
if existing_g2_assets:
    first_asset = existing_g2_assets[0]
    print(df_g2_train[[first_asset, f'{first_asset}_rtn', f'{first_asset}_vol']].tail())


--- Group 1: SP Data Check (Last 5 rows) ---
                           SP    SP_rtn    SP_vol
datetime                                         
2025-06-30 15:56:00  6208.279 -0.000739  0.109098
2025-06-30 15:57:00  6206.774 -0.000242  0.108644
2025-06-30 15:58:00  6211.059  0.000690  0.112746
2025-06-30 15:59:00  6208.543 -0.000405  0.118266
2025-06-30 16:00:00  6200.242 -0.001338  0.154946

--- Group 2: First Asset Data Check (Last 5 rows) ---
                         AUD   AUD_rtn   AUD_vol
datetime                                        
2025-06-30 23:35:00  0.65672 -0.000152  0.068006
2025-06-30 23:40:00  0.65691  0.000289  0.068987
2025-06-30 23:45:00  0.65681 -0.000152  0.069758
2025-06-30 23:50:00  0.65696  0.000228  0.070068
2025-06-30 23:55:00  0.65679 -0.000259  0.072833


In [14]:
# Save processed dataframes to pickle files for the next step (EDA)
# This preserves column types and indices perfectly.
df_g1_clean.to_pickle('df_g1_processed.pkl')
df_g2_clean.to_pickle('df_g2_processed.pkl')

print("Data saved successfully to .pkl files.")

Data saved successfully to .pkl files.
