In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import joblib
import pytz
from datetime import datetime, time, timedelta

In [2]:
print("Loading and preparing data...")
# --- Load and Pre-process Data (Similar to feature_engineering.py) ---
df_raw = pd.read_parquet('../data/raw/xauusd_h1_2018_present.parquet')
df_raw.set_index('time', inplace=True)
df = df_raw.tz_localize('UTC')

# We'll use more features for the DL model to learn from
features_to_use = ['open', 'high', 'low', 'close', 'tick_volume']
data = df[features_to_use]
# data.head(10)

# Use StandardScaler for DL models. It centers the data around zero.
scaler = StandardScaler()
scaled_data_df = pd.DataFrame(scaler.fit_transform(data), index=data.index, columns=data.columns)
# scaled_data_df.head(10)
joblib.dump(scaler, '../models/hyp_a_xauusd_h1_2018_pytorch_session_scaler.joblib')
print("Data scaled and scaler saved.")

Loading and preparing data...
Data scaled and scaler saved.


In [3]:
scaled_data_df.tail(200)

Unnamed: 0_level_0,open,high,low,close,tick_volume
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2025-09-11 16:00:00+00:00,3.172117,3.176248,3.161831,3.173310,0.104919
2025-09-11 17:00:00+00:00,3.173752,3.177861,3.176290,3.171657,0.021388
2025-09-11 18:00:00+00:00,3.172448,3.179878,3.182471,3.178708,-0.124383
2025-09-11 19:00:00+00:00,3.178930,3.185011,3.188964,3.184161,-0.156672
2025-09-11 20:00:00+00:00,3.184623,3.175350,3.186683,3.182105,-0.250967
...,...,...,...,...,...
2025-09-24 04:00:00+00:00,3.421847,3.423341,3.413243,3.405236,-0.005052
2025-09-24 05:00:00+00:00,3.405540,3.400335,3.405388,3.406173,-0.158076
2025-09-24 06:00:00+00:00,3.406678,3.413699,3.408387,3.421523,-0.208382
2025-09-24 07:00:00+00:00,3.421939,3.416833,3.428383,3.426628,-0.390420


In [4]:
df.tail(200)

Unnamed: 0_level_0,open,high,low,close,tick_volume,spread,real_volume
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2025-09-11 16:00:00+00:00,3629.38,3637.06,3618.34,3630.27,5273,5,0
2025-09-11 17:00:00+00:00,3630.27,3637.94,3626.20,3629.37,4916,5,0
2025-09-11 18:00:00+00:00,3629.56,3639.04,3629.56,3633.21,4293,11,0
2025-09-11 19:00:00+00:00,3633.09,3641.84,3633.09,3636.18,4155,5,0
2025-09-11 20:00:00+00:00,3636.19,3636.57,3631.85,3635.06,3752,5,0
...,...,...,...,...,...,...,...
2025-09-24 04:00:00+00:00,3765.37,3771.85,3755.01,3756.58,4803,5,0
2025-09-24 05:00:00+00:00,3756.49,3759.30,3750.74,3757.09,4149,12,0
2025-09-24 06:00:00+00:00,3757.11,3766.59,3752.37,3765.45,3934,14,0
2025-09-24 07:00:00+00:00,3765.42,3768.30,3763.24,3768.23,3156,6,0


In [13]:
# --- Create Session-based Sequences ---
print("Creating session-based sequences...")
london_tz = pytz.timezone('Europe/London')
X_sequences = []
y_class_targets = []
y_reg_targets = []

# DEFINE OUR FIXED SEQUENCE LENGTH
# From observation, a full session is about 10 hours.
SEQUENCE_LENGTH = 10
NUM_FEATURES = len(features_to_use)

# This loop is very similar to your feature_engineering script
for day in scaled_data_df.index.normalize().unique():
    try:
        # --- Define DYNAMIC London Session ---
        london_open_local = london_tz.localize(datetime.combine(day, time(8, 0)))
        london_close_local = london_tz.localize(datetime.combine(day, time(17, 0)))
        london_open_utc = london_open_local.astimezone(pytz.utc)
        london_close_utc = london_close_local.astimezone(pytz.utc)

        # --- Define Asian Session ---
        previous_day = day - timedelta(days=1)
        asia_part1 = scaled_data_df.loc[str(previous_day.date())].between_time('22:00', '23:59')
        asia_part2 = scaled_data_df.loc[str(day.date())].between_time('00:00', '07:59')
        asia_session_df = pd.concat([asia_part1, asia_part2])
        # print("\nasia ===== ",len(asia_session_df))
        # print("==================ASIA============================\n", asia_session_df)

        london_session_df = df[(df.index >= london_open_utc) & (df.index < london_close_utc)]
        # print("\nlondon ===== ",len(asia_session_df))
        # print("==================LONDON============================\n", london_session_df)

        
        # --- PADDING LOGIC ---
        # Instead of a rigid check, we handle all non-empty sessions
        if not asia_session_df.empty and not london_session_df.empty:
            
            sequence_data = asia_session_df.values
            current_length = len(sequence_data)
            
            # Create a "canvas" of zeros with our desired final shape
            padded_sequence = np.zeros((SEQUENCE_LENGTH, NUM_FEATURES))
            
            # Copy the actual data into the END of the canvas
            # This is called "pre-padding" and is the standard method.
            padded_sequence[-current_length:] = sequence_data
            
            # Now, `padded_sequence` is guaranteed to be shape (10, 5)
            X_sequences.append(padded_sequence)
            
            # --- Calculate Targets (this logic is unchanged) ---
            london_open = london_session_df['open'].iloc[0]
            london_close = london_session_df['close'].iloc[-1]
            
            london_direction = 1 if london_close > london_open else 0
            y_class_targets.append(london_direction)
            
            london_return = (london_close - london_open) / london_open
            y_reg_targets.append(london_return)

    except Exception as e:
        continue

# print(X_sequences)
# print("==================================")
# print(y_class_targets)
# print("==================================")
# print(y_reg_targets)


# Convert lists to numpy arrays
X = np.array(X_sequences)
y_class = np.array(y_class_targets)
y_reg = np.array(y_reg_targets)

print(f"Session sequences created successfully.")
print(f"Shape of X: {X.shape}") # Should be (num_days, 10, 5)
print(f"Shape of y_class: {y_class.shape}")
print(f"Shape of y_reg: {y_reg.shape}")

# --- Save the new sequence data ---
np.save('../data/processed/hyp_a_xauusd_h1_2018_session_sequences_X_padded.npy', X)
np.save('../data/processed/hyp_a_xauusd_h1_2018_session_targets_y_class.npy', y_class)
np.save('../data/processed/hyp_a_xauusd_h1_2018_session_targets_y_reg.npy', y_reg)

print("Padded session sequence data saved.")

Creating session-based sequences...
Session sequences created successfully.
Shape of X: (1995, 10, 5)
Shape of y_class: (1995,)
Shape of y_reg: (1995,)
Padded session sequence data saved.
