![QuantConnect Logo](https://cdn.quantconnect.com/web/i/icon.png)
<hr>

In [None]:
# Cell 1: QuantBook Initialization 
qb = QuantBook() # Assuming QuantBook is initialized in the environment
print("QuantBook Initialized.")

QuantBook Initialized.


In [14]:
# Cell 2: Imports
import numpy as np
import pandas as pd
from datetime import datetime
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import pickle
import tensorflow as tf
from tensorflow.keras.models import Sequential
# Updated Imports: Added Dropout
from tensorflow.keras.layers import Dense, SimpleRNN, Input, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.regularizers import l2
# Re-adding QuantBook import explicitly for clarity within the script if run standalone
from QuantConnect.Research import QuantBook
print("Imports loaded.")

Imports loaded.


In [15]:
# Cell 3: Function Definitions (Modified get_data_for_period)

# MODIFIED: Parameterized get_training_data (renamed for clarity)
#           Added robust DataFrame processing for columns/index
def get_data_for_period(start_date, end_date):
    """
    Uses QuantConnect's QuantBook to fetch historical SPY data (Daily)
    for the SPECIFIED period.
    Returns a DataFrame with 'open', 'high', 'low', 'close', 'volume'.
    """
    # Ensure QuantBook is available and initialized
    if 'QuantBook' not in globals():
        # Attempt initialization if not found
        try:
            from QuantConnect import QuantBook
            global qb
            qb = QuantBook()
            print("QuantBook initialized within get_data_for_period.")
        except ImportError:
             raise EnvironmentError("QuantBook required but not initialized and cannot import.")

    spy_symbol = qb.AddEquity("SPY", Resolution.Daily).Symbol

    print(f"Fetching history from {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")
    try:
        history = qb.History(
            spy_symbol,
            start=start_date, # Use passed start_date
            end=end_date,     # Use passed end_date
            resolution=Resolution.Daily
        )
    except Exception as e:
        print(f"Error during qb.History call: {e}")
        return pd.DataFrame() # Return empty DataFrame on error

    if history.empty:
        print(f"Warning: No history data returned for SPY between {start_date} and {end_date}.")
        return pd.DataFrame() # Return empty DataFrame explicitly

    # --- Robust DataFrame Processing ---
    # Check if index is MultiIndex (Symbol, Time)
    if isinstance(history.index, pd.MultiIndex):
        # If MultiIndex, drop the 'symbol' level to make time the primary index
        df = history.droplevel(0)
    else:
        # If single index (Time), just use the history directly
        df = history

    # Check if COLUMNS are MultiIndex (less common for single symbol, but possible)
    if isinstance(df.columns, pd.MultiIndex):
         # If columns are MultiIndex, assume top level is symbol, keep second level
         df.columns = df.columns.get_level_values(1)

    # Ensure standard column names exist before selecting
    required_cols = ['open', 'high', 'low', 'close', 'volume']
    if not all(col in df.columns for col in required_cols):
        print(f"Error: DataFrame missing required columns. Found: {df.columns.tolist()}")
        return pd.DataFrame()

    # Select only the required columns
    df = df[required_cols].copy()

    # Ensure index is DatetimeIndex or similar, convert to date if needed
    if isinstance(df.index, pd.DatetimeIndex):
        df.index = df.index.date
    elif isinstance(df.index, pd.Index) and pd.api.types.is_datetime64_any_dtype(df.index):
         df.index = pd.to_datetime(df.index).date
    # If index is already date objects, do nothing

    print(f"Processed {len(df)} rows.")
    return df

# MODIFIED: extract_and_scale_features now fits scaler ONLY on current window data
def extract_and_scale_features(df: pd.DataFrame, lookback: int):
    """
    Extracts features, target, creates sequences, and scales features.
    Fits a NEW scaler on the provided DataFrame (current training window).
    Returns features array, targets array, and the FITTED scaler.
    """
    if df.empty or len(df) <= lookback + 1:
         print(f"Insufficient data for feature extraction (requires > {lookback + 1} days, got {len(df)}).")
         return None, None, None # Return None for X, y, scaler

    features_list = []
    targets = []

    # Calculate raw features first
    df = df.copy() # Work on a copy to avoid modifying original slice
    df['price_changes'] = df['close'] - df['open']
    df['overnight_gaps'] = df['open'] - df['close'].shift(1)

    # Target: next day's price change (close - open)
    df['next_open'] = df['open'].shift(-1)
    df['next_close'] = df['close'].shift(-1)
    df['target'] = df['next_close'] - df['next_open']

    # Drop NaNs created by shifts/calculations
    df.dropna(inplace=True)

    if len(df) < lookback:
        print(f"Insufficient data after NaN drop for sequence creation (requires {lookback} days, got {len(df)}).")
        return None, None, None

    feature_cols = ['price_changes', 'overnight_gaps']
    # Ensure feature columns exist after calculations and dropna
    if not all(col in df.columns for col in feature_cols):
        print(f"Error: Missing feature columns after calculations. Found: {df.columns.tolist()}")
        return None, None, None

    features = df[feature_cols].values
    target_values = df['target'].values

    # --- Scale the features for THIS window ---
    scaler = StandardScaler()
    print("Fitting new scaler for the current window.")
    scaled_features = scaler.fit_transform(features) # Fit and transform ONLY on current window

    # Create sequences
    X, y = [], []
    for i in range(len(scaled_features) - lookback):
        X.append(scaled_features[i:(i + lookback)])
        y.append(target_values[i + lookback])

    if not X:
        print("Could not generate feature sequences.")
        return None, None, scaler # Return scaler even if no sequences

    return np.array(X), np.array(y), scaler


# --- Assume build_rnn_model and extract_numpy_weights are defined as before ---
def build_rnn_model(input_shape, hidden_size=16, dropout_rate=0.25, l2_reg=0.01):
    # (Implementation as provided previously)
    model = tf.keras.models.Sequential([
        tf.keras.layers.Input(shape=input_shape),
        tf.keras.layers.SimpleRNN(hidden_size, activation='tanh',
                  kernel_regularizer=tf.keras.regularizers.l2(l2_reg),
                  recurrent_regularizer=tf.keras.regularizers.l2(l2_reg),
                  bias_regularizer=tf.keras.regularizers.l2(l2_reg),
                  name='simple_rnn'),
        tf.keras.layers.Dropout(dropout_rate, name='dropout'),
        tf.keras.layers.Dense(1, name='dense_output')
    ])
    model.compile(optimizer=tf.keras.optimizers.Adam(), loss='mse')
    return model

def extract_numpy_weights(keras_model):
     # (Implementation as provided previously)
    weights = {}
    try:
        rnn_layer = keras_model.get_layer('simple_rnn')
        dense_layer = keras_model.get_layer('dense_output')
        rnn_weights = rnn_layer.get_weights()
        dense_weights = dense_layer.get_weights()
        weights['Wxh'] = rnn_weights[0]
        weights['Whh'] = rnn_weights[1]
        weights['bh'] = rnn_weights[2]
        weights['Why'] = dense_weights[0]
        weights['by'] = dense_weights[1]
        print("Weights extracted successfully.")
    except Exception as e:
        print(f"Error extracting weights: {e}")
        return None
    return weights

# Cell 4: Walk-Forward Training Execution Loop

# --- Walk-Forward Configuration ---
wf_config = {
    "initial_train_start_date": datetime(2000, 1, 1),
    "overall_end_date": datetime(2025, 3, 5), # Corresponds to algo end date
    "initial_training_years": 10,
    "retrain_frequency_days": 365, # Retrain approximately annually
}

# --- Model Hyperparameters ---
model_config = {
    "lookback": 10,
    "feature_count": 2,
    "hidden_size": 16,
    "dropout_rate": 0.25,
    "l2_reg": 0.01,
    "epochs": 50,
    "batch_size": 32,
    "early_stopping_patience": 10
}

# --- Execution Loop ---
print(f"\n=== Starting Walk-Forward Training Process ===")
# Ensure qb is initialized before the loop if not done globally
if 'QuantBook' not in globals():
    try:
        from QuantConnect import QuantBook
        qb = QuantBook()
        print("QuantBook initialized before loop.")
    except ImportError:
         print("ERROR: QuantBook cannot be initialized. Stopping.")
         # Or raise an error depending on desired behavior
         exit() # Exit if QB is essential and cannot be loaded


current_train_end_date = wf_config["initial_train_start_date"] + timedelta(days=wf_config["initial_training_years"] * 365)
successful_periods = 0
total_periods = 0

while current_train_end_date <= wf_config["overall_end_date"]:
    total_periods += 1
    # Define training window for this iteration (Anchored)
    train_start = wf_config["initial_train_start_date"]
    train_end = min(current_train_end_date, wf_config["overall_end_date"]) # Don't go past overall end

    print(f"\n--- Processing Training Period: {train_start.strftime('%Y-%m-%d')} to {train_end.strftime('%Y-%m-%d')} ---")

    # 1. Get Data for THIS period using the modified function
    df_period = get_data_for_period(train_start, train_end)
    if df_period.empty:
        print("Skipping period due to lack of data.")
        current_train_end_date += timedelta(days=wf_config["retrain_frequency_days"])
        continue # Move to next period

    # 2. Extract Features and Scale (Scaler is fitted here)
    X_period, y_period, scaler_period = extract_and_scale_features(df_period, model_config['lookback'])
    if X_period is None or y_period is None or scaler_period is None:
        print("Skipping period due to feature extraction/scaling failure.")
        current_train_end_date += timedelta(days=wf_config["retrain_frequency_days"])
        continue

    # 3. Train/Validation Split
    try:
        # Ensure sufficient data for split
        if len(X_period) < 5: # Need at least a few samples for train/val
             print(f"Skipping period: Insufficient samples ({len(X_period)}) for train/validation split.")
             current_train_end_date += timedelta(days=wf_config["retrain_frequency_days"])
             continue
        X_train, X_val, y_train, y_val = train_test_split(X_period, y_period, test_size=0.2, shuffle=False)
        print(f"Training data shape: {X_train.shape}, Validation data shape: {X_val.shape}")
        # Add check for empty validation set which can cause issues
        if X_val.shape[0] == 0:
             print("Warning: Validation set is empty after split. Early stopping may not work as expected.")

    except ValueError as e:
         print(f"Error during train/validation split (likely insufficient data): {e}")
         current_train_end_date += timedelta(days=wf_config["retrain_frequency_days"])
         continue

    # 4. Build and Train Model
    try:
        model = build_rnn_model(input_shape=(model_config['lookback'], model_config['feature_count']),
                                  hidden_size=model_config['hidden_size'],
                                  dropout_rate=model_config['dropout_rate'],
                                  l2_reg=model_config['l2_reg'])
        early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss',
                                                           patience=model_config['early_stopping_patience'],
                                                           restore_best_weights=True)
        print("Starting Keras model training...")
        # Avoid issues if validation set was empty
        callbacks_list = [early_stopping] if X_val.shape[0] > 0 else []
        validation_data_tuple = (X_val, y_val) if X_val.shape[0] > 0 else None

        model.fit(X_train, y_train,
                  epochs=model_config['epochs'],
                  batch_size=model_config['batch_size'],
                  validation_data=validation_data_tuple,
                  callbacks=callbacks_list,
                  verbose=0) # Use verbose=1 or 2 for detailed logs
        print("Training finished for period.")
    except Exception as e:
        print(f"Error during model building or training: {e}")
        current_train_end_date += timedelta(days=wf_config["retrain_frequency_days"])
        continue

    # 5. Extract Weights
    numpy_weights_period = extract_numpy_weights(model)
    if numpy_weights_period is None:
        print("Skipping period due to weight extraction failure.")
        current_train_end_date += timedelta(days=wf_config["retrain_frequency_days"])
        continue

    # 6. Save Scaler and Weights with Period-Specific Keys
    period_suffix = train_end.strftime('%Y%m%d') # Use the END date of training
    scaler_key = f"rnn_strategy/scaler_{period_suffix}.pkl"
    weights_key = f"rnn_strategy/weights_{period_suffix}.pkl"

    try:
        scaler_bytes = pickle.dumps(scaler_period)
        if qb.ObjectStore.SaveBytes(scaler_key, scaler_bytes):
            print(f"Scaler saved to ObjectStore: '{scaler_key}'")
        else:
            print(f"Error: Failed to save scaler to ObjectStore: '{scaler_key}'")
            # Decide how to handle partial failure (maybe skip saving weights too)

        weights_bytes = pickle.dumps(numpy_weights_period)
        if qb.ObjectStore.SaveBytes(weights_key, weights_bytes):
            print(f"Weights saved to ObjectStore: '{weights_key}'")
            successful_periods += 1 # Count success only if both save
        else:
            print(f"Error: Failed to save weights to ObjectStore: '{weights_key}'")

    except Exception as e:
        print(f"Error pickling or saving scaler/weights: {e}")

    # Move to the next training period end date
    current_train_end_date += timedelta(days=wf_config["retrain_frequency_days"])

    # Break if the last processed period already reached the overall end date
    if train_end == wf_config["overall_end_date"]:
         break


print(f"\n=== Walk-Forward Training Process Finished ===")
print(f"Total Training Periods Attempted: {total_periods}")
print(f"Successful Periods Completed: {successful_periods}")


=== Starting Walk-Forward Training Process ===

--- Processing Training Period: 2000-01-01 to 2009-12-29 ---
Fetching history from 2000-01-01 to 2009-12-29
Processed 2512 rows.
Fitting new scaler for the current window.
Training data shape: (2000, 10, 2), Validation data shape: (500, 10, 2)
Starting Keras model training...
Training finished for period.
Weights extracted successfully.
Scaler saved to ObjectStore: 'rnn_strategy/scaler_20091229.pkl'
Weights saved to ObjectStore: 'rnn_strategy/weights_20091229.pkl'

--- Processing Training Period: 2000-01-01 to 2010-12-29 ---
Fetching history from 2000-01-01 to 2010-12-29
Processed 2764 rows.
Fitting new scaler for the current window.
Training data shape: (2201, 10, 2), Validation data shape: (551, 10, 2)
Starting Keras model training...
Training finished for period.
Weights extracted successfully.
Scaler saved to ObjectStore: 'rnn_strategy/scaler_20101229.pkl'
Weights saved to ObjectStore: 'rnn_strategy/weights_20101229.pkl'

--- Process