![QuantConnect Logo](https://cdn.quantconnect.com/web/i/icon.png)
<hr>

In [15]:
# Cell 1: QuantBook Initialization 
qb = QuantBook() # Assuming QuantBook is initialized in the environment
print("QuantBook Initialized.")

QuantBook Initialized.


In [16]:
# Cell 2: Imports
import numpy as np
import pandas as pd
from datetime import datetime
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import pickle
import tensorflow as tf
from tensorflow.keras.models import Sequential
# Updated Imports: Added Dropout
from tensorflow.keras.layers import Dense, SimpleRNN, Input, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.regularizers import l2
# Re-adding QuantBook import explicitly for clarity within the script if run standalone
from QuantConnect.Research import QuantBook
print("Imports loaded.")

Imports loaded.


In [17]:
# Cell 3: Function Definitions

# Data Retrieval (Keep original function, ensures modularity)
def get_training_data():
    """
    Uses QuantConnect's QuantBook to fetch historical SPY data (Daily)
    from 2000-01-01 to 2010-01-01.
    Returns a DataFrame with 'open', 'high', 'low', 'close', 'volume'.
    """
    # Ensure QuantBook is available and initialized
    if 'QuantBook' not in globals():
        raise EnvironmentError("QuantBook required but not initialized.")
    qb = QuantBook() # Re-initialize locally if needed or use global
    spy_symbol = qb.AddEquity("SPY", Resolution.Daily).Symbol
    history = qb.History(
        spy_symbol,
        start=datetime(2000, 1, 1),
        end=datetime(2010, 1, 1), # Training data end date
        resolution=Resolution.Daily
    )
    if history.empty:
        raise ValueError("Failed to fetch training data.")

    df = history.droplevel(0) # Assuming multi-index from History
    df.reset_index(inplace=True)
    df.rename(columns={'time': 'date'}, inplace=True)
    df.set_index('date', inplace=True)
    # Select necessary columns, drop others if present
    df = df[['open', 'high', 'low', 'close', 'volume']]
    return df


# Feature Extraction (Refactored: Removed volume_changes, updated features, added comment)
def extract_and_scale_features(df: pd.DataFrame, lookback: int):
    """
    Extracts features (price changes, overnight gaps) and scales them globally.
    Returns features array, targets array, and the fitted scaler.
    Refactored: Removed 'volume_changes'.
    """
    features_list = []
    targets = []

    # Calculate raw features first
    df['price_changes'] = df['close'] - df['open']
    df['overnight_gaps'] = df['open'] - df['close'].shift(1)
    # df['volume_changes'] = df['volume'] - df['volume'].shift(1) # Removed volume_changes

    # Target: next day's price change (close - open)
    df['target'] = (df['close'].shift(-1) - df['open'].shift(-1))

    # Drop NaNs created by shifts/calculations (affects first row for gaps, last for target)
    df.dropna(inplace=True)

    # Updated feature columns list
    feature_cols = ['price_changes', 'overnight_gaps']

    # Scale the features globally
    scaler = StandardScaler()
    # WARNING: Global scaling assumes feature statistics (mean, std dev) are stationary over the entire training period.
    # Concept drift can occur if statistics change significantly later.
    df[feature_cols] = scaler.fit_transform(df[feature_cols]) # Fit and transform on training data

    # Create sequences
    for i in range(len(df) - lookback):
        # Select only the specified feature_cols
        window_features = df.iloc[i : i + lookback][feature_cols].values
        target_value = df.iloc[i + lookback]['target'] # Target is after the window

        # Ensure complete window and correct number of features
        if window_features.shape[0] == lookback and window_features.shape[1] == len(feature_cols):
            features_list.append(window_features)
            targets.append(target_value)

    if not features_list:
        raise ValueError("Could not generate feature sequences.")

    # features shape: (num_samples, lookback, num_features=2)
    # targets shape: (num_samples,)
    return np.array(features_list), np.array(targets), scaler


# RNN Pretraining Function (Refactored: hyperparameters, model structure, weight extraction indices)
def pretrain_and_save_rnn_keras():
    """
    Trains a refactored RNN using Keras and saves the fitted scaler and weights
    to QuantConnect's ObjectStore.
    Refactored: Simplified model, added Dropout, increased L2, reduced features.
    """
    # Load training data
    df = get_training_data()

    # Hyperparameters (Refactored)
    lookback = 10           # RNN sequence length (should match main.py)
    feature_count = 2       # Updated: Number of features used
    hidden_size = 16        # Updated: Reduced complexity
    learning_rate = 0.001   # Kept original
    l2_lambda = 5e-4        # Updated: Increased regularization
    epochs = 50             # Kept original
    batch_size = 32         # Kept original
    validation_split = 0.2  # Kept original
    patience = 10           # Kept original

    # Extract features, targets, and the FITTED scaler (using refactored function)
    X, y, scaler = extract_and_scale_features(df, lookback)

    # Ensure feature count matches
    if X.shape[2] != feature_count:
         raise ValueError(f"Feature dimension mismatch: Expected {feature_count}, got {X.shape[2]}")

    # Split data into training and validation sets
    X_train, X_val, y_train, y_val = train_test_split(
        X, y, test_size=validation_split, random_state=42, shuffle=False # Keep shuffle=False for time series
    )

    print(f"Training data shape: {X_train.shape}, Validation data shape: {X_val.shape}")

    # --- Keras Model Definition (Refactored) ---
    model = Sequential([
        Input(shape=(lookback, feature_count)), # Updated input shape
        SimpleRNN(hidden_size,                # Updated hidden_size
                  activation='tanh',
                  kernel_regularizer=l2(l2_lambda),    # Updated l2_lambda
                  recurrent_regularizer=l2(l2_lambda), # Updated l2_lambda
                  bias_regularizer=l2(l2_lambda)),     # Updated l2_lambda
        Dropout(0.25), # Added Dropout layer
        Dense(1)
    ])

    model.compile(optimizer=Adam(learning_rate=learning_rate), loss='mse')
    early_stopping = EarlyStopping(monitor='val_loss', patience=patience, restore_best_weights=True)

    print("Starting Keras model training...")
    history = model.fit(X_train, y_train,
                        epochs=epochs,
                        batch_size=batch_size,
                        validation_data=(X_val, y_val),
                        callbacks=[early_stopping],
                        verbose=2)
    print("Training finished.")

   # Extract weights (Adjusted indices due to Dropout layer - Corrected)
    best_weights = {}
    # Layer indices: 0=SimpleRNN, 1=Dropout, 2=Dense
    if len(model.layers) < 3: # Check if model has at least the expected layers
         raise RuntimeError(f"Model structure doesn't match expected layers after training. Found layers: {len(model.layers)}")

    # Corrected indices:
    rnn_layer = model.layers[0]   # SimpleRNN is the first layer (index 0)
    dense_layer = model.layers[2] # Dense is the third layer (index 2)

    rnn_weights = rnn_layer.get_weights()
    dense_weights = dense_layer.get_weights()

    # Ensure weights are extracted correctly
    if len(rnn_weights) != 3 or len(dense_weights) != 2:
         raise RuntimeError(f"Unexpected number of weight arrays retrieved. RNN: {len(rnn_weights)}, Dense: {len(dense_weights)}")

    # Transpose and reshape as expected by the original main.py's forward pass
    best_weights["Wxh"] = rnn_weights[0].T # Kernel (input-to-hidden)
    best_weights["Whh"] = rnn_weights[1]   # Recurrent Kernel (hidden-to-hidden)
    best_weights["bh"] = rnn_weights[2].reshape(-1, 1) # Bias (hidden)
    best_weights["Why"] = dense_weights[0].T # Kernel (hidden-to-output)
    best_weights["by"] = dense_weights[1].reshape(-1, 1) # Bias (output)
    print("Weights extracted from trained model.")

    # --- Save to ObjectStore ---
    if 'QuantBook' not in globals():
        print("Error: QuantBook not available. Cannot save to ObjectStore.")
        return # Or raise error

    qb = QuantBook() # Re-initialize locally or use global
    scaler_key = "rnn_strategy/scaler.pkl"
    weights_key = "rnn_strategy/weights.pkl"

    # Save Scaler
    try:
        scaler_bytes = pickle.dumps(scaler)
        success_scaler = qb.ObjectStore.SaveBytes(scaler_key, scaler_bytes)
        if success_scaler:
            print(f"Scaler successfully saved to ObjectStore with key: '{scaler_key}'")
        else:
            print(f"Failed to save scaler to ObjectStore.")
    except Exception as e:
        print(f"Error saving scaler to ObjectStore: {e}")

    # Save Weights
    try:
        weights_bytes = pickle.dumps(best_weights)
        success_weights = qb.ObjectStore.SaveBytes(weights_key, weights_bytes)
        if success_weights:
            print(f"Weights successfully saved to ObjectStore with key: '{weights_key}'")
        else:
            print(f"Failed to save weights to ObjectStore.")
    except Exception as e:
        print(f"Error saving weights to ObjectStore: {e}")

print("All necessary functions defined.")

All necessary functions defined.


In [18]:
# Cell 4: Execute the Pretraining and Saving Process (Keep original execution logic)
print(f"Starting RNN Pretraining and Saving at {datetime.now()}...")
try:
    # Call the main function defined in the previous cell
    pretrain_and_save_rnn_keras()
    print(f"RNN Pretraining and Saving finished at {datetime.now()}. Check logs above for status.")
except Exception as e:
    print(f"An error occurred during pretraining execution: {e}")
    # You might want to print traceback for debugging
    import traceback
    traceback.print_exc()

Starting RNN Pretraining and Saving at 2025-04-15 18:03:46.997540...
Training data shape: (2002, 10, 2), Validation data shape: (501, 10, 2)
Starting Keras model training...
Epoch 1/50
63/63 - 1s - 15ms/step - loss: 1.0470 - val_loss: 1.9239
Epoch 2/50
63/63 - 0s - 2ms/step - loss: 0.8503 - val_loss: 1.7680
Epoch 3/50
63/63 - 0s - 3ms/step - loss: 0.7906 - val_loss: 1.6979
Epoch 4/50
63/63 - 0s - 2ms/step - loss: 0.7511 - val_loss: 1.6767
Epoch 5/50
63/63 - 0s - 2ms/step - loss: 0.7362 - val_loss: 1.6747
Epoch 6/50
63/63 - 0s - 2ms/step - loss: 0.7184 - val_loss: 1.6677
Epoch 7/50
63/63 - 0s - 2ms/step - loss: 0.7176 - val_loss: 1.6679
Epoch 8/50
63/63 - 0s - 2ms/step - loss: 0.7216 - val_loss: 1.6650
Epoch 9/50
63/63 - 0s - 3ms/step - loss: 0.7185 - val_loss: 1.6661
Epoch 10/50
63/63 - 0s - 2ms/step - loss: 0.7080 - val_loss: 1.6655
Epoch 11/50
63/63 - 0s - 3ms/step - loss: 0.7174 - val_loss: 1.6672
Epoch 12/50
63/63 - 0s - 2ms/step - loss: 0.7033 - val_loss: 1.6648
Epoch 13/50
63/63 