In [17]:
df = pd.read_csv('qqq_merged_data_engineered.csv')

df['Date'].unique()

array(['2025-04-11', '2025-04-10', '2025-04-09', '2025-04-08',
       '2025-04-07', '2025-04-04', '2025-04-03', '2025-04-02',
       '2025-04-01', '2025-03-31', '2025-03-28', '2025-03-27',
       '2025-03-26', '2025-03-25', '2025-03-24', '2025-03-21',
       '2025-03-20', '2025-03-19', '2025-03-18', '2025-03-17',
       '2025-03-14'], dtype=object)

In [18]:
# Drop columns with NA values
print("Shape before dropping columns with NA values:", df.shape)

# Get columns with NA values
columns_with_na = df.columns[df.isna().any()].tolist()
print(f"Columns with NA values: {columns_with_na}")

# Drop columns with NA values
df = df.dropna(axis=1)

print("Shape after dropping columns with NA values:", df.shape)

# Display remaining columns
print("\nRemaining columns:")
print(df.columns.tolist())


Shape before dropping columns with NA values: (3921, 50)
Columns with NA values: ['previous_date', 'daily_return', 'sma_5', 'sma_10', 'rsi_14', 'atr_14']
Shape after dropping columns with NA values: (3921, 44)

Remaining columns:
['Date', 'Close/Last', 'ohlcv_volume', 'Open', 'High', 'Low', 'strike', 'open', 'high', 'low', 'last', 'last_size', 'change', 'pctchange', 'previous', 'bid', 'bid_size', 'ask', 'ask_size', 'moneyness', 'option_volume', 'volume_change', 'volume_pctchange', 'open_interest', 'open_interest_change', 'open_interest_pctchange', 'volatility', 'volatility_change', 'volatility_pctchange', 'theoretical', 'delta', 'gamma', 'theta', 'vega', 'rho', 'vol_oi_ratio', 'midpoint', 'put', 'ema_12', 'ema_26', 'macd', 'macd_signal', 'intraday_range_pct', 'options_to_ohlcv_volume_ratio']


In [19]:
# Count NA values for each column in the dataframe
na_counts = df.isna().sum()

# Display the count of NA values for each column
print("NA Values Count per Column:")
for column, count in na_counts.items():
    print(f"{column}: {count}")

# Calculate percentage of NA values
total_rows = len(df)
na_percentage = (na_counts / total_rows) * 100

# Display percentage of NA values
print("\nPercentage of NA Values per Column:")
for column, percentage in na_percentage.items():
    print(f"{column}: {percentage:.2f}%")


NA Values Count per Column:
Date: 0
Close/Last: 0
ohlcv_volume: 0
Open: 0
High: 0
Low: 0
strike: 0
open: 0
high: 0
low: 0
last: 0
last_size: 0
change: 0
pctchange: 0
previous: 0
bid: 0
bid_size: 0
ask: 0
ask_size: 0
moneyness: 0
option_volume: 0
volume_change: 0
volume_pctchange: 0
open_interest: 0
open_interest_change: 0
open_interest_pctchange: 0
volatility: 0
volatility_change: 0
volatility_pctchange: 0
theoretical: 0
delta: 0
gamma: 0
theta: 0
vega: 0
rho: 0
vol_oi_ratio: 0
midpoint: 0
put: 0
ema_12: 0
ema_26: 0
macd: 0
macd_signal: 0
intraday_range_pct: 0
options_to_ohlcv_volume_ratio: 0

Percentage of NA Values per Column:
Date: 0.00%
Close/Last: 0.00%
ohlcv_volume: 0.00%
Open: 0.00%
High: 0.00%
Low: 0.00%
strike: 0.00%
open: 0.00%
high: 0.00%
low: 0.00%
last: 0.00%
last_size: 0.00%
change: 0.00%
pctchange: 0.00%
previous: 0.00%
bid: 0.00%
bid_size: 0.00%
ask: 0.00%
ask_size: 0.00%
moneyness: 0.00%
option_volume: 0.00%
volume_change: 0.00%
volume_pctchange: 0.00%
open_interest: 0

In [20]:
unique_dates = df['Date'].unique()
print("Unique dates in cleaned data:", unique_dates)
print("Total number of unique dates:", len(unique_dates))

Unique dates in cleaned data: ['2025-04-11' '2025-04-10' '2025-04-09' '2025-04-08' '2025-04-07'
 '2025-04-04' '2025-04-03' '2025-04-02' '2025-04-01' '2025-03-31'
 '2025-03-28' '2025-03-27' '2025-03-26' '2025-03-25' '2025-03-24'
 '2025-03-21' '2025-03-20' '2025-03-19' '2025-03-18' '2025-03-17'
 '2025-03-14']
Total number of unique dates: 21


In [21]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, LSTM, TimeDistributed, GlobalAveragePooling1D, Masking
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.regularizers import l2
from sklearn.preprocessing import StandardScaler

# -----------------------------------
# 1. Load and Prepare the Data
# -----------------------------------

# Sort the dataframe by Date (oldest first)
df = df.sort_values('Date').reset_index(drop=True)

# Convert 'Date' column to datetime type
df['Date'] = pd.to_datetime(df['Date'])


# Define the list of feature columns. Here we include every column except "Date".
# (The "Close/Last" column will be used as a feature for the current day, but the target is the next day's value.)
feature_cols = ['Close/Last', 'ohlcv_volume', 'Open', 'High', 'Low', 'strike', 'open', 'high', 'low', 'last', 'last_size', 'change', 'pctchange', 'previous', 'bid', 'bid_size', 'ask', 'ask_size', 'moneyness', 'option_volume', 'volume_change', 'volume_pctchange', 'open_interest', 'open_interest_change', 'open_interest_pctchange', 'volatility', 'volatility_change', 'volatility_pctchange', 'theoretical', 'delta', 'gamma', 'theta', 'vega', 'rho', 'vol_oi_ratio', 'midpoint', 'put', 'ema_12', 'ema_26', 'macd', 'macd_signal', 'intraday_range_pct', 'options_to_ohlcv_volume_ratio']

# Group by each unique day so that each day’s options data is retained
grouped = df.groupby('Date')

# Create lists to hold each day's options data and corresponding date
days = []
dates = []
for date, group in grouped:
    # Extract the feature values from the group
    day_data = group[feature_cols].values.astype(np.float32)
    days.append(day_data)
    dates.append(date)

# Sort the days by date
sorted_indices = np.argsort(dates)
days = [days[i] for i in sorted_indices]
dates = [dates[i] for i in sorted_indices]

# Create target values:
# For each day, we want to predict the next day’s close.
# We assume that the day's "Close/Last" is the same for all rows of that day,
# so we take the first row's "Close/Last" from the next day.
targets = []
for i in range(len(days) - 1):
    next_day_close = days[i+1][0, feature_cols.index('Close/Last')]
    targets.append(next_day_close)
    
# Remove the last day as it has no following day's close to predict
days = days[:-1]
dates = dates[:-1]
targets = np.array(targets)

# Since each day can have a variable number of options records,
# we pad the data so that every day has the same shape.
max_options = max(day.shape[0] for day in days)
# days_padded will have shape (n_days, max_options, n_features)
days_padded = pad_sequences(days, maxlen=max_options, dtype='float32', 
                            padding='post', truncating='post')

# -----------------------------------
# 2. Create Lookback Sequences
# -----------------------------------
# Use a lookback window so that each training sample is
# built from the previous "lookback" days' options data.
lookback = 5  # Example: use the previous 5 days

X, y, X_dates = [], [], []
for i in range(lookback, len(targets)):
    # X: sequence of day data from the past "lookback" days
    X.append(days_padded[i-lookback:i])
    # y: the target for the current day (next day close)
    y.append(targets[i])
    X_dates.append(dates[i])
    
X = np.array(X)  # shape: (n_samples, lookback, max_options, n_features)
y = np.array(y)

print("X shape:", X.shape)
print("y shape:", y.shape)

# -----------------------------------
# 3. Scale the Data
# -----------------------------------
# Scale X: reshape to 2D, scale, then reshape back.
n_samples, lb, max_opts, n_features = X.shape
X_reshaped = X.reshape(-1, n_features)
scaler_X = StandardScaler()
X_scaled_reshaped = scaler_X.fit_transform(X_reshaped)
X_scaled = X_scaled_reshaped.reshape(n_samples, lb, max_opts, n_features)

# Scale y:
scaler_y = StandardScaler()
y_scaled = scaler_y.fit_transform(y.reshape(-1, 1)).flatten()

# -----------------------------------
# 4. Build the LSTM RNN Model with L2 Regularization
# -----------------------------------
# Use L2 regularization factor (adjust as needed)
l2_reg = 0.001

# Number of features remains the same.
n_features = len(feature_cols)

# Sub-model to process a single day’s options data.
# Input shape: (max_options, n_features)
option_input = Input(shape=(max_options, n_features), name='option_input')
# Mask padded rows (assumed to be zeros)
masked = Masking(mask_value=0.0)(option_input)
# Process each option row individually with a TimeDistributed Dense layer including L2 regularization.
option_dense = TimeDistributed(Dense(64, activation='relu', kernel_regularizer=l2(l2_reg)))(masked)
# Aggregate the processed rows into a fixed-length vector using global average pooling.
day_embedding = GlobalAveragePooling1D()(option_dense)
# Create the day-level model that outputs a daily embedding.
day_model = Model(inputs=option_input, outputs=day_embedding, name='day_model')

# Define the sequence model.
# Input: sequence of days with shape (lookback, max_options, n_features)
seq_input = Input(shape=(lookback, max_options, n_features), name='seq_input')
# Apply the day_model to each day in the sequence using TimeDistributed.
day_embeddings = TimeDistributed(day_model)(seq_input)  # shape: (lookback, 64)
# Feed the sequence of day embeddings into an LSTM layer to capture temporal dynamics.
lstm_out = LSTM(50, activation='tanh')(day_embeddings)
# Final Dense layer to predict the next day’s closing price with L2 regularization.
output = Dense(1, kernel_regularizer=l2(l2_reg))(lstm_out)

# Build and compile the complete model.
model = Model(inputs=seq_input, outputs=output, name='advanced_options_model')
model.compile(optimizer='adam', loss='mse')
model.summary()

# -----------------------------------
# 5. Train the Model
# -----------------------------------
# Train using the scaled data.
history = model.fit(X_scaled, y_scaled, epochs=20, batch_size=16, validation_split=0.2)

# -----------------------------------
# 6. Evaluate and Save Predictions
# -----------------------------------
# Generate predictions on the entire dataset.
predictions_scaled = model.predict(X_scaled)
# Invert target scaling
predictions = scaler_y.inverse_transform(predictions_scaled)
results_df = pd.DataFrame({
    'Date': [d.strftime('%Y-%m-%d') for d in X_dates],
    'True_Close_next': y.flatten(),
    'Predicted_Close_next': predictions.flatten()
})
print(results_df.head())

# Save the trained model and the predictions for later analysis.
model.save('advanced_options_close_predictor.h5')
results_df.to_csv('advanced_expanding_window_predictions.csv', index=False)


X shape: (15, 5, 347, 43)
y shape: (15,)


Epoch 1/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 26s/step - loss: 1.3032 - val_loss: 0.2980
Epoch 2/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 70ms/step - loss: 1.2364 - val_loss: 0.2104
Epoch 3/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 65ms/step - loss: 1.1733 - val_loss: 0.1641
Epoch 4/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 66ms/step - loss: 1.1119 - val_loss: 0.1644
Epoch 5/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 68ms/step - loss: 1.0511 - val_loss: 0.2133
Epoch 6/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 67ms/step - loss: 0.9903 - val_loss: 0.3110
Epoch 7/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 69ms/step - loss: 0.9294 - val_loss: 0.4564
Epoch 8/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 68ms/step - loss: 0.8678 - val_loss: 0.6490
Epoch 9/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [



         Date  True_Close_next  Predicted_Close_next
0  2025-03-21       490.660004            497.987366
1  2025-03-24       493.459991            497.142334
2  2025-03-25       484.380005            491.547577
3  2025-03-26       481.619995            489.030579
4  2025-03-27       468.940002            478.375122
