In [4]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, LSTM, TimeDistributed, GlobalAveragePooling1D, Masking
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.regularizers import l2
from sklearn.preprocessing import StandardScaler

# -----------------------------------
# 1. Load and Prepare the Data
# -----------------------------------

# Load the engineered dataset and ensure dates are parsed
df = pd.read_csv('merged_data_engineered.csv')
df['Date'] = pd.to_datetime(df['Date'])

# Sort the dataframe by Date (oldest first)
df = df.sort_values('Date').reset_index(drop=True)

# If there are categorical columns that are not numeric, encode them.
# For example, encode "type" (if present) as: call -> 1, put -> 0.
if df['type'].dtype == object:
    df['type'] = df['type'].map({'call': 1, 'put': 0})

# Define the list of feature columns. Here we include every column except "Date".
# (The "Close/Last" column will be used as a feature for the current day, but the target is the next day's value.)
feature_cols = [
    'Close/Last', 'ohlcv_volume', 'Open', 'High', 'Low', 'type', 'strike',
    'open', 'high', 'low', 'last', 'last_size', 'change', 'pctchange', 'previous',
    'bid', 'bid_size', 'ask', 'ask_size', 'moneyness', 'option_volume',
    'volume_change', 'volume_pctchange', 'open_interest', 'open_interest_change',
    'open_interest_pctchange', 'volatility', 'volatility_change', 'volatility_pctchange',
    'theoretical', 'delta', 'gamma', 'theta', 'vega', 'rho', 'vol_oi_ratio', 'dte',
    'midpoint', 'daily_return', 'sma_5', 'sma_10', 'ema_12', 'ema_26',
    'macd', 'macd_signal', 'rsi_14', 'atr_14', 'intraday_range_pct',
    'options_to_ohlcv_volume_ratio'
]

# Drop rows with missing feature values in these columns
df = df.dropna(subset=feature_cols)

# Group by each unique day so that each day’s options data is retained
grouped = df.groupby('Date')

# Create lists to hold each day's options data and corresponding date
days = []
dates = []
for date, group in grouped:
    # Extract the feature values from the group
    day_data = group[feature_cols].values.astype(np.float32)
    days.append(day_data)
    dates.append(date)

# Sort the days by date
sorted_indices = np.argsort(dates)
days = [days[i] for i in sorted_indices]
dates = [dates[i] for i in sorted_indices]

# Create target values:
# For each day, we want to predict the next day’s close.
# We assume that the day's "Close/Last" is the same for all rows of that day,
# so we take the first row's "Close/Last" from the next day.
targets = []
for i in range(len(days) - 1):
    next_day_close = days[i+1][0, feature_cols.index('Close/Last')]
    targets.append(next_day_close)
    
# Remove the last day as it has no following day's close to predict
days = days[:-1]
dates = dates[:-1]
targets = np.array(targets)

# Since each day can have a variable number of options records,
# we pad the data so that every day has the same shape.
max_options = max(day.shape[0] for day in days)
# days_padded will have shape (n_days, max_options, n_features)
days_padded = pad_sequences(days, maxlen=max_options, dtype='float32', 
                            padding='post', truncating='post')

# -----------------------------------
# 2. Create Lookback Sequences
# -----------------------------------
# Use a lookback window so that each training sample is
# built from the previous "lookback" days' options data.
lookback = 5  # Example: use the previous 5 days

X, y, X_dates = [], [], []
for i in range(lookback, len(targets)):
    # X: sequence of day data from the past "lookback" days
    X.append(days_padded[i-lookback:i])
    # y: the target for the current day (next day close)
    y.append(targets[i])
    X_dates.append(dates[i])
    
X = np.array(X)  # shape: (n_samples, lookback, max_options, n_features)
y = np.array(y)

print("X shape:", X.shape)
print("y shape:", y.shape)

# -----------------------------------
# 3. Scale the Data
# -----------------------------------
# Scale X: reshape to 2D, scale, then reshape back.
n_samples, lb, max_opts, n_features = X.shape
X_reshaped = X.reshape(-1, n_features)
scaler_X = StandardScaler()
X_scaled_reshaped = scaler_X.fit_transform(X_reshaped)
X_scaled = X_scaled_reshaped.reshape(n_samples, lb, max_opts, n_features)

# Scale y:
scaler_y = StandardScaler()
y_scaled = scaler_y.fit_transform(y.reshape(-1, 1)).flatten()

# -----------------------------------
# 4. Build the LSTM RNN Model with L2 Regularization
# -----------------------------------
# Use L2 regularization factor (adjust as needed)
l2_reg = 0.001

# Number of features remains the same.
n_features = len(feature_cols)

# Sub-model to process a single day’s options data.
# Input shape: (max_options, n_features)
option_input = Input(shape=(max_options, n_features), name='option_input')
# Mask padded rows (assumed to be zeros)
masked = Masking(mask_value=0.0)(option_input)
# Process each option row individually with a TimeDistributed Dense layer including L2 regularization.
option_dense = TimeDistributed(Dense(64, activation='relu', kernel_regularizer=l2(l2_reg)))(masked)
# Aggregate the processed rows into a fixed-length vector using global average pooling.
day_embedding = GlobalAveragePooling1D()(option_dense)
# Create the day-level model that outputs a daily embedding.
day_model = Model(inputs=option_input, outputs=day_embedding, name='day_model')

# Define the sequence model.
# Input: sequence of days with shape (lookback, max_options, n_features)
seq_input = Input(shape=(lookback, max_options, n_features), name='seq_input')
# Apply the day_model to each day in the sequence using TimeDistributed.
day_embeddings = TimeDistributed(day_model)(seq_input)  # shape: (lookback, 64)
# Feed the sequence of day embeddings into an LSTM layer to capture temporal dynamics.
lstm_out = LSTM(50, activation='tanh')(day_embeddings)
# Final Dense layer to predict the next day’s closing price with L2 regularization.
output = Dense(1, kernel_regularizer=l2(l2_reg))(lstm_out)

# Build and compile the complete model.
model = Model(inputs=seq_input, outputs=output, name='advanced_options_model')
model.compile(optimizer='adam', loss='mse')
model.summary()

# -----------------------------------
# 5. Train the Model
# -----------------------------------
# Train using the scaled data.
history = model.fit(X_scaled, y_scaled, epochs=20, batch_size=16, validation_split=0.2)

# -----------------------------------
# 6. Evaluate and Save Predictions
# -----------------------------------
# Generate predictions on the entire dataset.
predictions_scaled = model.predict(X_scaled)
# Invert target scaling
predictions = scaler_y.inverse_transform(predictions_scaled)
results_df = pd.DataFrame({
    'Date': [d.strftime('%Y-%m-%d') for d in X_dates],
    'True_Close_next': y.flatten(),
    'Predicted_Close_next': predictions.flatten()
})
print(results_df.head())

# Save the trained model and the predictions for later analysis.
model.save('advanced_options_close_predictor.h5')
results_df.to_csv('advanced_expanding_window_predictions.csv', index=False)


X shape: (8, 5, 22, 49)
y shape: (8,)


Epoch 1/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step - loss: 1.2239 - val_loss: 0.2946
Epoch 2/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step - loss: 1.0961 - val_loss: 0.3538
Epoch 3/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step - loss: 0.9799 - val_loss: 0.4570
Epoch 4/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step - loss: 0.8728 - val_loss: 0.6053
Epoch 5/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step - loss: 0.7728 - val_loss: 0.8003
Epoch 6/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step - loss: 0.6786 - val_loss: 1.0459
Epoch 7/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step - loss: 0.5916 - val_loss: 1.3423
Epoch 8/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step - loss: 0.5120 - val_loss: 1.6869
Epoch 9/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m



         Date  True_Close_next  Predicted_Close_next
0  2025-03-31            15.95             16.798996
1  2025-04-01            16.26             14.844572
2  2025-04-02            11.41             12.732158
3  2025-04-03             8.73              9.318368
4  2025-04-04             9.15              7.552315


In [8]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, LSTM, TimeDistributed, GlobalAveragePooling1D, Masking
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.regularizers import l2
from sklearn.preprocessing import StandardScaler

# -----------------------------------
# 1. Load and Prepare the Data
# -----------------------------------

df = pd.read_csv('merged_data_engineered.csv')
df['Date'] = pd.to_datetime(df['Date'])
df = df.sort_values('Date').reset_index(drop=True)

# Encode 'type' as a binary variable if needed.
if df['type'].dtype == object:
    df['type'] = df['type'].map({'call': 1, 'put': 0})

feature_cols = [
    'Close/Last', 'ohlcv_volume', 'Open', 'High', 'Low', 'type', 'strike',
    'open', 'high', 'low', 'last', 'last_size', 'change', 'pctchange', 'previous',
    'bid', 'bid_size', 'ask', 'ask_size', 'moneyness', 'option_volume',
    'volume_change', 'volume_pctchange', 'open_interest', 'open_interest_change',
    'open_interest_pctchange', 'volatility', 'volatility_change', 'volatility_pctchange',
    'theoretical', 'delta', 'gamma', 'theta', 'vega', 'rho', 'vol_oi_ratio', 'dte',
    'midpoint', 'daily_return', 'sma_5', 'sma_10', 'ema_12', 'ema_26',
    'macd', 'macd_signal', 'rsi_14', 'atr_14', 'intraday_range_pct',
    'options_to_ohlcv_volume_ratio'
]

df = df.dropna(subset=feature_cols)

# Group by Date so that each day's options data is retained.
grouped = df.groupby('Date')
days, dates = [], []
for date, group in grouped:
    day_data = group[feature_cols].values.astype(np.float32)
    days.append(day_data)
    dates.append(date)

# Ensure days are sorted by date.
sorted_idx = np.argsort(dates)
days = [days[i] for i in sorted_idx]
dates = [dates[i] for i in sorted_idx]

# Create target values: for each day, predict the next day's Close/Last.
targets = []
for i in range(len(days) - 1):
    next_day_close = days[i+1][0, feature_cols.index('Close/Last')]
    targets.append(next_day_close)

# Remove the last day (no target)
days = days[:-1]
dates = dates[:-1]
targets = np.array(targets)

# Pad each day’s options data so every day has shape (max_options, n_features)
max_options = max(day.shape[0] for day in days)
days_padded = pad_sequences(days, maxlen=max_options, dtype='float32', padding='post', truncating='post')

# -----------------------------------
# 2. Create Lookback Sequences
# -----------------------------------
# Here lookback=5 means each sample uses the previous 5 days.
lookback = 5
X, y, X_dates = [], [], []
for i in range(lookback, len(targets)):
    X.append(days_padded[i-lookback:i])
    y.append(targets[i])
    X_dates.append(dates[i])
    
X = np.array(X)  # shape: (n_samples, lookback, max_options, n_features)
y = np.array(y)

print("X shape:", X.shape)  # e.g., (n_samples, 5, max_options, n_features)
print("y shape:", y.shape)

# -----------------------------------
# 3. Scale the Data
# -----------------------------------
n_samples, lb, max_opts, n_features = X.shape
X_reshaped = X.reshape(-1, n_features)
scaler_X = StandardScaler()
X_scaled_reshaped = scaler_X.fit_transform(X_reshaped)
X_scaled = X_scaled_reshaped.reshape(n_samples, lb, max_opts, n_features)

scaler_y = StandardScaler()
y_scaled = scaler_y.fit_transform(y.reshape(-1, 1)).flatten()

# -----------------------------------
# 4. Build the LSTM RNN Model with L2 Regularization
# -----------------------------------
l2_reg = 0.001

# Sub-model for a single day's options data.
option_input = Input(shape=(max_options, n_features), name='option_input')
masked = Masking(mask_value=0.0)(option_input)
option_dense = TimeDistributed(Dense(64, activation='relu', kernel_regularizer=l2(l2_reg)))(masked)
day_embedding = GlobalAveragePooling1D()(option_dense)
day_model = Model(inputs=option_input, outputs=day_embedding, name='day_model')

seq_input = Input(shape=(lookback, max_options, n_features), name='seq_input')
day_embeddings = TimeDistributed(day_model)(seq_input)  # shape: (lookback, 64)
lstm_out = LSTM(50, activation='tanh')(day_embeddings)
output = Dense(1, kernel_regularizer=l2(l2_reg))(lstm_out)

model = Model(inputs=seq_input, outputs=output, name='advanced_options_model')
model.compile(optimizer='adam', loss='mse')
model.summary()

# -----------------------------------
# 5. Train the Model
# -----------------------------------
history = model.fit(X_scaled, y_scaled, epochs=20, batch_size=16, validation_split=0.2)

# -----------------------------------
# 6. Generate Predictions for Every Date
# -----------------------------------
# Our current in-sample predictions correspond to dates in X_dates.
# These predictions use a full lookback window. Note that with lookback=5,
# the earliest prediction we get is for the 6th date in our data.
# To obtain a prediction for every date from 2025-03-05 to 2025-04-10
# (assuming 2025-03-04 is the first date), you could:
#   - either reduce lookback to 1 so each day (except the first) is predicted,
#   - or accept that with lookback=5 predictions only start when a full window is available.
#
# Here, we show how to get the in-sample predictions and then add one out-of-sample forecast
# for the day after the last day in our dataset (2025-04-11).

# In-sample predictions:
preds_scaled = model.predict(X_scaled)
preds = scaler_y.inverse_transform(preds_scaled)

# Assemble a DataFrame for in-sample predicted dates (these correspond to X_dates).
pred_dates = [d.strftime('%Y-%m-%d') for d in X_dates]
results_df = pd.DataFrame({
    'Date': pred_dates,
    'True_Close_next': y.flatten(),
    'Predicted_Close_next': preds.flatten()
})

print("In-sample predictions:")
print(results_df.head())

# For an out-of-sample forecast for 2025-04-11:
# Use the last available lookback window from the padded data.
last_window = days_padded[-lookback:]  # shape: (lookback, max_options, n_features)
# Scale this window using the same scaler_X.
last_window_scaled = scaler_X.transform(last_window.reshape(-1, n_features)).reshape(lookback, max_opts, n_features)
# Expand dims to match model input: (1, lookback, max_options, n_features)
last_window_scaled = np.expand_dims(last_window_scaled, axis=0)
future_pred_scaled = model.predict(last_window_scaled)
future_pred = scaler_y.inverse_transform(future_pred_scaled)[0, 0]
future_date = (dates[-1] + pd.Timedelta(days=1)).strftime('%Y-%m-%d')

# Append the out-of-sample forecast to the results.
future_df = pd.DataFrame({
    'Date': [future_date],
    'True_Close_next': [np.nan],   # No true value available yet.
    'Predicted_Close_next': [future_pred]
})

all_preds_df = pd.concat([results_df, future_df], ignore_index=True)

print("\nAll predictions from {} to {}:".format(pred_dates[0], future_date))
print(all_preds_df)

# Save the predictions.
all_preds_df.to_csv('advanced_expanding_window_predictions.csv', index=False)
model.save('advanced_options_close_predictor.h5')


X shape: (8, 5, 22, 49)
y shape: (8,)


Epoch 1/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step - loss: 1.4281 - val_loss: 0.4024
Epoch 2/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step - loss: 1.2173 - val_loss: 0.3957
Epoch 3/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step - loss: 1.0636 - val_loss: 0.4218
Epoch 4/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step - loss: 0.9430 - val_loss: 0.4898
Epoch 5/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step - loss: 0.8379 - val_loss: 0.6140
Epoch 6/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step - loss: 0.7391 - val_loss: 0.8100
Epoch 7/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step - loss: 0.6436 - val_loss: 1.0905
Epoch 8/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step - loss: 0.5525 - val_loss: 1.4632
Epoch 9/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m

  all_preds_df = pd.concat([results_df, future_df], ignore_index=True)



All predictions from 2025-03-31 to 2025-04-10:
         Date  True_Close_next  Predicted_Close_next
0  2025-03-31            15.95             16.835896
1  2025-04-01            16.26             15.033982
2  2025-04-02            11.41             13.124949
3  2025-04-03             8.73              9.762660
4  2025-04-04             9.15              7.800424
5  2025-04-07             8.25              6.850358
6  2025-04-08            12.77              5.565059
7  2025-04-09             9.63              4.924136
8  2025-04-10              NaN              4.396750
