In [8]:
import pandas as pd

df = pd.read_csv('qqq_merged_data_engineered.csv')

df['Date'].unique()

array(['2025-04-11', '2025-04-10', '2025-04-09', '2025-04-08',
       '2025-04-07', '2025-04-04', '2025-04-03', '2025-04-02',
       '2025-04-01', '2025-03-31', '2025-03-28', '2025-03-27',
       '2025-03-26', '2025-03-25', '2025-03-24', '2025-03-21',
       '2025-03-20', '2025-03-19', '2025-03-18', '2025-03-17',
       '2025-03-14'], dtype=object)

In [9]:
# Drop columns with NA values
print("Shape before dropping columns with NA values:", df.shape)

# Get columns with NA values
columns_with_na = df.columns[df.isna().any()].tolist()
print(f"Columns with NA values: {columns_with_na}")

# Drop columns with NA values
df = df.dropna(axis=1)

print("Shape after dropping columns with NA values:", df.shape)

# Display remaining columns
print("\nRemaining columns:")
print(df.columns.tolist())


Shape before dropping columns with NA values: (3921, 50)
Columns with NA values: ['previous_date', 'daily_return', 'sma_5', 'sma_10', 'rsi_14', 'atr_14']
Shape after dropping columns with NA values: (3921, 44)

Remaining columns:
['Date', 'Close/Last', 'ohlcv_volume', 'Open', 'High', 'Low', 'strike', 'open', 'high', 'low', 'last', 'last_size', 'change', 'pctchange', 'previous', 'bid', 'bid_size', 'ask', 'ask_size', 'moneyness', 'option_volume', 'volume_change', 'volume_pctchange', 'open_interest', 'open_interest_change', 'open_interest_pctchange', 'volatility', 'volatility_change', 'volatility_pctchange', 'theoretical', 'delta', 'gamma', 'theta', 'vega', 'rho', 'vol_oi_ratio', 'midpoint', 'put', 'ema_12', 'ema_26', 'macd', 'macd_signal', 'intraday_range_pct', 'options_to_ohlcv_volume_ratio']


In [10]:
# Count NA values for each column in the dataframe
na_counts = df.isna().sum()

# Display the count of NA values for each column
print("NA Values Count per Column:")
for column, count in na_counts.items():
    print(f"{column}: {count}")

# Calculate percentage of NA values
total_rows = len(df)
na_percentage = (na_counts / total_rows) * 100

# Display percentage of NA values
print("\nPercentage of NA Values per Column:")
for column, percentage in na_percentage.items():
    print(f"{column}: {percentage:.2f}%")


NA Values Count per Column:
Date: 0
Close/Last: 0
ohlcv_volume: 0
Open: 0
High: 0
Low: 0
strike: 0
open: 0
high: 0
low: 0
last: 0
last_size: 0
change: 0
pctchange: 0
previous: 0
bid: 0
bid_size: 0
ask: 0
ask_size: 0
moneyness: 0
option_volume: 0
volume_change: 0
volume_pctchange: 0
open_interest: 0
open_interest_change: 0
open_interest_pctchange: 0
volatility: 0
volatility_change: 0
volatility_pctchange: 0
theoretical: 0
delta: 0
gamma: 0
theta: 0
vega: 0
rho: 0
vol_oi_ratio: 0
midpoint: 0
put: 0
ema_12: 0
ema_26: 0
macd: 0
macd_signal: 0
intraday_range_pct: 0
options_to_ohlcv_volume_ratio: 0

Percentage of NA Values per Column:
Date: 0.00%
Close/Last: 0.00%
ohlcv_volume: 0.00%
Open: 0.00%
High: 0.00%
Low: 0.00%
strike: 0.00%
open: 0.00%
high: 0.00%
low: 0.00%
last: 0.00%
last_size: 0.00%
change: 0.00%
pctchange: 0.00%
previous: 0.00%
bid: 0.00%
bid_size: 0.00%
ask: 0.00%
ask_size: 0.00%
moneyness: 0.00%
option_volume: 0.00%
volume_change: 0.00%
volume_pctchange: 0.00%
open_interest: 0

In [11]:
unique_dates = df['Date'].unique()
print("Unique dates in cleaned data:", unique_dates)
print("Total number of unique dates:", len(unique_dates))

Unique dates in cleaned data: ['2025-04-11' '2025-04-10' '2025-04-09' '2025-04-08' '2025-04-07'
 '2025-04-04' '2025-04-03' '2025-04-02' '2025-04-01' '2025-03-31'
 '2025-03-28' '2025-03-27' '2025-03-26' '2025-03-25' '2025-03-24'
 '2025-03-21' '2025-03-20' '2025-03-19' '2025-03-18' '2025-03-17'
 '2025-03-14']
Total number of unique dates: 21


In [12]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, LSTM, TimeDistributed, GlobalAveragePooling1D, Masking
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.regularizers import l2
from sklearn.preprocessing import StandardScaler

# -----------------------------------
# 1. Load and Prepare the Data
# -----------------------------------

df['Date'] = pd.to_datetime(df['Date'])

# Sort the dataframe by Date (oldest first)
df = df.sort_values('Date').reset_index(drop=True)

# Define the list of feature columns. Here we include every column except "Date".
# (The "Close/Last" column will be used to derive the target.)
feature_cols = ['Close/Last', 'ohlcv_volume', 'Open', 'High', 'Low', 'strike', 'open', 'high', 'low', 'last', 'last_size', 'change', 'pctchange', 'previous', 'bid', 'bid_size', 'ask', 'ask_size', 'moneyness', 'option_volume', 'volume_change', 'volume_pctchange', 'open_interest', 'open_interest_change', 'open_interest_pctchange', 'volatility', 'volatility_change', 'volatility_pctchange', 'theoretical', 'delta', 'gamma', 'theta', 'vega', 'rho', 'vol_oi_ratio', 'midpoint', 'put', 'ema_12', 'ema_26', 'macd', 'macd_signal', 'intraday_range_pct', 'options_to_ohlcv_volume_ratio']

# Group by each unique day so that each day’s options data is retained
grouped = df.groupby('Date')
days = []
dates = []
for date, group in grouped:
    # Extract the feature values from the group
    day_data = group[feature_cols].values.astype(np.float32)
    days.append(day_data)
    dates.append(date)

# Sort the days by date
sorted_idx = np.argsort(dates)
days = [days[i] for i in sorted_idx]
dates = [dates[i] for i in sorted_idx]

# Create binary target values:
# For each day, we compare its "Close/Last" (assumed constant for that day)
# with the next day’s "Close/Last". If next day > current day, label 1; else 0.
direction_targets = []
for i in range(len(days) - 1):
    current_close = days[i][0, feature_cols.index('Close/Last')]
    next_close = days[i+1][0, feature_cols.index('Close/Last')]
    direction = 1 if next_close > current_close else 0
    direction_targets.append(direction)
    
# Remove the last day as it has no following day's target
days = days[:-1]
dates = dates[:-1]
y = np.array(direction_targets)  # Binary labels (0 or 1)

# Since each day may have a variable number of option records,
# we pad the data so that every day has the same shape.
max_options = max(day.shape[0] for day in days)
days_padded = pad_sequences(days, maxlen=max_options, dtype='float32', 
                            padding='post', truncating='post')

# -----------------------------------
# 2. Create Lookback Sequences
# -----------------------------------
# Each sample is built from the previous "lookback" days' options data.
lookback = 5  # For example: use the previous 5 days
X, X_dates = [], []
for i in range(lookback, len(y)):
    X.append(days_padded[i-lookback:i])
    X_dates.append(dates[i])
X = np.array(X)  # shape: (n_samples, lookback, max_options, n_features)
y = y[lookback:]  # Align targets with the samples

print("X shape:", X.shape)
print("y shape:", y.shape)

# -----------------------------------
# 3. Scale the Data (features only)
# -----------------------------------
# Reshape X to 2D for scaling, then reshape back.
n_samples, lb, max_opts, n_features = X.shape
X_reshaped = X.reshape(-1, n_features)
scaler_X = StandardScaler()
X_scaled_reshaped = scaler_X.fit_transform(X_reshaped)
X_scaled = X_scaled_reshaped.reshape(n_samples, lb, max_opts, n_features)
# No scaling for y as they are binary.

# -----------------------------------
# 4. Build the LSTM RNN Model for Classification with L2 Regularization
# -----------------------------------
l2_reg = 0.001  # Regularization factor

# Sub-model to process a single day’s options data.
option_input = Input(shape=(max_options, n_features), name='option_input')
masked = Masking(mask_value=0.0)(option_input)
option_dense = TimeDistributed(Dense(64, activation='relu', kernel_regularizer=l2(l2_reg)))(masked)
day_embedding = GlobalAveragePooling1D()(option_dense)
day_model = Model(inputs=option_input, outputs=day_embedding, name='day_model')

# Sequence model: process a sequence of days.
seq_input = Input(shape=(lookback, max_options, n_features), name='seq_input')
day_embeddings = TimeDistributed(day_model)(seq_input)  # shape: (lookback, 64)
lstm_out = LSTM(50, activation='tanh')(day_embeddings)
# For classification: use a sigmoid activation.
output = Dense(1, activation='sigmoid', kernel_regularizer=l2(l2_reg))(lstm_out)

model = Model(inputs=seq_input, outputs=output, name='advanced_direction_model')
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

# -----------------------------------
# 5. Train the Model
# -----------------------------------
history = model.fit(X_scaled, y, epochs=20, batch_size=16, validation_split=0.2)

# -----------------------------------
# 6. Evaluate and Save Predictions
# -----------------------------------
# Generate predictions on the entire dataset.
predictions_probs = model.predict(X_scaled)
# Convert probabilities to binary predictions using 0.5 threshold.
predictions = (predictions_probs >= 0.5).astype(int).flatten()

# Create a DataFrame with the results.
results_df = pd.DataFrame({
    'Date': [d.strftime('%Y-%m-%d') for d in X_dates],
    'True_Direction': y,
    'Predicted_Direction': predictions
})
print(results_df.head())

# Save the trained model and predictions.
model.save('advanced_direction_predictor.h5')
results_df.to_csv('advanced_direction_predictions.csv', index=False)


X shape: (15, 5, 347, 43)
y shape: (15,)


Epoch 1/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 25s/step - accuracy: 0.4167 - loss: 0.7979 - val_accuracy: 0.6667 - val_loss: 0.6981
Epoch 2/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step - accuracy: 0.4167 - loss: 0.7731 - val_accuracy: 0.6667 - val_loss: 0.7213
Epoch 3/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step - accuracy: 0.4167 - loss: 0.7549 - val_accuracy: 0.3333 - val_loss: 0.7500
Epoch 4/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step - accuracy: 0.5833 - loss: 0.7427 - val_accuracy: 0.3333 - val_loss: 0.7827
Epoch 5/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step - accuracy: 0.5833 - loss: 0.7357 - val_accuracy: 0.3333 - val_loss: 0.8177
Epoch 6/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 60ms/step - accuracy: 0.5833 - loss: 0.7327 - val_accuracy: 0.3333 - val_loss: 0.8523
Epoch 7/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━



         Date  True_Direction  Predicted_Direction
0  2025-03-21               1                    0
1  2025-03-24               1                    0
2  2025-03-25               0                    0
3  2025-03-26               0                    0
4  2025-03-27               0                    0
