In [14]:
import pandas as pd

In [15]:
df = pd.read_csv('results/stooq_merged.csv', parse_dates=['Date'])

# 1. Rename the long News columns to something shorter
df = df.rename(columns={
    'MERGED_GDELT_STOOQ_ALIGNED_News_Sentiment': 'Sentiment_Tone',
    'MERGED_GDELT_STOOQ_ALIGNED_News_Disagreement': 'Sentiment_Dispersion',
    'MERGED_GDELT_STOOQ_ALIGNED_News_Volume': 'News_Volume'
})

# 2. Select only the core columns you need for the first model (S&P 500 Focus)
# We drop the confusing duplicates and specific stocks (MSFT/AAPL) for now to focus on the Index
cols_to_keep = [
    'Date', 
    'SPX_Close', 'SPX_Volume',  # The Target Index
    'Sentiment_Tone', 'Sentiment_Dispersion', 'News_Volume' # The Signals
]

df_clean = df[cols_to_keep].copy()
df_clean = df_clean.set_index('Date').sort_index()

# Now you have a clean table: [SPX_Close, SPX_Volume, Sentiment_Tone, ...]

In [16]:
import numpy as np

# 1. Calculate Daily Log Returns (The "Truth")
df_clean['Return_Daily'] = np.log(df_clean['SPX_Close'] / df_clean['SPX_Close'].shift(1))

# 2. Create the TARGET (Next Day's Return)
# We shift UP by 1. Row 't' now contains the return for 't+1'
df_clean['Target_NextDay_Return'] = df_clean['Return_Daily'].shift(-1)

# 3. Create Binary Target (Direction) - Optional but good for Classification
# 1 if Up, 0 if Down
df_clean['Target_Direction'] = (df_clean['Target_NextDay_Return'] > 0).astype(int)

# 4. Handle Missing Values (The shifting creates NaNs at the end)
df_clean = df_clean.dropna()

print("Ready for Modeling. Columns available:")
print(df_clean.columns.tolist())

Ready for Modeling. Columns available:
['SPX_Close', 'SPX_Volume', 'Sentiment_Tone', 'Sentiment_Dispersion', 'News_Volume', 'Return_Daily', 'Target_NextDay_Return', 'Target_Direction']


In [17]:
correlation = df_clean[['Sentiment_Tone', 'News_Volume', 'Target_NextDay_Return']].corr()
print(correlation)

                       Sentiment_Tone  News_Volume  Target_NextDay_Return
Sentiment_Tone               1.000000     0.312139              -0.000960
News_Volume                  0.312139     1.000000              -0.006388
Target_NextDay_Return       -0.000960    -0.006388               1.000000


In [None]:
# Create a copy to avoid SettingWithCopy warnings
df_enhanced = df_clean.copy()

# --- 1. Smoothing (Trend Detection) ---
# 3-Day and 7-Day Rolling Average of Sentiment
df_enhanced['Sent_MA_3'] = df_enhanced['Sentiment_Tone'].rolling(window=3).mean()
df_enhanced['Sent_MA_7'] = df_enhanced['Sentiment_Tone'].rolling(window=7).mean()

# --- 2. Momentum (Change in Mood) ---
# Is the news getting better or worse?
df_enhanced['Sent_Momentum'] = df_enhanced['Sentiment_Tone'].diff()

# --- 3. Interaction (Volume Weighted Sentiment) ---
# Scale Sentiment by Volume (Normalize volume first to avoid huge numbers)
vol_mean = df_enhanced['News_Volume'].rolling(window=20).mean()
df_enhanced['Relative_Vol'] = df_enhanced['News_Volume'] / vol_mean
df_enhanced['Weighted_Sentiment'] = df_enhanced['Sentiment_Tone'] * df_enhanced['Relative_Vol']

# --- 4. Volatility Regime ---
# Is the news highly conflicted? (High Dispersion)
df_enhanced['Dispersion_MA_3'] = df_enhanced['Sentiment_Dispersion'].rolling(window=3).mean()

# Drop the NaNs created by rolling windows (first 20 rows)
df_enhanced = df_enhanced.dropna()

print("New Features Created:")
print(df_enhanced[['Sent_MA_7', 'Sent_Momentum', 'Weighted_Sentiment']].tail())

In [20]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from torch.utils.data import DataLoader, TensorDataset

# === Configuration ===
SEQ_LENGTH = 30  # Look back 30 days to predict tomorrow
BATCH_SIZE = 32
HIDDEN_SIZE = 64
EPOCHS = 20
LEARNING_RATE = 0.001

# Select features (Ensure these exist in df_enhanced)
# We exclude the Target from the input features to prevent leakage
feature_cols = ['Return_Daily', 'Sent_MA_7', 'Sent_Momentum', 'Weighted_Sentiment', 'Dispersion_MA_3']
target_col = 'Target_NextDay_Return'

# Drop NaNs just in case
data = df_enhanced.dropna().copy()

# === 1. Scaling (Crucial for LSTM) ===
scaler_X = MinMaxScaler()
scaler_y = MinMaxScaler()

# Scale Features
X_scaled = scaler_X.fit_transform(data[feature_cols].values)
# Scale Target (Reshape to -1, 1 for scaler)
y_scaled = scaler_y.fit_transform(data[[target_col]].values)

# === 2. Create Sequences (Sliding Window) ===
def create_sequences(input_data, target_data, seq_length):
    xs, ys = [], []
    for i in range(len(input_data) - seq_length):
        x = input_data[i:(i + seq_length)]
        y = target_data[i + seq_length]
        xs.append(x)
        ys.append(y)
    return np.array(xs), np.array(ys)

X_seq, y_seq = create_sequences(X_scaled, y_scaled, SEQ_LENGTH)

# Convert to PyTorch Tensors
X_tensor = torch.FloatTensor(X_seq)
y_tensor = torch.FloatTensor(y_seq)

print(f"Input Shape: {X_tensor.shape}")  # (Samples, 30, Features)
print(f"Target Shape: {y_tensor.shape}") # (Samples, 1)

ModuleNotFoundError: No module named 'torch'

In [None]:
class SentimentLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, output_size=1, num_layers=1):
        super(SentimentLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        # The LSTM Layer
        # batch_first=True means input is (Batch, Seq, Feature)
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, dropout=0.2)
        
        # The Output Layer
        self.fc = nn.Linear(hidden_size, output_size)
        
    def forward(self, x):
        # Initialize hidden state and cell state with zeros
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        
        # Forward propagate LSTM
        # out shape: (batch_size, seq_length, hidden_size)
        out, _ = self.lstm(x, (h0, c0))
        
        # Decode the hidden state of the last time step
        out = self.fc(out[:, -1, :])
        return out

# Instantiate Model
model = SentimentLSTM(input_size=len(feature_cols), hidden_size=HIDDEN_SIZE)
print(model)

In [None]:
# === Split Data (Chronological) ===
train_size = int(len(X_tensor) * 0.8)

X_train, X_test = X_tensor[:train_size], X_tensor[train_size:]
y_train, y_test = y_tensor[:train_size], y_tensor[train_size:]

# Create DataLoaders
train_loader = DataLoader(TensorDataset(X_train, y_train), shuffle=False, batch_size=BATCH_SIZE)
test_loader = DataLoader(TensorDataset(X_test, y_test), shuffle=False, batch_size=BATCH_SIZE)

# Loss and Optimizer
criterion = nn.MSELoss() # Mean Squared Error for Regression
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

# === Training Loop ===
print("Starting Training...")
for epoch in range(EPOCHS):
    model.train()
    train_loss = 0
    for X_batch, y_batch in train_loader:
        # Forward pass
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        
        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item()
    
    if (epoch+1) % 5 == 0:
        print(f'Epoch [{epoch+1}/{EPOCHS}], Loss: {train_loss/len(train_loader):.6f}')

print("Training Complete.")

In [None]:
import matplotlib.pyplot as plt

model.eval()
with torch.no_grad():
    # Get predictions
    test_predictions = model(X_test)
    
    # === Inverse Transform to get Real Return Values ===
    # We must "un-scale" the data to interpret it
    y_test_real = scaler_y.inverse_transform(y_test.numpy())
    preds_real = scaler_y.inverse_transform(test_predictions.numpy())

    # === Calculate Metrics ===
    # 1. RMSE
    rmse = np.sqrt(np.mean((preds_real - y_test_real)**2))
    print(f"Root Mean Squared Error: {rmse:.6f}")

    # 2. Directional Accuracy
    # If predicted > 0 and actual > 0, that's a hit.
    # Note: We compare against 0 (flat), or you can compare signs.
    # We use a small threshold (1e-5) to avoid zero-division issues
    correct_direction = np.sign(preds_real) == np.sign(y_test_real)
    accuracy = np.mean(correct_direction) * 100
    print(f"Directional Accuracy: {accuracy:.2f}%")

    # === Visualization ===
    plt.figure(figsize=(12, 6))
    plt.plot(y_test_real[:100], label='Actual Return', color='black', alpha=0.7)
    plt.plot(preds_real[:100], label='LSTM Predicted', color='blue', linestyle='--')
    plt.title("LSTM Forecast vs Actual (First 100 Test Days)")
    plt.legend()
    plt.show()