In [11]:
import pandas as pd

In [None]:
df = pd.read_csv('results/stooq_merged.csv', parse_dates=['Date'])

# 1. Rename the long News columns to something shorter
df = df.rename(columns={
    'MERGED_GDELT_STOOQ_ALIGNED_News_Sentiment': 'Sentiment_Tone',
    'MERGED_GDELT_STOOQ_ALIGNED_News_Disagreement': 'Sentiment_Dispersion',
    'MERGED_GDELT_STOOQ_ALIGNED_News_Volume': 'News_Volume'
})

# 2. Select only the core columns you need for the first model (S&P 500 Focus)
cols_to_keep = [
    'Date', 
    'SPX_Close', 'SPX_Volume',  # The Target Index
    'Sentiment_Tone', 'Sentiment_Dispersion', 'News_Volume' # The Signals
]

df_clean = df[cols_to_keep].copy()
df_clean = df_clean.set_index('Date').sort_index()

In [13]:
import numpy as np

# 1. Calculate Daily Log Returns (The "Truth")
df_clean['Return_Daily'] = np.log(df_clean['SPX_Close'] / df_clean['SPX_Close'].shift(1))

# 2. Create the TARGET (Next Day's Return)
# We shift UP by 1. Row 't' now contains the return for 't+1'
df_clean['Target_NextDay_Return'] = df_clean['Return_Daily'].shift(-1)

# 3. Create Binary Target (Direction) - Optional but good for Classification
# 1 if Up, 0 if Down
df_clean['Target_Direction'] = (df_clean['Target_NextDay_Return'] > 0).astype(int)

# 4. Handle Missing Values (The shifting creates NaNs at the end)
df_clean = df_clean.dropna()

print("Ready for Modeling. Columns available:")
print(df_clean.columns.tolist())

Ready for Modeling. Columns available:
['SPX_Close', 'SPX_Volume', 'Sentiment_Tone', 'Sentiment_Dispersion', 'News_Volume', 'Return_Daily', 'Target_NextDay_Return', 'Target_Direction']


In [14]:
correlation = df_clean[['Sentiment_Tone', 'News_Volume', 'Target_NextDay_Return']].corr()
print(correlation)

                       Sentiment_Tone  News_Volume  Target_NextDay_Return
Sentiment_Tone               1.000000     0.312139              -0.000960
News_Volume                  0.312139     1.000000              -0.006388
Target_NextDay_Return       -0.000960    -0.006388               1.000000


In [15]:
# Create a copy to avoid SettingWithCopy warnings
df_enhanced = df_clean.copy()

# --- 1. Smoothing (Trend Detection) ---
# 3-Day and 7-Day Rolling Average of Sentiment
df_enhanced['Sent_MA_3'] = df_enhanced['Sentiment_Tone'].rolling(window=3).mean()
df_enhanced['Sent_MA_7'] = df_enhanced['Sentiment_Tone'].rolling(window=7).mean()

# --- 2. Momentum (Change in Mood) ---
# Is the news getting better or worse?
df_enhanced['Sent_Momentum'] = df_enhanced['Sentiment_Tone'].diff()

# --- 3. Interaction (Volume Weighted Sentiment) ---
# Scale Sentiment by Volume (Normalize volume first to avoid huge numbers)
vol_mean = df_enhanced['News_Volume'].rolling(window=20).mean()
df_enhanced['Relative_Vol'] = df_enhanced['News_Volume'] / vol_mean
df_enhanced['Weighted_Sentiment'] = df_enhanced['Sentiment_Tone'] * df_enhanced['Relative_Vol']

# --- 4. Volatility Regime ---
# Is the news highly conflicted? (High Dispersion)
df_enhanced['Dispersion_MA_3'] = df_enhanced['Sentiment_Dispersion'].rolling(window=3).mean()

# Drop the NaNs created by rolling windows (first 20 rows)
df_enhanced = df_enhanced.dropna()

print("New Features Created:")
print(df_enhanced[['Sent_MA_7', 'Sent_Momentum', 'Weighted_Sentiment']].tail())

New Features Created:
            Sent_MA_7  Sent_Momentum  Weighted_Sentiment
Date                                                    
2025-11-18  -0.507338       0.119248           -0.490013
2025-11-19  -0.498653       0.025994           -0.486453
2025-11-20  -0.511872       0.001138           -0.492954
2025-11-25  -0.488513       0.021033           -0.439336
2025-12-02  -0.515744      -0.257131           -0.671667
