In [2]:
# Libraries
# ==============================================================================
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error
from skforecast.datasets import fetch_dataset
from skforecast.sarimax import Sarimax
from skforecast.recursive import ForecasterSarimax
from skforecast.model_selection import TimeSeriesFold, backtesting_sarimax, grid_search_sarimax
from skforecast.plot import set_dark_theme
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.graphics.tsaplots import plot_pacf, plot_acf
from statsmodels.tsa.stattools import adfuller
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [None]:
import os
import glob
from statsmodels.tsa.statespace.sarimax import SARIMAX

# Get all stock files
news_files = glob.glob('data_collection/data/news/tsv/*.tsv')
stocks = [os.path.basename(f).replace('.tsv', '') for f in news_files]

# Store results for comparison
results = {}

for stock in stocks:
    # Read data
    news_df = pd.read_csv(f'data_collection/data/news/tsv/{stock}.tsv', sep='\t')
    trades_df = pd.read_csv(f'data_collection/data/trades/tsv/{stock}.tsv', sep='\t')
    
    # Filter out data past 2025-04-25
    trades_df = trades_df[trades_df['date'] <= '2025-04-19']
    news_df = news_df[news_df['published_utc'].str[:10] <= '2025-04-25']

    # Prepare sentiment features
    sentiment_counts = news_df.groupby(news_df['published_utc'].str[:10])['sentiment'].value_counts().unstack(fill_value=0)
    sentiment_counts = sentiment_counts[['positive', 'neutral', 'negative']]
    sentiment_counts['culmination'] = sentiment_counts['positive'] * 1 + sentiment_counts['negative'] * -10
    
    # Create shifted versions
    for shift in [1, 2, 3]:
        shifted = sentiment_counts.copy()
        shifted.index = pd.to_datetime(shifted.index, utc=True)
        shifted = shifted.shift(-shift, freq='D')
        locals()[f'sentiment_counts_{shift}day'] = shifted

    # Prepare data for each model
    data_variants = {
        'same_day': trades_df.copy(),
        '1day': trades_df.copy(),
        '2day': trades_df.copy(),
        '3day': trades_df.copy()
    }

    # Process each variant
    for variant, df in data_variants.items():
        df['date_dt'] = pd.to_datetime(df['date'], utc=True)
        df.set_index('date_dt', inplace=True)
        
        if variant == 'same_day':
            sentiment_data = sentiment_counts.copy()
        else:
            days = variant.replace('day', '')
            sentiment_data = locals()[f'sentiment_counts_{days}day'].copy()
        
        # Ensure the index is datetime
        sentiment_data.index = pd.to_datetime(sentiment_data.index, utc=True)
        df = df.join(sentiment_data, how='left')
        df[['positive', 'neutral', 'negative', 'culmination']] = df[['positive', 'neutral', 'negative', 'culmination']].fillna(0).astype(int)
        data_variants[variant] = df

    # Split data
    train_mask = data_variants['same_day'].index < pd.to_datetime('2025-04-12', utc=True)
    
    # Initialize models for each variant
    models = {}
    predictions = {}
    maes = {}

    # Calculate weekly window features
    for variant, df in data_variants.items():
        # Calculate previous week's statistics
        df['prev_week_mean'] = df['close'].rolling(window=5, min_periods=1).mean().shift(1)
        df['prev_week_std'] = df['close'].rolling(window=5, min_periods=1).std().shift(1)
        df['prev_week_max'] = df['close'].rolling(window=5, min_periods=1).max().shift(1)
        df['prev_week_min'] = df['close'].rolling(window=5, min_periods=1).min().shift(1)
            

    # Update the modeling section:
    for variant, df in data_variants.items():
        # Prepare exog variables (now includes weekly window features)
        exog_columns = ['positive', 'neutral', 'negative', 'culmination', 
                       'prev_week_mean', 'prev_week_std', 'prev_week_max', 'prev_week_min'] 

        # Prepare training data
        train_data = df[train_mask].copy()
        test_data = df[~train_mask].copy()  

        # Create and fit model with updated features
        forecaster = ForecasterSarimax(
            regressor=Sarimax(
                order=(1, 1, 1),
                seasonal_order=(1, 1, 1, 5),
                maxiter=200
            )
        )   

        # Fit model with updated features
        train_y = pd.Series(train_data['close'].values, index=train_data.index)
        train_exog = train_data[exog_columns].copy().fillna(0)  

        forecaster.fit(
            y=train_y,
            exog=train_exog
        )   

        # Make predictions with updated features
        pred = forecaster.predict(
            steps=len(test_data),
            exog=test_data[exog_columns]
        )
            
        predictions[variant] = pred
        maes[variant] = mean_absolute_error(test_data['close'], pred)
            
        # Store results
        results[stock] = {
            'mae': maes['same_day'],
            'mae_1day': maes['1day'],
            'mae_2day': maes['2day'],
            'mae_3day': maes['3day'],
            'actual': df[~train_mask]['close'],
            'predicted': predictions['same_day'],
            'predicted_1day': predictions['1day'],
            'predicted_2day': predictions['2day'],
            'predicted_3day': predictions['3day'],
            'dates': df[~train_mask].index
        }   

        # Plot predictions
        plt.figure(figsize=(12,6))
        plt.plot(results[stock]['dates'], results[stock]['actual'], label='Actual')
        plt.plot(results[stock]['dates'], results[stock]['predicted'], 
                 label=f'Same-day Sentiment (MAE: ${maes["same_day"]:.2f})')
        plt.plot(results[stock]['dates'], results[stock]['predicted_1day'], 
                 label=f'1-day Forward Sentiment (MAE: ${maes["1day"]:.2f})')
        plt.plot(results[stock]['dates'], results[stock]['predicted_2day'], 
                 label=f'2-day Forward Sentiment (MAE: ${maes["2day"]:.2f})')
        plt.plot(results[stock]['dates'], results[stock]['predicted_3day'], 
                 label=f'3-day Forward Sentiment (MAE: ${maes["3day"]:.2f})')
        plt.title(f'{stock} Stock Price: Actual vs Predicted with Shifted Sentiment (SARIMAX)')
        plt.xlabel('Date')
        plt.ylabel('Stock Price ($)')
        plt.legend()
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()


kys
