## Sentiment Strategy Model

In [157]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score
import matplotlib.pyplot as plt

In [158]:
# Load Processed Data
df_prices = pd.read_csv('../data/prices.csv', parse_dates=['date'])
price_matrix = df_prices.pivot(index='date', columns='ticker', values='close').sort_index()
df_sentiment = pd.read_csv('../data/sentiment.csv', parse_dates=['date'])
df_sentiment['date'] = pd.to_datetime(df_sentiment['date'])

In [159]:
# Generates a feature set by merging sentiment statistics with the stock’s daily price index, 
# forward-filling to align quarterly sentiment data with daily observations.
def create_features(prices, sentiment_df, ticker):
    df = pd.DataFrame(index=prices.index)
    sent_cols = [
        'mean_positive', 'mean_neutral', 'mean_negative',
        'std_positive', 'std_neutral', 'std_negative',
        'polarity_score', 'polarity_std',
        'max_positive', 'min_negative', 'num_paragraphs'
    ]
    sent = sentiment_df[sentiment_df['ticker'] == ticker].copy()
    sent = sent.set_index('date')[sent_cols]
    sent = sent.sort_index().reindex(df.index, method='ffill')
    df = df.join(sent)

    return df.dropna()

In [160]:
# Creates classification labels based on future price movement
# - Computes the return over a forward-looking window (lookahead)
# - Assigns label 1 if the return exceeds the given threshold, -1 if the return is below 
#   the -threshold, else 0

def create_labels(prices, threshold=0.001, lookahead=5):
    future_return = prices.shift(-lookahead) / prices - 1
    labels = pd.Series(0, index=prices.index)
    labels[future_return > threshold] = 1
    labels[future_return < -threshold] = -1
    return labels

## Model Training and Signal Generation

Trains a separate Randome Forest classifier for each stock using only sentiment features. Data is split into training (pre-2018) and testing (2018 onward). A RandomForestClassifier is trained to predict whether the stock will go up (label 1), down (label -1), or not (label 0).

Position Assignment:

- Long (1000) if predicted probability for class 1 > 0.4
- Short (-1000) if probability for class -1 > 0.4
- Otherwise, hold (0)

In [161]:
positions_df = pd.DataFrame()
metrics = {}

for symbol in price_matrix.columns:
    prices = price_matrix[symbol].dropna()
    features = create_features(prices, df_sentiment, symbol)
    labels = create_labels(prices).reindex(features.index)
    data = features.copy()
    data['label'] = labels
    data = data[data['label'].isin([-1, 1])].dropna()

    train = data
    test = data[data.index >= '2018-06-01']
    X_train, y_train = train.drop('label', axis=1), train['label']
    X_test, y_test = test.drop('label', axis=1), test['label']

    model = RandomForestClassifier(n_estimators=100, class_weight='balanced', max_depth=4, random_state=42)
    model.fit(X_train, y_train)

    probs = model.predict_proba(X_test)
    predictions = []
    for row in probs:
        prob_map = dict(zip(model.classes_, row))
        if prob_map.get(1, 0) > 0.4:
            predictions.append(1000)
        elif prob_map.get(-1, 0) > 0.4:
            predictions.append(-1000)
        else:
            predictions.append(0)

    positions_df[symbol] = pd.Series(predictions, index=X_test.index)
    
    y_pred = pd.Series(predictions, index=X_test.index).map({-1000: -1, 0: 0, 1000: 1})
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='macro', zero_division=0)
    metrics[symbol] = {'accuracy': acc, 'f1_score': f1}

    importances = model.feature_importances_
    feature_names = X_train.columns
    sorted_idx = np.argsort(importances)[::-1]

metrics_df = pd.DataFrame(metrics).T
print("\nSentiment Model Performance Summary:")
print(metrics_df.round(3))


Sentiment Model Performance Summary:
       accuracy  f1_score
AAPL      0.661     0.563
AMD       0.598     0.374
AMZN      0.584     0.579
ASML      0.612     0.534
CSCO      0.591     0.507
GOOGL     0.605     0.526
INTC      0.582     0.504
MSFT      0.621     0.521
MU        0.634     0.634
NVDA      0.599     0.578


In [162]:
trades_df = positions_df.fillna(0).diff()
trades_df.to_csv('../data/sentiment_strategy.csv', index=True)