# Imports

In [259]:
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import numpy as np
import seaborn  as sns
from tqdm import tqdm
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score

# Tests

## Using OpenAI

In [4]:
# import openai
# openai.api_key = api_key
# def analyze_sentiment(text):
#     prompt = f"Analyze the sentiment of the following text and classify it as Positive, Negative, or Neutral:\n\n\"{text}\""
    
#     response = openai.ChatCompletion.create(
#         model="gpt-4o-mini",
#         messages=[{"role": "user", "content": prompt}],
#         max_tokens=50
#     )
    
#     sentiment = response['choices'][0]['message']['content'].strip()
#     return sentiment

# text = news['content'].values[0]
# sentiment = analyze_sentiment(text)
# print(f"Sentiment: {sentiment}")

**Problem:** No prenium APIKey for testing

# Utils

In [267]:
def sentiment_labelling(df):
    """
    Perform sentiment labelling on the given DataFrame.
    Adds 'negative', 'neutral', and 'positive' columns to the DataFrame.
    Processes the DataFrame to group by date and fill missing values.
    """
    # Initialize sentiment columns
    df['negative'] = 0.0
    df['neutral'] = 0.0
    df['positive'] = 0.0

    # Initialize the tqdm progress bar
    for index, row in tqdm(df.iterrows(), total=len(df), desc="Processing Sentiment Analysis", dynamic_ncols=True):
        text = row['content']

        # Tokenize the text
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)

        # Perform inference
        with torch.no_grad():
            outputs = model(**inputs)

        # Get predicted probabilities (softmax output)
        probs = torch.nn.functional.softmax(outputs.logits, dim=-1).squeeze().tolist()

        # Assign probabilities to the corresponding columns
        df.at[index, 'positive'] = probs[0]
        df.at[index, 'negative'] = probs[1]
        df.at[index, 'neutral'] = probs[2]

    # Drop unnecessary columns
    if {'url', 'title', 'content'}.issubset(df.columns):
        df = df.drop(columns=['url', 'title', 'content'])

    # Group by date and calculate mean sentiment scores
    df = df.groupby("date").mean().reset_index()

    # Ensure the 'date' column is in datetime format
    df['date'] = pd.to_datetime(df['date'])

    # Create a full date range
    date_range = pd.date_range(start=df["date"].min(), end=df["date"].max())

    # Reindex to include all dates and forward-fill missing values
    df = df.set_index("date").reindex(date_range)
    df = df.ffill().reset_index()
    df.rename(columns={"index": "date"}, inplace=True)

    # Sort by date in descending order
    df = df.sort_values(by="date", ascending=False).reset_index(drop=True)

    return df

In [268]:
def dates_filtering(news, df):
    # Ensure 'date' columns are in datetime format
    news['date'] = pd.to_datetime(news['date'])
    df['date'] = pd.to_datetime(df['date'])

    # Find the intersection of dates
    common_dates = set(news['date']).intersection(set(df['date']))

    # Filter rows from both DataFrames where the date is in the common_dates
    filtered_news = news[news['date'].isin(common_dates)]
    filtered_df = df[df['date'].isin(common_dates)]
    return filtered_news, filtered_df

In [269]:
def mixing_data(news, df, include_news=True, train_size=0.8, random_state=None):
    """
    Combine company data with sentiment data, process features, and split into train-test sets.

    Args:
        news (pd.DataFrame): News sentiment data with 'date', 'negative', 'positive', and 'neutral' columns.
        df (pd.DataFrame): Company stock data with 'Company', 'High', and 'date' columns.
        include_news (bool): Whether to include sentiment data.
        train_size (float): Proportion of data to use for training.
        random_state (int, optional): Random state for reproducibility.

    Returns:
        tuple: (train_data, test_data), where each is a NumPy array.
    """
    data = []
    companies = df['Company'].drop_duplicates().values
    features = 400  # Number of companies to process
    
    # Process company data
    for company in tqdm(companies[:features], desc="Processing company data"):
        company_data = df[df['Company'] == company]['High'].values
        company_data = (company_data[1:] - company_data[:-1]) / company_data[:-1]  # Compute returns
        data.append(company_data)

    data = np.array(data)  # Shape: (features, timesteps)
    
    # Include sentiment data if specified
    if include_news:
        sentiment_data = []
        dates = df['date'].drop_duplicates().values

        for date in tqdm(dates[1:], desc="Processing sentiment data"):
            sentiment_data_day = news[news['date'] == date][['negative', 'positive', 'neutral']].values.flatten()
            sentiment_data.append(sentiment_data_day)

        sentiment_data = np.array(sentiment_data)  # Shape: (timesteps, sentiment_features)

        # Check for timestep mismatch
        if data.shape[1] != sentiment_data.shape[0]:
            raise ValueError(f"Mismatch in timesteps: company_data ({data.shape[1]}) vs sentiment_data ({sentiment_data.shape[0]})")

        sentiment_data = sentiment_data.T  # Shape: (sentiment_features, timesteps)
        
        # Combine company data with sentiment data
        data = np.concatenate((data, sentiment_data), axis=0).T  # Shape: (timesteps, features + sentiment_features)

    # Train-test split
    train_data, test_data = train_test_split(
        data,
        train_size=train_size,
        shuffle=False,  # Maintain temporal order
        random_state=random_state
    )

    return train_data, test_data

In [270]:
class MultivariateTimeSeriesDataset(Dataset):
    def __init__(self, data, target_indices, sequence_length):
        self.data = data # Shape must be : (data_length, num_features)
        self.target_indices = target_indices  # Subset of features to predict
        self.sequence_length = sequence_length
    
    def __len__(self):
        return len(self.data) - self.sequence_length
    
    def __getitem__(self, idx):
        x = self.data[idx:idx+self.sequence_length, :]  # Input: All features
        y = self.data[idx+self.sequence_length, self.target_indices]  # Target: Subset of features
        return torch.tensor(x, dtype=torch.float32), torch.tensor(y, dtype=torch.float32)

In [271]:
def load_multivariate_data(train_data, test_data, target_index, sequence_length):
    # Scale each feature independently
    scalers = [StandardScaler() for _ in range(train_data.shape[1])]
    train_scaled = np.zeros_like(train_data)
    test_scaled = np.zeros_like(test_data)
    
    for i, scaler in enumerate(scalers):
        train_scaled[:, i] = scaler.fit_transform(train_data[:, i].reshape(-1, 1)).flatten()
        test_scaled[:, i] = scaler.transform(test_data[:, i].reshape(-1, 1)).flatten()
    
    train_dataset = MultivariateTimeSeriesDataset(train_scaled, target_index, sequence_length)
    test_dataset = MultivariateTimeSeriesDataset(test_scaled, target_index, sequence_length)
    
    return train_dataset, test_dataset, scalers

In [272]:
def create_data_loaders(train_dataset, test_dataset, batch_size):
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
    return train_loader, test_loader

In [273]:
def evaluate_model(model, test_loader, scalers, target_indices):
    model.eval()
    y_true, y_pred = [], []
    with torch.no_grad():
        for batch_x, batch_y in test_loader:
            outputs = model(batch_x)
            y_true.append(batch_y.numpy())
            y_pred.append(outputs.numpy())
    
    # Concatenate all batches
    y_true = np.concatenate(y_true, axis=0)
    y_pred = np.concatenate(y_pred, axis=0)
    
    # Inverse transform predictions and ground truth for the target features
    y_true_rescaled, y_pred_rescaled = [], []
    for i, target_index in enumerate(target_indices):
        scaler = scalers[target_index]
        y_true_rescaled.append(scaler.inverse_transform(y_true[:, i].reshape(-1, 1)).flatten())
        y_pred_rescaled.append(scaler.inverse_transform(y_pred[:, i].reshape(-1, 1)).flatten())
    
    y_true_rescaled = np.stack(y_true_rescaled, axis=1)
    y_pred_rescaled = np.stack(y_pred_rescaled, axis=1)
    
    # Compute R2 scores for each feature
    r2_scores = [r2_score(y_true_rescaled[:, i], y_pred_rescaled[:, i]) for i in range(len(target_indices))]
    for i, score in enumerate(r2_scores):
        print(f"R2 Score for Feature {target_indices[i]}: {score:.4f}")
    return r2_scores

In [282]:
def train_model(model, train_loader, val_loader, criterion, optimizer, epochs, early_stopping=None):
    """
    Train a model with training and validation phases, and optional early stopping.

    Args:
        model (torch.nn.Module): The model to train.
        train_loader (DataLoader): DataLoader for training data.
        val_loader (DataLoader): DataLoader for validation data.
        criterion (torch.nn.Module): Loss function.
        optimizer (torch.optim.Optimizer): Optimizer for training.
        epochs (int): Number of epochs to train.
        early_stopping (EarlyStopping, optional): Early stopping instance to monitor validation loss.
    """
    for epoch in range(epochs):
        # Training phase
        model.train()
        train_loss = 0.0
        for batch in train_loader:
            x_batch, y_batch = batch  # Inputs and targets already properly structured
            optimizer.zero_grad()
            y_pred = model(x_batch)  # Forward pass
            loss = criterion(y_pred, y_batch)  # Compute loss
            loss.backward()  # Backpropagation
            optimizer.step()  # Optimization step
            train_loss += loss.item()

        train_loss /= len(train_loader)

        # Validation phase
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for batch in val_loader:
                x_batch, y_batch = batch  # Inputs and targets already properly structured
                y_pred = model(x_batch)  # Forward pass
                loss = criterion(y_pred, y_batch)  # Compute loss
                val_loss += loss.item()

        val_loss /= len(val_loader)
        
        # Log the losses
        print(f"\nEpoch {epoch+1}, Train Loss: {train_loss:.6f}, Validation Loss: {val_loss:.6f}")

        # Early stopping (if provided)
        if early_stopping:
            early_stopping(val_loss, model)

            if early_stopping.early_stop:
                print("\n     ###################     \n Early stopping triggered \n     ###################     ")
                break

In [285]:
def evaluate_model(model, test_loader, scalers, target_indices):
    model.eval()
    y_true, y_pred = [], []
    with torch.no_grad():
        for batch_x, batch_y in test_loader:
            outputs = model(batch_x)
            y_true.append(batch_y.numpy())
            y_pred.append(outputs.numpy())
    
    # Concatenate all batches
    y_true = np.concatenate(y_true, axis=0)
    y_pred = np.concatenate(y_pred, axis=0)
    
    # Inverse transform predictions and ground truth for the target features
    y_true_rescaled, y_pred_rescaled = [], []
    for i, target_index in enumerate(target_indices):
        scaler = scalers[target_index]
        y_true_rescaled.append(scaler.inverse_transform(y_true[:, i].reshape(-1, 1)).flatten())
        y_pred_rescaled.append(scaler.inverse_transform(y_pred[:, i].reshape(-1, 1)).flatten())
    
    y_true_rescaled = np.stack(y_true_rescaled, axis=1)
    y_pred_rescaled = np.stack(y_pred_rescaled, axis=1)
    
    # Compute R2 scores for each feature
    r2_scores = [r2_score(y_true_rescaled[:, i], y_pred_rescaled[:, i]) for i in range(len(target_indices))]
    for i, score in enumerate(r2_scores):
        print(f"R2 Score for Feature {target_indices[i]}: {score:.4f}")
    return r2_scores

# Global Methodology

We trained different models using two types of data:
- 1: without including news sentiment analysis
- 2: including news sentiment analysis

---> Maybe a bit dumb: the more you include data, the more your model is precise

---> Strongly depends of the quality of the datasets of the news and the quality of the prediction by the FinBERT model

## Structure of the data used

When we include sentiment data, we add three columns: positive, negative, neutral that describes to three different sentiments for the stock movement.

# Loading data

In [9]:
# Load and preprocess `df`
df = pd.read_csv('data/sp500_prices.csv')
df['date'] = pd.to_datetime(df['Date'])
df = df.drop(columns=['Date'])

In [10]:
# Load and preprocess `news`
news = pd.read_csv('data/bis_press_releases.csv')
news = news.dropna()
news['date'] = pd.to_datetime(news['date'], format='%d %b %Y')

# Financial Time Series Forecasting using Sentiment Analysis with FinBERT and LSTM

## Loading datasets

In [78]:
# Load and preprocess `news`
news = pd.read_csv('data/bis_press_releases.csv')
news = news.dropna()
news['date'] = pd.to_datetime(news['date'], format='%d %b %Y')

In [79]:
# Load and preprocess `df`
df = pd.read_csv('data/sp500_prices.csv')
df['date'] = pd.to_datetime(df['Date'])
df = df.drop(columns=['Date'])

## Processing news data

In [80]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")
class_labels = model.config.id2label
prob_labels = model.config.label2id

text1 = news['content'].values[0]
inputs = tokenizer(text1, return_tensors="pt", truncation=True, padding=True)

# Perform inference
with torch.no_grad():  # No gradient calculation needed for predictions
    outputs = model(**inputs)
probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
predicted_class = torch.argmax(probs, dim=-1).item()

# Print the results
print("Predicted class:", class_labels[predicted_class])
print("Probabilities:", probs)
print('labels assignation:', prob_labels)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Predicted class: neutral
Probabilities: tensor([[0.1161, 0.2905, 0.5935]])
labels assignation: {'positive': 0, 'negative': 1, 'neutral': 2}


In [81]:
news = sentiment_labelling(news)

Processing Sentiment Analysis: 100%|██████████| 7057/7057 [16:48<00:00,  7.00it/s]


## Preprocessing data

In [274]:
news, df = dates_filtering(news, df)

In [275]:
train_data, test_data = mixing_data(news, df, include_news=True)

Processing company data: 100%|██████████| 400/400 [00:08<00:00, 49.33it/s]
Processing sentiment data: 100%|██████████| 1992/1992 [00:00<00:00, 5758.81it/s]


In [276]:
sequence_length = 20
target_indices = [k for k in range(401)]

train_dataset, test_dataset, scalers = load_multivariate_data(train_data, test_data, target_indices, sequence_length)

## Model

### LSTM

In [256]:
class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers, intermediate_size=256, dropout=0.2):
        super(LSTM, self).__init__()
        
        # LSTM layer
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, dropout=dropout)
        self.fc1 = nn.Linear(hidden_size, intermediate_size)
        self.sigmoid = nn.Sigmoid()
        self.dropout1 = nn.Dropout(p=dropout)
        self.fc2 = nn.Linear(intermediate_size, intermediate_size)
        self.tanh = nn.Tanh()
        self.dropout2 = nn.Dropout(p=dropout)
        self.fc3 = nn.Linear(intermediate_size, output_size)
    
    def forward(self, x):
        # Initialize hidden and cell states to zeros
        h0 = torch.zeros(self.lstm.num_layers, x.size(0), self.lstm.hidden_size).to(x.device)  # Hidden state
        c0 = torch.zeros(self.lstm.num_layers, x.size(0), self.lstm.hidden_size).to(x.device)  # Cell state
        
        # Forward propagate LSTM
        out, (hn, cn) = self.lstm(x, (h0, c0))
        
        # Take the last time step's output
        out = out[:, -1, :]  # Shape: (batch_size, hidden_size)
        
        # Fully connected layer to produce the final output
        out = self.fc1(out)
        out = self.sigmoid(out)
        out = self.dropout1(out)
        out = self.fc2(out)
        out = self.tanh(out)
        out = self.dropout2(out)
        out = self.fc3(out) # Shape: (batch_size, output_size)
        
        return out

### Early Stopping

In [257]:
class EarlyStopping:
    def __init__(self, patience=5, delta=0, path='checkpoint.pt', verbose=False):
        """
        Args:
            patience (int): How many epochs to wait after last improvement.
            delta (float): Minimum change to qualify as an improvement.
            path (str): Path to save the best model.
            verbose (bool): Print messages about early stopping.
        """
        self.patience = patience
        self.delta = delta
        self.path = path
        self.verbose = verbose
        self.counter = 0
        self.best_loss = None
        self.early_stop = False

    def __call__(self, val_loss, model):
        if self.best_loss is None:
            self.best_loss = val_loss
            self.save_checkpoint(val_loss, model)
        elif val_loss > self.best_loss - self.delta:
            self.counter += 1
            if self.verbose:
                print(f"EarlyStopping counter: {self.counter} out of {self.patience}")
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_loss = val_loss
            self.save_checkpoint(val_loss, model)
            self.counter = 0

    def save_checkpoint(self, val_loss, model):
        """Save the model when validation loss improves."""
        if self.verbose:
            print(f"Validation loss decreased to {val_loss:.6f}. Saving model...")
        torch.save(model.state_dict(), self.path)

### Model Definition

In [279]:
input_size = train_data.shape[1]
output_size = len(target_indices)

# Hyperparameters for the structure of the model
hidden_size = 128
num_layers = 6
dropout = 0.0

# Training hyperparameters
learning_rate = 0.02
num_epochs = 200

# Initialize the LSTM model
model = LSTM(input_size, hidden_size, output_size, num_layers, dropout=dropout)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=1e-5)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5, verbose=True)
early_stopping = EarlyStopping(patience=20, verbose=True)



## Training Loop

In [284]:
epochs = 200
batch_size = 64

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

train_model(
    model=model, 
    train_loader=train_loader, 
    val_loader=test_loader, 
    criterion=criterion, 
    optimizer=optimizer, 
    epochs=epochs, 
    early_stopping=early_stopping
    )


Epoch 1, Train Loss: 1.619245, Validation Loss: 0.800944
Validation loss decreased to 0.800944. Saving model...

Epoch 2, Train Loss: 1.002031, Validation Loss: 0.799706
Validation loss decreased to 0.799706. Saving model...

Epoch 3, Train Loss: 1.009764, Validation Loss: 0.799965
EarlyStopping counter: 1 out of 20

Epoch 4, Train Loss: 1.000138, Validation Loss: 0.799462
Validation loss decreased to 0.799462. Saving model...

Epoch 5, Train Loss: 1.007853, Validation Loss: 0.799663
EarlyStopping counter: 1 out of 20

Epoch 6, Train Loss: 1.002212, Validation Loss: 0.799739
EarlyStopping counter: 2 out of 20

Epoch 7, Train Loss: 1.000102, Validation Loss: 0.799702
EarlyStopping counter: 3 out of 20

Epoch 8, Train Loss: 1.006043, Validation Loss: 0.799523
EarlyStopping counter: 4 out of 20

Epoch 9, Train Loss: 1.007469, Validation Loss: 0.799409
Validation loss decreased to 0.799409. Saving model...

Epoch 10, Train Loss: 1.014396, Validation Loss: 0.799625
EarlyStopping counter: 1

## Evaluation

In [286]:
evaluate_model(
    model=model,
    test_loader=test_loader,
    scalers=scalers,
    target_indices=target_indices
)

R2 Score for Feature 0: -0.0077
R2 Score for Feature 1: -0.0003
R2 Score for Feature 2: -0.0026
R2 Score for Feature 3: -0.0001
R2 Score for Feature 4: -0.0008
R2 Score for Feature 5: -0.0026
R2 Score for Feature 6: -0.0004
R2 Score for Feature 7: -0.0003
R2 Score for Feature 8: -0.0003
R2 Score for Feature 9: -0.0001
R2 Score for Feature 10: -0.0005
R2 Score for Feature 11: -0.0087
R2 Score for Feature 12: -0.0004
R2 Score for Feature 13: -0.0087
R2 Score for Feature 14: -0.0001
R2 Score for Feature 15: -0.0008
R2 Score for Feature 16: -0.0001
R2 Score for Feature 17: -0.0031
R2 Score for Feature 18: -0.0000
R2 Score for Feature 19: -0.0024
R2 Score for Feature 20: -0.0000
R2 Score for Feature 21: -0.0004
R2 Score for Feature 22: -0.0011
R2 Score for Feature 23: -0.0015
R2 Score for Feature 24: -0.0071
R2 Score for Feature 25: -0.0006
R2 Score for Feature 26: -0.0038
R2 Score for Feature 27: -0.0073
R2 Score for Feature 28: -0.0067
R2 Score for Feature 29: -0.0000
R2 Score for Feature

[-0.007697582244873047,
 -0.0003352165222167969,
 -0.00260770320892334,
 -0.0001093149185180664,
 -0.000823974609375,
 -0.0025768280029296875,
 -0.00043833255767822266,
 -0.0003472566604614258,
 -0.00034332275390625,
 -7.414817810058594e-05,
 -0.0005167722702026367,
 -0.008720040321350098,
 -0.000408172607421875,
 -0.008693814277648926,
 -0.00012505054473876953,
 -0.000828862190246582,
 -5.173683166503906e-05,
 -0.0031354427337646484,
 -4.124641418457031e-05,
 -0.002390265464782715,
 -4.088878631591797e-05,
 -0.00035381317138671875,
 -0.0011425018310546875,
 -0.001547098159790039,
 -0.007055401802062988,
 -0.0005979537963867188,
 -0.0038323402404785156,
 -0.007278323173522949,
 -0.0066986083984375,
 -1.1920928955078125e-07,
 -0.005735993385314941,
 -0.0056226253509521484,
 -0.0013349056243896484,
 -0.0014873743057250977,
 -0.0029587745666503906,
 -0.005314469337463379,
 -4.172325134277344e-06,
 -0.00012576580047607422,
 -0.006925702095031738,
 -0.0025686025619506836,
 -2.59876251220703

# Application with trading strategies ?