# LSTM-BERT model

# Relevant Imports

In [48]:
import pandas as pd
import numpy as np
import torch
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
import re
import string
from sentence_transformers import SentenceTransformer
import nltk



# Check for MPS backend

In [49]:
device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")
print(f"Using device: {device}")


Using device: mps


# Preprocessing

In [50]:
filepath_num = '../../code_final/GARCH/GARCH_output.csv'
filepath_sent = '../../dataset_final/FinSen_S&P500/FinSen_text_annotated.csv'

data_num = pd.read_csv(filepath_num)
data_sent = pd.read_csv(filepath_sent)

In [51]:
data_sent.head()

Unnamed: 0.1,Unnamed: 0,Title,Tag,Time,Content,sentiment_score
0,0,"TSX Slightly Down, Books Weekly Gains",Stock Market,16/07/2023,"TSX Slightly Down, Books Weekly GainsUnited St...",0.13
1,1,UnitedHealth Hits 4-week High,stocks,15/07/2023,UnitedHealth Hits 4-week HighUnited States sto...,0.62
2,2,Cisco Systems Hits 4-week Low,stocks,15/07/2023,Cisco Systems Hits 4-week LowUnited States sto...,-0.43
3,3,AT&T Hits All-time Low,stocks,15/07/2023,AT&T Hits All-time LowUnited States stocksAT&T...,-0.28
4,4,Microsoft Hits 4-week High,stocks,15/07/2023,Microsoft Hits 4-week HighUnited States stocks...,0.6


## Data Alignment

### Text Data

In [52]:
import re
import nltk

# Download the sentence tokenizer only once
nltk.download("punkt_tab")  

from nltk.tokenize import sent_tokenize

# ----------------------------------------
# 1. Minimal cleaning function (unchanged)
# ----------------------------------------
def minimal_clean_text(text: str, to_lowercase: bool = False) -> str:
    """
    Minimal cleaning for transformer-based models:
    1. Remove HTML tags.
    2. Remove/replacing weird characters if needed.
    3. (Optional) Convert to lowercase if you're using an uncased model.
    """
    # 1. Remove HTML tags:
    text = re.sub(r'<.*?>', '', text)

    # 2. Remove or replace unwanted characters (non-ASCII -> space here):
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)

    # 3. (Optional) Lowercasing:
    if to_lowercase:
        text = text.lower()

    # Remove excessive whitespace
    text = " ".join(text.split())

    return text

# --------------------------------------------
# 2. Helper function: remove duplicate sentences
# --------------------------------------------
def remove_duplicate_sentences(text: str) -> str:
    """
    Splits the text into sentences (using NLTK), 
    then removes exact duplicate sentences. 
    Joins back with a space.
    """
    sentences = sent_tokenize(text)
    seen = set()
    unique_sentences = []
    
    for s in sentences:
        s = s.strip()
        if s not in seen:
            unique_sentences.append(s)
            seen.add(s)
            
    return " ".join(unique_sentences)

# ---------------------------------------------
# 3. Clean the content only (exclude the Title)
# ---------------------------------------------
data_sent['cleaned_content'] = data_sent['Content'].apply(
    lambda x: minimal_clean_text(x, to_lowercase=False)
)

# ---------------------------------------------
# 4. Remove duplicate sentences from each entry
# ---------------------------------------------
data_sent['cleaned_content'] = data_sent['cleaned_content'].apply(remove_duplicate_sentences)

# (The group-by and merging part has been removed)


[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/skhanna/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [53]:
data_sent = data_sent.rename(columns={'Time': 'Date'})

In [54]:
# Step 1: Parse day-first
data_sent['Date'] = pd.to_datetime(data_sent['Date'], dayfirst=True)
# 2. Sort by date in ascending order
data_sent.sort_values(by='Date', inplace=True)
data_sent.head()


Unnamed: 0.1,Unnamed: 0,Title,Tag,Date,Content,sentiment_score,cleaned_content
15533,15533,Blackstone boosts IPO after Beijing takes $3bn...,News,2007-06-04,Blackstone boosts IPO after Beijing takes $3bn...,0.03,Blackstone boosts IPO after Beijing takes $3bn...
15532,15532,Trade Deficit Increases in March 2007,Balance of Trade,2007-06-04,Trade Deficit Increases in March 2007United St...,0.0,Trade Deficit Increases in March 2007United St...
15530,15530,Consumer Price Index 2.6 percent higher than i...,Inflation Rate,2007-06-06,Consumer Price Index 2.6 percent higher than i...,0.0,Consumer Price Index 2.6 percent higher than i...
15527,15527,"Nonfarm payroll employment increased by 157,00...",Unemployment Rate,2007-06-06,"Nonfarm payroll employment increased by 157,00...",0.01,"Nonfarm payroll employment increased by 157,00..."
15531,15531,U.S. Federal Reserve Kept Rates Unchanged at 5...,Interest Rate,2007-06-06,U.S. Federal Reserve Kept Rates Unchanged at 5...,-0.01,U.S. Federal Reserve Kept Rates Unchanged at 5...


In [55]:
data_sent.head(30)

Unnamed: 0.1,Unnamed: 0,Title,Tag,Date,Content,sentiment_score,cleaned_content
15533,15533,Blackstone boosts IPO after Beijing takes $3bn...,News,2007-06-04,Blackstone boosts IPO after Beijing takes $3bn...,0.03,Blackstone boosts IPO after Beijing takes $3bn...
15532,15532,Trade Deficit Increases in March 2007,Balance of Trade,2007-06-04,Trade Deficit Increases in March 2007United St...,0.0,Trade Deficit Increases in March 2007United St...
15530,15530,Consumer Price Index 2.6 percent higher than i...,Inflation Rate,2007-06-06,Consumer Price Index 2.6 percent higher than i...,0.0,Consumer Price Index 2.6 percent higher than i...
15527,15527,"Nonfarm payroll employment increased by 157,00...",Unemployment Rate,2007-06-06,"Nonfarm payroll employment increased by 157,00...",0.01,"Nonfarm payroll employment increased by 157,00..."
15531,15531,U.S. Federal Reserve Kept Rates Unchanged at 5...,Interest Rate,2007-06-06,U.S. Federal Reserve Kept Rates Unchanged at 5...,-0.01,U.S. Federal Reserve Kept Rates Unchanged at 5...
15529,15529,United States GDP Rises 0.6 percent in the fir...,GDP Growth Rate,2007-06-06,United States GDP Rises 0.6 percent in the fir...,0.02,United States GDP Rises 0.6 percent in the fir...
15528,15528,United States Trade Balance Deficit Rises to $...,Balance of Trade,2007-06-06,United States Trade Balance Deficit Rises to $...,-0.01,United States Trade Balance Deficit Rises to $...
15526,15526,US Economy is expanding after Q1 slowdown,GDP Growth Rate,2007-06-21,US Economy is expanding after Q1 slowdownUnite...,0.03,US Economy is expanding after Q1 slowdownUnite...
15525,15525,The Fed is widely expected to keep the benchma...,Interest Rate,2007-06-24,The Fed is widely expected to keep the benchma...,0.01,The Fed is widely expected to keep the benchma...
15524,15524,Consumer Confidence and Housing Weaken in US,GDP Growth Rate,2007-06-26,Consumer Confidence and Housing Weaken in USUn...,-0.66,Consumer Confidence and Housing Weaken in USUn...


### Merging Data

In [56]:
if 'Unnamed: 0' in data_sent.columns:
    data_sent = data_sent.drop(columns=['Unnamed: 0'])
data_sent.head()

Unnamed: 0,Title,Tag,Date,Content,sentiment_score,cleaned_content
15533,Blackstone boosts IPO after Beijing takes $3bn...,News,2007-06-04,Blackstone boosts IPO after Beijing takes $3bn...,0.03,Blackstone boosts IPO after Beijing takes $3bn...
15532,Trade Deficit Increases in March 2007,Balance of Trade,2007-06-04,Trade Deficit Increases in March 2007United St...,0.0,Trade Deficit Increases in March 2007United St...
15530,Consumer Price Index 2.6 percent higher than i...,Inflation Rate,2007-06-06,Consumer Price Index 2.6 percent higher than i...,0.0,Consumer Price Index 2.6 percent higher than i...
15527,"Nonfarm payroll employment increased by 157,00...",Unemployment Rate,2007-06-06,"Nonfarm payroll employment increased by 157,00...",0.01,"Nonfarm payroll employment increased by 157,00..."
15531,U.S. Federal Reserve Kept Rates Unchanged at 5...,Interest Rate,2007-06-06,U.S. Federal Reserve Kept Rates Unchanged at 5...,-0.01,U.S. Federal Reserve Kept Rates Unchanged at 5...


In [57]:
if 'Unnamed: 0' in data_num.columns:
    data_num = data_num.drop(columns=['Unnamed: 0'])
data_num.head()

Unnamed: 0,Date,Open,Close,High,Volume,aggregate_sentiment_score,Movement,Daily_Return,Volatility,sentiment_volatility,...,garch_residuals_lag7,rolling_cond_variance_3,rolling_cond_volatility_3,rolling_residuals_3,rolling_cond_variance_5,rolling_cond_volatility_5,rolling_residuals_5,rolling_cond_variance_7,rolling_cond_volatility_7,rolling_residuals_7
0,2007-08-01,1455.18,1465.81,1468.38,5256780000.0,-0.15,0,0.724264,1.257202,0.270222,...,,,,,,,,,,
1,2007-08-07,1467.61,1476.7,1488.3,4909390000.0,0.0,1,0.742934,1.384401,0.262336,...,,,,,,,,,,
2,2007-08-10,1453.08,1453.64,1462.02,5345780000.0,-0.67,0,-1.56159,1.159805,0.367736,...,,1.423092,1.192399,-0.037356,,,,,,
3,2007-08-13,1453.42,1452.92,1466.29,3696280000.0,-0.2,0,-0.049531,0.942009,0.275372,...,,1.371807,1.170432,-0.294422,,,,,,
4,2007-08-14,1452.86,1426.54,1456.73,3814630000.0,-0.23,0,-1.815654,1.229445,0.250898,...,,1.480917,1.215417,-1.151931,1.438991,1.198351,-0.39879,,,


In [58]:
# Merge on the 'date' column, keeping only dates present in both dataframes
data_num['Date'] = pd.to_datetime(data_num['Date'])
merged_df = pd.merge(data_num, data_sent, on='Date', how='inner')

merged_df.head()


Unnamed: 0,Date,Open,Close,High,Volume,aggregate_sentiment_score,Movement,Daily_Return,Volatility,sentiment_volatility,...,rolling_cond_volatility_5,rolling_residuals_5,rolling_cond_variance_7,rolling_cond_volatility_7,rolling_residuals_7,Title,Tag,Content,sentiment_score,cleaned_content
0,2007-08-01,1455.18,1465.81,1468.38,5256780000.0,-0.15,0,0.724264,1.257202,0.270222,...,,,,,,U.S. Companies added the smallest number of jo...,Unemployment Rate,U.S. Companies added the smallest number of jo...,-0.15,U.S. Companies added the smallest number of jo...
1,2007-08-07,1467.61,1476.7,1488.3,4909390000.0,0.0,1,0.742934,1.384401,0.262336,...,,,,,,US Federal Reserve keeps interest rates unchan...,Interest Rate,US Federal Reserve keeps interest rates unchan...,0.0,US Federal Reserve keeps interest rates unchan...
2,2007-08-10,1453.08,1453.64,1462.02,5345780000.0,-0.67,0,-1.56159,1.159805,0.367736,...,,,,,,Stock Markets Fall Worldwide! Recession?,Stock Market,Stock Markets Fall Worldwide! Recession?United...,-0.67,Stock Markets Fall Worldwide! Recession?United...
3,2007-08-13,1453.42,1452.92,1466.29,3696280000.0,-0.2,0,-0.049531,0.942009,0.275372,...,,,,,,Goldman Sachs fights back by putting $3 billio...,Stock Market,Goldman Sachs fights back by putting $3 billio...,-0.2,Goldman Sachs fights back by putting $3 billio...
4,2007-08-14,1452.86,1426.54,1456.73,3814630000.0,-0.23,0,-1.815654,1.229445,0.250898,...,1.198351,-0.39879,,,,U.S. Trade Deficit Narrows in June to $58.1 Bln,Balance of Trade,U.S. Trade Deficit Narrows in June to $58.1 Bl...,-0.23,U.S. Trade Deficit Narrows in June to $58.1 Bl...


In [59]:
merged_df.value_counts

<bound method DataFrame.value_counts of             Date     Open    Close     High        Volume  \
0     2007-08-01  1455.18  1465.81  1468.38  5.256780e+09   
1     2007-08-07  1467.61  1476.70  1488.30  4.909390e+09   
2     2007-08-10  1453.08  1453.64  1462.02  5.345780e+09   
3     2007-08-13  1453.42  1452.92  1466.29  3.696280e+09   
4     2007-08-14  1452.86  1426.54  1456.73  3.814630e+09   
...          ...      ...      ...      ...           ...   
15230 2023-07-14  4514.60  4505.41  4527.75  3.647450e+09   
15231 2023-07-14  4514.60  4505.41  4527.75  3.647450e+09   
15232 2023-07-14  4514.60  4505.41  4527.75  3.647450e+09   
15233 2023-07-14  4514.60  4505.41  4527.75  3.647450e+09   
15234 2023-07-14  4514.60  4505.41  4527.75  3.647450e+09   

       aggregate_sentiment_score  Movement  Daily_Return  Volatility  \
0                          -0.15         0      0.724264    1.257202   
1                           0.00         1      0.742934    1.384401   
2          

In [60]:
merged_df.columns

Index(['Date', 'Open', 'Close', 'High', 'Volume', 'aggregate_sentiment_score',
       'Movement', 'Daily_Return', 'Volatility', 'sentiment_volatility',
       'sentiment_volatility_lag1', 'aggregate_sentiment_score_lag1',
       'Close_lag1', 'High_lag1', 'Volume_lag1', 'Daily_Return_lag1',
       'Volatility_lag1', 'Log_Return', 'garch_cond_variance',
       'garch_cond_volatility', 'garch_residuals', 'garch_cond_variance_lag1',
       'garch_cond_volatility_lag1', 'garch_residuals_lag1',
       'garch_cond_variance_lag2', 'garch_cond_volatility_lag2',
       'garch_residuals_lag2', 'garch_cond_variance_lag3',
       'garch_cond_volatility_lag3', 'garch_residuals_lag3',
       'garch_cond_variance_lag5', 'garch_cond_volatility_lag5',
       'garch_residuals_lag5', 'garch_cond_variance_lag7',
       'garch_cond_volatility_lag7', 'garch_residuals_lag7',
       'rolling_cond_variance_3', 'rolling_cond_volatility_3',
       'rolling_residuals_3', 'rolling_cond_variance_5',
       'rolling

In [61]:
merged_df['rolling_cond_volatility_3_lag1'] = merged_df['rolling_cond_variance_3'].shift(1)
merged_df['rolling_cond_volatility_5_lag1'] = merged_df['rolling_cond_variance_5'].shift(1)
merged_df.columns

Index(['Date', 'Open', 'Close', 'High', 'Volume', 'aggregate_sentiment_score',
       'Movement', 'Daily_Return', 'Volatility', 'sentiment_volatility',
       'sentiment_volatility_lag1', 'aggregate_sentiment_score_lag1',
       'Close_lag1', 'High_lag1', 'Volume_lag1', 'Daily_Return_lag1',
       'Volatility_lag1', 'Log_Return', 'garch_cond_variance',
       'garch_cond_volatility', 'garch_residuals', 'garch_cond_variance_lag1',
       'garch_cond_volatility_lag1', 'garch_residuals_lag1',
       'garch_cond_variance_lag2', 'garch_cond_volatility_lag2',
       'garch_residuals_lag2', 'garch_cond_variance_lag3',
       'garch_cond_volatility_lag3', 'garch_residuals_lag3',
       'garch_cond_variance_lag5', 'garch_cond_volatility_lag5',
       'garch_residuals_lag5', 'garch_cond_variance_lag7',
       'garch_cond_volatility_lag7', 'garch_residuals_lag7',
       'rolling_cond_variance_3', 'rolling_cond_volatility_3',
       'rolling_residuals_3', 'rolling_cond_variance_5',
       'rolling

In [62]:
# Columns we want to retain
columns_to_keep = ['Date','Open', 'Movement', 'sentiment_volatility_lag1', 'aggregate_sentiment_score_lag1',
       'Close_lag1', 'High_lag1', 'Volume_lag1', 'Daily_Return_lag1',
       'Volatility_lag1', 'garch_cond_variance_lag1',
       'garch_cond_volatility_lag1', 'garch_residuals_lag1', 'rolling_cond_volatility_3_lag1', 'rolling_cond_volatility_5_lag1','Title','cleaned_content']
merged_df = merged_df[columns_to_keep]
merged_df.head()

Unnamed: 0,Date,Open,Movement,sentiment_volatility_lag1,aggregate_sentiment_score_lag1,Close_lag1,High_lag1,Volume_lag1,Daily_Return_lag1,Volatility_lag1,garch_cond_variance_lag1,garch_cond_volatility_lag1,garch_residuals_lag1,rolling_cond_volatility_3_lag1,rolling_cond_volatility_5_lag1,Title,cleaned_content
0,2007-08-01,1455.18,0,0.268142,0.0,1455.27,1488.3,4524520000.0,-0.251552,0.991957,,,,,,U.S. Companies added the smallest number of jo...,U.S. Companies added the smallest number of jo...
1,2007-08-07,1467.61,1,0.270222,-0.15,1465.81,1468.38,5256780000.0,0.724264,1.257202,1.440581,1.200242,0.721654,,,US Federal Reserve keeps interest rates unchan...,US Federal Reserve keeps interest rates unchan...
2,2007-08-10,1453.08,0,0.262336,0.0,1476.7,1488.3,4909390000.0,0.742934,1.384401,1.311621,1.14526,0.740188,,,Stock Markets Fall Worldwide! Recession?,Stock Markets Fall Worldwide! Recession?United...
3,2007-08-13,1453.42,0,0.367736,-0.67,1453.64,1462.02,5345780000.0,-1.56159,1.159805,1.517074,1.231695,-1.573911,1.423092,,Goldman Sachs fights back by putting $3 billio...,Goldman Sachs fights back by putting $3 billio...
4,2007-08-14,1452.86,0,0.275372,-0.2,1452.92,1466.29,3696280000.0,-0.049531,0.942009,1.286727,1.13434,-0.049543,1.371807,,U.S. Trade Deficit Narrows in June to $58.1 Bln,U.S. Trade Deficit Narrows in June to $58.1 Bl...


In [63]:
merged_df.columns

Index(['Date', 'Open', 'Movement', 'sentiment_volatility_lag1',
       'aggregate_sentiment_score_lag1', 'Close_lag1', 'High_lag1',
       'Volume_lag1', 'Daily_Return_lag1', 'Volatility_lag1',
       'garch_cond_variance_lag1', 'garch_cond_volatility_lag1',
       'garch_residuals_lag1', 'rolling_cond_volatility_3_lag1',
       'rolling_cond_volatility_5_lag1', 'Title', 'cleaned_content'],
      dtype='object')

In [64]:
merged_df.value_counts

<bound method DataFrame.value_counts of             Date     Open  Movement  sentiment_volatility_lag1  \
0     2007-08-01  1455.18         0                   0.268142   
1     2007-08-07  1467.61         1                   0.270222   
2     2007-08-10  1453.08         0                   0.262336   
3     2007-08-13  1453.42         0                   0.367736   
4     2007-08-14  1452.86         0                   0.275372   
...          ...      ...       ...                        ...   
15230 2023-07-14  4514.60         1                   0.504529   
15231 2023-07-14  4514.60         1                   0.504529   
15232 2023-07-14  4514.60         1                   0.504529   
15233 2023-07-14  4514.60         1                   0.504529   
15234 2023-07-14  4514.60         1                   0.504529   

       aggregate_sentiment_score_lag1  Close_lag1  High_lag1   Volume_lag1  \
0                                0.00     1455.27    1488.30  4.524520e+09   
1          

In [65]:
merged_df.to_csv('multimodal_S&P500_all.csv')

## Splitting Data