# LSTM-BERT model

# Relevant Imports

In [1]:
import pandas as pd
import numpy as np
import torch
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
import re
import string
from sentence_transformers import SentenceTransformer



# Check for MPS backend

In [2]:
device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")
print(f"Using device: {device}")


Using device: mps


# Preprocessing

In [3]:
filepath_num = '../../code_final/GARCH/GARCH_output.csv'
filepath_sent = '../../dataset_final/FinSen_S&P500/FinSen_text_annotated.csv'

data_num = pd.read_csv(filepath_num)
data_sent = pd.read_csv(filepath_sent)

In [16]:
def generate_textual_embeddings(texts, tokenizer, model, device, batch_size=16):
    """
    Generate textual embeddings using a BERT model.

    Args:
    - texts (list of str): List of text inputs.
    - tokenizer: Pretrained tokenizer for the BERT model.
    - model: Pretrained BERT model.
    - device: Device to run the model on ('cpu' or 'cuda').
    - batch_size (int): Batch size for processing texts.

    Returns:
    - embeddings (torch.Tensor): Tensor of shape (len(texts), embedding_dim).
    """
    model.eval()  # Set model to evaluation mode
    embeddings = []

    with torch.no_grad():
        for i in range(0, len(texts), batch_size):
            batch_texts = texts[i:i + batch_size]
            encoded = tokenizer(batch_texts, padding=True, truncation=True, return_tensors='pt')
            input_ids = encoded['input_ids'].to(device)
            attention_mask = encoded['attention_mask'].to(device)

            # Forward pass through BERT
            outputs = model(input_ids, attention_mask=attention_mask)
            # Use mean pooling of token embeddings to get sentence embeddings
            batch_embeddings = outputs.last_hidden_state.mean(dim=1)
            embeddings.append(batch_embeddings.cpu())

    return torch.cat(embeddings, dim=0)


In [17]:
# Convert the 'cleaned_text' column to a list of strings
texts = df_text['cleaned_text'].tolist()

# Generate embeddings for 'cleaned_text'
text_embeddings = generate_textual_embeddings(texts, tokenizer, bert_model, device)

# Verify the shape of the embeddings
print(f"Cleaned Text Embeddings Shape: {text_embeddings.shape}")


Cleaned Text Embeddings Shape: torch.Size([2800, 768])


In [18]:
# Save embeddings to disk
torch.save(text_embeddings, 'textual_embeddings.pt')
