In [1]:
# Install Hugging Face Transformers and PyTorch libraries
!pip install transformers torch



In [2]:
#1. Load the pre-trained FinTwitBERT model and tokenizer

# Import tokenizer and sentiment classification model class
from transformers import AutoTokenizer, AutoModelForSequenceClassification
# Load the FinTwitBERT tokenizer, pre-trained on financial tweets
tokenizer = AutoTokenizer.from_pretrained("StephanAkkerman/FinTwitBERT")

# Load FinTwitBERT model adapted for sentiment classification with 3 labels: bullish, neutral, bearish
model = AutoModelForSequenceClassification.from_pretrained(
    "StephanAkkerman/FinTwitBERT",
    num_labels=3  # For bullish, bearish, neutral
)

  from .autonotebook import tqdm as notebook_tqdm
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at StephanAkkerman/FinTwitBERT and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
#2. Load the dataset and prepare model/tokenizer for classification
import pandas as pd # Import pandas for handling CSV data


# Load the filtered dataset containing tweets and sentiment labels
actual_df = pd.read_csv("filtered_dataset.csv")
# Extract tweets and labels from the dataset
tweets = actual_df["tweet"].tolist()
labels = actual_df["sentiment"].tolist()  # should be 0 = bullish, 1 = neutral, 2 = bearish

In [4]:
# 3. Tokenize the tweet texts for input to the model

# Tokenize the tweets with padding and truncation for uniform input size
encodings = tokenizer(
    tweets,
    padding=True,           # Pad shorter tweets to the same length
    truncation=True,        # Truncate longer tweets to the maximum length
    max_length=64,          # Maximum number of tokens per tweet
    return_tensors="pt"     # Return PyTorch tensors
)

# Convert sentiment labels to PyTorch tensor format
import torch
labels = torch.tensor(labels)

In [5]:
# 4. Split the dataset into training, validation, and test sets 
# (https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html)

from sklearn.model_selection import train_test_split
# Split the full dataset into 70% training and 30% temporary (val + test), stratified by label
train_idx, temp_idx = train_test_split(
    range(len(labels)), 
    test_size=0.3,      # 30% for validation and test
    stratify=labels,    # Ensure class balance across splits to maintain label distribution
    random_state=42     # Seed for reproducibility
)
# Split the 30% temp set into 15% validation and 15% test, stratified again
val_idx, test_idx = train_test_split(
    temp_idx, 
    test_size=0.5, 
    stratify=labels[temp_idx], # Maintain label distribution
    random_state=42
)

In [6]:
# 5. Define a custom PyTorch Dataset to wrap tokenized tweets and labels
from torch.utils.data import Dataset
# Create a custom dataset class for tweet sentiment classification
class TweetDataset(Dataset): 
    # Constructor
    def __init__(self, encodings, labels, indices):
        # Store only the subset of data defined by the given indices (train/val/test)
        self.encodings = {k: v[indices] for k, v in encodings.items()}
        self.labels = labels[indices]
    def __len__(self):
        # Return the number of samples in the dataset
        return len(self.labels)
    def __getitem__(self, idx):
        # Return one sample as a dictionary with input IDs, attention mask, and label
        return {
            'input_ids': self.encodings['input_ids'][idx],
            'attention_mask': self.encodings['attention_mask'][idx],
            'labels': self.labels[idx]
        }
# Create dataset objects for training, validation, and testing
train_dataset = TweetDataset(encodings, labels, train_idx)
val_dataset = TweetDataset(encodings, labels, val_idx)
test_dataset = TweetDataset(encodings, labels, test_idx)