<a href="https://colab.research.google.com/github/sheldonkemper/bank_of_england/blob/main/AM_FinBERT_finetuning_V1_0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **1. Install libraries**

In [None]:
!pip install transformers datasets torch scikit-learn

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupt

In [None]:
import torch
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader

# **2. Load FinBERT and Prepare Dataset**

In [None]:
# Load FinBERT tokenizer
model_name = "ProsusAI/finbert"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Load your labeled financial dataset
df = pd.read_csv("Financial_Sentiment_Analysis.csv")  # selected columns: 'text', 'label' (0 = negative, 1 = neutral, 2 = positive)

# Convert text labels to numerical values (if needed)
label_mapping = {"negative": 0, "neutral": 1, "positive": 2}
df['label'] = df['label'].map(label_mapping)

# Split dataset
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['text'].tolist(), df['label'].tolist(), test_size=0.2, random_state=42
)

# Tokenize data
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512)

# Convert to Dataset format
train_dataset = Dataset.from_dict({"input_ids": train_encodings["input_ids"],
                                   "attention_mask": train_encodings["attention_mask"],
                                   "labels": train_labels})

val_dataset = Dataset.from_dict({"input_ids": val_encodings["input_ids"],
                                 "attention_mask": val_encodings["attention_mask"],
                                 "labels": val_labels})

# **3. Load FinBERT and Define Training Arguments**

In [None]:
# Load FinBERT model
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./finbert_finetuned",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=2,
    load_best_model_at_end=True
)

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)



# **4. Train and Save Model**

In [None]:
# Train model
trainer.train()

# Save the fine-tuned model
model.save_pretrained("./finbert_the_quant_collective")
tokenizer.save_pretrained("./finbert_the_quant_collective")

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33marjt23-mitra[0m ([33marjt23-mitra-fourthrev[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Epoch,Training Loss,Validation Loss
1,0.3562,0.690291
2,0.5585,0.615275
3,0.3564,0.587452


('./finbert_the_quant_collective/tokenizer_config.json',
 './finbert_the_quant_collective/special_tokens_map.json',
 './finbert_the_quant_collective/vocab.txt',
 './finbert_the_quant_collective/added_tokens.json',
 './finbert_the_quant_collective/tokenizer.json')

# **5. Evaluate Model**

In [None]:
# Load validation data
def compute_accuracy(predictions, labels):
    preds = np.argmax(predictions, axis=1)
    return np.sum(preds == labels) / len(labels)

# Get model predictions
outputs = trainer.predict(val_dataset)
accuracy = compute_accuracy(outputs.predictions, outputs.label_ids)

print(f"Validation Accuracy: {accuracy:.4f}")

Validation Accuracy: 0.7857


# **6. Use Fine-Tuned FinBERT for Sentiment Analysis**

In [None]:
# Load fine-tuned model
from transformers import pipeline

finbert_pipeline = pipeline("text-classification", model="./finbert_the_quant_collective", tokenizer=tokenizer)

# Test on new financial text
test_text = "regarding deposit inflows firmwide level average deposits 3% quarter-on-quarter end-of-period deposits 2% quarter-on-quarter Ã¢â‚¬â€œ implying intra-quarter reversal recent outflow trend consequence march events estimate retained approximately $50 billion deposit inflows quarter-end"
result = finbert_pipeline(test_text)
print(result)  # Output: [{'label': 'XXXXXX', 'score': X.XX}]

Device set to use cuda:0


[{'label': 'neutral', 'score': 0.8798871040344238}]
