<a href="https://colab.research.google.com/github/sheldonkemper/bank_of_england/blob/main/notebooks/modelling/ob_flan_t5_sentiment_jpm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
"""
===================================================
Author: Oscar Bowden
Role: Research Lead, Bank of England Employer Project (Quant Collective)
LinkedIn: https://uk.linkedin.com/in/oscar-bowden-4b14711b7
Date: 2025-02-24
Version: 2.3

Description:
    This notebook contains an inference pipeline for a Flan-T5 (base)
    model that has been fine-tuned for polar sentiment analysis
    of financial sentences (using Financial Phrasebank:
    https://huggingface.co/datasets/takala/financial_phrasebank).
===================================================
"""

# Imports

In [1]:
!pip install umap-learn datsets transformers torch evaluate scikit-learn > /dev/null 2>&1

In [3]:
#Imports

from google.colab import drive
import os

import pandas as pd
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration, TrainingArguments, Trainer
import tensorflow as tf
import numpy as np
import random
from sentence_transformers import SentenceTransformer
from umap import UMAP
import hdbscan
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
import gc
from transformers import DataCollatorForSeq2Seq

import nltk
nltk.download('punkt_tab')
from nltk.tokenize import sent_tokenize
from sklearn.metrics import cohen_kappa_score
import seaborn as sns
import matplotlib.pyplot as plt
import re

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


# Data loading and pre-processing

In [4]:
# Load topic modelled data (management and Q&A)

drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [18]:
path1 = "/content/drive/MyDrive/JPMorgan_qna_answer_topics_by_quarter_gpt.csv"
#path2 = "/content/drive/MyDrive/bank_of_england/data/preprocessed_data/"

df_q = pd.read_csv(path1)
#df_m = pd.read_csv(path2)

In [19]:
#Cleaning dataset

df_q.drop(columns=['Sentiment'], inplace=True)

In [20]:
#Prepare topics for input into fine-tuned flan-t5

def prepare_text_for_inference(text):
    text = str(text).strip()
    return f"Classify sentiment: {text}"

df_q["snippet_infer"] = df_q["Snippet"].apply(prepare_text_for_inference)

df_q["topic_infer"] = df_q["Topic"].apply(prepare_text_for_inference)

# 3) Inference on fine-tuned Flan-T5

In [22]:
# Load fine-tuned model and tokeniser from the best checkpoint

best_checkpoint = "/content/drive/MyDrive/bank_of_england/data/model_outputs/flan_t5_sent"

model = T5ForConditionalGeneration.from_pretrained(best_checkpoint)
tokenizer = T5Tokenizer.from_pretrained(best_checkpoint)

# Define the prediction function using your fine-tuned model
def predict_sentiment(prepared_text):
    """
    Predicts sentiment using the fine-tuned Flan-T5 model.
    Assumes the input text is already preprocessed (i.e., prompt prepended).
    """
    inputs = tokenizer(prepared_text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        output = model.generate(**inputs, max_new_tokens=2)
    return tokenizer.decode(output[0], skip_special_tokens=True).strip()

# Apply inference on your prepared quarterly data
df_q["topic_sent"] = df_q["topic_infer"].apply(predict_sentiment)
df_q["snippet_sent"] = df_q["snippet_infer"].apply(predict_sentiment)

In [23]:
df_q["topic_sent"] = df_q["topic_sent"].map({"0": "Negative", "1": "Neutral", "2": "Positive"})
df_q["snippet_sent"] = df_q["snippet_sent"].map({"0": "Negative", "1": "Neutral", "2": "Positive"})

df_q.drop(columns=['snippet_infer', 'topic_infer'], inplace=True)

In [27]:
print(df_q["topic_sent"].value_counts())
print(f"\n {df_q['snippet_sent'].value_counts()}")

topic_sent
Neutral     370
Positive     36
Negative      2
Name: count, dtype: int64

 snippet_sent
Neutral     258
Positive    114
Negative     36
Name: count, dtype: int64


In [28]:
#Save csv

file_path = "/content/drive/MyDrive/bank_of_england/data/model_outputs/sent_output/JPM_answers_sent_output_040325_v1.csv"

df_q.to_csv(file_path, index=False)