<a href="https://colab.research.google.com/github/sheldonkemper/bank_of_england/blob/main/notebooks/modelling/ob_flan_t5_sentiment_jpm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
"""
===================================================
Author: Oscar Bowden
Role: Research Lead, Bank of England Employer Project (Quant Collective)
LinkedIn: https://uk.linkedin.com/in/oscar-bowden-4b14711b7
Date: 2025-02-24
Version: 2.2

Description:
    This notebook contains a fine-tuning pipeline for Flan-T5 (base) polar sentiment analysis.
    The data analysed is pre-processed financial meeting transcript
    data from JPMorganChase. Financial Phrasebank
    (https://huggingface.co/datasets/takala/financial_phrasebank) dataset is used for
    fine tuning. Tuned model performance is assessed alongside a zero-shot approach.
===================================================
"""

# Imports

In [None]:
!pip install umap-learn datsets transformers torch evaluate scikit-learn > /dev/null 2>&1

Collecting bertopic
  Downloading bertopic-0.16.4-py3-none-any.whl.metadata (23 kB)
Collecting hdbscan>=0.8.29 (from bertopic)
  Downloading hdbscan-0.8.40-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (15 kB)
Collecting umap-learn>=0.5.0 (from bertopic)
  Downloading umap_learn-0.5.7-py3-none-any.whl.metadata (21 kB)
Collecting pynndescent>=0.5 (from umap-learn>=0.5.0->bertopic)
  Downloading pynndescent-0.5.13-py3-none-any.whl.metadata (6.8 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic)
  Downloadin

In [None]:
#Imports

from google.colab import drive
import os

import pandas as pd
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration, TrainingArguments, Trainer
import tensorflow as tf
import numpy as np
import random
from sentence_transformers import SentenceTransformer
from umap import UMAP
import hdbscan
from sklearn.feature_extraction.text import CountVectorizer
from datasets import load_dataset, Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
import gc
from transformers import DataCollatorForSeq2Seq

import nltk
nltk.download('punkt_tab')
from nltk.tokenize import sent_tokenize
from sklearn.metrics import cohen_kappa_score
import seaborn as sns
import matplotlib.pyplot as plt
import re

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


# Data loading and pre-processing

In [None]:
# Load topic modelled data (management and Q&A)

drive.mount('/content/drive', force_remount=True)

path1 = "/content/drive/MyDrive/bank_of_england/data/preprocessed_data/"
path2 = "/content/drive/MyDrive/bank_of_england/data/preprocessed_data/"

df_q = pd.read_csv(path1)
df_m = pd.read_csv(path2)

Mounted at /content/drive


Unnamed: 0,Index,Quarter-Year,Question,Question_cleaned,Asked By,Role of the person asked the question,Answer,Answer_cleaned,Answered By,Role of the person answered the question
0,1,1Q23,"So, Jamie, I was actually hoping to get your p...",['so jamie actually hoping get perspective see...,Steven Chubak,"Analyst, Wolfe Research LLC","Well, I think you were already kind of complet...",['well think already kind complete answering q...,Jamie Dimon,"Chairman & Chief Executive Officer, JPMorgan C..."
1,2,1Q23,"Hey, thanks. Good morning. Hey, Jeremy, I was ...",['hey thanks good morning hey jeremy wondering...,Ken Usdin,"Analyst, Jefferies LLC","Yeah, sure. So let me just summarize the drive...",['yeah sure let summarize drivers change outlo...,Jeremy Barnum,"Chief Financial Officer, JPMorgan Chase & Co."
2,3,1Q23,"Hi, thanks. Jeremy, wanted to follow up again ...",['hi thanks jeremy wanted follow drivers nii r...,John McDonald,"Analyst, Autonomous Research","Yeah. John, it's a really good question, and w...",['yeah john really good question weve obviousl...,Jeremy Barnum,"Chief Financial Officer, JPMorgan Chase & Co."
3,4,1Q23,My first question is you mentioned that your r...,['first question mentioned reserve build drive...,Erika Najarian,"Analyst, UBS Securities LLC","Yeah. So, Erika, as you know, we take \n not g...",['yeah so erika know take going go lot detail ...,Jeremy Barnum,"Chief Financial Officer, JPMorgan Chase & Co."
4,5,1Q23,Hey. Good morning. Maybe just a little bit on ...,['hey good morning maybe little bit deposit th...,Jim Mitchell,"Analyst, Seaport Global Securities LLC","Yeah. A couple things there. So, first of all,...",['yeah couple things there so first all know r...,"Jeremy Barnum, Jamie Dimon","Chief Financial Officer, JPMorgan Chase & Co.;..."


In [None]:
#Prepare topics for input into fine-tuned flan-t5



In [None]:
# Preparing 'Question' and 'Answer' columns of JPM data

# Create a new column that contains the list of sentences
df['Question_Sentences'] = df['Question'].apply(lambda x: sent_tokenize(str(x)))
df['Answer_Sentences'] = df['Answer'].apply(lambda x: sent_tokenize(str(x)))

# Explode the DataFrame
df_exploded_q2 = df.explode('Question_Sentences')
df_exploded_a2 = df.explode('Answer_Sentences')

# Rename columns
df_exploded_q2 = df_exploded_q2.rename(columns={'Question_Sentences': 'Question_Sentence'})
df_exploded_q2 = df_exploded_q2[['Question_Sentence', 'Question', 'Asked By', 'Quarter-Year']]

df_exploded_a2 = df_exploded_a2.rename(columns={'Answer_Sentences': 'Answer_Sentence'})
df_exploded_a2 = df_exploded_a2[['Answer_Sentence', 'Question', 'Asked By', 'Quarter-Year']]

print(df_exploded_q2.shape)
print(df_exploded_a2.shape)

#Remove rows with 3 or fewer words
df_exploded = df_exploded_q2[df_exploded_q2['Question_Sentence'].apply(lambda x: len(x.split()) > 3)]
df_exploded_a = df_exploded_a2[df_exploded_a2['Answer_Sentence'].apply(lambda x: len(x.split()) > 3)]

print(df_exploded.shape)
print(df_exploded_a.shape)

df_exploded.head()

(399, 4)
(1074, 4)
(342, 4)
(945, 4)


Unnamed: 0,Question_Sentence,Question,Asked By,Quarter-Year
0,"So, Jamie, I was actually hoping to get your p...","So, Jamie, I was actually hoping to get your p...",Steven Chubak,1Q23
0,In your letter you spent a fair amount of time...,"So, Jamie, I was actually hoping to get your p...",Steven Chubak,1Q23
0,But what are some of the changes that you're s...,"So, Jamie, I was actually hoping to get your p...",Steven Chubak,1Q23
0,"And along those same lines, how you're thinkin...","So, Jamie, I was actually hoping to get your p...",Steven Chubak,1Q23
1,"Hey, Jeremy, I was just wondering if you can j...","Hey, thanks. Good morning. Hey, Jeremy, I was ...",Ken Usdin,1Q23


# 3) Inference on fine-tuned Flan-T5

In [None]:
# Data preparation for JPM questions

def prepare_text_for_inference(text):
    text = str(text).strip()
    return f"Classify sentiment: {text}"

df_exploded["Sentence_t5_tuned_infer"] = df_exploded["Question_Sentence"].apply(prepare_text_for_inference)

In [None]:
# Load fine-tuned model and tokeniser from the best checkpoint

#best_checkpoint = trainer.state.best_model_checkpoint
#print("Best checkpoint path:", best_checkpoint)

model = T5ForConditionalGeneration.from_pretrained(best_checkpoint)
tokenizer = T5Tokenizer.from_pretrained(best_checkpoint)

# Define the prediction function using your fine-tuned model
def predict_sentiment(prepared_text):
    """
    Predicts sentiment using the fine-tuned Flan-T5 model.
    Assumes the input text is already preprocessed (i.e., prompt prepended).
    """
    inputs = tokenizer(prepared_text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        output = model.generate(**inputs, max_new_tokens=2)
    return tokenizer.decode(output[0], skip_special_tokens=True).strip()

# Apply inference on your prepared quarterly data
df_exploded["flan_t5_sentiment_tuned"] = df_exploded["Sentence_t5_tuned_infer"].apply(predict_sentiment)