<a href="https://colab.research.google.com/github/sheldonkemper/bank_of_england/blob/main/notebooks/modelling/ob_full_pipeline_jpm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [32]:
"""
===================================================
Author: Oscar Bowden
Role: Research Lead, Bank of England Employer Project (Quant Collective)
LinkedIn: https://uk.linkedin.com/in/oscar-bowden-4b14711b7
Date: 2025-02-17
Version: 1.0

Description:
    This notebook is a rough version of a modelling pipeline for pre-processed financial meeting transcript
    data (JPMorganChase). It employs BERTopic, finBERT and Flan-T5 to extract insights into the speakers
    from the Q&A sections of the transcripts.
===================================================
"""



In [33]:
!pip install bertopic
!pip install umap-learn



In [34]:
#Imports

import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification, T5Tokenizer, T5ForConditionalGeneration
import tensorflow as tf
import numpy as np
import random
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
from umap import UMAP

In [35]:
#Load pre-processed data

url = "https://raw.githubusercontent.com/sheldonkemper/bank_of_england/main/data/preprocessed_data/JPMorgan_QNA_processed_data.xlsx"

df = pd.read_excel(url, engine='openpyxl')

df.head()

Unnamed: 0,Index,Quarter-Year,Asked By,Role of the person Asked the question,Question,Answered By,Role of the person answered the question,Answer
0,1,4Q24,John McDonald,"Analyst, Truist Securities, Inc.","Hi. Good morning. Jeremy, I wanted to ask abou...",Jeremy Barnum,"Chief Financial Officer, JPMorganChase","Yeah. Good question, John, and welcome back, b..."
1,2,4Q24,Mike Mayo,"Analyst, Wells Fargo Securities LLC","Hi. Simple and then more difficult, I guess. J...",Jamie Dimon,"Chairman & Chief Executive Officer, JPMorganChase",I do love what I do. And answering the second ...
2,3,4Q24,Jim Mitchell,"Analyst, Seaport Global Securities LLC","Hey. Good morning. Maybe just on regulation, w...",Jeremy Barnum,"Chief Financial Officer, JPMorganChase","Hey, Jim. I mean, it's obviously something we'..."
3,4,4Q24,Erika Najarian,"Analyst, UBS Securities LLC","Yes. Hi, good morning. Wanted to follow up on ...",Jeremy Barnum,"Chief Financial Officer, JPMorganChase","Right, Erika. Okay. You are tempting me with m..."
4,5,4Q24,Erika,Unknown,"Does that conclude your question, Erika?",Jeremy Barnum,"Chief Financial Officer, JPMorganChase",Very good. We can go to the next question. Tha...


# Running FinBERT on question chunks - aiming to gather insights at the analyst/question level

* --------Currently using non-final version of data---------

In [36]:
# Load ProsusAI FinBERT model & tokenizer
MODEL_NAME = "ProsusAI/finbert"
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)
model = BertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=3)
model.eval()  # Set model to evaluation mode

# Load your preprocessed data (assuming it's already in `df`)
TEXT_COLUMN = "Question"  # Adjust this if your column name is different

# Define class labels for FinBERT
LABELS = ["Negative", "Neutral", "Positive"]

# Function to get sentiment scores
def get_sentiment(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)

    with torch.no_grad():  # No gradient calculation needed - only inferring, not training
        outputs = model(**inputs)

    probs = torch.nn.functional.softmax(outputs.logits, dim=-1).cpu().numpy().flatten()
    sentiment_label = LABELS[probs.argmax()]  # Get label with highest probability

    return {"Negative_q": probs[0], "Neutral_q": probs[1], "Positive_q": probs[2], "Sentiment_q": sentiment_label}

# Apply sentiment analysis to each chunk
df["sentiment_question"] = df[TEXT_COLUMN].apply(get_sentiment)

# Convert dictionary to separate columns
sentiment_df = df["sentiment_question"].apply(pd.Series)
df = pd.concat([df, sentiment_df], axis=1).drop(columns=["sentiment_question"])

# Show results
df.head()

Unnamed: 0,Index,Quarter-Year,Asked By,Role of the person Asked the question,Question,Answered By,Role of the person answered the question,Answer,Negative_q,Neutral_q,Positive_q,Sentiment_q
0,1,4Q24,John McDonald,"Analyst, Truist Securities, Inc.","Hi. Good morning. Jeremy, I wanted to ask abou...",Jeremy Barnum,"Chief Financial Officer, JPMorganChase","Yeah. Good question, John, and welcome back, b...",0.122641,0.013932,0.863427,Positive
1,2,4Q24,Mike Mayo,"Analyst, Wells Fargo Securities LLC","Hi. Simple and then more difficult, I guess. J...",Jamie Dimon,"Chairman & Chief Executive Officer, JPMorganChase",I do love what I do. And answering the second ...,0.064734,0.031453,0.903813,Positive
2,3,4Q24,Jim Mitchell,"Analyst, Seaport Global Securities LLC","Hey. Good morning. Maybe just on regulation, w...",Jeremy Barnum,"Chief Financial Officer, JPMorganChase","Hey, Jim. I mean, it's obviously something we'...",0.286975,0.036256,0.676769,Positive
3,4,4Q24,Erika Najarian,"Analyst, UBS Securities LLC","Yes. Hi, good morning. Wanted to follow up on ...",Jeremy Barnum,"Chief Financial Officer, JPMorganChase","Right, Erika. Okay. You are tempting me with m...",0.100585,0.028308,0.871107,Positive
4,5,4Q24,Erika,Unknown,"Does that conclude your question, Erika?",Jeremy Barnum,"Chief Financial Officer, JPMorganChase",Very good. We can go to the next question. Tha...,0.04435,0.112143,0.843507,Positive


#Extract Topics using BERTopic (with all-MiniLM-L6-v2 transformer, UMAP for dimensionality reduction, reproducibility code and NaN value handling)

* --------Currently using non-final version of data---------

In [37]:
def reset_session():
    tf.keras.backend.clear_session()
    np.random.seed(42)
    random.seed(42)
    tf.random.set_seed(42)

reset_session()  # Ensure consistent results

# 🚀 2️⃣ Load Preprocessed Data
TEXT_COLUMN = "Question"

# Remove NaN values and convert questions to a list
questions = df[TEXT_COLUMN].dropna().tolist()

# Initialize Sentence Transformer & UMAP for Dimensionality Reduction
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")  # High-quality embeddings
umap_model = UMAP(n_neighbors=10, min_dist=0.2)  # Optimize clustering

# Initialize BERTopic with Custom Embeddings & UMAP
topic_model = BERTopic(embedding_model=embedding_model, umap_model=umap_model, verbose=True)

# Train Model on Questions (Fit & Transform)
topic_model.fit(questions)  # Fit model first
topics, probs = topic_model.transform(questions)  # Then transform data

# Add topic assignments to DataFrame
df.loc[df[TEXT_COLUMN].notna(), "topic_q"] = topics

print(topic_model.get_topic_freq().head(10))  # Top 10 most frequent topics
print(topic_model.get_topic(0))  # Keywords for Topic 0

2025-02-17 16:29:19,872 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/3 [00:00<?, ?it/s]

2025-02-17 16:29:24,302 - BERTopic - Embedding - Completed ✓
2025-02-17 16:29:24,306 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-02-17 16:29:24,403 - BERTopic - Dimensionality - Completed ✓
2025-02-17 16:29:24,405 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-02-17 16:29:24,413 - BERTopic - Cluster - Completed ✓
2025-02-17 16:29:24,417 - BERTopic - Representation - Extracting topics from clusters using representation models.
2025-02-17 16:29:24,442 - BERTopic - Representation - Completed ✓


Batches:   0%|          | 0/3 [00:00<?, ?it/s]

2025-02-17 16:29:28,954 - BERTopic - Dimensionality - Reducing dimensionality of input embeddings.
2025-02-17 16:29:28,957 - BERTopic - Dimensionality - Completed ✓
2025-02-17 16:29:28,959 - BERTopic - Clustering - Approximating new points with `hdbscan_model`
2025-02-17 16:29:28,961 - BERTopic - Cluster - Completed ✓


   Topic  Count
0     -1     91
False


# Flan-T5 topic extraction: zero-shot prompting

In [38]:
# Load Flan-T5 and Tokeniser

MODEL_NAME = "google/flan-t5-large"
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)
model.eval()  # Set model to inference mode

# Load Data & Extract Segments for Topic Extraction
TEXT_COLUMN = "Question"

# Seperate Q data
questions = df[TEXT_COLUMN].dropna().tolist()

# Define Zero-Shot Prompt
def extract_topics_with_flan_t5(text):
    prompt = (
        f"Extract the main topic discussed in the following financial text. "
        f"Include between three and ten words in your answer."
        f"Do not include speaker names or any references to a Q&A format.\n\n"
        f"{text}\n\n"
        f"Topics (comma-separated):"
    )
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, padding=True, max_length=512)

    with torch.no_grad():  # Disable gradient calc for inference
        outputs = model.generate(**inputs, max_length=50)

    topics = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return topics

# 🚀 4️⃣ Apply Flan-T5 Topic Extraction to Each Segment
df["flan_t5_topics_q"] = df[TEXT_COLUMN].apply(extract_topics_with_flan_t5)

# 🚀 5️⃣ Compare Flan-T5 vs BERTopic Topics
df_comparison = df[["Question", "topic_q", "flan_t5_topics_q"]]


df_comparison.head()

Unnamed: 0,Question,topic_q,flan_t5_topics_q
0,"Hi. Good morning. Jeremy, I wanted to ask abou...",-1.0,Capital
1,"Hi. Simple and then more difficult, I guess. J...",-1.0,Jamie Dimon
2,"Hey. Good morning. Maybe just on regulation, w...",-1.0,regulation
3,"Yes. Hi, good morning. Wanted to follow up on ...",-1.0,Capital
4,"Does that conclude your question, Erika?",-1.0,Financial text


In [39]:
df_comparison

Unnamed: 0,Question,topic_q,flan_t5_topics_q
0,"Hi. Good morning. Jeremy, I wanted to ask abou...",-1.0,Capital
1,"Hi. Simple and then more difficult, I guess. J...",-1.0,Jamie Dimon
2,"Hey. Good morning. Maybe just on regulation, w...",-1.0,regulation
3,"Yes. Hi, good morning. Wanted to follow up on ...",-1.0,Capital
4,"Does that conclude your question, Erika?",-1.0,Financial text
...,...,...,...
86,"Hey Jeremy, you mentioned a degree of reinterm...",-1.0,JPMorgan Chase
87,Hi. Good morning.\nI do want to unpack the que...,-1.0,Basel IV
88,Thank you. So you talked about in your letter ...,-1.0,Consumers can move excess cash balances if the...
89,Good morning. You guys talked about one of the...,-1.0,Credit card balances
