<a href="https://colab.research.google.com/github/sheldonkemper/bank_of_england/blob/main/notebooks/modelling/ob_full_pipeline_jpm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
"""
===================================================
Author: Oscar Bowden
Role: Research Lead, Bank of England Employer Project (Quant Collective)
LinkedIn: https://uk.linkedin.com/in/oscar-bowden-4b14711b7
Date: 2025-02-17
Version: 1.0

Description:
    This notebook is a rough version of a modelling pipeline for pre-processed financial meeting transcript
    data (JPMorganChase). It employs BERTopic, finBERT and Flan-T5 to extract insights into the speakers
    from the Q&A sections of the transcripts.
===================================================
"""



In [1]:
!pip install bertopic
!pip install umap-learn

Collecting bertopic
  Downloading bertopic-0.16.4-py3-none-any.whl.metadata (23 kB)
Collecting hdbscan>=0.8.29 (from bertopic)
  Downloading hdbscan-0.8.40-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (15 kB)
Collecting umap-learn>=0.5.0 (from bertopic)
  Downloading umap_learn-0.5.7-py3-none-any.whl.metadata (21 kB)
Collecting pynndescent>=0.5 (from umap-learn>=0.5.0->bertopic)
  Downloading pynndescent-0.5.13-py3-none-any.whl.metadata (6.8 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic)
  Downloadin

In [17]:
#Imports

from google.colab import drive
import os

import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification, T5Tokenizer, T5ForConditionalGeneration
import tensorflow as tf
import numpy as np
import random
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
from umap import UMAP
import hdbscan
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
"""
#Load pre-processed data

url = "https://raw.githubusercontent.com/sheldonkemper/bank_of_england/main/data/preprocessed_data/JPMorgan_QNA_processed_data.xlsx"

df = pd.read_excel(url, engine='openpyxl')

df.head()
"""

#This block will only work if file is in the github repository

In [28]:
# Load pre-processed data

drive.mount('/content/drive', force_remount=True)

path = "/content/drive/MyDrive/Colab Notebooks/DS_CA/BOE/jpmorgan_qna_df_preprocessed_final.csv"

df = pd.read_csv(path)

df.head()

Mounted at /content/drive


Unnamed: 0,Index,Quarter-Year,Question,Question_cleaned,Asked By,Role of the person asked the question,Answer,Answer_cleaned,Answered By,Role of the person answered the question
0,1,1Q23,"So, Jamie, I was actually hoping to get your p...",['so jamie actually hoping get perspective see...,Steven Chubak,"Analyst, Wolfe Research LLC","Well, I think you were already kind of complet...",['well think already kind complete answering q...,Jamie Dimon,"Chairman & Chief Executive Officer, JPMorgan C..."
1,2,1Q23,"Hey, thanks. Good morning. Hey, Jeremy, I was ...",['hey thanks good morning hey jeremy wondering...,Ken Usdin,"Analyst, Jefferies LLC","Yeah, sure. So let me just summarize the drive...",['yeah sure let summarize drivers change outlo...,Jeremy Barnum,"Chief Financial Officer, JPMorgan Chase & Co."
2,3,1Q23,"Hi, thanks. Jeremy, wanted to follow up again ...",['hi thanks jeremy wanted follow drivers nii r...,John McDonald,"Analyst, Autonomous Research","Yeah. John, it's a really good question, and w...",['yeah john really good question weve obviousl...,Jeremy Barnum,"Chief Financial Officer, JPMorgan Chase & Co."
3,4,1Q23,My first question is you mentioned that your r...,['first question mentioned reserve build drive...,Erika Najarian,"Analyst, UBS Securities LLC","Yeah. So, Erika, as you know, we take \n not g...",['yeah so erika know take going go lot detail ...,Jeremy Barnum,"Chief Financial Officer, JPMorgan Chase & Co."
4,5,1Q23,Hey. Good morning. Maybe just a little bit on ...,['hey good morning maybe little bit deposit th...,Jim Mitchell,"Analyst, Seaport Global Securities LLC","Yeah. A couple things there. So, first of all,...",['yeah couple things there so first all know r...,"Jeremy Barnum, Jamie Dimon","Chief Financial Officer, JPMorgan Chase & Co.;..."


#Extract Topics using BERTopic (with all-MiniLM-L6-v2 transformer, UMAP for dimensionality reduction, reproducibility code and NaN value handling)

* Token length max = 512

In [13]:
# Check token length in Question_cleaned - returns true if any > 500
df["Question_cleaned"].apply(lambda x: len(x.split()) if isinstance(x, str) else 0).gt(500).any()

False

In [None]:
"""
def reset_session():
    tf.keras.backend.clear_session()
    np.random.seed(42)
    random.seed(42)
    tf.random.set_seed(42)

reset_session()  # Ensure consistent results

# Load Preprocessed Data
TEXT_COLUMN = "Question_cleaned"

# Remove NaN values and convert questions to a list
questions = df[TEXT_COLUMN].dropna().tolist()

# Initialize Sentence Transformer & UMAP for Dimensionality Reduction
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")  # High-quality embeddings
umap_model = UMAP(n_neighbors=10, min_dist=0.2, metric='cosine', random_state=42)  # Optimize clustering

# Initialize BERTopic with Custom Embeddings & UMAP
topic_model = BERTopic(embedding_model=embedding_model, umap_model=umap_model, verbose=True)

# Train Model on Questions (Fit & Transform)
topic_model.fit(questions)  # Fit model first
topics, probs = topic_model.transform(questions)  # Then transform data

# Add topic assignments to DataFrame
df.loc[df[TEXT_COLUMN].notna(), "topic_q"] = topics

print(topic_model.get_topic_freq().head(10))  # Top 10 most frequent topics
print(topic_model.get_topic(0))  # Keywords for Topic 0
"""

In [29]:
# üöÄ 1Ô∏è‚É£ Reset Session for Reproducibility
def reset_session():
    tf.keras.backend.clear_session()
    np.random.seed(42)
    random.seed(42)
    tf.random.set_seed(42)

reset_session()  # Ensure consistent results

# üöÄ 2Ô∏è‚É£ Load Preprocessed Data
TEXT_COLUMN = "Question_cleaned"

# Remove NaN values and convert questions to a list
questions = df[TEXT_COLUMN].dropna().tolist()

# üöÄ 3Ô∏è‚É£ Initialize Sentence Transformer for Better Embeddings
embedding_model = SentenceTransformer("all-mpnet-base-v2")  # High-quality sentence embeddings

# üöÄ 4Ô∏è‚É£ Optimize UMAP for Better Topic Separation
umap_model = UMAP(n_neighbors=5, min_dist=0.2, metric='cosine', random_state=42)

# üöÄ 5Ô∏è‚É£ Fine-Tune HDBSCAN for More Topic Diversity
hdbscan_model = hdbscan.HDBSCAN(
    min_cluster_size=5,  # Reduce to allow smaller topics
    min_samples=5,  # Controls how strictly points are assigned to clusters
    leaf_size=1,
    metric='euclidean',
    cluster_selection_method='eom',
    prediction_data=True
)

# üöÄ 6Ô∏è‚É£ Improve Text Vectorization for More Unique Topics
vectorizer_model = CountVectorizer(
    ngram_range=(1, 2),  # Allows both single words and bigrams
    min_df=2,  # Removes rare words
    stop_words="english"  # Removes common English words
)

# üöÄ 7Ô∏è‚É£ Initialize BERTopic with Custom Components
topic_model = BERTopic(
    embedding_model=embedding_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_model,
    nr_topics='auto',  # Dynamically determine number of topics
    verbose=True
)

# üöÄ 8Ô∏è‚É£ Train Model on Questions (Fit & Transform)
topic_model.fit(questions)  # Fit model first
topics, probs = topic_model.transform(questions)  # Then transform data

# üöÄ 9Ô∏è‚É£ Add Topic Assignments to DataFrame
df.loc[df[TEXT_COLUMN].notna(), "topic_q"] = topics

# üöÄ üîü Show Topic Frequency and Keywords for Topic 0
print(topic_model.get_topic_freq().head(15))  # Show top 15 topics
print(topic_model.get_topic(0))  # Keywords for Topic 0



modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling%2Fconfig.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

2025-02-18 16:40:24,225 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/3 [00:00<?, ?it/s]

2025-02-18 16:40:32,899 - BERTopic - Embedding - Completed ‚úì
2025-02-18 16:40:32,901 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-02-18 16:40:33,030 - BERTopic - Dimensionality - Completed ‚úì
2025-02-18 16:40:33,031 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-02-18 16:40:33,042 - BERTopic - Cluster - Completed ‚úì
2025-02-18 16:40:33,044 - BERTopic - Representation - Extracting topics from clusters using representation models.
2025-02-18 16:40:33,062 - BERTopic - Representation - Completed ‚úì
2025-02-18 16:40:33,064 - BERTopic - Topic reduction - Reducing number of topics
2025-02-18 16:40:33,088 - BERTopic - Topic reduction - Reduced number of topics from 2 to 2


Batches:   0%|          | 0/3 [00:00<?, ?it/s]

2025-02-18 16:40:39,670 - BERTopic - Dimensionality - Reducing dimensionality of input embeddings.
2025-02-18 16:40:39,674 - BERTopic - Dimensionality - Completed ‚úì
2025-02-18 16:40:39,676 - BERTopic - Clustering - Approximating new points with `hdbscan_model`
2025-02-18 16:40:39,682 - BERTopic - Cluster - Completed ‚úì


   Topic  Count
0      0     81
1      1     10
[('question', 0.4080269821095939), ('time', 0.21988386764199733), ('hear', 0.14939081699603973), ('questions', 0.14658924509466487), ('thank', 0.09135656814458598), ('okay', 0.09135656814458598), ('hear okay', 0.05255221377993026), ('', 1e-05), ('', 1e-05), ('', 1e-05)]


# Running FinBERT on question chunks - aiming to gather insights at the analyst/question level

* Token length max = 512

In [30]:
# Load ProsusAI FinBERT model & tokenizer
MODEL_NAME = "ProsusAI/finbert"
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)
model = BertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=3)
model.eval()  # Set model to evaluation mode

# Load your preprocessed data (assuming it's already in `df`)
TEXT_COLUMN = "Question_cleaned"  # Adjust this if your column name is different

# Define class labels for FinBERT
LABELS = ["Negative", "Neutral", "Positive"]

# Function to get sentiment scores
def get_sentiment(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)

    with torch.no_grad():  # No gradient calculation needed - only inferring, not training
        outputs = model(**inputs)

    probs = torch.nn.functional.softmax(outputs.logits, dim=-1).cpu().numpy().flatten()
    sentiment_label = LABELS[probs.argmax()]  # Get label with highest probability

    return {"Negative_q": probs[0], "Neutral_q": probs[1], "Positive_q": probs[2], "Sentiment_q": sentiment_label}

# Apply sentiment analysis to each chunk
df["sentiment_question"] = df[TEXT_COLUMN].apply(get_sentiment)

# Convert dictionary to separate columns
sentiment_df = df["sentiment_question"].apply(pd.Series)
df = pd.concat([df, sentiment_df], axis=1).drop(columns=["sentiment_question"])

# Show results
df.head()

Unnamed: 0,Index,Quarter-Year,Question,Question_cleaned,Asked By,Role of the person asked the question,Answer,Answer_cleaned,Answered By,Role of the person answered the question,topic_q,Negative_q,Neutral_q,Positive_q,Sentiment_q
0,1,1Q23,"So, Jamie, I was actually hoping to get your p...",['so jamie actually hoping get perspective see...,Steven Chubak,"Analyst, Wolfe Research LLC","Well, I think you were already kind of complet...",['well think already kind complete answering q...,Jamie Dimon,"Chairman & Chief Executive Officer, JPMorgan C...",0.0,0.185998,0.027652,0.78635,Positive
1,2,1Q23,"Hey, thanks. Good morning. Hey, Jeremy, I was ...",['hey thanks good morning hey jeremy wondering...,Ken Usdin,"Analyst, Jefferies LLC","Yeah, sure. So let me just summarize the drive...",['yeah sure let summarize drivers change outlo...,Jeremy Barnum,"Chief Financial Officer, JPMorgan Chase & Co.",0.0,0.537501,0.106014,0.356486,Negative
2,3,1Q23,"Hi, thanks. Jeremy, wanted to follow up again ...",['hi thanks jeremy wanted follow drivers nii r...,John McDonald,"Analyst, Autonomous Research","Yeah. John, it's a really good question, and w...",['yeah john really good question weve obviousl...,Jeremy Barnum,"Chief Financial Officer, JPMorgan Chase & Co.",0.0,0.105943,0.277307,0.61675,Positive
3,4,1Q23,My first question is you mentioned that your r...,['first question mentioned reserve build drive...,Erika Najarian,"Analyst, UBS Securities LLC","Yeah. So, Erika, as you know, we take \n not g...",['yeah so erika know take going go lot detail ...,Jeremy Barnum,"Chief Financial Officer, JPMorgan Chase & Co.",0.0,0.04943,0.678445,0.272125,Neutral
4,5,1Q23,Hey. Good morning. Maybe just a little bit on ...,['hey good morning maybe little bit deposit th...,Jim Mitchell,"Analyst, Seaport Global Securities LLC","Yeah. A couple things there. So, first of all,...",['yeah couple things there so first all know r...,"Jeremy Barnum, Jamie Dimon","Chief Financial Officer, JPMorgan Chase & Co.;...",0.0,0.186244,0.032985,0.780771,Positive


# Flan-T5 topic extraction: zero-shot prompting

In [44]:
# Summary table to show questions asked per analyst per quarter

# Group Data by Speaker and Quarter
speaker_question_counts = df.groupby(["Asked By", "Quarter-Year"]).size().reset_index(name="question_count")

# üöÄ 2Ô∏è‚É£ Pivot to Create a Summary Table
summary_df = speaker_question_counts.pivot(index="Asked By", columns="Quarter-Year", values="question_count").fillna(0)

summary_df

Quarter-Year,1Q23,1Q24,2Q23,2Q24,3Q23,3Q24,4Q23,4Q24
Asked By,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Betsy L. Graseck,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0
Charles W. Peabody,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
Ebrahim H. Poonawala,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0
Erika Najarian,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
Gerard Cassidy,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
Glenn Schorr,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
Jamie Dimon,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
Jeremy Barnum,0.0,0.0,2.0,1.0,0.0,0.0,1.0,1.0
Jim Mitchell,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
John McDonald,1.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0


In [39]:
# üöÄ 1Ô∏è‚É£ Load Flan-T5 Model & Tokenizer
MODEL_NAME = "google/flan-t5-large"
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)
model.eval()  # Set model to inference mode

# üöÄ 2Ô∏è‚É£ Function to Convert Data into More Natural Input
def process_sentiment_data(speaker, data):
    """Converts structured sentiment scores into a readable summary for Flan-T5"""

    quarter_groups = data.groupby("Quarter-Year")
    sentiment_trends = []

    for quarter, group in quarter_groups:
        sentiments = group["Sentiment_q"].tolist()
        neg_scores = group["Negative_q"].tolist()
        neu_scores = group["Neutral_q"].tolist()
        pos_scores = group["Positive_q"].tolist()

        sentiment_counts = {sent: sentiments.count(sent) for sent in set(sentiments)}
        total = len(sentiments)

        avg_neg = sum(neg_scores) / total if total else 0
        avg_neu = sum(neu_scores) / total if total else 0
        avg_pos = sum(pos_scores) / total if total else 0

        # **üöÄ Convert Numerical Scores into a More Text-Like Input**
        dominant_sentiment = max(
            [("Negative", avg_neg), ("Neutral", avg_neu), ("Positive", avg_pos)],
            key=lambda x: x[1]
        )[0]  # Find dominant sentiment in the quarter

        sentiment_trends.append(
            f"In {quarter}, the speaker's sentiment was mainly {dominant_sentiment}. "
            f"Negative sentiment was {avg_neg:.2f}, Neutral was {avg_neu:.2f}, and Positive was {avg_pos:.2f}. "
            f"{sentiment_counts} sentiment labels were assigned."
        )

    return "\n".join(sentiment_trends)

# üöÄ 3Ô∏è‚É£ Define Improved Sentiment Analysis Function
def summarize_speaker_sentiment(speaker, data):
    """Summarizes sentiment patterns based on FinBERT outputs per quarter for a given speaker."""

    # Convert structured sentiment data into natural language
    sentiment_text = process_sentiment_data(speaker, data)

    # Example of expected output to guide the model
    example_output = (
        "Example Summary:\n"
        "In 1Q23, the speaker's sentiment was mainly Neutral, with occasional Negative shifts. "
        "By 2Q23, the tone became more Negative, particularly in economic discussions. "
        "In 3Q23, the sentiment was mixed, but Positive sentiment increased slightly due to improved outlook. "
        "Overall, sentiment fluctuated, but later quarters showed a shift toward optimism."
    )

    # **üöÄ More Explicit Prompt**
    prompt = (
        f"The following data shows how {speaker}'s sentiment evolved over multiple financial quarters. "
        f"Analyze the sentiment shifts and summarize how the speaker's tone has changed over time.\n\n"
        f"Sentiment Trends Per Quarter:\n{sentiment_text}\n\n"
        f"Your response should follow this format:\n\n{example_output}\n\n"
        f"Provide a detailed summary of {speaker}'s sentiment trends over time:"
    )

    # Tokenize & Generate Response
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=150)  # Allow detailed response

    summary = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
    return summary

# üöÄ 4Ô∏è‚É£ Generate Summaries for Each Speaker
speaker_summaries = []
for speaker, group in df.groupby("Asked By"):
    if not group.empty:
        summary = summarize_speaker_sentiment(speaker, group)
        speaker_summaries.append({"Asked By": speaker, "sentiment_summary": summary})

# üöÄ 5Ô∏è‚É£ Convert Results into DataFrame
sentiment_summary_df = pd.DataFrame(speaker_summaries)


In [40]:
sentiment_summary_df

Unnamed: 0,Asked By,sentiment_summary
0,Betsy L. Graseck,Betsy L. Graseck's sentiment fluctuated over t...
1,Charles W. Peabody,"The speaker's sentiment fluctuated, but later ..."
2,Ebrahim H. Poonawala,"The speaker's sentiment fluctuated, but later ..."
3,Erika Najarian,"By 2Q23, the tone became more Negative, partic..."
4,Gerard Cassidy,"In 1Q23, the speaker's sentiment was mainly Ne..."
5,Glenn Schorr,"The speaker's sentiment fluctuated, but later ..."
6,Jamie Dimon,"In 1Q23, the speaker's sentiment was mainly Po..."
7,Jeremy Barnum,Jeremy Barnum's sentiment fluctuated over the ...
8,Jim Mitchell,"In 1Q23, the speaker's sentiment was mainly Ne..."
9,John McDonald,"In 1Q23, the speaker's sentiment was mainly Po..."
