<a href="https://colab.research.google.com/github/sheldonkemper/bank_of_england/blob/main/notebooks/modelling/rb_jomorgan_summarisation_v5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **FLAN-T5-Large** is tested for text summarisation using JPMorgan Financial transcripts.

The model is applied to each individual quarter and year, where negative sentiment is detected from management Answers to anlaysts questions.


**A bespoke prompt** is applied for summarisation of answers:

**1)** *prompt = f"Summarize by generating a comprehensive and detailed summary that accurately captures all key points, facts, and discussions from the given text. Ensure that important details on management decisions, financial strategies, business growth, economic expectations, leadership transitions, regulatory frameworks, and technological improvements are retained. Preserve key statements and quotes where necessary to maintain factual accuracy: {text}"*

**ROUGE scores** are applied to measure alignment with reference texts through precision, recall, and F-measure, are used, helping assess models performance.


In [None]:
!pip install bertopic umap-learn hdbscan sentence-transformers
!pip install transformers torch
!pip install rouge_score
!pip install evaluate
!pip install --upgrade protobuf
!pip install tensorboard



In [None]:
!pip install tensorflow
import tensorflow as tf
import numpy as np
import random



In [None]:
import time
import torch
from google.colab import drive
import os
import sys
import pandas as pd
import re
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer,Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq
import evaluate
from rouge_score import rouge_scorer
from typing import List, Union, Optional
import logging

import warnings
warnings.filterwarnings("ignore", category=UserWarning)

In [None]:
def reset_session():
    tf.keras.backend.clear_session()
    np.random.seed(42)
    random.seed(42)
    tf.random.set_seed(42)

In [54]:
# creating a pdf reader object
df_qna = pd.read_csv('/content/sample_data/JPMorgan_qna_answer_topics_by_quarter_gpt.csv', header=0)

print("Q&A DataFrame:")
display(df_qna.head(3))


Q&A DataFrame:


Unnamed: 0,Quarter-Year,Sentiment,Topic,Snippet
0,4Q24,Negative,Capital Management Strategy,"""We feel very comfortable with the notion that..."
1,4Q24,Negative,Capital Deployment and Buybacks,"""It means more capital return through buybacks..."
2,4Q24,Negative,Investment Focus and Operational Execution,"""The themes are remarkably consistent. So, we ..."


In [55]:
df_qna = df_qna[df_qna["Quarter-Year"] != "1Q23"]

In [56]:
filtered_df = df_qna[ (df_qna["Quarter-Year"] == "4Q24") & (df_qna["Sentiment"] == "Negative")]

In [57]:
sentiment = filtered_df["Snippet"].tolist()  #### genertaing list for modeling

###**Summarisation model**

In [None]:
reset_session()

In [None]:
class TextSummarizer:
    def __init__(self, model_name: str = "google/flan-t5-large", device: Optional[str] = None):
        """Initialize the summarizer with model and device."""
        self.device = device or ('cuda' if torch.cuda.is_available() else 'cpu')
        logger.info(f"Using device: {self.device}")

        try:
            self.tokenizer = AutoTokenizer.from_pretrained(model_name)
            self.model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(self.device)
            logger.info(f"Successfully loaded {model_name}")
        except Exception as e:
            logger.error(f"Error loading model: {str(e)}")
            raise

    def chunk_text(self,
                  text: Union[str, List[str]],
                  chunk_size: int = 400,
                  overlap: int = 50) -> List[str]:
        """Split text into overlapping chunks."""
        # Validate parameters
        if chunk_size <= 0 or overlap < 0 or overlap >= chunk_size:
            raise ValueError("Invalid chunk_size or overlap parameters")

        try:

            if isinstance(text, list):
                text = " ".join(text)

            if not text.strip():
                return []

            words = text.split()
            chunks = []
            start = 0

            while start < len(words):
                end = min(start + chunk_size, len(words))
                chunk = " ".join(words[start:end])
                chunks.append(chunk)
                start += chunk_size - overlap

            logger.debug(f"Split text into {len(chunks)} chunks")
            return chunks

        except Exception as e:
            logger.error(f"Error in chunk_text: {str(e)}")
            raise

    def summarize_text(self,
                      text: str,
                      min_new_tokens: int = 200,
                      max_new_tokens: int = 400) -> str:
        """Summarize a single piece of text."""

        if pd.isna(text) or not text.strip():
            logger.warning("Empty or NaN text provided")
            return ""

        try:
            prompt = f"Summarize by generating a comprehensive and detailed summary that accurately captures all key points, facts, and discussions from the given text. Ensure that important details on management decisions, financial strategies, business growth, economic expectations, leadership transitions, regulatory frameworks, and technological improvements are retained. Preserve key statements and quotes where necessary to maintain factual accuracy: {text}"


            inputs = self.tokenizer(
                prompt,
                return_tensors="pt",
                truncation=True,
                max_length=512
            ).to(self.device)


            with torch.no_grad():
                outputs = self.model.generate(
                    inputs.input_ids,
                    min_new_tokens=min_new_tokens,
                    max_new_tokens=max_new_tokens,
                    num_beams=4,
                    length_penalty=2.0,
                    no_repeat_ngram_size=3,
                    early_stopping=True,
                    do_sample=False
                )

            summary = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
            return summary

        except Exception as e:
            logger.error(f"Error in summarize_text: {str(e)}")
            raise

    def summarize_long_text(self,
                          text: Union[str, List[str]],
                          chunk_size: int = 400,
                          overlap: int = 50) -> str:
        """Handle long text summarization."""
        try:
            # Get chunks
            chunks = self.chunk_text(text, chunk_size, overlap)
            if not chunks:
                logger.warning("No valid chunks to summarize")
                return ""

            # Summarize chunks
            chunk_summaries = []
            for i, chunk in enumerate(chunks):
                logger.debug(f"Summarizing chunk {i+1}/{len(chunks)}")
                summary = self.summarize_text(chunk)
                if summary.strip():
                    chunk_summaries.append(summary)

            if not chunk_summaries:
                logger.warning("No valid summaries generated")
                return ""

            # If single chunk, return its summary
            if len(chunk_summaries) == 1:
                return chunk_summaries[0]

            # Summarize the combined summaries
            logger.debug("Generating final summary")
            final_summary = self.summarize_text(
                " ".join(chunk_summaries),
                min_new_tokens=150,
                max_new_tokens=300
            )

            return final_summary

        except Exception as e:
            logger.error(f"Error in summarize_long_text: {str(e)}")
            raise

# Running the model
try:
    summarizer = TextSummarizer()

    # Perform summarization
    logger.info("Starting summarization of management discussion")
    summary_text = summarizer.summarize_long_text(sentiment)

    print("Final Summary:", summary_text)

    # Create DataFrame for summary
    df_summary = pd.DataFrame({"Summary": [summary_text]})
    df_summary.to_csv("summary_output.csv", index=False)  # Export to CSV

    # Save model and tokenizer
    model_save_path = "./saved_model"
    summarizer.model.save_pretrained(model_save_path)
    summarizer.tokenizer.save_pretrained(model_save_path)

    logger.info(f"Model and tokenizer saved to {model_save_path}")

except Exception as e:
    logger.error(f"Error during summarization: {str(e)}")
    print(f"An error occurred: {str(e)}")


Final Summary: Bank of America's chief financial officer says the company's QT portfolio is expected to taper off by the middle of the year and that the bank is also aggressively engaging with clients and acquiring all the new clients and deepening in a lot of different markets. "We're putting a LOT of effort into improving the sort of ability of our software engineers to be productive," he said. "Jenn Piepszak, who does not want to be the CEO, will be here as Chief Operating Officer and stay after that." "all we want is a coherent, rational, holistically-assessed regulatory framework that allows banks to do their job, supporting the economy, that isn't reflexively anti-bank." "in what you might call the affluent section of the Wealth Management space, we are significantly underpenetrated."


In [None]:
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

In [None]:
discussion_str = " ".join(sentiment)

# Calculate ROUGE scores
scores = scorer.score(discussion_str, summary_text)
for key in scores:
    print(f'{key}: {scores[key]}')

rouge1: Score(precision=0.9185185185185185, recall=0.2309124767225326, fmeasure=0.36904761904761907)
rouge2: Score(precision=0.7835820895522388, recall=0.1958955223880597, fmeasure=0.31343283582089554)
rougeL: Score(precision=0.6962962962962963, recall=0.1750465549348231, fmeasure=0.27976190476190477)


### **Running model on full period and crating a CSV output with data from each individual quarters on negative sentiment text.**

In [59]:
summarizer = TextSummarizer()

summary_data = []

quarter_years = df_qna["Quarter-Year"].unique()

# Loop through each unique Quarter-Year
for quarter in quarter_years:

    filtered_df = df_qna[(df_qna["Quarter-Year"] == quarter) & (df_qna["Sentiment"] == "Negative")]

    sentiment = filtered_df["Snippet"].dropna().astype(str).tolist()

    if not sentiment:
        logger.warning(f"No valid snippets found for Quarter-Year: {quarter}")
        continue

    print(type(summarizer))
    # Generate the summary for the snippets in this Quarter-Year
    try:
        summary_text = summarizer.summarize_long_text(sentiment)
    except Exception as e:
        logger.error(f"Error summarizing for Quarter-Year {quarter}: {str(e)}")
        continue

    summary_data.append({"Summary": summary_text, "Quarter-Year": quarter})

df_summary = pd.DataFrame(summary_data)

df_summary.to_csv("summary_output.csv", index=False)

print(df_summary)

<class '__main__.TextSummarizer'>
<class '__main__.TextSummarizer'>
<class '__main__.TextSummarizer'>
<class '__main__.TextSummarizer'>
<class '__main__.TextSummarizer'>
<class '__main__.TextSummarizer'>
<class '__main__.TextSummarizer'>
                                             Summary Quarter-Year
0  Bank of America's chief financial officer says...         4Q24
1  Shares of Bank of America jumped more than 5% ...         3Q24
2  Credit Suisse's chief executive, Jamie Dimon, ...         2Q24
3  Morgan Stanley's chief financial officer discu...         1Q24
4  Bank of America's chief executive says the com...         4Q23
5  Credit Suisse's chief executive said the compa...         3Q23
6  Bank of America Merrill Lynch chief financial ...         2Q23
