<a href="https://colab.research.google.com/github/sheldonkemper/bank_of_england/blob/main/notebooks/modelling/rb_jomorgan_summarisation_v6.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install bertopic umap-learn hdbscan sentence-transformers
!pip install transformers torch
!pip install rouge_score
!pip install evaluate
!pip install --upgrade protobuf
!pip install tensorboard

Collecting bertopic
  Downloading bertopic-0.16.4-py3-none-any.whl.metadata (23 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)


In [2]:
!pip install tensorflow
import tensorflow as tf
import numpy as np
import random

Collecting protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.3 (from tensorflow)
  Downloading protobuf-5.29.3-cp38-abi3-manylinux2014_x86_64.whl.metadata (592 bytes)
Downloading protobuf-5.29.3-cp38-abi3-manylinux2014_x86_64.whl (319 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m319.7/319.7 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: protobuf
  Attempting uninstall: protobuf
    Found existing installation: protobuf 6.30.0
    Uninstalling protobuf-6.30.0:
      Successfully uninstalled protobuf-6.30.0
Successfully installed protobuf-5.29.3


In [3]:
import time
import torch
from google.colab import drive
import os
import sys
import pandas as pd
import re
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer,Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq
import evaluate
from rouge_score import rouge_scorer
from typing import List, Union, Optional
import logging

import warnings
warnings.filterwarnings("ignore", category=UserWarning)

In [4]:
def reset_session():
    tf.keras.backend.clear_session()
    np.random.seed(42)
    random.seed(42)
    tf.random.set_seed(42)

In [5]:
# creating a pdf reader object
df_qna = pd.read_csv('/content/sample_data/jp_morgan_qna.csv', header=0)

print("Q&A DataFrame:")
display(df_qna.head(3))


Q&A DataFrame:


Unnamed: 0,Quarter,Question,Question_cleaned,Analyst,Analyst Role,Response,Response_cleaned,Executive,Executive Role Type
0,4Q24,"Hi. Good morning. Jeremy, I wanted to ask abou...",['hi good morning jeremy wanted ask capital kn...,John McDonald,"Analyst, Truist Securities, Inc.","Yeah. Good question, John, and welcome back, b...",['yeah good question john welcome back way so ...,Jeremy Barnum,CFO
1,4Q24,"Hi. Simple and then more difficult, I guess. J...",['hi simple difficult guess jamie whos success...,Mike Mayo,"Analyst, Wells Fargo Securities LLC",I do love what I do. And answering the second ...,['love do answering second question first look...,Jamie Dimon,CEO
2,4Q24,"Hey. Good morning. Maybe just on regulation, w...",['hey good morning maybe regulation new admini...,Jim Mitchell,"Analyst, Seaport Global Securities LLC","Hey, Jim. I mean, it's obviously something we'...",['hey jim mean obviously something thinking lo...,Jeremy Barnum,CFO


In [6]:
df_qna = df_qna[df_qna["Quarter"] != "1Q23"]

In [7]:
filtered_df = df_qna[ (df_qna["Quarter"] == "4Q24") & (df_qna["Analyst"] == "John McDonald")]

In [8]:
sentiment = filtered_df["Response"].tolist()  #### genertaing list for modeling

###**Summarisation model**

In [9]:
reset_session()

In [10]:
# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


class TextSummarizer:
    def __init__(self, model_name: str = "google/flan-t5-large", device: Optional[str] = None):
        """Initialize the summarizer with model and device."""
        self.device = device or ('cuda' if torch.cuda.is_available() else 'cpu')
        logger.info(f"Using device: {self.device}")

        try:
            self.tokenizer = AutoTokenizer.from_pretrained(model_name)
            self.model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(self.device)
            logger.info(f"Successfully loaded {model_name}")
        except Exception as e:
            logger.error(f"Error loading model: {str(e)}")
            raise

    def chunk_text(self,
                  text: Union[str, List[str]],
                  chunk_size: int = 400,
                  overlap: int = 50) -> List[str]:
        """Split text into overlapping chunks."""
        # Validate parameters
        if chunk_size <= 0 or overlap < 0 or overlap >= chunk_size:
            raise ValueError("Invalid chunk_size or overlap parameters")

        try:

            if isinstance(text, list):
                text = " ".join(text)

            if not text.strip():
                return []

            words = text.split()
            chunks = []
            start = 0

            while start < len(words):
                end = min(start + chunk_size, len(words))
                chunk = " ".join(words[start:end])
                chunks.append(chunk)
                start += chunk_size - overlap

            logger.debug(f"Split text into {len(chunks)} chunks")
            return chunks

        except Exception as e:
            logger.error(f"Error in chunk_text: {str(e)}")
            raise

    def summarize_text(self,
                      text: str,
                      min_new_tokens: int = 100,
                      max_new_tokens: int = 400) -> str:
        """Summarize a single piece of text."""

        if pd.isna(text) or not text.strip():
            logger.warning("Empty or NaN text provided")
            return ""

        try:
            prompt = f"Rewrite the following text into a concise and original summary while maintaining its key ideas: {text}"


            inputs = self.tokenizer(
                prompt,
                return_tensors="pt",
                truncation=True,
                max_length=512
            ).to(self.device)


            with torch.no_grad():
                outputs = self.model.generate(
                    inputs.input_ids,
                    min_new_tokens=min_new_tokens,
                    max_new_tokens=max_new_tokens,
                    num_beams=4,
                    length_penalty=2,
                    no_repeat_ngram_size=3,
                    early_stopping=True,
                    do_sample=False
                )

            summary = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
            return summary

        except Exception as e:
            logger.error(f"Error in summarize_text: {str(e)}")
            raise

    def summarize_long_text(self,
                          text: Union[str, List[str]],
                          chunk_size: int = 300,
                          overlap: int = 50) -> str:
        """Handle long text summarization."""
        try:
            # Get chunks
            chunks = self.chunk_text(text, chunk_size, overlap)
            if not chunks:
                logger.warning("No valid chunks to summarize")
                return ""

            # Summarize chunks
            chunk_summaries = []
            for i, chunk in enumerate(chunks):
                logger.debug(f"Summarizing chunk {i+1}/{len(chunks)}")
                summary = self.summarize_text(chunk)
                if summary.strip():
                    chunk_summaries.append(summary)

            if not chunk_summaries:
                logger.warning("No valid summaries generated")
                return ""

            # If single chunk, return its summary
            if len(chunk_summaries) == 1:
                return chunk_summaries[0]

            # Summarize the combined summaries
            logger.debug("Generating final summary")
            final_summary = self.summarize_text(
                " ".join(chunk_summaries),
                min_new_tokens=150,
                max_new_tokens=300
            )

            return final_summary

        except Exception as e:
            logger.error(f"Error in summarize_long_text: {str(e)}")
            raise

# Running the model
try:
    summarizer = TextSummarizer()

    # Perform summarization
    logger.info("Starting summarization of management discussion")
    summary_text = summarizer.summarize_long_text(sentiment)

    print("Final Summary:", summary_text)

    # Create DataFrame for summary
    df_summary = pd.DataFrame({"Summary": [summary_text]})
    df_summary.to_csv("summary_output.csv", index=False)  # Export to CSV

    # Save model and tokenizer
    model_save_path = "./saved_model"
    summarizer.model.save_pretrained(model_save_path)
    summarizer.tokenizer.save_pretrained(model_save_path)

    logger.info(f"Model and tokenizer saved to {model_save_path}")

except Exception as e:
    logger.error(f"Error during summarization: {str(e)}")
    print(f"An error occurred: {str(e)}")


tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Final Summary: We've grown a lot and it has contributed quite a bit to our growth and to our ability to run the company efficiently. But any time you have that quantum of head count growth as well as that rate of headcount growth, you have to believe, all else equal, that some amount of inefficiency has been introduced. And so, this year, as we went through the budget cycle, we've gotten a little bit better at identifying the reds and the ambers that are behind the greens, and that's embedded in the culture of the company. So we do that everywhere and continue analyzing and iterating, and we throw resources against that stuff as they do that. The truth is — and I guess this is a good thing — that the themes are remarkably consistent. So, we are seeing the results of our kind of high certainty investment choices across all the categories that you know very well and that we highlighted on the outlook page for expenses, and those continue to be the main areas of focus. The execution gets 

In [11]:
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

In [12]:
discussion_str = " ".join(sentiment)

# Calculate ROUGE scores
scores = scorer.score(discussion_str, summary_text)
for key in scores:
    print(f'{key}: {scores[key]}')

rouge1: Score(precision=0.9948979591836735, recall=0.20440251572327045, fmeasure=0.33913043478260874)
rouge2: Score(precision=0.958974358974359, recall=0.19622245540398742, fmeasure=0.3257839721254356)
rougeL: Score(precision=0.5408163265306123, recall=0.1111111111111111, fmeasure=0.1843478260869565)


### **Running model on full period and crating a CSV output with data from each individual quarters on negative sentiment text.**

In [15]:
summarizer = TextSummarizer()

summary_data = []

quarter_years = df_qna["Quarter"].unique()
analysts = df_qna["Analyst"].unique()

# Loop through each unique Quarter-Year and Analyst
for quarter in quarter_years:
    for analyst in analysts:
        filtered_df = df_qna[(df_qna["Quarter"] == quarter) & (df_qna["Analyst"] == analyst)]

        snippets = filtered_df["Response"].dropna().astype(str).tolist()

        if not snippets:
            logger.warning(f"No valid snippets found for Quarter-Year: {quarter}, Analyst: {analyst}")
            continue

        print(type(summarizer))
        # Generate the summary for the snippets in this Quarter-Year and Analyst
        try:
            summary_text = summarizer.summarize_long_text(snippets)
        except Exception as e:
            logger.error(f"Error summarizing for Quarter {quarter}, Analyst {analyst}: {str(e)}")
            continue

        summary_data.append({"Summary": summary_text, "Quarter": quarter, "Analyst": analyst})

# Convert summary data to DataFrame
df_summary = pd.DataFrame(summary_data)

df_summary.to_csv("summary_output.csv", index=False)

print(df_summary)


<class '__main__.TextSummarizer'>
<class '__main__.TextSummarizer'>
<class '__main__.TextSummarizer'>
<class '__main__.TextSummarizer'>
<class '__main__.TextSummarizer'>
<class '__main__.TextSummarizer'>
<class '__main__.TextSummarizer'>
<class '__main__.TextSummarizer'>
<class '__main__.TextSummarizer'>
<class '__main__.TextSummarizer'>




<class '__main__.TextSummarizer'>
<class '__main__.TextSummarizer'>
<class '__main__.TextSummarizer'>




<class '__main__.TextSummarizer'>
<class '__main__.TextSummarizer'>
<class '__main__.TextSummarizer'>
<class '__main__.TextSummarizer'>




<class '__main__.TextSummarizer'>
<class '__main__.TextSummarizer'>
<class '__main__.TextSummarizer'>




<class '__main__.TextSummarizer'>
<class '__main__.TextSummarizer'>
<class '__main__.TextSummarizer'>




<class '__main__.TextSummarizer'>
<class '__main__.TextSummarizer'>




<class '__main__.TextSummarizer'>




<class '__main__.TextSummarizer'>
<class '__main__.TextSummarizer'>
<class '__main__.TextSummarizer'>
<class '__main__.TextSummarizer'>
<class '__main__.TextSummarizer'>




<class '__main__.TextSummarizer'>
<class '__main__.TextSummarizer'>
<class '__main__.TextSummarizer'>
<class '__main__.TextSummarizer'>




<class '__main__.TextSummarizer'>
<class '__main__.TextSummarizer'>
<class '__main__.TextSummarizer'>
<class '__main__.TextSummarizer'>




<class '__main__.TextSummarizer'>




<class '__main__.TextSummarizer'>




<class '__main__.TextSummarizer'>




<class '__main__.TextSummarizer'>
<class '__main__.TextSummarizer'>
<class '__main__.TextSummarizer'>
<class '__main__.TextSummarizer'>




<class '__main__.TextSummarizer'>




<class '__main__.TextSummarizer'>
<class '__main__.TextSummarizer'>




<class '__main__.TextSummarizer'>




<class '__main__.TextSummarizer'>
<class '__main__.TextSummarizer'>
<class '__main__.TextSummarizer'>




<class '__main__.TextSummarizer'>
<class '__main__.TextSummarizer'>
<class '__main__.TextSummarizer'>
<class '__main__.TextSummarizer'>




<class '__main__.TextSummarizer'>




<class '__main__.TextSummarizer'>
<class '__main__.TextSummarizer'>
<class '__main__.TextSummarizer'>
<class '__main__.TextSummarizer'>
<class '__main__.TextSummarizer'>




<class '__main__.TextSummarizer'>
<class '__main__.TextSummarizer'>
<class '__main__.TextSummarizer'>
<class '__main__.TextSummarizer'>
<class '__main__.TextSummarizer'>




<class '__main__.TextSummarizer'>
<class '__main__.TextSummarizer'>
<class '__main__.TextSummarizer'>
<class '__main__.TextSummarizer'>




<class '__main__.TextSummarizer'>
<class '__main__.TextSummarizer'>




<class '__main__.TextSummarizer'>
<class '__main__.TextSummarizer'>
<class '__main__.TextSummarizer'>




                                              Summary Quarter  \
0   We've grown a lot and it has contributed quite...    4Q24   
1   I think it's the rational thing to do. I've ha...    4Q24   
2   Jamie Dimon's comments are consistent with wha...    4Q24   
3   We're happy to see the clear recognition on th...    4Q24   
4   Let's go to the next question. Thanks. Yeah, t...    4Q24   
..                                                ...     ...   
72  The Wall Street Journal reports that the U.S. ...    2Q23   
73  I think the overall point that we're trying to...    2Q23   
74  What's the outlook for investment banking and ...    2Q23   
75  Thank you, guys. Thank you for your help. Than...    2Q23   
76  I would think about that as being really entir...    2Q23   

               Analyst  
0        John McDonald  
1            Mike Mayo  
2         Jim Mitchell  
3       Erika Najarian  
4                Erika  
..                 ...  
72       Steven Chubak  
73        Glenn Sch