<a href="https://colab.research.google.com/github/sheldonkemper/bank_of_england/blob/main/notebooks/modelling/ob_full_pipeline_jpm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
"""
===================================================
Author: Oscar Bowden
Role: Research Lead, Bank of England Employer Project (Quant Collective)
LinkedIn: https://uk.linkedin.com/in/oscar-bowden-4b14711b7
Date: 2025-02-17
Version: 1.0

Description:
    This notebook is a rough version of a modelling pipeline for pre-processed financial meeting transcript
    data (JPMorganChase). It employs BERTopic, finBERT and Flan-T5 to extract insights into the speakers
    from the Q&A sections of the transcripts.
===================================================
"""



# Imports

In [1]:
!pip install bertopic
!pip install umap-learn
!pip install datsets
!pip install transformers
!pip install torch
!pip install evaluate
!pip install scikit-learn

Collecting bertopic
  Downloading bertopic-0.16.4-py3-none-any.whl.metadata (23 kB)
Collecting hdbscan>=0.8.29 (from bertopic)
  Downloading hdbscan-0.8.40-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (15 kB)
Collecting umap-learn>=0.5.0 (from bertopic)
  Downloading umap_learn-0.5.7-py3-none-any.whl.metadata (21 kB)
Collecting pynndescent>=0.5 (from umap-learn>=0.5.0->bertopic)
  Downloading pynndescent-0.5.13-py3-none-any.whl.metadata (6.8 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic)
  Downloadin

In [16]:
#Imports

from google.colab import drive
import os

import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification, T5Tokenizer, T5ForConditionalGeneration, TrainingArguments, Trainer
import tensorflow as tf
import numpy as np
import random
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
from umap import UMAP
import hdbscan
from sklearn.feature_extraction.text import CountVectorizer
from datasets import load_dataset, Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
import gc
from transformers import DataCollatorForSeq2Seq

import nltk
nltk.download('punkt_tab')
from nltk.tokenize import sent_tokenize

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


# Data loading and pre-processing

In [3]:
# Load pre-processed (questions extracted via GPT) data

drive.mount('/content/drive', force_remount=True)

path = "/content/drive/MyDrive/Colab Notebooks/DS_CA/BOE/jpmorgan_qna_df_preprocessed_final.csv"

df = pd.read_csv(path)

df.head()

Mounted at /content/drive


Unnamed: 0,Index,Quarter-Year,Question,Question_cleaned,Asked By,Role of the person asked the question,Answer,Answer_cleaned,Answered By,Role of the person answered the question
0,1,1Q23,"So, Jamie, I was actually hoping to get your p...",['so jamie actually hoping get perspective see...,Steven Chubak,"Analyst, Wolfe Research LLC","Well, I think you were already kind of complet...",['well think already kind complete answering q...,Jamie Dimon,"Chairman & Chief Executive Officer, JPMorgan C..."
1,2,1Q23,"Hey, thanks. Good morning. Hey, Jeremy, I was ...",['hey thanks good morning hey jeremy wondering...,Ken Usdin,"Analyst, Jefferies LLC","Yeah, sure. So let me just summarize the drive...",['yeah sure let summarize drivers change outlo...,Jeremy Barnum,"Chief Financial Officer, JPMorgan Chase & Co."
2,3,1Q23,"Hi, thanks. Jeremy, wanted to follow up again ...",['hi thanks jeremy wanted follow drivers nii r...,John McDonald,"Analyst, Autonomous Research","Yeah. John, it's a really good question, and w...",['yeah john really good question weve obviousl...,Jeremy Barnum,"Chief Financial Officer, JPMorgan Chase & Co."
3,4,1Q23,My first question is you mentioned that your r...,['first question mentioned reserve build drive...,Erika Najarian,"Analyst, UBS Securities LLC","Yeah. So, Erika, as you know, we take \n not g...",['yeah so erika know take going go lot detail ...,Jeremy Barnum,"Chief Financial Officer, JPMorgan Chase & Co."
4,5,1Q23,Hey. Good morning. Maybe just a little bit on ...,['hey good morning maybe little bit deposit th...,Jim Mitchell,"Analyst, Seaport Global Securities LLC","Yeah. A couple things there. So, first of all,...",['yeah couple things there so first all know r...,"Jeremy Barnum, Jamie Dimon","Chief Financial Officer, JPMorgan Chase & Co.;..."


In [None]:
"""
# Summary table to show questions asked per analyst per quarter

# Group Data by Speaker and Quarter
speaker_question_counts = df.groupby(["Asked By", "Quarter-Year"]).size().reset_index(name="question_count")

# Pivot to Create a Summary Table
summary_df = speaker_question_counts.pivot(index="Asked By", columns="Quarter-Year", values="question_count").fillna(0)

summary_df
"""

* Now we will load the Financial PhraseBank dataset.
* https://huggingface.co/datasets/takala/financial_phrasebank
* Sentences_allagree version, which has 100% annotator agreement for every row.
* 8 annotators.
* Data is in the format: { "sentence": "Pharmaceuticals group Orion Corp reported a fall in its third-quarter earnings that were hit by larger expenditures on R&D and marketing .",
  "label": "negative"}
* Will need to prepare the quarterly question data in a similar way for the LLM to analyse sentiment.

In [24]:
# Load the Financial PhraseBank Dataset (100% Annotator Agreement)
dataset = load_dataset("takala/financial_phrasebank", "sentences_allagree")

# Convert to Pandas DataFrame
df_fpb = pd.DataFrame(dataset["train"])

# Rename columns for consistency
df_fpb = df_fpb.rename(columns={"sentence": "text", "label": "sentiment"})  # Sentiment is already text

# Train-Test Split (80% Train, 20% Test)
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df_fpb["text"], df_fpb["sentiment"], test_size=0.2, random_state=42
)

# Convert numeric labels to strings before creating Hugging Face datasets
train_labels = train_labels.astype(str)
test_labels = test_labels.astype(str)

# Convert to DataFrame for Hugging Face format
train_df = pd.DataFrame({"input_text": "Classify sentiment: " + train_texts, "output_text": train_labels})
test_df = pd.DataFrame({"input_text": "Classify sentiment: " + test_texts, "output_text": test_labels})

# Convert to Hugging Face Dataset format
train_dataset = Dataset.from_pandas(train_df).remove_columns(["__index_level_0__"])
test_dataset = Dataset.from_pandas(test_df).remove_columns(["__index_level_0__"])

# To view
df_check = train_dataset.to_pandas()
print(df_check.shape)
df_check.head(5)

(1811, 2)


Unnamed: 0,input_text,output_text
0,Classify sentiment: The robust growth was the ...,2
1,Classify sentiment: Operating profit fell to E...,0
2,Classify sentiment: Finnish electronics manufa...,0
3,Classify sentiment: Finland 's national carrie...,2
4,Classify sentiment: The Elcoteq group recently...,0


* Target variable is encoded and an instruction tag ("Classify sentiment:") is prepended to each sentence to prepare for later fine-tuning.
* 80/20 train/test split conducted.

In [23]:
# Preparing 'Question' column of JPM data

# Create a new column that contains the list of sentences from each 'Question'
df['Sentences'] = df['Question'].apply(lambda x: sent_tokenize(str(x)))

# Explode the DataFrame
df_exploded = df.explode('Sentences')

df_exploded = df_exploded.rename(columns={'Sentences': 'Sentence'})
df_exploded = df_exploded[['Sentence', 'Question', 'Asked By', 'Quarter-Year']]

print(df_exploded.shape)

#Remove rows with 3 or fewer words
df_exploded = df_exploded[df_exploded['Sentence'].apply(lambda x: len(x.split()) > 3)]

print(df_exploded.shape)

df_exploded.head()

(399, 4)
(342, 4)


Unnamed: 0,Sentence,Question,Asked By,Quarter-Year
0,"So, Jamie, I was actually hoping to get your p...","So, Jamie, I was actually hoping to get your p...",Steven Chubak,1Q23
0,In your letter you spent a fair amount of time...,"So, Jamie, I was actually hoping to get your p...",Steven Chubak,1Q23
0,But what are some of the changes that you're s...,"So, Jamie, I was actually hoping to get your p...",Steven Chubak,1Q23
0,"And along those same lines, how you're thinkin...","So, Jamie, I was actually hoping to get your p...",Steven Chubak,1Q23
1,"Hey, Jeremy, I was just wondering if you can j...","Hey, thanks. Good morning. Hey, Jeremy, I was ...",Ken Usdin,1Q23


* Metadata retained per sentence.
* Removed rows with 3 or fewer words in the sentence - resulted in 399 -> 342 rows. This was needed as many short irrelevant sentences such as "Thank you." or "Good morning." existed.

# 1) Flan-T5 sentiment extraction: zero-shot prompting

In [4]:
# Setting environment variable to reduce GPU RAM fragmentation

os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

In [25]:
# 🚀 1️⃣ Load Flan-T5 Model & Tokenizer
MODEL_NAME = "google/flan-t5-base"
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)
model.eval()  # Set model to inference mode

# 🚀 2️⃣ Define Optimized Sentiment Extraction Function
def extract_sentiment_flan_t5(text):
    """Uses Flan-T5 to classify sentiment as Positive, Negative, or Neutral."""

    # 🚀 Updated Prompt with Examples for Clarity
    prompt = (
        f"Classify the sentiment of the following financial question as Positive, Negative, or Neutral.\n\n"
        f"Examples:\n"
        f"1. 'Will the company increase dividends this quarter?' → Positive\n"
        f"2. 'How will rising inflation impact profitability?' → Negative\n"
        f"3. 'What are your expectations for the next quarter?' → Neutral\n\n"
        f"Now classify this question:\n"
        f"Question: {text}\n\n"
        f"Sentiment:"
    )

    # Tokenize & Generate Response
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=5)  # Allow slightly longer responses

    sentiment = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()

    # 🚀 Ensure valid output
    valid_sentiments = {"Positive", "Negative", "Neutral"}
    return sentiment if sentiment in valid_sentiments else "Uncertain"  # Default to "Uncertain" instead of Neutral

# 🚀 3️⃣ Apply Sentiment Extraction to Question Chunks
TEXT_COLUMN = "Sentence"
df_exploded["flan_t5_sentiment_zero_shot"] = df_exploded[TEXT_COLUMN].dropna().apply(extract_sentiment_flan_t5)

In [26]:
df_exploded["flan_t5_sentiment_zero_shot"].value_counts()

Unnamed: 0_level_0,count
flan_t5_sentiment_zero_shot,Unnamed: 1_level_1
Neutral,295
Positive,41
Negative,6


* Zero-shot prompting has produced the above proportions of sentiment.

# 2) Fine-tuning Flan-T5 with Financial Phrasebank

In [9]:
# 🚀 3️⃣ Load Flan-T5 Model & Tokenizer
MODEL_NAME = "google/flan-t5-base"
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)

# Tokenization Function (Ensure Proper List Format)
def tokenize_function(examples):
    inputs = tokenizer(
        examples["input_text"],
        padding="max_length",
        truncation=True,
        max_length=128
    )
    targets = tokenizer(
        examples["output_text"],
        padding="max_length",
        truncation=True,
        max_length=128
    )

    # Replace pad token ids with -100 for labels so that they are ignored in the loss computation
    labels = targets["input_ids"]
    labels = [[token if token != tokenizer.pad_token_id else -100 for token in label] for label in labels]

    return {
        "input_ids": inputs["input_ids"],
        "attention_mask": inputs["attention_mask"],
        "labels": labels
    }

# Apply tokenization correctly with proper batching
tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_test = test_dataset.map(tokenize_function, batched=True)

print("✅ Tokenization successful!")

Map:   0%|          | 0/1811 [00:00<?, ? examples/s]

Map:   0%|          | 0/453 [00:00<?, ? examples/s]

✅ Tokenization successful!


In [14]:
"""
# Check data consistency - debugging

# Part 1: Check Data Consistency
def check_data_consistency(tokenized_dataset, tokenizer, num_samples=5):
    print(f"Checking data consistency for {num_samples} samples...\n")
    # Randomly choose sample indices
    indices = random.sample(range(len(tokenized_dataset)), num_samples)

    for idx in indices:
        example = tokenized_dataset[idx]
        input_ids = example["input_ids"]
        attention_mask = example["attention_mask"]
        labels = example["labels"]

        # Decode input text (skip special tokens for clarity)
        decoded_input = tokenizer.decode(input_ids, skip_special_tokens=True)

        # For labels, filter out -100 tokens (which are ignored by the loss function)
        filtered_labels = [token for token in labels if token != -100]
        decoded_labels = tokenizer.decode(filtered_labels, skip_special_tokens=True)

        print(f"Sample index: {idx}")
        print("Decoded Input: ", decoded_input)
        print("Decoded Labels:", decoded_labels)
        print("Input length:", len(input_ids))
        print("Attention mask sum (non-pad tokens):", sum(attention_mask))
        print("Labels length:", len(labels))
        print("Count of ignored (-100) tokens in labels:", labels.count(-100))
        print("-"*50)

# Run the consistency check on your tokenized training dataset
check_data_consistency(tokenized_train, tokenizer)

# Part 2: Debug a Single Forward Pass to Check Loss Computation
def debug_single_batch(tokenized_dataset, model, tokenizer, batch_size=4):
    # Take a small batch from the dataset
    sample_batch = tokenized_dataset.select(range(batch_size))

    # Convert lists to tensors and move to the same device as the model
    input_ids = torch.tensor(sample_batch["input_ids"]).to(model.device)
    attention_mask = torch.tensor(sample_batch["attention_mask"]).to(model.device)
    labels = torch.tensor(sample_batch["labels"]).to(model.device)

    model.train()  # Ensure the model is in training mode

    # Forward pass (this will compute the loss because labels are provided)
    outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
    loss = outputs.loss

    print(f"Loss for a single batch of size {batch_size}: {loss.item()}")

# Run the single-batch debug
debug_single_batch(tokenized_train, model, tokenizer)
"""

Checking data consistency for 5 samples...

Sample index: 51
Decoded Input:  Classify sentiment: Operating profit totaled EUR 9.4 mn , down from EUR 11.7 mn in 2004 .
Decoded Labels: 0
Input length: 128
Attention mask sum (non-pad tokens): 30
Labels length: 128
Count of ignored (-100) tokens in labels: 125
--------------------------------------------------
Sample index: 1518
Decoded Input:  Classify sentiment: The investment will be worth approximately EUR 100mn .
Decoded Labels: 1
Input length: 128
Attention mask sum (non-pad tokens): 17
Labels length: 128
Count of ignored (-100) tokens in labels: 126
--------------------------------------------------
Sample index: 563
Decoded Input:  Classify sentiment: Mobile communication and wireless broadband provider Nokia Inc NYSE : NOK today set new financial targets and forecasts for Nokia and the mobile device industry and also for Nokia Siemens Networks and the mobile and fixed infrastructure and related services market .
Decoded Labels: 1


In [8]:
# Clear GPU Cache

gc.collect()
torch.cuda.empty_cache()

In [15]:
print("Allocated:", torch.cuda.memory_allocated())
print("Reserved:", torch.cuda.memory_reserved())


Allocated: 41839444480
Reserved: 41888514048


In [10]:
# Define Training Arguments & Fine-Tune Model

output_dir = "/content/drive/MyDrive/Colab Notebooks/DS_CA/BOE/flan_t5_large_200225"

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

training_args = TrainingArguments(
    output_dir= output_dir,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=2,
    load_best_model_at_end=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# 🚀 Start Fine-Tuning
trainer.train()

# Save the fine-tuned model
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

print(f"✅ Model saved to {output_dir}")

Epoch,Training Loss,Validation Loss
1,0.1907,0.083689
2,0.0443,0.054032
3,0.0157,0.057747


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'].


✅ Model saved to /content/drive/MyDrive/Colab Notebooks/DS_CA/BOE/flan_t5_large_200225


In [11]:
# Load fine-tuned model for inference with test set

best_checkpoint = trainer.state.best_model_checkpoint
print("Best checkpoint path:", best_checkpoint)

# Load best model and tokenizer for evaluation
model = T5ForConditionalGeneration.from_pretrained(best_checkpoint)
tokenizer = T5Tokenizer.from_pretrained(best_checkpoint)

# Define Sentiment Classification Function
def predict_sentiment(text):
    """Predicts sentiment using fine-tuned Flan-T5"""
    prompt = f"Classify sentiment: {text}"
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, padding=True)

    with torch.no_grad():
        output = model.generate(**inputs, max_new_tokens=2)

    return tokenizer.decode(output[0], skip_special_tokens=True).strip()

# Predict Sentiment on Test Set
predictions = [predict_sentiment(text) for text in test_texts.tolist()]

# Calculate Accuracy & F1-Score
accuracy = accuracy_score(test_labels.tolist(), predictions)
f1 = f1_score(test_labels.tolist(), predictions, average="weighted")

print(f"✅ Model Accuracy: {accuracy:.4f}")
print(f"✅ Model F1-Score: {f1:.4f}")

Best checkpoint path: /content/drive/MyDrive/Colab Notebooks/DS_CA/BOE/flan_t5_large_200225/checkpoint-454
✅ Model Accuracy: 0.9757
✅ Model F1-Score: 0.9758


* The model appears to be performing well on the test set of Financial Phrasebook.
* Next, we will prepare the quarterly question data for input to the fine-tuned model.

# 3) Inference on fine-tuned Flan-T5

In [27]:
# Data preparation for JPM questions

def prepare_text_for_inference(text):
    text = str(text).strip()
    return f"Classify sentiment: {text}"

df_exploded["Sentence_t5_tuned_infer"] = df_exploded["Sentence"].apply(prepare_text_for_inference)

In [28]:
# Load your fine-tuned model and tokenizer from the best checkpoint

best_checkpoint = trainer.state.best_model_checkpoint
print("Best checkpoint path:", best_checkpoint)

model = T5ForConditionalGeneration.from_pretrained(best_checkpoint)
tokenizer = T5Tokenizer.from_pretrained(best_checkpoint)

# Define the prediction function using your fine-tuned model
def predict_sentiment(prepared_text):
    """
    Predicts sentiment using the fine-tuned Flan-T5 model.
    Assumes the input text is already preprocessed (i.e., prompt prepended).
    """
    inputs = tokenizer(prepared_text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        output = model.generate(**inputs, max_new_tokens=2)
    return tokenizer.decode(output[0], skip_special_tokens=True).strip()

# Apply inference on your prepared quarterly data
df_exploded["flan_t5_sentiment_tuned"] = df_exploded["Sentence_t5_tuned_infer"].apply(predict_sentiment)

Best checkpoint path: /content/drive/MyDrive/Colab Notebooks/DS_CA/BOE/flan_t5_large_200225/checkpoint-454


In [30]:
# Convert labels back to words

df_exploded["flan_t5_sentiment_tuned_label"] = df_exploded["flan_t5_sentiment_tuned"].map({"0": "Negative", "1": "Neutral", "2": "Positive"})

df_exploded["flan_t5_sentiment_tuned_label"].value_counts()

Unnamed: 0_level_0,count
flan_t5_sentiment_tuned_label,Unnamed: 1_level_1
Neutral,275
Positive,55
Negative,12


# 4) Comparison

#----OLD FinBERT and OLD T5 prompting code----


In [5]:
# Load ProsusAI FinBERT model & tokenizer
MODEL_NAME = "ProsusAI/finbert"
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)
model = BertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=3)
model.eval()  # Set model to evaluation mode

# Load your preprocessed data (assuming it's already in `df`)
TEXT_COLUMN = "Question_cleaned"  # Adjust this if your column name is different

# Define class labels for FinBERT
LABELS = ["Negative", "Neutral", "Positive"]

# Function to get sentiment scores
def get_sentiment(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)

    with torch.no_grad():  # No gradient calculation needed - only inferring, not training
        outputs = model(**inputs)

    probs = torch.nn.functional.softmax(outputs.logits, dim=-1).cpu().numpy().flatten()
    sentiment_label = LABELS[probs.argmax()]  # Get label with highest probability

    return {"fb_Negative_q": probs[0], "fb_Neutral_q": probs[1], "fb_Positive_q": probs[2], "fb_Sentiment_q": sentiment_label}

# Apply sentiment analysis to each chunk
df["finbert_sentiment_question"] = df[TEXT_COLUMN].apply(get_sentiment)

# Convert dictionary to separate columns
sentiment_df = df["finbert_sentiment_question"].apply(pd.Series)
df = pd.concat([df, sentiment_df], axis=1).drop(columns=["finbert_sentiment_question"])

tokenizer_config.json:   0%|          | 0.00/252 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/758 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

In [6]:
df["fb_Sentiment_q"].value_counts()

Unnamed: 0_level_0,count
fb_Sentiment_q,Unnamed: 1_level_1
Positive,63
Negative,19
Neutral,9


* At a glance this is aligns with our rough expectations - considering JPM's performance

In [None]:
"""
# Load Flan-T5 Model & Tokenizer
MODEL_NAME = "google/flan-t5-large"
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)
model.eval()  # Set model to inference mode

# Function to Convert Data into More Natural Input
def process_sentiment_data(speaker, data):

    quarter_groups = data.groupby("Quarter-Year")
    sentiment_trends = []

    for quarter, group in quarter_groups:
        sentiments = group["Sentiment_q"].tolist()
        neg_scores = group["Negative_q"].tolist()
        neu_scores = group["Neutral_q"].tolist()
        pos_scores = group["Positive_q"].tolist()

        sentiment_counts = {sent: sentiments.count(sent) for sent in set(sentiments)}
        total = len(sentiments)

        avg_neg = sum(neg_scores) / total if total else 0
        avg_neu = sum(neu_scores) / total if total else 0
        avg_pos = sum(pos_scores) / total if total else 0

        # **Convert Numerical Scores into a More Text-Like Input**
        dominant_sentiment = max(
            [("Negative", avg_neg), ("Neutral", avg_neu), ("Positive", avg_pos)],
            key=lambda x: x[1]
        )[0]  # Find dominant sentiment in the quarter

        sentiment_trends.append(
            f"In {quarter}, the speaker's sentiment was mainly {dominant_sentiment}. "
            f"Negative sentiment was {avg_neg:.2f}, Neutral was {avg_neu:.2f}, and Positive was {avg_pos:.2f}. "
            f"{sentiment_counts} sentiment labels were assigned."
        )

    return "\n".join(sentiment_trends)

# Define Improved Sentiment Analysis Function
def summarize_speaker_sentiment(speaker, data):

    # Convert structured sentiment data into natural language
    sentiment_text = process_sentiment_data(speaker, data)

    # Example of expected output to guide the model
    example_output = (
        "Example Summary:\n"
        "In 1Q23, the speaker's sentiment was mainly Neutral, with occasional Negative shifts. "
        "By 2Q23, the tone became more Negative, particularly in economic discussions. "
        "In 3Q23, the sentiment was mixed, but Positive sentiment increased slightly due to improved outlook. "
        "Overall, sentiment fluctuated, but later quarters showed a shift toward optimism."
    )

    # **🚀 More Explicit Prompt**
    prompt = (
        f"The following data shows how {speaker}'s sentiment evolved over multiple financial quarters. "
        f"Analyze the sentiment shifts and summarize how the speaker's tone has changed over time.\n\n"
        f"Sentiment Trends Per Quarter:\n{sentiment_text}\n\n"
        f"Your response should follow this format:\n\n{example_output}\n\n"
        f"Provide a detailed summary of {speaker}'s sentiment trends over time:"
    )

    # Tokenize & Generate Response
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=150)  # Allow detailed response

    summary = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
    return summary

# Generate Summaries for Each Speaker
speaker_summaries = []
for speaker, group in df.groupby("Asked By"):
    if not group.empty:
        summary = summarize_speaker_sentiment(speaker, group)
        speaker_summaries.append({"Asked By": speaker, "sentiment_summary": summary})

# Convert Results into DataFrame
sentiment_summary_df = pd.DataFrame(speaker_summaries)
"""