<a href="https://colab.research.google.com/github/sheldonkemper/bank_of_england/blob/Preprocessing/notebooks/processed/preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Author: Chiaki Tachikawa
Role: Data Sciense Lead,
Date:2025-02-11
Description:

# **Library**

In [32]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
nltk.download("stopwords")
nltk.download("punkt")
nltk.download("punkt_tab")
from nltk.tokenize import word_tokenize, sent_tokenize
from collections import Counter
import regex as re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [33]:
!pip install tensorflow
!pip install numpy
!pip install bertopic


import tensorflow as tf
import numpy as np
import random
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
import tensorflow as tf
from umap import UMAP
from bertopic.representation import MaximalMarginalRelevance



# **Function**

preprocessor function : if you add data name and column name, the values in the column are preprocessed.

In [99]:
#create function to preprocess data
def preprocessor (data, col, col1,col2):
  #Copy col1umn
  data[col1]=data[col]
  data[col2]=data[col]

  #Adding column1
  #Lower the lettercase
  data[col1] = data[col1].str.lower()

  #Remove stop words
  stop_words = set(stopwords.words("english"))
  data[col1] = data[col1].apply(lambda x: " ".join([word for word in str(x).split() if word not in (stop_words)]))

  #Tokenize the word
  data[col1] = data[col1].apply(word_tokenize)

  #Remove numbers
  data[col1] = data[col1].apply(lambda x: [word for word in x if not word.isdigit()])

  #remove symbol from comments
  data[col1] = data[col1].apply(lambda x: [word for word in x if x!=""])

  #remove short word
  data[col1] = data[col1].apply(lambda x: [word for word in x if len(word)>2])

  #remove symbols
  data[col1] = data[col1].apply (lambda x: [re.sub(r"[^a-z]", "", word) for word in x])

  #Adding column2
  #Lower the lettercase

  data[col2] = data[col2].str.lower()

  #Remove stop words
  stop_words = set(stopwords.words("english"))
  data[col2] = data[col2].apply(lambda x: " ".join([word for word in str(x).split() if word not in (stop_words)]))

  #remove symbols
  data[col2] = data[col2].apply (lambda x: [re.sub(r"[.,'?]", "", x)])

  return


remove_freq_words function: This function removes top X percent of frequent words from the column.

In [None]:
"""
def remove_freq_words(data, col, percent):
  #Flatten the list of tokens
  all_tokens = [token for sublist in data[col] for token in sublist]

#calculate word frequencies
  word_freq = Counter(all_tokens)

#Convert it to DF
  word_freq_df = pd.DataFrame(word_freq.items(), columns = ["word","freq"])

#Identify the to 5% most frequent words
  top_5_percent = word_freq_df.nlargest(int(len(word_freq_df)*percent), "freq")["word"]

  filtered_data = []
  for sentence in data[col]:
    filtered_sentence = [word for word in sentence if word not in top_5_percent.values]
    filtered_data.append(" ".join(filtered_sentence))

  print(filtered_data)
  """

    Splits the input text into chunks that do not exceed max_chunk_size characters.
    The splitting is based on sentence boundaries.

    Parameters:
        text (str): The full text to be chunked.
        max_chunk_size (int): Maximum number of characters per chunk.

    Returns:
        List[str]: A list of text chunks.

In [36]:
def chunk_text(text, max_chunk_size=500):

    sentences = sent_tokenize(text)
    chunks = []
    current_chunk = ""
    for sentence in sentences:
        if len(current_chunk) + len(sentence) + 1 > max_chunk_size:
            if current_chunk:
                chunks.append(current_chunk.strip())
                current_chunk = sentence + " "
            else:
                # In case a single sentence exceeds max_chunk_size.
                chunks.append(sentence.strip())
                current_chunk = ""
        else:
            current_chunk += sentence + " "
    if current_chunk.strip():
        chunks.append(current_chunk.strip())
    return chunks

## **Data**

JP Morgan QA section

In [100]:
#Obtaining management discussion / git bash
!git clone https://github.com/sheldonkemper/bank_of_england.git
!git switch Preprocessing
%cd bank_of_england/data/processed
%ls

#Defining qa_data
qa_data = pd.read_csv("jpmorgan_qa_section.csv")


Cloning into 'bank_of_england'...
remote: Enumerating objects: 815, done.[K
remote: Counting objects: 100% (161/161), done.[K
remote: Compressing objects: 100% (106/106), done.[K
remote: Total 815 (delta 110), reused 61 (delta 54), pack-reused 654 (from 1)[K
Receiving objects: 100% (815/815), 7.10 MiB | 10.45 MiB/s, done.
Resolving deltas: 100% (370/370), done.
Branch 'Preprocessing' set up to track remote branch 'Preprocessing' from 'origin'.
Switched to a new branch 'Preprocessing'
/content/bank_of_england/data/processed/bank_of_england/data/processed/bank_of_england/data/processed/bank_of_england/data/processed/bank_of_england/data/processed/bank_of_england/data/processed/bank_of_england/data/processed/bank_of_england/data/processed/bank_of_england/data/processed/bank_of_england/data/processed/bank_of_england/data/processed/bank_of_england/data/processed/bank_of_england/data/processed/bank_of_england/data/processed/bank_of_england/data/processed
chunked_management_discussion.csv

In [96]:
qa_data.head()

Unnamed: 0,speaker,marker,job_title,utterance,filename,financial_quarter,call_date
0,Jeremy Barnum,A,"Chief Financial Officer, JPMorganChase","Yeah. I think the conventional wisdom on QT, a...",4q24-earnings-transcript.pdf,4Q24,2025-01-15
1,Mike Mayo,Q,"Analyst, Wells Fargo Securities LLC","So, you'll stay around maybe for a few more ye...",4q24-earnings-transcript.pdf,4Q24,2025-01-15
2,Mike Mayo,Q,"Analyst, Wells Fargo Securities LLC",All right. Thank you.,4q24-earnings-transcript.pdf,4Q24,2025-01-15
3,Operator,,,Thank you. Our next question comes from Jim Mi...,4q24-earnings-transcript.pdf,4Q24,2025-01-15
4,Jim Mitchell,Q,"Analyst, Seaport Global Securities LLC","Hey. Good morning. Maybe just on regulation, w...",4q24-earnings-transcript.pdf,4Q24,2025-01-15


In [40]:
#Checking the type of data
qa_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 739 entries, 0 to 738
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   speaker            739 non-null    object
 1   marker             637 non-null    object
 2   job_title          636 non-null    object
 3   utterance          738 non-null    object
 4   filename           739 non-null    object
 5   financial_quarter  739 non-null    object
 6   call_date          739 non-null    object
dtypes: object(7)
memory usage: 40.5+ KB


In [101]:
preprocessor(qa_data, "utterance", "tokenised_data", "cleaned_data")

In [102]:
qa_data.head()

Unnamed: 0,speaker,marker,job_title,utterance,filename,financial_quarter,call_date,tokenised_data,cleaned_data
0,Jeremy Barnum,A,"Chief Financial Officer, JPMorganChase","Yeah. I think the conventional wisdom on QT, a...",4q24-earnings-transcript.pdf,4Q24,2025-01-15,"[yeah, think, conventional, wisdom, pretending...",[yeah think conventional wisdom qt im pretendi...
1,Mike Mayo,Q,"Analyst, Wells Fargo Securities LLC","So, you'll stay around maybe for a few more ye...",4q24-earnings-transcript.pdf,4Q24,2025-01-15,"[stay, around, maybe, years, base, case, right...",[so stay around maybe years base case right now]
2,Mike Mayo,Q,"Analyst, Wells Fargo Securities LLC",All right. Thank you.,4q24-earnings-transcript.pdf,4Q24,2025-01-15,"[right, thank, you]",[right thank you]
3,Operator,,,Thank you. Our next question comes from Jim Mi...,4q24-earnings-transcript.pdf,4Q24,2025-01-15,"[thank, you, next, question, comes, jim, mitch...",[thank you next question comes jim mitchell se...
4,Jim Mitchell,Q,"Analyst, Seaport Global Securities LLC","Hey. Good morning. Maybe just on regulation, w...",4q24-earnings-transcript.pdf,4Q24,2025-01-15,"[hey, good, morning, maybe, regulation, new, a...",[hey good morning maybe regulation new adminis...


JP morgan management discussion

In [107]:
%ls

chunked_management_discussion.csv            santander_management_discussion.csv
chunked_santander_management_discussion.csv  ubs_management_discussion.csv
jpmorgan_management_discussion.csv           ubs_qna_section.csv
jpmorgan_qa_section.csv


In [110]:
#defining santader dataframe
jpmorgan_body_df=pd.read_csv("chunked_management_discussion.csv")

In [111]:
preprocessor(jpmorgan_body_df, "chunk_text", "tokenized_data","cleaned_data")

In [112]:
jpmorgan_body_df.head()

Unnamed: 0,filename,chunk_index,chunk_text,financial_quarter,call_date,tokenized_data,cleaned_data
0,4q24-earnings-transcript.pdf,1,"Operator: Good morning, ladies and gentlemen. ...",4Q24,2025-01-15,"[operator, good, morning, ladies, gentlemen, w...",[operator: good morning ladies gentlemen welco...
1,4q24-earnings-transcript.pdf,2,"At this time, I would like to turn the call ov...",4Q24,2025-01-15,"[time, would, like, turn, call, jpmorganchase,...",[time would like turn call jpmorganchases chai...
2,4q24-earnings-transcript.pdf,3,"NII ex. Markets was down $548 million or 2%, d...",4Q24,2025-01-15,"[nii, markets, million, driven, impact, lower,...",[nii ex markets $548 million 2% driven impact ...
3,4q24-earnings-transcript.pdf,4,And Markets revenue was up $1.2 billion or 21%...,4Q24,2025-01-15,"[markets, revenue, , billion, expenses, , bill...",[markets revenue $12 billion 21% expenses $228...
4,4q24-earnings-transcript.pdf,5,I'll\nremind you that there were a number of s...,4Q24,2025-01-15,"[ll, remind, number, significant, items, , exc...",[ill remind number significant items 2024 excl...


Santander management discussion

In [114]:
%ls

chunked_management_discussion.csv            santander_management_discussion.csv
chunked_santander_management_discussion.csv  ubs_management_discussion.csv
jpmorgan_management_discussion.csv           ubs_qna_section.csv
jpmorgan_qa_section.csv


In [115]:
santander_management_df=pd.read_csv("chunked_santander_management_discussion.csv")

In [116]:
santander_management_df.head()

Unnamed: 0,filename,chunk_index,chunk_text,financial_quarter,call_date
0,video_2024_Q1_1,1,"Good morning, everybody. And welcome to Banco ...",2024 Q1,Unknown
1,video_2024_Q1_1,2,Secondary reporting will be what used to be pr...,2024 Q1,Unknown
2,video_2024_Q1_1,3,"But now to the presentation, I am joined here ...",2024 Q1,Unknown
3,video_2024_Q1_1,4,"First, let me share with you what we will focu...",2024 Q1,Unknown
4,video_2024_Q1_1,5,"That's an 11% increase versus Q1, 23. 9% in co...",2024 Q1,Unknown


In [117]:
preprocessor(santander_management_df,"chunk_text","tokenized_data","cleaned_data")

In [118]:
santander_management_df.head()

Unnamed: 0,filename,chunk_index,chunk_text,financial_quarter,call_date,tokenized_data,cleaned_data
0,video_2024_Q1_1,1,"Good morning, everybody. And welcome to Banco ...",2024 Q1,Unknown,"[good, morning, everybody, welcome, banco, san...",[good morning everybody welcome banco santande...
1,video_2024_Q1_1,2,Secondary reporting will be what used to be pr...,2024 Q1,Unknown,"[secondary, reporting, used, primary, mainly, ...",[secondary reporting used primary 2023 mainly ...
2,video_2024_Q1_1,3,"But now to the presentation, I am joined here ...",2024 Q1,Unknown,"[presentation, joined, today, ceo, mr, ektorgr...",[presentation joined today ceo mr ektorgrisi c...
3,video_2024_Q1_1,4,"First, let me share with you what we will focu...",2024 Q1,Unknown,"[first, let, share, focus, today, first, talk,...",[first let share focus today first talk q1 res...
4,video_2024_Q1_1,5,"That's an 11% increase versus Q1, 23. 9% in co...",2024 Q1,Unknown,"[that, increase, versus, cost, euros, begonia,...",[thats 11% increase versus q1 23 9% cost euros...


UBS qna section

In [120]:
%ls

chunked_management_discussion.csv            santander_management_discussion.csv
chunked_santander_management_discussion.csv  ubs_management_discussion.csv
jpmorgan_management_discussion.csv           ubs_qna_section.csv
jpmorgan_qa_section.csv


In [121]:
ubs_qna_df=pd.read_csv("ubs_qna_section.csv")

In [122]:
ubs_qna_df.head()

Unnamed: 0,speaker,job_title,utterance,call_date,financial_quarter,source_file
0,Unknown,,"Chis Hallam, Goldman Sachs Yes. Good morning, ...",25 April 2023,1Q23,1q23-earnings-call-remarks.pdf
1,Sergio P. Ermotti,,"Okay. Thank you. On capital requirements, you ...",25 April 2023,1Q23,1q23-earnings-call-remarks.pdf
2,Chris Hallam,Goldman Sachs,"Very clear. Thanks. Kian Abouhossein, JPMorgan...",25 April 2023,1Q23,1q23-earnings-call-remarks.pdf
3,Sergio P. Ermotti,,"So, Sarah, take the first question. I'll take ...",25 April 2023,1Q23,1q23-earnings-call-remarks.pdf
4,Sarah Youngwood,,"So, when we give you the 74%, we focused inten...",25 April 2023,1Q23,1q23-earnings-call-remarks.pdf


In [124]:
preprocessor(ubs_qna_df, "utterance", "tokenized_data","cleaned_data")

In [125]:
ubs_qna_df.head()

Unnamed: 0,speaker,job_title,utterance,call_date,financial_quarter,source_file,tokenized_data,cleaned_data
0,Unknown,,"Chis Hallam, Goldman Sachs Yes. Good morning, ...",25 April 2023,1Q23,1q23-earnings-call-remarks.pdf,"[chis, hallam, goldman, sachs, yes, good, morn...",[chis hallam goldman sachs yes good morning ev...
1,Sergio P. Ermotti,,"Okay. Thank you. On capital requirements, you ...",25 April 2023,1Q23,1q23-earnings-call-remarks.pdf,"[okay, thank, you, capital, requirements, know...",[okay thank you capital requirements know situ...
2,Chris Hallam,Goldman Sachs,"Very clear. Thanks. Kian Abouhossein, JPMorgan...",25 April 2023,1Q23,1q23-earnings-call-remarks.pdf,"[clear, thanks, kian, abouhossein, jpmorgan, y...",[clear thanks kian abouhossein jpmorgan yeah t...
3,Sergio P. Ermotti,,"So, Sarah, take the first question. I'll take ...",25 April 2023,1Q23,1q23-earnings-call-remarks.pdf,"[sarah, take, first, question, ll, take, second]",[so sarah take first question ill take second]
4,Sarah Youngwood,,"So, when we give you the 74%, we focused inten...",25 April 2023,1Q23,1q23-earnings-call-remarks.pdf,"[give, focused, intentionally, viewed, economi...",[so give 74% focused intentionally viewed econ...


UBS management discussion

In [130]:
%ls

chunked_management_discussion.csv            santander_management_discussion.csv
chunked_santander_management_discussion.csv  ubs_management_discussion.csv
jpmorgan_management_discussion.csv           ubs_qna_section.csv
jpmorgan_qa_section.csv


In [131]:
ubs_manag_df=pd.read_csv("ubs_management_discussion.csv")

In [132]:
ubs_manag_df.head()

Unnamed: 0,speaker,utterance,call_date,financial_quarter,source_file
0,Unknown,"Youngwood, Group Chief Financial Officer Inclu...",25 April 2023,1Q23,1q23-earnings-call-remarks.pdf
1,Sergio P. Ermotti,"Thank you, Sarah, good morning, everyone. I am...",25 April 2023,1Q23,1q23-earnings-call-remarks.pdf
2,Latin America.,"In Asset Management, the combination will impr...",25 April 2023,1Q23,1q23-earnings-call-remarks.pdf
3,Sarah Youngwood,"Thank you, Sergio. Good morning, everyone. rel...",25 April 2023,1Q23,1q23-earnings-call-remarks.pdf
4,Group Chief Financial Officer,Including analyst Q&A session,7 May 2024,1Q24,1q24-earnings-call-remarks.pdf


In [135]:
#chuncking

file_name=[]
quarter_name=[]
chunk_total=[]
speaker_name=[]
call_date=[]


dummy_df={
    ("file")
}
for t in range(len(ubs_manag_df["utterance"])):
  #print(ubs_manag_df["file_name"][t])

  chunks = chunk_text(ubs_manag_df["utterance"][t], max_chunk_size=500)
  #print(len(chunks))
  for u in range(len(chunks)):
     file_name.append(ubs_manag_df["source_file"][t])
     quarter_name.append(ubs_manag_df["financial_quarter"][t])
     speaker_name.append(ubs_manag_df["speaker"])
     call_date.append(ubs_manag_df["call_date"])
  #print(len(chunks))
  #print("t",chunks[1])
  if chunks:
    dummy_df["file_name"][t]=ubs_manag_df["file_name"][t]
    print(len(chunks))
    chunk_total.append(chunks)
    #dummy_df["chunks"][t]=chunks[t]
  else:
    print(f"No chunks produced for {t['filename']}.")

print(len(file_name))
print(chunk_total)
flat_list = [item for sublist in chunk_total for item in sublist]
print(len(flat_list))
data={
    "file_name":file_name,"chunks":flat_list,"quarter":quarter_name
}
dummy_df = pd.DataFrame(data)

KeyError: 'file_name'

# **Export the output as a csv file**

In [128]:
#export preprocessed data for Modeling
preprocessed_qa_csv_path = "/content/sample_data/ubs_qna_df_preprocessed.csv"

ubs_qna_df.to_csv(preprocessed_qa_csv_path, index=False)

# **Modeling**

In [None]:
# Define a function to reset the session.
def reset_session():
    tf.keras.backend.clear_session()
    np.random.seed(42)
    random.seed(42)
    tf.random.set_seed(42)
#reset_session()


#embedding model to numarise the text
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

#umap model to reduce dimention
umap_model = UMAP(n_neighbors=20, min_dist=0.1)

#representation mode to adjust topic or keywords. diversity is adjustable depending on the result
representation_model = MaximalMarginalRelevance(diversity=0.2)

#BERTopic model: add above models in the parameters
bertModel = BERTopic(embedding_model=embedding_model, umap_model=umap_model, representation_model=representation_model, verbose=True)

In [None]:
#Fitting data into bertModel
bertModel.fit(filtered_data)
topic, probabilities = bertModel.transform(filtered_data)

# **Result**

In [None]:
bertModel.get_topic_freq().head(10)

In [None]:
bertModel.get_topic(0)

In [None]:
import plotly.io as pio

fig = bertModel.visualize_barchart(top_n_topics=10, n_words=5)
fig.update_layout(
    autosize=False,
    width=1000,
    height=800,
    margin=dict(l=50, r=50, t=100, b=50),
    font=dict(size=12),
    title=dict(
        text="Top 10 Topics and Their Key Words",
        font=dict(size=16),
        x=0.5,
        y=0.98,
        xanchor="center",
        yanchor="top"
    )
)

fig.show()