<a href="https://colab.research.google.com/github/sheldonkemper/bank_of_england/blob/tidy_up_preprocessing_notebook/notebooks/model_testing_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
"""
===================================================
Author: Chiaki Tachikawa
Role: Data Science Lead, Bank of England Employer Project (Quant Collective)
LinkedIn: https://www.linkedin.com/in/chiaki-tachikawa
Date: 2025-02-13
Version: 1.1

Description:
    This notebook implements a system for cleaning and exporting transcript data for the Bank of England project. The workflow includes:
    - Importing necessary libraries and downloading NLTK data.
    - Defining and applying a `preprocessor` function to clean and tokenize text data.
    - Reading and preprocessing various CSV files containing transcript data.
    - Exporting the preprocessed data to new CSV files for further analysis.

===================================================
"""

# **Library**

In [None]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
nltk.download("stopwords")
nltk.download("punkt")
nltk.download("punkt_tab")
from nltk.tokenize import word_tokenize, sent_tokenize
from collections import Counter
import regex as re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [None]:
"""

!pip install tensorflow
!pip install numpy
!pip install bertopic


import tensorflow as tf
import numpy as np
import random
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
import tensorflow as tf
from umap import UMAP
from bertopic.representation import MaximalMarginalRelevance

"""

'\n\n!pip install tensorflow\n!pip install numpy\n!pip install bertopic\n\n\nimport tensorflow as tf\nimport numpy as np\nimport random\nfrom sentence_transformers import SentenceTransformer\nfrom bertopic import BERTopic\nimport tensorflow as tf\nfrom umap import UMAP\nfrom bertopic.representation import MaximalMarginalRelevance\n\n'

# **Function**

preprocessor function : The function modifies the DataFrame data in place, adding two new columns (col1 and col2) with preprocessed text.


Input:
  - name of dataframe
  - name of column which contains the text to clean
  - name of column which is tokenized
  - name of column which is cleaned

In [None]:
#create function to preprocess data
def preprocessor (data, col, col1,col2):
  #Copy col1umn
  data[col1]=data[col]
  data[col2]=data[col]

  #Adding column1
  #Lower the lettercase
  data[col1] = data[col1].str.lower()

  #Remove stop words
  stop_words = set(stopwords.words("english"))
  data[col1] = data[col1].apply(lambda x: " ".join([word for word in str(x).split() if word not in (stop_words)]))

  #Tokenize the word
  data[col1] = data[col1].apply(word_tokenize)

  #Remove numbers
  data[col1] = data[col1].apply(lambda x: [word for word in x if not word.isdigit()])

  #remove symbol from comments
  data[col1] = data[col1].apply(lambda x: [word for word in x if x!=""])

  #remove short word
  data[col1] = data[col1].apply(lambda x: [word for word in x if len(word)>2])

  #remove symbols
  data[col1] = data[col1].apply (lambda x: [re.sub(r"[^a-z]", "", word) for word in x])

  #Adding column2
  #Lower the lettercase
  data[col2] = data[col2].str.lower()

  #Remove stop words
  stop_words = set(stopwords.words("english"))
  data[col2] = data[col2].apply(lambda x: " ".join([word for word in str(x).split() if word not in (stop_words)]))

  #remove symbols
  data[col2] = data[col2].apply (lambda x: [re.sub(r"[.,'?]", "", x)])

  return


## **Data**

JP Morgan QA section

In [None]:
#Obtaining management discussion / git bash
!git clone https://github.com/sheldonkemper/bank_of_england.git
!git switch Preprocessing
%cd bank_of_england/data/processed
%ls

Cloning into 'bank_of_england'...
remote: Enumerating objects: 907, done.[K
remote: Counting objects: 100% (249/249), done.[K
remote: Compressing objects: 100% (191/191), done.[K
remote: Total 907 (delta 157), reused 71 (delta 57), pack-reused 658 (from 1)[K
Receiving objects: 100% (907/907), 8.53 MiB | 21.14 MiB/s, done.
Resolving deltas: 100% (421/421), done.
fatal: invalid reference: Preprocessing
/content/bank_of_england/data/processed/bank_of_england/data/processed
chunked_management_discussion.csv            santander_management_discussion.csv
chunked_santander_management_discussion.csv  ubs_management_discussion.csv
jpmorgan_management_discussion.csv           ubs_qna_section.csv
jpmorgan_qa_section.csv


In [None]:
#Defining qa_data
qa_data = pd.read_csv("jpmorgan_qa_section.csv")
qa_data.head()

Unnamed: 0,speaker,marker,job_title,utterance,filename,financial_quarter,call_date
0,Jeremy Barnum,A,"Chief Financial Officer, JPMorganChase","Yeah. I think the conventional wisdom on QT, a...",4q24-earnings-transcript.pdf,4Q24,2025-01-15
1,Mike Mayo,Q,"Analyst, Wells Fargo Securities LLC","So, you'll stay around maybe for a few more ye...",4q24-earnings-transcript.pdf,4Q24,2025-01-15
2,Mike Mayo,Q,"Analyst, Wells Fargo Securities LLC",All right. Thank you.,4q24-earnings-transcript.pdf,4Q24,2025-01-15
3,Operator,,,Thank you. Our next question comes from Jim Mi...,4q24-earnings-transcript.pdf,4Q24,2025-01-15
4,Jim Mitchell,Q,"Analyst, Seaport Global Securities LLC","Hey. Good morning. Maybe just on regulation, w...",4q24-earnings-transcript.pdf,4Q24,2025-01-15


In [None]:
#preprocessing data
preprocessor(qa_data, "utterance", "tokenised_data", "cleaned_data")

In [None]:
#present preprocessed dataframe
qa_data.head()

Unnamed: 0,speaker,marker,job_title,utterance,filename,financial_quarter,call_date,tokenised_data,cleaned_data
0,Jeremy Barnum,A,"Chief Financial Officer, JPMorganChase","Yeah. I think the conventional wisdom on QT, a...",4q24-earnings-transcript.pdf,4Q24,2025-01-15,"[yeah, think, conventional, wisdom, pretending...",[yeah think conventional wisdom qt im pretendi...
1,Mike Mayo,Q,"Analyst, Wells Fargo Securities LLC","So, you'll stay around maybe for a few more ye...",4q24-earnings-transcript.pdf,4Q24,2025-01-15,"[stay, around, maybe, years, base, case, right...",[so stay around maybe years base case right now]
2,Mike Mayo,Q,"Analyst, Wells Fargo Securities LLC",All right. Thank you.,4q24-earnings-transcript.pdf,4Q24,2025-01-15,"[right, thank, you]",[right thank you]
3,Operator,,,Thank you. Our next question comes from Jim Mi...,4q24-earnings-transcript.pdf,4Q24,2025-01-15,"[thank, you, next, question, comes, jim, mitch...",[thank you next question comes jim mitchell se...
4,Jim Mitchell,Q,"Analyst, Seaport Global Securities LLC","Hey. Good morning. Maybe just on regulation, w...",4q24-earnings-transcript.pdf,4Q24,2025-01-15,"[hey, good, morning, maybe, regulation, new, a...",[hey good morning maybe regulation new adminis...


JP morgan management discussion

In [None]:
%ls

chunked_management_discussion.csv            santander_management_discussion.csv
chunked_santander_management_discussion.csv  ubs_management_discussion.csv
jpmorgan_management_discussion.csv           ubs_qna_section.csv
jpmorgan_qa_section.csv


In [None]:
#defining santader dataframe
jpmorgan_body_df=pd.read_csv("chunked_management_discussion.csv")

In [None]:
#preprocess data
preprocessor(jpmorgan_body_df, "chunk_text", "tokenized_data","cleaned_data")

In [None]:
jpmorgan_body_df.head()

Unnamed: 0,filename,chunk_index,chunk_text,financial_quarter,call_date,tokenized_data,cleaned_data
0,4q24-earnings-transcript.pdf,1,"Operator: Good morning, ladies and gentlemen. ...",4Q24,2025-01-15,"[operator, good, morning, ladies, gentlemen, w...",[operator: good morning ladies gentlemen welco...
1,4q24-earnings-transcript.pdf,2,"At this time, I would like to turn the call ov...",4Q24,2025-01-15,"[time, would, like, turn, call, jpmorganchase,...",[time would like turn call jpmorganchases chai...
2,4q24-earnings-transcript.pdf,3,"NII ex. Markets was down $548 million or 2%, d...",4Q24,2025-01-15,"[nii, markets, million, driven, impact, lower,...",[nii ex markets $548 million 2% driven impact ...
3,4q24-earnings-transcript.pdf,4,And Markets revenue was up $1.2 billion or 21%...,4Q24,2025-01-15,"[markets, revenue, , billion, expenses, , bill...",[markets revenue $12 billion 21% expenses $228...
4,4q24-earnings-transcript.pdf,5,I'll\nremind you that there were a number of s...,4Q24,2025-01-15,"[ll, remind, number, significant, items, , exc...",[ill remind number significant items 2024 excl...


UBS qna section

In [None]:
%ls

chunked_management_discussion.csv            santander_management_discussion.csv
chunked_santander_management_discussion.csv  ubs_management_discussion.csv
jpmorgan_management_discussion.csv           ubs_qna_section.csv
jpmorgan_qa_section.csv


In [None]:
#define ubs q&a data
ubs_qna_df=pd.read_csv("ubs_qna_section.csv")

In [None]:
#preprocessing ubs Q&A data
preprocessor(ubs_qna_df, "utterance", "tokenized_data","cleaned_data")

In [None]:
ubs_qna_df.head()

Unnamed: 0,speaker,job_title,utterance,call_date,financial_quarter,source_file,tokenized_data,cleaned_data
0,Unknown,,"Chis Hallam, Goldman Sachs Yes. Good morning, ...",25 April 2023,1Q23,1q23-earnings-call-remarks.pdf,"[chis, hallam, goldman, sachs, yes, good, morn...",[chis hallam goldman sachs yes good morning ev...
1,Sergio P. Ermotti,,"Okay. Thank you. On capital requirements, you ...",25 April 2023,1Q23,1q23-earnings-call-remarks.pdf,"[okay, thank, you, capital, requirements, know...",[okay thank you capital requirements know situ...
2,Chris Hallam,Goldman Sachs,"Very clear. Thanks. Kian Abouhossein, JPMorgan...",25 April 2023,1Q23,1q23-earnings-call-remarks.pdf,"[clear, thanks, kian, abouhossein, jpmorgan, y...",[clear thanks kian abouhossein jpmorgan yeah t...
3,Sergio P. Ermotti,,"So, Sarah, take the first question. I'll take ...",25 April 2023,1Q23,1q23-earnings-call-remarks.pdf,"[sarah, take, first, question, ll, take, second]",[so sarah take first question ill take second]
4,Sarah Youngwood,,"So, when we give you the 74%, we focused inten...",25 April 2023,1Q23,1q23-earnings-call-remarks.pdf,"[give, focused, intentionally, viewed, economi...",[so give 74% focused intentionally viewed econ...


UBS management discussion

In [None]:
%ls

chunked_management_discussion.csv            santander_management_discussion.csv
chunked_santander_management_discussion.csv  ubs_management_discussion.csv
jpmorgan_management_discussion.csv           ubs_qna_section.csv
jpmorgan_qa_section.csv


In [None]:
#defining ubs management discussion
ubs_manag_df=pd.read_csv("ubs_management_discussion.csv")
ubs_manag_df.head()

Unnamed: 0,speaker,utterance,call_date,financial_quarter,source_file
0,Unknown,"Youngwood, Group Chief Financial Officer Inclu...",25 April 2023,1Q23,1q23-earnings-call-remarks.pdf
1,Sergio P. Ermotti,"Thank you, Sarah, good morning, everyone. I am...",25 April 2023,1Q23,1q23-earnings-call-remarks.pdf
2,Latin America.,"In Asset Management, the combination will impr...",25 April 2023,1Q23,1q23-earnings-call-remarks.pdf
3,Sarah Youngwood,"Thank you, Sergio. Good morning, everyone. rel...",25 April 2023,1Q23,1q23-earnings-call-remarks.pdf
4,Group Chief Financial Officer,Including analyst Q&A session,7 May 2024,1Q24,1q24-earnings-call-remarks.pdf


In [None]:
#preprocessing ubs management discussion
preprocessor(ubs_manag_df,"utterance", "tokenized_data","cleaned_data")
ubs_manag_df.head()

Unnamed: 0,speaker,utterance,call_date,financial_quarter,source_file,tokenized_data,cleaned_data
0,Unknown,"Youngwood, Group Chief Financial Officer Inclu...",25 April 2023,1Q23,1q23-earnings-call-remarks.pdf,"[youngwood, group, chief, financial, officer, ...",[youngwood group chief financial officer inclu...
1,Sergio P. Ermotti,"Thank you, Sarah, good morning, everyone. I am...",25 April 2023,1Q23,1q23-earnings-call-remarks.pdf,"[thank, you, sarah, good, morning, everyone, h...",[thank you sarah good morning everyone happy b...
2,Latin America.,"In Asset Management, the combination will impr...",25 April 2023,1Q23,1q23-earnings-call-remarks.pdf,"[asset, management, combination, improve, posi...",[asset management combination improve position...
3,Sarah Youngwood,"Thank you, Sergio. Good morning, everyone. rel...",25 April 2023,1Q23,1q23-earnings-call-remarks.pdf,"[thank, you, sergio, good, morning, everyone, ...",[thank you sergio good morning everyone relate...
4,Group Chief Financial Officer,Including analyst Q&A session,7 May 2024,1Q24,1q24-earnings-call-remarks.pdf,"[including, analyst, session]",[including analyst q&a session]


# **Export the output as a csv file**

JP morgan QA section

In [None]:
#export preprocessed data
preprocessed_qa_csv_path1 = "/content/sample_data/jpmorgan_qna_df_preprocessed.csv"
qa_data.to_csv(preprocessed_qa_csv_path1, index=False)

JP morgan management discussion

In [None]:
#export preprocessed data
preprocessed_qa_csv_path2 = "/content/sample_data/jpmorgan_management_df_preprocessed.csv"
jpmorgan_body_df.to_csv(preprocessed_qa_csv_path2, index=False)

UBS QA section

In [None]:
#export preprocessed data
preprocessed_qa_csv_path3 = "/content/sample_data/ubs_qa_df_preprocessed.csv"
ubs_qna_df.to_csv(preprocessed_qa_csv_path3, index=False)

UBS management discussion

In [None]:
#export preprocessed data
preprocessed_qa_csv_path4 = "/content/sample_data/ubs_management_df_preprocessed.csv"
ubs_manag_df.to_csv(preprocessed_qa_csv_path4, index=False)

# **Modeling**

In [None]:
# Define a function to reset the session.
def reset_session():
    tf.keras.backend.clear_session()
    np.random.seed(42)
    random.seed(42)
    tf.random.set_seed(42)
#reset_session()


#embedding model to numarise the text
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

#umap model to reduce dimention
umap_model = UMAP(n_neighbors=20, min_dist=0.1)

#representation mode to adjust topic or keywords. diversity is adjustable depending on the result
representation_model = MaximalMarginalRelevance(diversity=0.2)

#BERTopic model: add above models in the parameters
bertModel = BERTopic(embedding_model=embedding_model, umap_model=umap_model, representation_model=representation_model, verbose=True)

NameError: name 'SentenceTransformer' is not defined

In [None]:
#Fitting data into bertModel
bertModel.fit(filtered_data)
topic, probabilities = bertModel.transform(filtered_data)

# **Result**

In [None]:
bertModel.get_topic_freq().head(10)

In [None]:
bertModel.get_topic(0)

In [None]:
import plotly.io as pio

fig = bertModel.visualize_barchart(top_n_topics=10, n_words=5)
fig.update_layout(
    autosize=False,
    width=1000,
    height=800,
    margin=dict(l=50, r=50, t=100, b=50),
    font=dict(size=12),
    title=dict(
        text="Top 10 Topics and Their Key Words",
        font=dict(size=16),
        x=0.5,
        y=0.98,
        xanchor="center",
        yanchor="top"
    )
)

fig.show()