<a href="https://colab.research.google.com/github/sheldonkemper/bank_of_england/blob/tidy_up_preprocessing_notebook/notebooks/processed/ct_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [137]:
"""
===================================================
Author: Chiaki Tachikawa
Role: Data Science Lead, Bank of England Employer Project (Quant Collective)
LinkedIn: https://www.linkedin.com/in/chiaki-tachikawa
Date: 2025-02-13
Version: 1.1

Description:
    This notebook implements a system for cleaning and exporting transcript data for the Bank of England project. The workflow includes:
    - Importing necessary libraries and downloading NLTK data.
    - Defining and applying a `preprocessor` function to clean and tokenize text data.
    - Reading and preprocessing various CSV files containing transcript data.
    - Exporting the preprocessed data to new CSV files for further analysis.

===================================================
"""



# **Library**

In [138]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download("stopwords")
nltk.download("punkt")
nltk.download("punkt_tab")
nltk.download('wordnet')
from nltk.tokenize import word_tokenize, sent_tokenize
from collections import Counter
import regex as re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from google.colab import drive


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# **Function**

preprocessor function : The function modifies the DataFrame data in place, adding two new columns (col1 and col2) with preprocessed text.


Input:
  - name of dataframe
  - name of column which contains the text to clean
  - name of column which is tokenized
  - name of column which is cleaned

In [139]:
#create function to preprocess data
def preprocessor (data, col, col1,col2):
  #Copy col1umn
  data[col1]=data[col]
  data[col2]=data[col]


  #Adding column1
  #Lower the lettercase
  data[col1] = data[col1].str.lower()

  #Remove stop words
  stop_words = set(stopwords.words("english"))
  data[col1] = data[col1].apply(lambda x: " ".join([word for word in str(x).split() if word not in (stop_words)]))

  #Tokenize the word
  data[col1] = data[col1].apply(nltk.word_tokenize)

  #Remove numbers
  data[col1] = data[col1].apply(lambda x: [word for word in x if not word.isdigit()])

  #remove symbol from comments
  data[col1] = data[col1].apply(lambda x: [word for word in x if x!=""])

  #remove short word
  data[col1] = data[col1].apply(lambda x: [word for word in x if len(word)>2])

  #remove symbols
  data[col1] = data[col1].apply (lambda x: [re.sub(r"[^a-z]", "", word) for word in x])

  #lemmatization
  lemmatizer = WordNetLemmatizer()
  data[col1] = data[col1].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])



  #Adding column2
  #Lower the lettercase
  data[col2] = data[col2].str.lower()

  #Remove stop words
  stop_words = set(stopwords.words("english"))
  data[col2] = data[col2].apply(lambda x: " ".join([word for word in str(x).split() if word not in (stop_words)]))

  #remove symbols
  data[col2] = data[col2].apply (lambda x: [re.sub(r"[.,'?]", "", x)])

  return


find_row: This function searches upwards from the given current_row_num in the DataFrame df to find the first row where the value in column "M" is "A". It returns the index of that row. If no such row is found, it returns 0

In [140]:
def find_row (df, col, current_row_num):
  #list_name=[]
  i = current_row_num-1
  while i > 0:
    if df[col][i] == "A":
      break
    else:
      i-=1
  return i

find_row_empty: This function searches upwards from the given current_row_num to find the first row where col1 has the value "A" and col2 is not an empty list. It returns the index of that row.

In [141]:
def find_row_empty (df, col1, col2, current_row_num):
  #list_name=[]
  i = current_row_num-1
  while i > 0:
    if df[col1][i] == "A" and df[col2][i] != []:
      break
    else:
      i-=1
  return i

In [142]:
def create_ques_num_column (data, new_col,marker_col):
  #Create question number column
  data[new_col]=None
  #set global var to count question number
  num = 0
  #if Q was found, num adds 1 otherwise none
  for i in data.index:
    if data.loc[i,marker_col]=="Q":
      data.at[i,new_col]=num
      num +=1
    else:
      continue

In [143]:
# Function to extract names
def extract_name(full_string):
    return full_string.split(',')[0]

In [144]:
#check if there is "A" before "Q" from the current location
def find_last_a (df, col, current_row_num):
  #list_name=[]
  i = current_row_num-1
  while i > 0:
    if df[col][i] == "Q":
      j = i-1
      while j > 0:
        if df[col][j]=="A":
          pass
        else:
          break
        j-=1
      break
    else:
      i-=1
  return i

## **Data**

In [145]:
#drive.mount('/content/drive')

In [146]:
#!ls"/content/bank_of_england/data/preprocessed_data/Archived/jpmorgan_qa_section_preprocessed.csv"

In [147]:
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


JP Morgan QA section

In [148]:
"""
#Obtaining management discussion / git bash
!git clone https://github.com/sheldonkemper/bank_of_england.git
!git switch Preprocessing
%cd bank_of_england/data/preprocessed_data/archived
%ls
"""

'\n#Obtaining management discussion / git bash\n!git clone https://github.com/sheldonkemper/bank_of_england.git\n!git switch Preprocessing\n%cd bank_of_england/data/preprocessed_data/archived\n%ls\n'

In [149]:
#Defining qa_data
qa_data = pd.read_csv("/content/drive/MyDrive/jpmorgan_qna_df_preprocessed_ver7 .csv")
qa_data.head()

Unnamed: 0,filename,financial_quarter,call_date,speaker,marker,question_num,job_title,metadata,answer_cleaned,analyst,analyst_title,metadata_question,question_cleaned
0,1q23-earnings-transcript.pdf,1Q23,2023-04-14,Steven Chubak,Q,[],"Analyst, Wolfe Research LLC","So, Jamie, I was actually hoping to get your p...",['so jamie actually hoping get perspective see...,[],[],[],['so jamie actually hoping get perspective see...
1,1q23-earnings-transcript.pdf,1Q23,2023-04-14,Jamie Dimon,A,['0'],"Chairman & Chief Executive Officer, JPMorgan C...","Well, I think you were already kind of complet...",['well think already kind complete answering q...,"['Steven Chubak, Analyst, Wolfe Research LLC']","['Analyst, Wolfe Research LLC']","[""So, Jamie, I was actually hoping to get your...",['well think already kind complete answering q...
2,1q23-earnings-transcript.pdf,1Q23,2023-04-14,Jamie Dimon,A,['0'],"Chairman & Chief Executive Officer, JPMorgan C...","Well, we've told you that we're kind of pencil...",['well weve told were kind penciling $12 billi...,"['Steven Chubak, Analyst, Wolfe Research LLC']","['Analyst, Wolfe Research LLC']","[""So, Jamie, I was actually hoping to get your...",['well weve told were kind penciling $12 billi...
3,1q23-earnings-transcript.pdf,1Q23,2023-04-14,Ken Usdin,Q,[],"Analyst, Jefferies LLC","Hey, thanks. Good morning. Hey, Jeremy, I was ...",['hey thanks good morning hey jeremy wondering...,[],[],[],['hey thanks good morning hey jeremy wondering...
4,1q23-earnings-transcript.pdf,1Q23,2023-04-14,Jeremy Barnum,A,['1'],"Chief Financial Officer, JPMorgan Chase & Co.","Yeah, sure. So let me just summarize the drive...",['yeah sure let summarize drivers change outlo...,"['Ken Usdin, Analyst, Jefferies LLC']","['Analyst, Jefferies LLC']","['Hey, thanks. Good morning. Hey, Jeremy, I wa...",['yeah sure let summarize drivers change outlo...


In [134]:
qa_data = qa_data.sort_index(ascending = False)

In [110]:
"""
#ascend df by financial_quarter
quarter_order = ["1Q23", "2Q23","3Q23","4Q23","1Q24","2Q24","3Q24","4Q24"]
qa_data["financial_quarter"] = pd.Categorical(qa_data["financial_quarter"], categories=quarter_order, ordered=True)
qa_data = qa_data.sort_values("financial_quarter", kind="mergesort")
"""

In [135]:
#preprocessing data
preprocessor(qa_data, "utterance", "question_tokenised_data", "question_cleaned")
preprocessor(qa_data,"utterance","answer_tokenised_data","answer_cleaned")

In [136]:
#remove operater
#qa_data = qa_data.loc[qa_data["speaker"]!="Operator"]

#remove less than 20 words
qa_data["count"] = qa_data["question_tokenised_data"].apply(lambda x: len(x))
qa_data = qa_data.loc[qa_data["count"]>20]
qa_data.head()

Unnamed: 0,speaker,job_title,marker,utterance,filename,financial_quarter,call_date,question_tokenised_data,question_cleaned,answer_tokenised_data,answer_cleaned,count
581,"Jim Mitchell, Analyst, Seaport Global Securiti...","Analyst, Seaport Global Securities LLC",Q,Hey. Good morning. Maybe just a little bit on ...,1q23-earnings-transcript.pdf,1Q23,2023-04-14,"[hey, good, morning, maybe, little, bit, depos...",[hey good morning maybe little bit deposit tho...,"[hey, good, morning, maybe, little, bit, depos...",[hey good morning maybe little bit deposit tho...,31
580,"Jeremy Barnum, Chief Financial Officer, JPMorg...","Chief Financial Officer, JPMorgan Chase & Co.",A,"Yeah. A couple things there. So, first of all,...",1q23-earnings-transcript.pdf,1Q23,2023-04-14,"[yeah, couple, thing, there, first, all, know,...",[yeah couple things there so first all know ri...,"[yeah, couple, thing, there, first, all, know,...",[yeah couple things there so first all know ri...,55
579,"Jamie Dimon, Chairman & Chief Executive Office...","Chairman & Chief Executive Officer, JPMorgan C...",A,"If I add, I would say, categorically, there's ...",1q23-earnings-transcript.pdf,1Q23,2023-04-14,"[add, would, say, categorically, there, pricin...",[add would say categorically theres pricing po...,"[add, would, say, categorically, there, pricin...",[add would say categorically theres pricing po...,47
578,"Jim Mitchell, Analyst, Seaport Global Securiti...","Analyst, Seaport Global Securities LLC",Q,"No, fair – all fair points. And maybe just a f...",1q23-earnings-transcript.pdf,1Q23,2023-04-14,"[fair, fair, point, maybe, followup, john, que...",[no fair – fair points maybe follow-up johns q...,"[fair, fair, point, maybe, followup, john, que...",[no fair – fair points maybe follow-up johns q...,31
576,"Jeremy Barnum, Chief Financial Officer, JPMorg...","Chief Financial Officer, JPMorgan Chase & Co.",A,"Yeah. And we always say, right, we underwrite ...",1q23-earnings-transcript.pdf,1Q23,2023-04-14,"[yeah, always, say, right, underwrite, cycle, ...",[yeah always say right underwrite cycle think ...,"[yeah, always, say, right, underwrite, cycle, ...",[yeah always say right underwrite cycle think ...,44


In [150]:
#reset index
qa_data.reset_index(drop=True, inplace=True)
qa_data.head()

Unnamed: 0,filename,financial_quarter,call_date,speaker,marker,question_num,job_title,metadata,answer_cleaned,analyst,analyst_title,metadata_question,question_cleaned
0,1q23-earnings-transcript.pdf,1Q23,2023-04-14,Steven Chubak,Q,[],"Analyst, Wolfe Research LLC","So, Jamie, I was actually hoping to get your p...",['so jamie actually hoping get perspective see...,[],[],[],['so jamie actually hoping get perspective see...
1,1q23-earnings-transcript.pdf,1Q23,2023-04-14,Jamie Dimon,A,['0'],"Chairman & Chief Executive Officer, JPMorgan C...","Well, I think you were already kind of complet...",['well think already kind complete answering q...,"['Steven Chubak, Analyst, Wolfe Research LLC']","['Analyst, Wolfe Research LLC']","[""So, Jamie, I was actually hoping to get your...",['well think already kind complete answering q...
2,1q23-earnings-transcript.pdf,1Q23,2023-04-14,Jamie Dimon,A,['0'],"Chairman & Chief Executive Officer, JPMorgan C...","Well, we've told you that we're kind of pencil...",['well weve told were kind penciling $12 billi...,"['Steven Chubak, Analyst, Wolfe Research LLC']","['Analyst, Wolfe Research LLC']","[""So, Jamie, I was actually hoping to get your...",['well weve told were kind penciling $12 billi...
3,1q23-earnings-transcript.pdf,1Q23,2023-04-14,Ken Usdin,Q,[],"Analyst, Jefferies LLC","Hey, thanks. Good morning. Hey, Jeremy, I was ...",['hey thanks good morning hey jeremy wondering...,[],[],[],['hey thanks good morning hey jeremy wondering...
4,1q23-earnings-transcript.pdf,1Q23,2023-04-14,Jeremy Barnum,A,['1'],"Chief Financial Officer, JPMorgan Chase & Co.","Yeah, sure. So let me just summarize the drive...",['yeah sure let summarize drivers change outlo...,"['Ken Usdin, Analyst, Jefferies LLC']","['Analyst, Jefferies LLC']","['Hey, thanks. Good morning. Hey, Jeremy, I wa...",['yeah sure let summarize drivers change outlo...


In [74]:
#Create question number column
create_ques_num_column(qa_data,"question_number_inline","marker")

In [75]:
qa_data.head()

Unnamed: 0,speaker,marker,job_title,utterance,filename,financial_quarter,call_date,tokenised_data,cleaned_data,question_tokenised_data,question_cleaned,answer_tokenised_data,answer_cleaned,count,question_number_inline
0,Jeremy Barnum,A,"Chief Financial Officer, JPMorgan Chase & Co.","Yeah. A couple things there. So, first of all,...",1q23-earnings-transcript.pdf,1Q23,2023-04-14,"['yeah', 'couple', 'things', 'there', 'first',...",['yeah couple things there so first all know r...,"[yeah, couple, thing, there, first, all, know,...",[yeah couple things there so first all know ri...,"[yeah, couple, thing, there, first, all, know,...",[yeah couple things there so first all know ri...,55,
1,Jeremy Barnum,A,"Chief Financial Officer, JPMorgan Chase & Co.","Yeah. And we always say, right, we underwrite ...",1q23-earnings-transcript.pdf,1Q23,2023-04-14,"['yeah', 'always', 'say', 'right', 'underwrite...",['yeah always say right underwrite cycle think...,"[yeah, always, say, right, underwrite, cycle, ...",[yeah always say right underwrite cycle think ...,"[yeah, always, say, right, underwrite, cycle, ...",[yeah always say right underwrite cycle think ...,44,
2,Jim Mitchell,Q,"Analyst, Seaport Global Securities LLC","No, fair – all fair points. And maybe just a f...",1q23-earnings-transcript.pdf,1Q23,2023-04-14,"['fair', 'fair', 'points', 'maybe', 'followup'...",['no fair – fair points maybe follow-up johns ...,"[fair, fair, point, maybe, followup, john, que...",[no fair – fair points maybe follow-up johns q...,"[fair, fair, point, maybe, followup, john, que...",[no fair – fair points maybe follow-up johns q...,31,0.0
3,Jamie Dimon,A,"Chairman & Chief Executive Officer, JPMorgan C...","If I add, I would say, categorically, there's ...",1q23-earnings-transcript.pdf,1Q23,2023-04-14,"['add', 'would', 'say', 'categorically', 'ther...",['add would say categorically theres pricing p...,"[add, would, say, categorically, there, pricin...",[add would say categorically theres pricing po...,"[add, would, say, categorically, there, pricin...",[add would say categorically theres pricing po...,47,
4,Jeremy Barnum,A,"Chief Financial Officer, JPMorgan Chase & Co.","Yeah. So a few things on there, Gerard. So we ...",1q23-earnings-transcript.pdf,1Q23,2023-04-14,"['yeah', 'things', 'there', 'gerard', 'previou...",['yeah things there gerard previously said tar...,"[yeah, thing, there, gerard, previously, said,...",[yeah things there gerard previously said targ...,"[yeah, thing, there, gerard, previously, said,...",[yeah things there gerard previously said targ...,106,


In [76]:
#adding new columns
qa_data["analyst"]=None
qa_data["analyst_title"]=None
qa_data["metadata_question"]=None
qa_data["question_num"]=None

#

for i in qa_data.index:
    name_list=[]
    title_list=[]
    question_list=[]
    quest_num_list=[]
    #If the value in the "marker" column at row i is "Q", then set the values of "analyst", "analyst_title", "metadata_question", and "question_num" columns at row i to "x".
    if qa_data.loc[i,"marker"]=="Q":
      qa_data.at[i,"analyst"]="x"
      qa_data.at[i,"analyst_title"]="x"
      qa_data.at[i,"metadata_question"]="x"
      qa_data.at[i,"question_num"]="x"
    elif find_last_a(qa_data,"marker",i) >=0 and qa_data.loc[i-1,"marker"]=="Q":
      name_list.append(qa_data["speaker"][i-1])
      title_list.append(qa_data["job_title"][i-1])
      question_list.append(qa_data["utterance"][i-1])
      quest_num_list.append(str(qa_data["question_number_inline"][i-1]))
    elif find_last_a(qa_data,"marker",i) >=0 and qa_data.loc[i-1,"marker"]=="A":
      qa_data.at[i,"analyst"] = qa_data.at[i-1,"analyst"]
      qa_data.at[i, "analyst_title"] = qa_data.at[i-1,"analyst_title"]
      qa_data.at[i,"metadata_question"]=qa_data.at[i-1, "metadata_question"]
      qa_data.at[i,"question_num"] = qa_data.at[i-1, "question_num"]

    #Initialize lists and populate them with "speaker", "job_title", "utterance", and "question_number_inline" values from rows between last_a and i in qa_data.
    else:
      last_a = find_row(qa_data,"marker", i)+ 1
      for j in range(last_a, i):
        name_list.append(qa_data["speaker"][j])
        title_list.append(qa_data["job_title"][j])
        question_list.append(qa_data["utterance"][j])
        quest_num_list.append(str(qa_data["question_number_inline"][j]))
 # Assign the lists name_list, title_list, question_list, and quest_num_list to the "analyst", "analyst_title", "metadata_question", and "question_num" columns at row i in qa_data, respectively.
    qa_data.at[i,"analyst"]=name_list
    qa_data.at[i,"analyst_title"]=title_list
    qa_data.at[i,"metadata_question"]=question_list
    qa_data.at[i,"question_num"]=quest_num_list

In [77]:
qa_data.head()

Unnamed: 0,speaker,marker,job_title,utterance,filename,financial_quarter,call_date,tokenised_data,cleaned_data,question_tokenised_data,question_cleaned,answer_tokenised_data,answer_cleaned,count,question_number_inline,analyst,analyst_title,metadata_question,question_num
0,Jeremy Barnum,A,"Chief Financial Officer, JPMorgan Chase & Co.","Yeah. A couple things there. So, first of all,...",1q23-earnings-transcript.pdf,1Q23,2023-04-14,"['yeah', 'couple', 'things', 'there', 'first',...",['yeah couple things there so first all know r...,"[yeah, couple, thing, there, first, all, know,...",[yeah couple things there so first all know ri...,"[yeah, couple, thing, there, first, all, know,...",[yeah couple things there so first all know ri...,55,,[],[],[],[]
1,Jeremy Barnum,A,"Chief Financial Officer, JPMorgan Chase & Co.","Yeah. And we always say, right, we underwrite ...",1q23-earnings-transcript.pdf,1Q23,2023-04-14,"['yeah', 'always', 'say', 'right', 'underwrite...",['yeah always say right underwrite cycle think...,"[yeah, always, say, right, underwrite, cycle, ...",[yeah always say right underwrite cycle think ...,"[yeah, always, say, right, underwrite, cycle, ...",[yeah always say right underwrite cycle think ...,44,,[],[],[],[]
2,Jim Mitchell,Q,"Analyst, Seaport Global Securities LLC","No, fair – all fair points. And maybe just a f...",1q23-earnings-transcript.pdf,1Q23,2023-04-14,"['fair', 'fair', 'points', 'maybe', 'followup'...",['no fair – fair points maybe follow-up johns ...,"[fair, fair, point, maybe, followup, john, que...",[no fair – fair points maybe follow-up johns q...,"[fair, fair, point, maybe, followup, john, que...",[no fair – fair points maybe follow-up johns q...,31,0.0,[],[],[],[]
3,Jamie Dimon,A,"Chairman & Chief Executive Officer, JPMorgan C...","If I add, I would say, categorically, there's ...",1q23-earnings-transcript.pdf,1Q23,2023-04-14,"['add', 'would', 'say', 'categorically', 'ther...",['add would say categorically theres pricing p...,"[add, would, say, categorically, there, pricin...",[add would say categorically theres pricing po...,"[add, would, say, categorically, there, pricin...",[add would say categorically theres pricing po...,47,,[Jim Mitchell],"[Analyst, Seaport Global Securities LLC]","[No, fair – all fair points. And maybe just a ...",[0]
4,Jeremy Barnum,A,"Chief Financial Officer, JPMorgan Chase & Co.","Yeah. So a few things on there, Gerard. So we ...",1q23-earnings-transcript.pdf,1Q23,2023-04-14,"['yeah', 'things', 'there', 'gerard', 'previou...",['yeah things there gerard previously said tar...,"[yeah, thing, there, gerard, previously, said,...",[yeah things there gerard previously said targ...,"[yeah, thing, there, gerard, previously, said,...",[yeah things there gerard previously said targ...,106,,[],[],[],[]


In [78]:
for i in range(len(qa_data)):
  #if the first row is answer, then return all analyst columns as x, if the row is answer and there is not analyst, then popurate questioner analyst name and other data, otherwise pass the function
  if i ==0 and qa_data["marker"][i]=="A":
    qa_data.at[i,"analyst"] ="x"
    qa_data.at[i,"analyst_title"] ="x"
    qa_data.at[i,"metadata_question"] ="x"
    qa_data.at[i,"question_num"]="x"
  elif qa_data["marker"][i]=="A"and qa_data["analyst"][i]==[]:
    a = find_row_empty(qa_data,"marker","analyst",i)
    qa_data.at[i,"analyst"] = qa_data.loc[a,"analyst"]
    qa_data.at[i,"analyst_title"] = qa_data.loc[a,"analyst_title"]
    qa_data.at[i,"metadata_question"] = qa_data.loc[a,"metadata_question"]
    qa_data.at[i,"question_num"] =qa_data.loc[a,"question_num"]
  else:
    continue


In [79]:
#rename column
qa_data.rename(columns={"utterance":"metadata"},inplace=True)


In [80]:
#reorganise column
qa_data=qa_data[["filename","financial_quarter","call_date","speaker","marker","question_num","job_title","metadata", "answer_cleaned","analyst","analyst_title","metadata_question","question_cleaned"]]

In [81]:
# Apply the function to the DataFrame
qa_data['speaker'] = qa_data['speaker'].apply(extract_name)

In [None]:
"""
#ascend df by financial_quarter
quarter_order = ["1Q23", "2Q23","3Q23","4Q23","1Q24","2Q24","3Q24","4Q24"]
qa_data["financial_quarter"] = pd.Categorical(qa_data["financial_quarter"], categories=quarter_order, ordered=True)
df_sorted = qa_data.sort_values("financial_quarter", kind="mergesort")
"""

In [151]:
qa_data["answer"]=None
current_answer = ""
current_question_num=0
for i in qa_data.index:
  if qa_data.loc[i,"marker"]=="A" :
    current_answer +=qa_data.loc[i, "metadata"]
  elif qa_data.loc[i,"marker"]=="Q":
    qa_data.at[i-1, "answer"] = current_answer
    current_answer=""
    current_question_num +=1
  else:
      continue


In [155]:
qa_data.head(50)

Unnamed: 0,filename,financial_quarter,call_date,speaker,marker,question_num,job_title,metadata,answer_cleaned,analyst,analyst_title,metadata_question,question_cleaned,answer
0,1q23-earnings-transcript.pdf,1Q23,2023-04-14,Steven Chubak,Q,[],"Analyst, Wolfe Research LLC","So, Jamie, I was actually hoping to get your p...",['so jamie actually hoping get perspective see...,[],[],[],['so jamie actually hoping get perspective see...,
1,1q23-earnings-transcript.pdf,1Q23,2023-04-14,Jamie Dimon,A,['0'],"Chairman & Chief Executive Officer, JPMorgan C...","Well, I think you were already kind of complet...",['well think already kind complete answering q...,"['Steven Chubak, Analyst, Wolfe Research LLC']","['Analyst, Wolfe Research LLC']","[""So, Jamie, I was actually hoping to get your...",['well think already kind complete answering q...,
2,1q23-earnings-transcript.pdf,1Q23,2023-04-14,Jamie Dimon,A,['0'],"Chairman & Chief Executive Officer, JPMorgan C...","Well, we've told you that we're kind of pencil...",['well weve told were kind penciling $12 billi...,"['Steven Chubak, Analyst, Wolfe Research LLC']","['Analyst, Wolfe Research LLC']","[""So, Jamie, I was actually hoping to get your...",['well weve told were kind penciling $12 billi...,"Well, I think you were already kind of complet..."
3,1q23-earnings-transcript.pdf,1Q23,2023-04-14,Ken Usdin,Q,[],"Analyst, Jefferies LLC","Hey, thanks. Good morning. Hey, Jeremy, I was ...",['hey thanks good morning hey jeremy wondering...,[],[],[],['hey thanks good morning hey jeremy wondering...,
4,1q23-earnings-transcript.pdf,1Q23,2023-04-14,Jeremy Barnum,A,['1'],"Chief Financial Officer, JPMorgan Chase & Co.","Yeah, sure. So let me just summarize the drive...",['yeah sure let summarize drivers change outlo...,"['Ken Usdin, Analyst, Jefferies LLC']","['Analyst, Jefferies LLC']","['Hey, thanks. Good morning. Hey, Jeremy, I wa...",['yeah sure let summarize drivers change outlo...,"Yeah, sure. So let me just summarize the drive..."
5,1q23-earnings-transcript.pdf,1Q23,2023-04-14,Ken Usdin,Q,[],"Analyst, Jefferies LLC","Yeah, and as a follow-up on the point about ra...",['yeah follow-up point rate expectations comin...,[],[],[],['yeah follow-up point rate expectations comin...,
6,1q23-earnings-transcript.pdf,1Q23,2023-04-14,Jamie Dimon,A,['2'],"Chairman & Chief Executive Officer, JPMorgan C...","Well first of all, I don't quite believe it. S...",['well first all quite believe it fed rate cur...,"['Ken Usdin, Analyst, Jefferies LLC']","['Analyst, Jefferies LLC']","[""Yeah, and as a follow-up on the point about ...",['well first all quite believe it fed rate cur...,"Well first of all, I don't quite believe it. S..."
7,1q23-earnings-transcript.pdf,1Q23,2023-04-14,John McDonald,Q,[],"Analyst, Autonomous Research","Hi, thanks. Jeremy, wanted to follow up again ...",['hi thanks jeremy wanted follow drivers nii r...,[],[],[],['hi thanks jeremy wanted follow drivers nii r...,
8,1q23-earnings-transcript.pdf,1Q23,2023-04-14,Jeremy Barnum,A,['3'],"Chief Financial Officer, JPMorgan Chase & Co.","Yeah. John, it's a really good question, and w...",['yeah john really good question weve obviousl...,"['John McDonald, Analyst, Autonomous Research']","['Analyst, Autonomous Research']","['Hi, thanks. Jeremy, wanted to follow up agai...",['yeah john really good question weve obviousl...,"Yeah. John, it's a really good question, and w..."
9,1q23-earnings-transcript.pdf,1Q23,2023-04-14,John McDonald,Q,[],"Analyst, Autonomous Research",Okay. And then I wanted to ask Jamie – there's...,['okay wanted ask jamie – theres narrative ind...,[],[],[],['okay wanted ask jamie – theres narrative ind...,


In [153]:
filtered_df = qa_data.dropna()

In [154]:
filtered_df.head(50)

Unnamed: 0,filename,financial_quarter,call_date,speaker,marker,question_num,job_title,metadata,answer_cleaned,analyst,analyst_title,metadata_question,question_cleaned,answer
2,1q23-earnings-transcript.pdf,1Q23,2023-04-14,Jamie Dimon,A,['0'],"Chairman & Chief Executive Officer, JPMorgan C...","Well, we've told you that we're kind of pencil...",['well weve told were kind penciling $12 billi...,"['Steven Chubak, Analyst, Wolfe Research LLC']","['Analyst, Wolfe Research LLC']","[""So, Jamie, I was actually hoping to get your...",['well weve told were kind penciling $12 billi...,"Well, I think you were already kind of complet..."
4,1q23-earnings-transcript.pdf,1Q23,2023-04-14,Jeremy Barnum,A,['1'],"Chief Financial Officer, JPMorgan Chase & Co.","Yeah, sure. So let me just summarize the drive...",['yeah sure let summarize drivers change outlo...,"['Ken Usdin, Analyst, Jefferies LLC']","['Analyst, Jefferies LLC']","['Hey, thanks. Good morning. Hey, Jeremy, I wa...",['yeah sure let summarize drivers change outlo...,"Yeah, sure. So let me just summarize the drive..."
6,1q23-earnings-transcript.pdf,1Q23,2023-04-14,Jamie Dimon,A,['2'],"Chairman & Chief Executive Officer, JPMorgan C...","Well first of all, I don't quite believe it. S...",['well first all quite believe it fed rate cur...,"['Ken Usdin, Analyst, Jefferies LLC']","['Analyst, Jefferies LLC']","[""Yeah, and as a follow-up on the point about ...",['well first all quite believe it fed rate cur...,"Well first of all, I don't quite believe it. S..."
8,1q23-earnings-transcript.pdf,1Q23,2023-04-14,Jeremy Barnum,A,['3'],"Chief Financial Officer, JPMorgan Chase & Co.","Yeah. John, it's a really good question, and w...",['yeah john really good question weve obviousl...,"['John McDonald, Analyst, Autonomous Research']","['Analyst, Autonomous Research']","['Hi, thanks. Jeremy, wanted to follow up agai...",['yeah john really good question weve obviousl...,"Yeah. John, it's a really good question, and w..."
11,1q23-earnings-transcript.pdf,1Q23,2023-04-14,Jeremy Barnum,A,['4'],"Chief Financial Officer, JPMorgan Chase & Co.","Yeah. So, Erika, as you know, we take – not go...",['yeah so erika know take – going go lot detai...,"['John McDonald, Analyst, Autonomous Research']","['Analyst, Autonomous Research']","[""Okay. And then I wanted to ask Jamie – there...",['yeah so erika know take – going go lot detai...,Yeah. I wouldn't use the word credit crunch if...
13,1q23-earnings-transcript.pdf,1Q23,2023-04-14,Jeremy Barnum,A,['5'],"Chief Financial Officer, JPMorgan Chase & Co.",Okay. Let's take a crack. Let's see what the b...,['okay lets take crack lets see boss thinks th...,"['Erika Najarian, Analyst, UBS Securities LLC']","['Analyst, UBS Securities LLC']","[""So, as you think about all of what you've ju...",['okay lets take crack lets see boss thinks th...,Okay. Let's take a crack. Let's see what the b...
16,1q23-earnings-transcript.pdf,1Q23,2023-04-14,Jamie Dimon,A,['6'],"Chairman & Chief Executive Officer, JPMorgan C...","If I add, I would say, categorically, there's ...",['add would say categorically theres pricing p...,"['Jim Mitchell, Analyst, Seaport Global Securi...","['Analyst, Seaport Global Securities LLC']","[""Hey. Good morning. Maybe just a little bit o...",['add would say categorically theres pricing p...,"Yeah. A couple things there. So, first of all,..."
18,1q23-earnings-transcript.pdf,1Q23,2023-04-14,Jeremy Barnum,A,['7'],"Chief Financial Officer, JPMorgan Chase & Co.","Yeah. And we always say, right, we underwrite ...",['yeah always say right underwrite cycle think...,"['Jim Mitchell, Analyst, Seaport Global Securi...","['Analyst, Seaport Global Securities LLC']","[""No, fair – all fair points. And maybe just a...",['yeah always say right underwrite cycle think...,"Yeah. And we always say, right, we underwrite ..."
20,1q23-earnings-transcript.pdf,1Q23,2023-04-14,Jeremy Barnum,A,['8'],"Chief Financial Officer, JPMorgan Chase & Co.","Yeah. So a few things on there, Gerard. So we ...",['yeah things there gerard previously said tar...,"['Gerard Cassidy, Analyst, RBC Capital Markets...","['Analyst, RBC Capital Markets LLC']","[""In your comments about your CET1 ratio, obvi...",['yeah things there gerard previously said tar...,"Yeah. So a few things on there, Gerard. So we ..."
21,1q23-earnings-transcript.pdf,1Q23,2023-04-14,Gerard Cassidy,Q,[],"Analyst, RBC Capital Markets LLC","Sure. And then just as a follow-up, if I heard...",['sure follow-up heard correctly give us littl...,[],[],[],['sure follow-up heard correctly give us littl...,


JP morgan management discussion

In [None]:
%ls

In [None]:
#defining santader dataframe
jpmorgan_body_df=pd.read_csv("jpmorgan_management_discussion.csv")
jpmorgan_body_df.head()

In [None]:
#preprocess data
preprocessor(jpmorgan_body_df, "chunk_text", "tokenized_data","cleaned_data")

In [None]:
jpmorgan_body_df.head()

UBS qna section

In [None]:
%ls

In [None]:
#define ubs q&a data
ubs_qna_df=pd.read_csv("ubs_qna_section.csv")

In [None]:
#preprocessing ubs Q&A data
preprocessor(ubs_qna_df, "utterance", "tokenized_data","cleaned_data")

In [None]:
ubs_qna_df.head()

UBS management discussion

In [None]:
%ls

In [None]:
#defining ubs management discussion
ubs_manag_df=pd.read_csv("ubs_management_discussion.csv")
ubs_manag_df.head()

In [None]:
#preprocessing ubs management discussion
preprocessor(ubs_manag_df,"utterance", "tokenized_data","cleaned_data")
ubs_manag_df.head()

# **Export the output as a csv file**

JP morgan QA section

In [156]:
#export preprocessed data
preprocessed_qa_csv_path1 = "/content/drive/MyDrive/bank_of_england/data/preprocessed_data/jpmorgan_qna_df_preprocessed_ver7.csv"
filtered_df.to_csv("jp_morgan.csv", index=False)

JP morgan management discussion

In [None]:
#export preprocessed data
preprocessed_qa_csv_path2 = "/content/sample_data/jpmorgan_management_df_preprocessed.csv"
jpmorgan_body_df.to_csv(preprocessed_qa_csv_path2, index=False)

UBS QA section

In [None]:
#export preprocessed data
preprocessed_qa_csv_path3 = "/content/sample_data/ubs_qa_df_preprocessed.csv"
ubs_qna_df.to_csv(preprocessed_qa_csv_path3, index=False)

UBS management discussion

In [None]:
#export preprocessed data
preprocessed_qa_csv_path4 = "/content/sample_data/ubs_management_df_preprocessed.csv"
ubs_manag_df.to_csv(preprocessed_qa_csv_path4, index=False)