<a href="https://colab.research.google.com/github/sheldonkemper/bank_of_england/blob/tidy_up_preprocessing_notebook/notebooks/processed/ct_preprocessing_jpmorgan.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [29]:
"""
===================================================
Author: Chiaki Tachikawa
Role: Data Science Lead, Bank of England Employer Project (Quant Collective)
LinkedIn: https://www.linkedin.com/in/chiaki-tachikawa
Date: 2025-02-27
Version: 1.1

Description:
    This notebook implements a system for cleaning and exporting transcript data for the Bank of England project. The workflow includes:
    - Importing necessary libraries and downloading NLTK data.
    - Defining and applying a `preprocessor` function to clean and tokenize text data.
    - Reading and preprocessing various CSV files containing transcript data.
    - Exporting the preprocessed data to new CSV files for further analysis.

===================================================
"""



# **Library**

In [30]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download("stopwords")
nltk.download("punkt")
nltk.download("punkt_tab")
nltk.download('wordnet')
from nltk.tokenize import word_tokenize, sent_tokenize
from collections import Counter
import regex as re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from google.colab import drive


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# **Function**

preprocessor function : The function modifies the DataFrame data in place, adding two new columns (col1 and col2) with preprocessed text.


Input:
  - name of dataframe
  - name of column which contains the text to clean
  - name of column which is tokenized
  - name of column which is cleaned

In [31]:
#create function to preprocess data
def preprocessor (data, col, col1,col2):
  #Copy col1umn
  data[col1]=data[col]
  data[col2]=data[col]


  #Adding column1
  #Lower the lettercase
  data[col1] = data[col1].str.lower()

  #Remove stop words
  stop_words = set(stopwords.words("english"))
  data[col1] = data[col1].apply(lambda x: " ".join([word for word in str(x).split() if word not in (stop_words)]))

  #Tokenize the word
  data[col1] = data[col1].apply(nltk.word_tokenize)

  #Remove numbers
  data[col1] = data[col1].apply(lambda x: [word for word in x if not word.isdigit()])

  #remove symbol from comments
  data[col1] = data[col1].apply(lambda x: [word for word in x if x!=""])

  #remove short word
  data[col1] = data[col1].apply(lambda x: [word for word in x if len(word)>2])

  #remove symbols
  data[col1] = data[col1].apply (lambda x: [re.sub(r"[^a-z]", "", word) for word in x])

  #lemmatization
  lemmatizer = WordNetLemmatizer()
  data[col1] = data[col1].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])



  #Adding column2
  #Lower the lettercase
  data[col2] = data[col2].str.lower()

  #Remove stop words
  stop_words = set(stopwords.words("english"))
  data[col2] = data[col2].apply(lambda x: " ".join([word for word in str(x).split() if word not in (stop_words)]))

  #remove symbols
  data[col2] = data[col2].apply (lambda x: [re.sub(r"[.,'?]", "", x)])

  return


## **Data**

In [32]:
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


JP Morgan QA section

In [33]:
#Defining qa_data
qa_data = pd.read_csv("/content/jpmorgan_qa_section.csv")
qa_data.head()

Unnamed: 0,Analyst,Analyst Role,Question,Executive,Executive Role Type,Response,filename,Quarter,call_date
0,Steven Chubak,"Analyst, Wolfe Research LLC","Hey, good morning.",Jeremy Barnum,"Chief Financial Officer, JPMorgan Chase","Good morning, Steve.",1q23-earnings-transcript.pdf,1Q23,2023-04-14
1,Steven Chubak,"Analyst, Wolfe Research LLC","So, Jamie, I was actually hoping to get your p...",Jamie Dimon,"Chairman & Chief Executive Officer, JPMorgan C...","Well, I think you were already kind of complet...",1q23-earnings-transcript.pdf,1Q23,2023-04-14
2,Steven Chubak,"Analyst, Wolfe Research LLC",Got it. And just in terms of appetite for the ...,Jamie Dimon,"Chairman & Chief Executive Officer, JPMorgan C...","Oh, yeah.",1q23-earnings-transcript.pdf,1Q23,2023-04-14
3,Steven Chubak,"Analyst, Wolfe Research LLC",...elevated macro uncertainties.,Jamie Dimon,"Chairman & Chief Executive Officer, JPMorgan C...","Well, we've told you that we're kind of pencil...",1q23-earnings-transcript.pdf,1Q23,2023-04-14
4,Ken Usdin,"Analyst, Jefferies LLC","Hey, thanks. Good morning. Hey, Jeremy, I was ...",Jeremy Barnum,"Chief Financial Officer, JPMorgan Chase","Yeah, sure. So let me just summarize the drive...",1q23-earnings-transcript.pdf,1Q23,2023-04-14


In [34]:
#preprocessing data
preprocessor(qa_data, "Question", "question_tokenised_data", "Question_cleaned")
preprocessor(qa_data,"Response","answer_tokenised_data","Response_cleaned")

#reorganise column
qa_data=qa_data[["filename","Quarter","Question","Question_cleaned","Analyst","Analyst Role","Response","Response_cleaned","Executive","Executive Role Type"]]

In [35]:
# Standardize Roles
for i in range(len(qa_data)):
  if isinstance(qa_data.loc[i, "Executive Role Type"], str):
    qa_data.loc[i, "Executive Role Type"] = re.sub(r"Chairman & Chief Executive Officer, JPMorgan Chase","CEO", qa_data.loc[i, "Executive Role Type"])
    qa_data.loc[i, "Executive Role Type"] = re.sub(r"Chief Executive Officer, JPMorgan Chase","CEO", qa_data.loc[i, "Executive Role Type"])
    qa_data.loc[i, "Executive Role Type"] = re.sub(r"Vice Chairman, JPMorgan Chase","Vice President", qa_data.loc[i, "Executive Role Type"])
    qa_data.loc[i, "Executive Role Type"] = re.sub(r"Managing Director$","Managing Director", qa_data.loc[i, "Executive Role Type"])
    qa_data.loc[i, "Executive Role Type"] = re.sub(r"Head of Investor Relations, JPMorgan Chase","Head of IR", qa_data.loc[i, "Executive Role Type"])
    qa_data.loc[i, "Executive Role Type"] = re.sub(r"Chief Financial Officer, JPMorgan Chase","CFO", qa_data.loc[i, "Executive Role Type"])
    qa_data.loc[i, "Executive Role Type"] = re.sub(r"Chief Operating Officer, JPMorgan Chase","COO", qa_data.loc[i, "Executive Role Type"])
    qa_data.loc[i, "Executive Role Type"] = re.sub(r"Chief Financial Officer, JPMorganChase","CFO", qa_data.loc[i, "Executive Role Type"])
    qa_data.loc[i, "Executive Role Type"] = re.sub(r"Chairman & Chief Executive Officer, JPMorganChase","CEO", qa_data.loc[i, "Executive Role Type"])


In [36]:
print(len(qa_data))

143


In [27]:
#Check if there is nill
print(f'Check if there is nil values on DF: {qa_data.isnull().sum()}')

Check if there is nil values on DF: filename               0
Quarter                0
Question               0
Question_cleaned       0
Analyst                0
Analyst Role           0
Response               0
Response_cleaned       0
Executive              0
Executive Role Type    0
dtype: int64


JP morgan management discussion

In [28]:
#defining jp morgan managment discussion dataframe
jpmorgan_body_df=pd.read_csv("jpmorgan_management_discussion.csv")
jpmorgan_body_df.head()

FileNotFoundError: [Errno 2] No such file or directory: 'jpmorgan_management_discussion.csv'

In [None]:
#Cleaning transcript
preprocessor(jpmorgan_body_df, "chunk_text", "tokenized_data","cleaned_data")

In [None]:
jpmorgan_body_df.head()

# **Export the output as a csv file**

JP morgan QA section

In [37]:
#export preprocessed data
preprocessed_qa_csv_path1 = "/content/drive/MyDrive/bank_of_england/data/preprocessed_data/jpmorgan_qna_df_preprocessed_ver7.csv"
qa_data.to_csv("jp_morgan.csv", index=False)

JP morgan management discussion

In [None]:
#export preprocessed data
preprocessed_qa_csv_path2 = "/content/sample_data/jpmorgan_management_df_preprocessed.csv"
jpmorgan_body_df.to_csv(preprocessed_qa_csv_path2, index=False)