<a href="https://colab.research.google.com/github/sheldonkemper/bank_of_england/blob/main/notebooks/cleansed/sk_processed_jpmorgan.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
"""
===================================================
Author: Sheldon Kemper
Role: Data Engineering Lead, Bank of England Employer Project (Quant Collective)
LinkedIn: https://www.linkedin.com/in/sheldon-kemper
Date: 2025-02-04
Version: 1.0

Description:
    This notebook is dedicated to the data engineering functions for the Bank of England Employer Project.
    It includes code for mounting Google Drive, reading raw PDF files (e.g., earnings call transcripts),
    and performing text extraction and cleaning using pdfplumber and regular expressions.

Dependencies:
    - pdfplumber
    - re
    - google.colab (for mounting Google Drive)
    - os

===================================================
"""



Modules

In [4]:
!pip install pdfplumber  # Install pdfplumber library

Collecting pdfplumber
  Downloading pdfplumber-0.11.5-py3-none-any.whl.metadata (42 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.5/42.5 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pdfminer.six==20231228 (from pdfplumber)
  Downloading pdfminer.six-20231228-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.2/48.2 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
Downloading pdfplumber-0.11.5-py3-none-any.whl (59 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.5/59.5 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pdfminer.six-20231228-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [5]:
# Step 1: Import required libraries
import pdfplumber
import re
import os
from google.colab import drive

In [6]:
# Mount Google Drive to the root location with force_remount
drive.mount('/content/drive', force_remount=True)

# Assuming 'BOE' folder is in 'MyDrive' and already shared
BOE_path = '/content/drive/MyDrive/BOE/bank_of_england/data'

# Now you (and others with access) can work with files in this directory
# For example, you can list the contents:
print(os.listdir(BOE_path))

Mounted at /content/drive
['model', 'cleansed', 'processed', 'raw']


In [8]:
import os
import re
import pdfplumber
import pandas as pd

# -------------------------------
# 1. Define the path to your raw folder on Google Drive
# -------------------------------
raw_dir = "/content/drive/My Drive/BOE/bank_of_england/data/raw/"

# -------------------------------
# 2. Define helper functions for processing
# -------------------------------

def clean_transcript(text):
    """Cleans the raw transcript text."""
    text = re.sub(r'\n\s*\.{10,}\s*\n', '\n', text)
    text = re.sub(r'\n\d+\n', '\n', text)
    text = re.sub(r'On page \d+', '', text)
    text = re.sub(r'Starting on page \d+', '', text)
    text = re.sub(r'\.\s*,', '.', text)
    text = text.replace('%. ,', '%.')
    text = re.sub(r'\s+\n', '\n', text)
    text = re.sub(r'\n+', '\n', text).strip()
    if "Disclaimer" in text:
        text = text.split("Disclaimer")[0].strip()
    return text

def extract_metadata(text):
    """Extracts financial quarter and call date from the transcript text."""
    quarter_match = re.search(r'(\dQ\s*\d{2})', text)
    financial_quarter = quarter_match.group(1).replace(" ", "") if quarter_match else None
    date_match = re.search(r'([A-Za-z]+\s+\d{1,2},\s+\d{4})', text)
    call_date = date_match.group(1) if date_match else None
    return financial_quarter, call_date

def split_sections(transcript):
    """
    Splits the transcript into Management Discussion and Q&A sections.
    Assumes that the Q&A section is introduced by a marker like "QUESTION AND ANSWER" (case-insensitive).
    Returns a tuple: (management_discussion, qa_section)
    """
    qa_marker = re.search(r'(?i)(QUESTION\s+AND\s+ANSWER)', transcript)
    if qa_marker:
        management_discussion = transcript[:qa_marker.start()].strip()
        qa_section = transcript[qa_marker.start():].strip()
    else:
        management_discussion = transcript
        qa_section = ""
    return management_discussion, qa_section

def parse_qa_section(qa_text, job_role_word_threshold=10):
    """
    Parses the Q&A section of the transcript into a list of dictionaries.
    Each dictionary contains 'speaker', 'marker', 'job_title', and 'utterance'.
    For lines appended after a speaker header, if the first line is short (fewer than job_role_word_threshold words)
    and contains a comma, it is assumed to be the job role and stored in the 'job_title' field.
    """
    entries = []
    current_entry = None
    lines = qa_text.split('\n')
    for line in lines:
        line = line.strip()
        if not line:
            continue
        # Try matching the "Name Marker" format (e.g., "John McDonald Q")
        m1 = re.match(r'^(?P<speaker>.+?)\s+(?P<marker>[QA])$', line)
        if m1:
            if current_entry is not None:
                entries.append(current_entry)
            current_entry = {
                'speaker': m1.group('speaker').strip(),
                'marker': m1.group('marker'),
                'job_title': "",
                'utterance': ""
            }
        else:
            # Next, try matching the "Speaker: Utterance" format (e.g., "Operator: Thank you...")
            m2 = re.match(r'^(?P<speaker>[^:]+):\s*(?P<utterance>.*)$', line)
            if m2:
                if current_entry is not None:
                    entries.append(current_entry)
                current_entry = {
                    'speaker': m2.group('speaker').strip(),
                    'marker': None,
                    'job_title': "",
                    'utterance': m2.group('utterance').strip()
                }
            else:
                # For lines that do not start a new speaker header,
                # if it's the first line after the header and current_entry's job_title is empty,
                # check if it qualifies as a job title.
                if current_entry is not None:
                    if not current_entry['job_title'] and not current_entry['utterance']:
                        words = line.split()
                        if len(words) < job_role_word_threshold and ',' in line:
                            current_entry['job_title'] = line
                            continue
                    # Append the line to the current entry's utterance.
                    if current_entry['utterance']:
                        current_entry['utterance'] += " " + line
                    else:
                        current_entry['utterance'] = line
                else:
                    current_entry = {'speaker': 'Unknown', 'marker': None, 'job_title': "", 'utterance': line}
    if current_entry is not None:
        entries.append(current_entry)
    return entries

# -------------------------------
# 3. Process each PDF in the raw folder
# -------------------------------
all_qa_entries = []

for filename in os.listdir(raw_dir):
    if filename.lower().endswith(".pdf"):
        file_path = os.path.join(raw_dir, filename)
        print("Processing file:", file_path)
        # Extract text from PDF
        transcript_text = ""
        with pdfplumber.open(file_path) as pdf:
            for page in pdf.pages:
                page_text = page.extract_text()
                if page_text:
                    transcript_text += page_text + "\n"
        print("Extracted text preview for", filename, ":", transcript_text[:1000])

        # Clean the transcript text
        transcript_clean = clean_transcript(transcript_text)
        print("Cleaned text preview for", filename, ":", transcript_clean[:1000])

        # Extract metadata
        financial_quarter, call_date = extract_metadata(transcript_clean)
        print("Extracted Financial Quarter:", financial_quarter)
        print("Extracted Call Date:", call_date)

        # Split the transcript into sections
        management_discussion, qa_section = split_sections(transcript_clean)
        # Remove the Q&A header from the Q&A section if present
        qa_section = re.sub(r'(?i)^QUESTION\s+AND\s+ANSWER\s+SECTION\s*', '', qa_section, count=1).strip()

        # Parse the Q&A section
        qa_entries = parse_qa_section(qa_section)
        # Add metadata to each Q&A entry
        for entry in qa_entries:
            entry['financial_quarter'] = financial_quarter
            entry['call_date'] = call_date
        all_qa_entries.extend(qa_entries)
        print("Processed file:", filename)

# Convert all parsed Q&A entries into a DataFrame
df_qa_all = pd.DataFrame(all_qa_entries)
print("\nCombined Parsed Q&A Section Preview:")
print(df_qa_all.head(10))

# You can now save df_qa_all to a CSV file or further process it.
# Example:
# df_qa_all.to_csv("/content/drive/My Drive/BOE/bank_of_england/data/processed/combined_qa_section.csv", index=False)


Processing file: /content/drive/My Drive/BOE/bank_of_england/data/raw/1q23-earnings-transcript.pdf
Extracted text preview for 1q23-earnings-transcript.pdf : 1Q23 FINANCIAL RESULTS
EARNINGS CALL TRANSCRIPT
April 14, 2023
MANAGEMENT DISCUSSION SECTION
......................................................................................................................................................................................................................................................
Operator: Good morning, ladies and gentlemen. Welcome to JPMorgan Chase’s First Quarter 2023 Earnings Call. This call is being recorded.
Your line will be muted for the duration of the call. We will now go live to the presentation. Please stand by.
At this time, I would like to turn the call over to JPMorgan Chase's Chairman and CEO, Jamie Dimon, and Chief Financial Officer, Jeremy
Barnum.
Mr. Barnum, please go ahead.
.................................................................................

In [12]:
import os
import re
import pdfplumber
import pandas as pd

# -------------------------------
# 1. Define the path to your raw folder on Google Drive
# -------------------------------
raw_dir = "/content/drive/My Drive/BOE/bank_of_england/data/raw/"

# -------------------------------
# 2. Define helper functions for processing
# -------------------------------

def clean_transcript(text):
    """Cleans the raw transcript text."""
    text = re.sub(r'\n\s*\.{10,}\s*\n', '\n', text)
    text = re.sub(r'\n\d+\n', '\n', text)
    text = re.sub(r'On page \d+', '', text)
    text = re.sub(r'Starting on page \d+', '', text)
    text = re.sub(r'\.\s*,', '.', text)
    text = text.replace('%. ,', '%.')
    text = re.sub(r'\s+\n', '\n', text)
    text = re.sub(r'\n+', '\n', text).strip()
    if "Disclaimer" in text:
        text = text.split("Disclaimer")[0].strip()
    return text

def extract_metadata(text):
    """Extracts the financial quarter and call date from the transcript text."""
    quarter_match = re.search(r'(\dQ\s*\d{2})', text)
    financial_quarter = quarter_match.group(1).replace(" ", "") if quarter_match else None
    date_match = re.search(r'([A-Za-z]+\s+\d{1,2},\s+\d{4})', text)
    call_date = date_match.group(1) if date_match else None
    return financial_quarter, call_date

def split_sections(transcript):
    """
    Splits the transcript into Management Discussion and Q&A sections.
    Assumes that the Q&A section is introduced by a marker like "QUESTION AND ANSWER" (case-insensitive).

    Returns:
        tuple: (management_discussion, qa_section)
    """
    qa_marker = re.search(r'(?i)(QUESTION\s+AND\s+ANSWER)', transcript)
    if qa_marker:
        management_discussion = transcript[:qa_marker.start()].strip()
        qa_section = transcript[qa_marker.start():].strip()
    else:
        management_discussion = transcript
        qa_section = ""
    return management_discussion, qa_section

def parse_qa_section(qa_text, job_role_word_threshold=10):
    """
    Parses the Q&A section of the transcript into a list of dictionaries.
    Each dictionary contains 'speaker', 'marker', 'job_title', and 'utterance'.
    For lines appended after a speaker header, if the first line is short (fewer than job_role_word_threshold words)
    and contains a comma, it is assumed to be the job role and stored in the 'job_title' field.
    """
    entries = []
    current_entry = None
    lines = qa_text.split('\n')

    for line in lines:
        line = line.strip()
        if not line:
            continue

        # Try matching the "Name Marker" format (e.g., "John McDonald Q")
        m1 = re.match(r'^(?P<speaker>.+?)\s+(?P<marker>[QA])$', line)
        if m1:
            if current_entry is not None:
                entries.append(current_entry)
            current_entry = {
                'speaker': m1.group('speaker').strip(),
                'marker': m1.group('marker'),
                'job_title': "",
                'utterance': ""
            }
        else:
            # Next, try matching the "Speaker: Utterance" format (e.g., "Operator: Thank you...")
            m2 = re.match(r'^(?P<speaker>[^:]+):\s*(?P<utterance>.*)$', line)
            if m2:
                if current_entry is not None:
                    entries.append(current_entry)
                current_entry = {
                    'speaker': m2.group('speaker').strip(),
                    'marker': None,
                    'job_title': "",
                    'utterance': m2.group('utterance').strip()
                }
            else:
                # For lines that do not start a new speaker header,
                # if it's the first line and current_entry's job_title is empty, check if it qualifies as a job title.
                if current_entry is not None:
                    if not current_entry['job_title'] and not current_entry['utterance']:
                        words = line.split()
                        if len(words) < job_role_word_threshold and ',' in line:
                            current_entry['job_title'] = line
                            continue
                    # Append the line to the current entry's utterance.
                    if current_entry['utterance']:
                        current_entry['utterance'] += " " + line
                    else:
                        current_entry['utterance'] = line
                else:
                    current_entry = {'speaker': 'Unknown', 'marker': None, 'job_title': "", 'utterance': line}
    if current_entry is not None:
        entries.append(current_entry)
    return entries

# -------------------------------
# 3. Process each PDF in the raw folder and aggregate results
# -------------------------------
all_qa_entries = []  # List to store parsed Q&A entries for all transcripts
all_md_entries = []  # List to store management discussion entries for all transcripts

for filename in os.listdir(raw_dir):
    if filename.lower().endswith(".pdf"):
        file_path = os.path.join(raw_dir, filename)
        print("Processing file:", file_path)

        # Extract text from PDF
        transcript_text = ""
        with pdfplumber.open(file_path) as pdf:
            for page in pdf.pages:
                page_text = page.extract_text()
                if page_text:
                    transcript_text += page_text + "\n"
        print("Extracted text preview for", filename, ":", transcript_text[:1000])

        # Clean the transcript text
        transcript_clean = clean_transcript(transcript_text)
        print("Cleaned text preview for", filename, ":", transcript_clean[:1000])

        # Extract metadata
        financial_quarter, call_date = extract_metadata(transcript_clean)
        print("Extracted Financial Quarter:", financial_quarter)
        print("Extracted Call Date:", call_date)

        # Split the transcript into sections
        management_discussion, qa_section = split_sections(transcript_clean)
        # Remove the Q&A header from the Q&A section if present
        qa_section = re.sub(r'(?i)^QUESTION\s+AND\s+ANSWER\s+SECTION\s*', '', qa_section, count=1).strip()

        # Append the management discussion (with metadata and filename)
        md_entry = {
            'filename': filename,
            'management_discussion': management_discussion,
            'financial_quarter': financial_quarter,
            'call_date': call_date
        }
        all_md_entries.append(md_entry)

        # Parse the Q&A section and add metadata to each entry
        qa_entries = parse_qa_section(qa_section)
        for entry in qa_entries:
            entry['filename'] = filename
            entry['financial_quarter'] = financial_quarter
            entry['call_date'] = call_date
        all_qa_entries.extend(qa_entries)

        print("Processed file:", filename)

# Convert aggregated lists to DataFrames
df_qa_all = pd.DataFrame(all_qa_entries)
df_md_all = pd.DataFrame(all_md_entries)

# -------------------------------
# 4. Format 'call_date' as datetime and sort descending by call_date
# -------------------------------
df_qa_all['call_date'] = pd.to_datetime(df_qa_all['call_date'], format='%B %d, %Y', errors='coerce')
df_md_all['call_date'] = pd.to_datetime(df_md_all['call_date'], format='%B %d, %Y', errors='coerce')

# Sort DataFrames in descending order of call_date
df_qa_all = df_qa_all.sort_values(by='call_date', ascending=False)
df_md_all = df_md_all.sort_values(by='call_date', ascending=False)

print("\nCombined Parsed Q&A Section Preview (Sorted):")
print(df_qa_all.head(10))

print("\nCombined Management Discussion DataFrame Preview (Sorted):")
print(df_md_all.head())

# -------------------------------
# 5. Save the DataFrames as CSV Files
# -------------------------------
qa_csv_path = "/content/drive/My Drive/BOE/bank_of_england/data/processed/qa_section.csv"
md_csv_path = "/content/drive/My Drive/BOE/bank_of_england/data/processed/management_discussion.csv"

df_qa_all.to_csv(qa_csv_path, index=False)
print("\nQ&A DataFrame saved to:", qa_csv_path)

df_md_all.to_csv(md_csv_path, index=False)
print("Management Discussion DataFrame saved to:", md_csv_path)


Processing file: /content/drive/My Drive/BOE/bank_of_england/data/raw/1q23-earnings-transcript.pdf
Extracted text preview for 1q23-earnings-transcript.pdf : 1Q23 FINANCIAL RESULTS
EARNINGS CALL TRANSCRIPT
April 14, 2023
MANAGEMENT DISCUSSION SECTION
......................................................................................................................................................................................................................................................
Operator: Good morning, ladies and gentlemen. Welcome to JPMorgan Chase’s First Quarter 2023 Earnings Call. This call is being recorded.
Your line will be muted for the duration of the call. We will now go live to the presentation. Please stand by.
At this time, I would like to turn the call over to JPMorgan Chase's Chairman and CEO, Jamie Dimon, and Chief Financial Officer, Jeremy
Barnum.
Mr. Barnum, please go ahead.
.................................................................................