<a href="https://colab.research.google.com/github/sheldonkemper/bank_of_england/blob/main/notebooks/cleaning/sk_jpmorgan.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
"""
===================================================
Author: Sheldon Kemper
Role: Data Engineering Lead, Bank of England Employer Project (Quant Collective)
LinkedIn: https://www.linkedin.com/in/sheldon-kemper
Date: 2025-02-04
Version: 1.0

Description:
    This notebook is dedicated to the data engineering functions for the Bank of England Employer Project.
    It includes code for mounting Google Drive, reading raw PDF files (e.g., earnings call transcripts),
    and performing text extraction and cleaning using pdfplumber and regular expressions.

Dependencies:
    - pdfplumber
    - re
    - google.colab (for mounting Google Drive)
    - os

===================================================
"""

Modules

In [4]:
!pip install pdfplumber  # Install pdfplumber library

Collecting pdfplumber
  Downloading pdfplumber-0.11.5-py3-none-any.whl.metadata (42 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.5/42.5 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pdfminer.six==20231228 (from pdfplumber)
  Downloading pdfminer.six-20231228-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.2/48.2 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
Downloading pdfplumber-0.11.5-py3-none-any.whl (59 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.5/59.5 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pdfminer.six-20231228-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [5]:
# Step 1: Import required libraries
import pdfplumber
import re
import os
from google.colab import drive

In [6]:
# Mount Google Drive to the root location with force_remount
drive.mount('/content/drive', force_remount=True)

# Assuming 'BOE' folder is in 'MyDrive' and already shared
BOE_path = '/content/drive/MyDrive/BOE/bank_of_england/data'

# Now you (and others with access) can work with files in this directory
# For example, you can list the contents:
print(os.listdir(BOE_path))

Mounted at /content/drive
['cleansed', 'raw', 'model']


In [11]:
import re
import pdfplumber

# Define the path to your PDF file
pdf_file_path = '/content/drive/My Drive/BOE/bank_of_england/data/raw/4q24-earnings-transcript.pdf'

# Read and extract text from the PDF
transcript_text = ""
with pdfplumber.open(pdf_file_path) as pdf:
    for page in pdf.pages:
        page_text = page.extract_text()
        if page_text:  # Ensure that text was extracted
            transcript_text += page_text + "\n"

# Optional: Print a snippet of the extracted text for a quick review
print("Extracted text preview:")
print(transcript_text[:1000])

# Clean the extracted text (remove extraneous lines and artifacts)

# Step 1: Remove long sequences of dots and standalone page numbers
transcript_clean = re.sub(r'\n\s*\.{10,}\s*\n', '\n', transcript_text)
transcript_clean = re.sub(r'\n\d+\n', '\n', transcript_clean)

# Step 2: Remove "On page <number>" and "Starting on page <number>" references
transcript_clean = re.sub(r'On page \d+', '', transcript_clean)
transcript_clean = re.sub(r'Starting on page \d+', '', transcript_clean)

# Step 3: Fix stray punctuation issues
# Remove a stray comma that follows a period (e.g., ". ,")
transcript_clean = re.sub(r'\.\s*,', '.', transcript_clean)
# Also fix specific sequences if needed (e.g., '%. ,')
transcript_clean = transcript_clean.replace('%. ,', '%.')

# Step 4: Normalize whitespace and newlines
transcript_clean = re.sub(r'\s+\n', '\n', transcript_clean)
transcript_clean = re.sub(r'\n+', '\n', transcript_clean).strip()

# Final cleaned text preview
print("\nFinal Cleaned text preview:")
print(transcript_clean[:2000])


Extracted text preview:
NOVEMBER 2024
4Q24 FINANCIAL RESULTS
EARNINGS CALL TRANSCRIPT
January 15, 2025
MANAGEMENT DISCUSSION SECTION
........................................................................................................................................................................................................................................................................................
Operator: Good morning, ladies and gentlemen. Welcome to JPMorganChase's Fourth Quarter 2024 Earnings Call. This call is being recorded.
Your line will be muted for the duration of the call. We will now go live to the presentation. The presentation is available on JPMorganChase's
website. Please refer to the disclaimer in the back concerning forward-looking statements. Please stand by.
At this time, I would like to turn the call over to JPMorganChase's Chairman and CEO, Jamie Dimon, and Chief Financial Officer, Jeremy Barnum.
Mr. Barnum, please go ahead.
.........................

In [16]:
import re
import pdfplumber
import pandas as pd

# -------------------------------
# 1. Define the path to your PDF file and extract text
# -------------------------------
pdf_file_path = '/content/drive/My Drive/BOE/bank_of_england/data/raw/4q24-earnings-transcript.pdf'

transcript_text = ""
with pdfplumber.open(pdf_file_path) as pdf:
    for page in pdf.pages:
        page_text = page.extract_text()
        if page_text:  # Ensure that text was extracted
            transcript_text += page_text + "\n"

# Optional: Print a snippet of the extracted text for a quick review
print("Extracted text preview:")
print(transcript_text[:1000])

# -------------------------------
# 2. Clean the extracted text
# -------------------------------
# Remove long sequences of dots (dividers) and standalone page numbers
transcript_clean = re.sub(r'\n\s*\.{10,}\s*\n', '\n', transcript_text)
transcript_clean = re.sub(r'\n\d+\n', '\n', transcript_clean)

# Remove "On page <number>" and "Starting on page <number>" references
transcript_clean = re.sub(r'On page \d+', '', transcript_clean)
transcript_clean = re.sub(r'Starting on page \d+', '', transcript_clean)

# Fix stray punctuation issues (e.g., remove stray commas following a period)
transcript_clean = re.sub(r'\.\s*,', '.', transcript_clean)
transcript_clean = transcript_clean.replace('%. ,', '%.')

# Normalize whitespace and newlines
transcript_clean = re.sub(r'\s+\n', '\n', transcript_clean)
transcript_clean = re.sub(r'\n+', '\n', transcript_clean).strip()

# Remove the disclaimer at the bottom (assumes the disclaimer starts with "Disclaimer")
if "Disclaimer" in transcript_clean:
    transcript_clean = transcript_clean.split("Disclaimer")[0].strip()

# Optional: Print a snippet of the final cleaned text for review
print("\nFinal Cleaned text preview:")
print(transcript_clean[:2000])

# -------------------------------
# 3. Split the transcript into sections
# -------------------------------
def split_sections(transcript):
    """
    Splits the transcript into Management Discussion and Question & Answer sections.
    Assumes that the Q&A section is introduced by a marker like "QUESTION AND ANSWER".

    Returns:
        tuple: (management_discussion, qa_section)
    """
    qa_marker = re.search(r'(?i)(QUESTION\s+AND\s+ANSWER)', transcript)
    if qa_marker:
        management_discussion = transcript[:qa_marker.start()].strip()
        qa_section = transcript[qa_marker.start():].strip()
    else:
        management_discussion = transcript
        qa_section = ""
    return management_discussion, qa_section

management_discussion, qa_section = split_sections(transcript_clean)

print("\nManagement Discussion Section Preview:")
print(management_discussion[:1000])
print("\nQuestion & Answer Section Preview:")
print(qa_section[:1000])

# -------------------------------
# 4. Parse the Q&A Section (handling extra markers and job title lines)
# -------------------------------
def parse_qa_section(qa_text):
    """
    Parses the Q&A section of the transcript, capturing speaker names, markers (e.g., "Q" or "A"),
    optional job titles, and the speaker's utterances.

    This function uses a heuristic: when a speaker header is detected (e.g., "John McDonald Q"),
    it looks ahead one line. If that next line contains a comma and is short (fewer than 10 words),
    it is assumed to be the job title and is stored separately.

    Returns:
        list of dict: Each dictionary contains 'speaker', 'marker', 'job_title', and 'utterance'.
    """
    entries = []
    lines = qa_text.split('\n')
    i = 0
    current_entry = None

    while i < len(lines):
        line = lines[i].strip()
        if not line:
            i += 1
            continue

        # Check if the line is a speaker header with a marker (e.g., "John McDonald Q" or "Jeremy Barnum A")
        speaker_marker_match = re.match(r'^(?P<speaker>.+?)\s+(?P<marker>[QA])$', line)
        if speaker_marker_match:
            # Append the previous entry if it exists
            if current_entry is not None:
                entries.append(current_entry)
            speaker = speaker_marker_match.group('speaker').strip()
            marker = speaker_marker_match.group('marker')
            current_entry = {'speaker': speaker, 'marker': marker, 'job_title': None, 'utterance': ""}

            # Check the next line for a potential job title (heuristic: contains a comma and is short)
            if i + 1 < len(lines):
                next_line = lines[i + 1].strip()
                if ',' in next_line and len(next_line.split()) < 10:
                    current_entry['job_title'] = next_line
                    i += 1  # Skip this line as it's been processed as a job title
            i += 1
            continue
        else:
            # Append the line to the current entry's utterance
            if current_entry is None:
                current_entry = {'speaker': 'Unknown', 'marker': None, 'job_title': None, 'utterance': line}
            else:
                if current_entry['utterance']:
                    current_entry['utterance'] += " " + line
                else:
                    current_entry['utterance'] = line
        i += 1

    if current_entry is not None:
        entries.append(current_entry)
    return entries

# Parse the Q&A section
qa_entries = parse_qa_section(qa_section)

# Convert the parsed Q&A entries to a DataFrame for inspection
df_qa = pd.DataFrame(qa_entries)
print("\nParsed Q&A Section Preview:")
print(df_qa.head(10))


Extracted text preview:
NOVEMBER 2024
4Q24 FINANCIAL RESULTS
EARNINGS CALL TRANSCRIPT
January 15, 2025
MANAGEMENT DISCUSSION SECTION
........................................................................................................................................................................................................................................................................................
Operator: Good morning, ladies and gentlemen. Welcome to JPMorganChase's Fourth Quarter 2024 Earnings Call. This call is being recorded.
Your line will be muted for the duration of the call. We will now go live to the presentation. The presentation is available on JPMorganChase's
website. Please refer to the disclaimer in the back concerning forward-looking statements. Please stand by.
At this time, I would like to turn the call over to JPMorganChase's Chairman and CEO, Jamie Dimon, and Chief Financial Officer, Jeremy Barnum.
Mr. Barnum, please go ahead.
.........................