<a href="https://colab.research.google.com/github/sheldonkemper/bank_of_england/blob/main/notebooks/cleaning/sk_jpmorgan.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
"""
===================================================
Author: Sheldon Kemper
Role: Data Engineering Lead, Bank of England Employer Project (Quant Collective)
LinkedIn: https://www.linkedin.com/in/sheldon-kemper
Date: 2025-02-04
Version: 1.0

Description:
    This notebook is dedicated to the data engineering functions for the Bank of England Employer Project.
    It includes code for mounting Google Drive, reading raw PDF files (e.g., earnings call transcripts),
    and performing text extraction and cleaning using pdfplumber and regular expressions.

Dependencies:
    - pdfplumber
    - re
    - google.colab (for mounting Google Drive)
    - os

===================================================
"""

Modules

In [4]:
!pip install pdfplumber  # Install pdfplumber library

Collecting pdfplumber
  Downloading pdfplumber-0.11.5-py3-none-any.whl.metadata (42 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.5/42.5 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pdfminer.six==20231228 (from pdfplumber)
  Downloading pdfminer.six-20231228-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.2/48.2 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
Downloading pdfplumber-0.11.5-py3-none-any.whl (59 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.5/59.5 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pdfminer.six-20231228-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [23]:
# Step 1: Import required libraries
import pdfplumber
import re
import os
from google.colab import drive

In [24]:
# Mount Google Drive to the root location with force_remount
drive.mount('/content/drive', force_remount=True)

# Assuming 'BOE' folder is in 'MyDrive' and already shared
BOE_path = '/content/drive/MyDrive/BOE/bank_of_england/data'

# Now you (and others with access) can work with files in this directory
# For example, you can list the contents:
print(os.listdir(BOE_path))

Mounted at /content/drive
['cleansed', 'raw', 'model']


In [25]:
import re
import pdfplumber
import pandas as pd

# -------------------------------
# 1. Define the path to your PDF file
# -------------------------------
pdf_file_path = '/content/drive/My Drive/BOE/bank_of_england/data/raw/4q24-earnings-transcript.pdf'

# -------------------------------
# 2. Extract text from the PDF
# -------------------------------
transcript_text = ""
with pdfplumber.open(pdf_file_path) as pdf:
    for page in pdf.pages:
        page_text = page.extract_text()
        if page_text:  # Ensure that text was extracted
            transcript_text += page_text + "\n"

print("Extracted text preview:")
print(transcript_text[:1000])

# -------------------------------
# 3. Clean the extracted text
# -------------------------------
transcript_clean = re.sub(r'\n\s*\.{10,}\s*\n', '\n', transcript_text)
transcript_clean = re.sub(r'\n\d+\n', '\n', transcript_clean)
transcript_clean = re.sub(r'On page \d+', '', transcript_clean)
transcript_clean = re.sub(r'Starting on page \d+', '', transcript_clean)
transcript_clean = re.sub(r'\.\s*,', '.', transcript_clean)
transcript_clean = transcript_clean.replace('%. ,', '%.')
transcript_clean = re.sub(r'\s+\n', '\n', transcript_clean)
transcript_clean = re.sub(r'\n+', '\n', transcript_clean).strip()

if "Disclaimer" in transcript_clean:
    transcript_clean = transcript_clean.split("Disclaimer")[0].strip()

print("\nFinal Cleaned text preview:")
print(transcript_clean[:2000])

# -------------------------------
# 3a. Extract Financial Quarter and Call Date
# -------------------------------
# Look for a pattern like "4Q24" (i.e. a digit, letter Q, and two digits)
quarter_match = re.search(r'(\dQ\d{2})', transcript_clean)
financial_quarter = quarter_match.group(1) if quarter_match else None

# Look for a date in the format "Month Day, Year" (e.g., "January 15, 2025")
date_match = re.search(r'([A-Za-z]+\s+\d{1,2},\s+\d{4})', transcript_clean)
call_date = date_match.group(1) if date_match else None

print("\nExtracted Financial Quarter:", financial_quarter)
print("Extracted Call Date:", call_date)

# -------------------------------
# 4. Split the transcript into sections
# -------------------------------
def split_sections(transcript):
    """
    Splits the transcript into Management Discussion and Question & Answer sections.
    Assumes that the Q&A section is introduced by a marker like "QUESTION AND ANSWER" (case-insensitive).

    Returns:
        tuple: (management_discussion, qa_section)
    """
    qa_marker = re.search(r'(?i)(QUESTION\s+AND\s+ANSWER)', transcript)
    if qa_marker:
        management_discussion = transcript[:qa_marker.start()].strip()
        qa_section = transcript[qa_marker.start():].strip()
    else:
        management_discussion = transcript
        qa_section = ""
    return management_discussion, qa_section

management_discussion, qa_section = split_sections(transcript_clean)

print("\nManagement Discussion Section Preview:")
print(management_discussion[:1000])
print("\nQuestion & Answer Section Preview:")
print(qa_section[:1000])

# Remove the Q&A header from the Q&A section if present
qa_section = re.sub(r'(?i)^QUESTION\s+AND\s+ANSWER\s+SECTION\s*', '', qa_section, count=1).strip()

# -------------------------------
# 5. Parse the Q&A Section (capturing job roles in their own field)
# -------------------------------
def parse_qa_section(qa_text, job_role_word_threshold=10):
    """
    Parses the Q&A section of the transcript, capturing speaker names, extra markers (e.g., "Q" or "A"),
    and separating out a job title (if present) into its own field.

    Expected formats:
      - "Name Marker" format (e.g., "John McDonald Q")
      - "Speaker: Utterance" format (e.g., "Operator: Thank you...")

    For lines appended after a speaker header, if the first line is short (fewer than job_role_word_threshold words)
    and contains a comma, it is assumed to be the job role and stored in the 'job_title' field.

    Returns:
        list of dict: Each dict contains 'speaker', 'marker', 'job_title', and 'utterance'.
    """
    entries = []
    current_entry = None
    lines = qa_text.split('\n')

    for line in lines:
        line = line.strip()
        if not line:
            continue

        # Attempt to match the "Name Marker" format (e.g., "John McDonald Q")
        m1 = re.match(r'^(?P<speaker>.+?)\s+(?P<marker>[QA])$', line)
        if m1:
            if current_entry is not None:
                entries.append(current_entry)
            current_entry = {
                'speaker': m1.group('speaker').strip(),
                'marker': m1.group('marker'),
                'job_title': "",
                'utterance': ""
            }
        else:
            # Next, attempt to match the "Speaker: Utterance" format (e.g., "Operator: Thank you...")
            m2 = re.match(r'^(?P<speaker>[^:]+):\s*(?P<utterance>.*)$', line)
            if m2:
                if current_entry is not None:
                    entries.append(current_entry)
                current_entry = {
                    'speaker': m2.group('speaker').strip(),
                    'marker': None,
                    'job_title': "",
                    'utterance': m2.group('utterance').strip()
                }
            else:
                # For lines that do not start a new speaker header,
                # if it's the first line and current_entry's job_title is empty, check if it qualifies as a job title.
                if current_entry is not None:
                    if not current_entry['job_title'] and not current_entry['utterance']:
                        words = line.split()
                        if len(words) < job_role_word_threshold and ',' in line:
                            current_entry['job_title'] = line
                            continue
                    # Append the line to the utterance.
                    if current_entry['utterance']:
                        current_entry['utterance'] += " " + line
                    else:
                        current_entry['utterance'] = line
                else:
                    current_entry = {'speaker': 'Unknown', 'marker': None, 'job_title': "", 'utterance': line}
    if current_entry is not None:
        entries.append(current_entry)
    return entries

qa_entries = parse_qa_section(qa_section)
df_qa = pd.DataFrame(qa_entries)
# Add the extracted financial quarter and call date as new columns
df_qa['financial_quarter'] = financial_quarter
df_qa['call_date'] = call_date

print("\nParsed Q&A Section Preview:")
print(df_qa.head(10))


Extracted text preview:
NOVEMBER 2024
4Q24 FINANCIAL RESULTS
EARNINGS CALL TRANSCRIPT
January 15, 2025
MANAGEMENT DISCUSSION SECTION
........................................................................................................................................................................................................................................................................................
Operator: Good morning, ladies and gentlemen. Welcome to JPMorganChase's Fourth Quarter 2024 Earnings Call. This call is being recorded.
Your line will be muted for the duration of the call. We will now go live to the presentation. The presentation is available on JPMorganChase's
website. Please refer to the disclaimer in the back concerning forward-looking statements. Please stand by.
At this time, I would like to turn the call over to JPMorganChase's Chairman and CEO, Jamie Dimon, and Chief Financial Officer, Jeremy Barnum.
Mr. Barnum, please go ahead.
.........................

In [26]:
# -------------------------------
# 6. Save the DataFrames as CSV Files
# -------------------------------

# Save the Q&A DataFrame
qa_csv_path = '/content/drive/My Drive/BOE/bank_of_england/data/cleansed/qa_section.csv'
df_qa.to_csv(qa_csv_path, index=False)
print("\nQ&A DataFrame saved to:", qa_csv_path)

# If you have parsed the Management Discussion section into a DataFrame (df_md), save it as well.
# For example, if you haven't parsed it separately, you might store the full text in a DataFrame as follows:
df_md = pd.DataFrame({'management_discussion': [management_discussion],
                      'financial_quarter': [financial_quarter],
                      'call_date': [call_date]})

md_csv_path = '/content/drive/My Drive/BOE/bank_of_england/data/cleansed/management_discussion.csv'
df_md.to_csv(md_csv_path, index=False)
print("Management Discussion DataFrame saved to:", md_csv_path)



Q&A DataFrame saved to: /content/drive/My Drive/BOE/bank_of_england/data/cleansed/qa_section.csv
Management Discussion DataFrame saved to: /content/drive/My Drive/BOE/bank_of_england/data/cleansed/management_discussion.csv
