<a href="https://colab.research.google.com/github/sheldonkemper/bank_of_england/blob/main/notebooks/cleaning/sk_jpmorgan.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
"""
===================================================
Author: Sheldon Kemper
Role: Data Engineering Lead, Bank of England Employer Project (Quant Collective)
LinkedIn: https://www.linkedin.com/in/sheldon-kemper
Date: 2025-02-04
Version: 1.0

Description:
    This notebook is dedicated to the data engineering functions for the Bank of England Employer Project.
    It includes code for mounting Google Drive, reading raw PDF files (e.g., earnings call transcripts),
    and performing text extraction and cleaning using pdfplumber and regular expressions.

Dependencies:
    - pdfplumber
    - re
    - google.colab (for mounting Google Drive)
    - os

===================================================
"""

Modules

In [4]:
!pip install pdfplumber  # Install pdfplumber library

Collecting pdfplumber
  Downloading pdfplumber-0.11.5-py3-none-any.whl.metadata (42 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.5/42.5 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pdfminer.six==20231228 (from pdfplumber)
  Downloading pdfminer.six-20231228-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.2/48.2 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
Downloading pdfplumber-0.11.5-py3-none-any.whl (59 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.5/59.5 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pdfminer.six-20231228-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [5]:
# Step 1: Import required libraries
import pdfplumber
import re
import os
from google.colab import drive

In [6]:
# Mount Google Drive to the root location with force_remount
drive.mount('/content/drive', force_remount=True)

# Assuming 'BOE' folder is in 'MyDrive' and already shared
BOE_path = '/content/drive/MyDrive/BOE/bank_of_england/data'

# Now you (and others with access) can work with files in this directory
# For example, you can list the contents:
print(os.listdir(BOE_path))

Mounted at /content/drive
['cleansed', 'raw', 'model']


In [7]:
# Step 3: Define the path to your PDF file
pdf_file_path = '/content/drive/My Drive/BOE/bank_of_england/data/raw/4q24-earnings-transcript.pdf'

# Step 4: Read and extract text from the PDF
transcript_text = ""
with pdfplumber.open(pdf_file_path) as pdf:
    for page in pdf.pages:
        page_text = page.extract_text()
        if page_text:  # Ensure that text was extracted
            transcript_text += page_text + "\n"

# Optional: Print a snippet of the extracted text for a quick review
print("Extracted text preview:")
print(transcript_text[:1000])

# Step 5: Clean the extracted text (remove extraneous lines and artifacts)
# Remove lines that are just long sequences of dots
transcript_clean = re.sub(r'\n\s*\.{10,}\s*\n', '\n', transcript_text)
# Remove page numbers that might appear on their own lines
transcript_clean = re.sub(r'\n\d+\n', '\n', transcript_clean)
# Normalize whitespace and newlines
transcript_clean = re.sub(r'\s+\n', '\n', transcript_clean)
transcript_clean = re.sub(r'\n+', '\n', transcript_clean).strip()

# Optional: Print a snippet of the cleaned transcript
print("\nCleaned text preview:")
print(transcript_clean[:1000])


Extracted text preview:
NOVEMBER 2024
4Q24 FINANCIAL RESULTS
EARNINGS CALL TRANSCRIPT
January 15, 2025
MANAGEMENT DISCUSSION SECTION
........................................................................................................................................................................................................................................................................................
Operator: Good morning, ladies and gentlemen. Welcome to JPMorganChase's Fourth Quarter 2024 Earnings Call. This call is being recorded.
Your line will be muted for the duration of the call. We will now go live to the presentation. The presentation is available on JPMorganChase's
website. Please refer to the disclaimer in the back concerning forward-looking statements. Please stand by.
At this time, I would like to turn the call over to JPMorganChase's Chairman and CEO, Jamie Dimon, and Chief Financial Officer, Jeremy Barnum.
Mr. Barnum, please go ahead.
.........................