<a href="https://colab.research.google.com/github/sheldonkemper/bank_of_england/blob/main/notebooks/import/sk_import_PDF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:

"""
===================================================
Author: Sheldon Kemper
Role: Data Engineering Lead, Bank of England Employer Project (Quant Collective)
LinkedIn: https://www.linkedin.com/in/sheldon-kemper
Date: 2025-02-04
Version: 1.0

Description:
    This notebook is dedicated to the data engineering functions for the Bank of England Employer Project.
    It includes code for mounting Google Drive, reading raw PDF files (e.g., earnings call transcripts),
    and saving these fles to a raw directory.

Dependencies:
    - pdfplumber
    - re
    - google.colab (for mounting Google Drive)
    - os

===================================================
"""



In [6]:
!pip install pdfplumber  # Install pdfplumber library

Collecting pdfplumber
  Using cached pdfplumber-0.11.5-py3-none-any.whl.metadata (42 kB)
Collecting pdfminer.six==20231228 (from pdfplumber)
  Using cached pdfminer.six-20231228-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Using cached pypdfium2-4.30.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
Using cached pdfplumber-0.11.5-py3-none-any.whl (59 kB)
Using cached pdfminer.six-20231228-py3-none-any.whl (5.6 MB)
Using cached pypdfium2-4.30.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.9 MB)
Installing collected packages: pypdfium2, pdfminer.six, pdfplumber
Successfully installed pdfminer.six-20231228 pdfplumber-0.11.5 pypdfium2-4.30.1


In [7]:
import os
import requests
import pdfplumber
import re
import pandas as pd
from google.colab import drive
from concurrent.futures import ThreadPoolExecutor, as_completed

In [8]:
# -------------------------------
# 1. Mount Google Drive and import necessary libraries
# -------------------------------
drive.mount('/content/drive', force_remount=True)

# Assuming 'BOE' folder is in 'MyDrive' and already shared
BOE_path = '/content/drive/MyDrive/BOE/bank_of_england/data'

# Now you (and others with access) can work with files in this directory
# For example, you can list the contents:
print(os.listdir(BOE_path))

Mounted at /content/drive
['raw', 'processed', 'preprocessed_data']


In [9]:
# Notebook 1: Import Transcripts and Append Data (Improved Download Code)

# -------------------------------
# 2. Define local paths on your Google Drive
# -------------------------------
bank = 'jpmorgan'
raw_dir = f"{BOE_path}/raw/{bank}"

# Create directories if they don't exist
os.makedirs(raw_dir, exist_ok=True)

# -------------------------------
# 3. Define the list of transcript URLs you are interested in
# -------------------------------
transcript_urls = [
    "https://www.jpmorganchase.com/content/dam/jpmc/jpmorgan-chase-and-co/investor-relations/documents/quarterly-earnings/2023/1st-quarter/1q23-earnings-transcript.pdf",
    "https://www.jpmorganchase.com/content/dam/jpmc/jpmorgan-chase-and-co/investor-relations/documents/quarterly-earnings/2023/2nd-quarter/2q23-earnings-transcript.pdf",
    "https://www.jpmorganchase.com/content/dam/jpmc/jpmorgan-chase-and-co/investor-relations/documents/quarterly-earnings/2023/3rd-quarter/jpm-3q23-earnings-call-transcript.pdf",
    "https://www.jpmorganchase.com/content/dam/jpmc/jpmorgan-chase-and-co/investor-relations/documents/quarterly-earnings/2023/4th-quarter/jpm-4q23-earnings-call-transcript.pdf",
    "https://www.jpmorganchase.com/content/dam/jpmc/jpmorgan-chase-and-co/investor-relations/documents/quarterly-earnings/2024/1st-quarter/jpm-1q24-earnings-call-transcript.pdf",
    "https://www.jpmorganchase.com/content/dam/jpmc/jpmorgan-chase-and-co/investor-relations/documents/quarterly-earnings/2024/2nd-quarter/jpm-2q24-earnings-call-transcript-final.pdf",
    "https://www.jpmorganchase.com/content/dam/jpmc/jpmorgan-chase-and-co/investor-relations/documents/quarterly-earnings/2024/3rd-quarter/jpmc-third-quarter-2024-earnings-conference-call-transcript.pdf",
    "https://www.jpmorganchase.com/content/dam/jpmc/jpmorgan-chase-and-co/investor-relations/documents/quarterly-earnings/2024/4th-quarter/4q24-earnings-transcript.pdf"
]

# transcript_urls = [
#     # 2023 Transcripts (from the archive)
#     # Q1 2023 (placeholder URL – please verify)
#     "https://www.ubs.com/global/en/investor-relations/financial-information/quarterly-reporting/qr-shared/2023/1q23/_jcr_content/mainpar/toplevelgrid_copy_co/col1/linklistreimagined_c/link_2038370922.1996821412.file/PS9jb250ZW50L2RhbS9hc3NldHMvY2MvaW52ZXN0b3ItcmVsYXRpb25zL3F1YXJ0ZXJsaWVzLzIwMjMvMXEyMy8xcTIzLWVhcm5pbmdzLWNhbGwtcmVtYXJrcy5wZGY=/1q23-earnings-call-remarks.pdf",
#     # Q2 2023 (placeholder URL – please verify)
#     "https://www.ubs.com/global/en/investor-relations/financial-information/quarterly-reporting/qr-shared/2023/2q23/_jcr_content/mainpar/toplevelgrid_copy_co/col1/linklistreimagined_c/link_2038370922_copy.1634234040.file/PS9jb250ZW50L2RhbS9hc3NldHMvY2MvaW52ZXN0b3ItcmVsYXRpb25zL3F1YXJ0ZXJsaWVzLzIwMjMvMnEyMy8ycTIzLWVhcm5pbmdzLWNhbGwtcmVtYXJrcy5wZGY=/2q23-earnings-call-remarks.pdf",
#     # Q3 2023 (actual URL as found on UBS website)
#     "https://www.ubs.com/global/en/investor-relations/financial-information/quarterly-reporting/qr-shared/2023/3q23/_jcr_content/mainpar/toplevelgrid_copy_co/col1/linklistreimagined_c/link_1665858674.1136236242.file/PS9jb250ZW50L2RhbS9hc3NldHMvY2MvaW52ZXN0b3ItcmVsYXRpb25zL3F1YXJ0ZXJsaWVzLzIwMjMvM3EyMy8zcTIzLWVhcm5pbmdzLWNhbGwtcmVtYXJrcy5wZGY%3D/3q23-earnings-call-remarks.pdf",
#     # Q4 2023 (actual URL as found on UBS website)
#     "https://www.ubs.com/global/en/investor-relations/financial-information/quarterly-reporting/qr-shared/2023/4q23/_jcr_content/mainpar/toplevelgrid_copy_co/col1/linklistreimagined_c/link_984441358_copy_.1148964796.file/PS9jb250ZW50L2RhbS9hc3NldHMvY2MvaW52ZXN0b3ItcmVsYXRpb25zL3F1YXJ0ZXJsaWVzLzIwMjMvNHEyMy80cTIzLWVhcm5pbmdzLWNhbGwtcmVtYXJrcy5wZGY%3D/4q23-earnings-call-remarks.pdf",

#     # 2024 Transcripts (current reporting page)
#     # Q1 2024
#     "https://www.ubs.com/global/en/investor-relations/financial-information/quarterly-reporting/qr-shared/2024/1q24/_jcr_content/mainpar/toplevelgrid_copy_co/col1/linklistreimagined_c/link_1853274911_copy.1827040041.file/PS9jb250ZW50L2RhbS9hc3NldHMvY2MvaW52ZXN0b3ItcmVsYXRpb25zL3F1YXJ0ZXJsaWVzLzIwMjQvMXEyNC8xcTI0LWVhcm5pbmdzLWNhbGwtcmVtYXJrcy5wZGY=/1q24-earnings-call-remarks.pdf",
#     # Q2 2024 (placeholder URL – please verify)
#     "https://www.ubs.com/global/en/investor-relations/financial-information/quarterly-reporting/qr-shared/2024/2q24/_jcr_content/mainpar/toplevelgrid_copy_co/col1/linklistreimagined_c/link_1458805504.1014368069.file/PS9jb250ZW50L2RhbS9hc3NldHMvY2MvaW52ZXN0b3ItcmVsYXRpb25zL3F1YXJ0ZXJsaWVzLzIwMjQvMnEyNC8ycTI0LWVhcm5pbmdzLWNhbGwtcmVtYXJrcy5wZGY=/2q24-earnings-call-remarks.pdf",
#     # Q3 2024
#     "https://www.ubs.com/global/en/investor-relations/financial-information/quarterly-reporting/qr-shared/2024/3q24/_jcr_content/mainpar/toplevelgrid_copy_co/col1/linklistreimagined_c/link_1458805504_copy.1529925360.file/PS9jb250ZW50L2RhbS9hc3NldHMvY2MvaW52ZXN0b3ItcmVsYXRpb25zL3F1YXJ0ZXJsaWVzLzIwMjQvM3EyNC8zcTI0LWVhcm5pbmdzLWNhbGwtcmVtYXJrcy5wZGY%3D/3q24-earnings-call-remarks.pdf",
#     # Q4 2024
#     "https://www.ubs.com/global/en/investor-relations/financial-information/quarterly-reporting/qr-shared/2024/4q24/_jcr_content/mainpar/toplevelgrid_copy/col1/linklistreimagined_c/link_1665858674_copy.0922295456.file/PS9jb250ZW50L2RhbS9hc3NldHMvY2MvaW52ZXN0b3ItcmVsYXRpb25zL3F1YXJ0ZXJsaWVzLzIwMjQvNHEyNC80cTI0LWVhcm5pbmdzLWNhbGwtcmVtYXJrcy5wZGY%3D/4q24-earnings-call-remarks.pdf"
# ]

# -------------------------------
# 4. Define a function to download a PDF
# -------------------------------
def download_pdf(url, raw_dir):
    """Downloads a PDF from the given URL and saves it to raw_dir.
    Returns a tuple: (url, status, file_path or error message).
    """
    file_name = url.split("/")[-1]
    file_path = os.path.join(raw_dir, file_name)

    # Skip download if file already exists
    if os.path.exists(file_path):
        return (url, "exists", file_path)

    try:
        response = requests.get(url, timeout=15)
        if response.status_code == 200:
            with open(file_path, "wb") as f:
                f.write(response.content)
            return (url, "downloaded", file_path)
        else:
            return (url, f"Failed (status code: {response.status_code})", None)
    except Exception as e:
        return (url, f"Error: {str(e)}", None)

# -------------------------------
# 5. Download all PDFs concurrently and log the results
# -------------------------------
results = []
with ThreadPoolExecutor(max_workers=4) as executor:
    futures = [executor.submit(download_pdf, url, raw_dir) for url in transcript_urls]
    for future in as_completed(futures):
        result = future.result()
        results.append(result)
        print(result)

print("\nSummary of Downloads:")
for res in results:
    print(res)


('https://www.jpmorganchase.com/content/dam/jpmc/jpmorgan-chase-and-co/investor-relations/documents/quarterly-earnings/2023/2nd-quarter/2q23-earnings-transcript.pdf', 'downloaded', '/content/drive/MyDrive/BOE/bank_of_england/data/raw/jpmorgan/2q23-earnings-transcript.pdf')
('https://www.jpmorganchase.com/content/dam/jpmc/jpmorgan-chase-and-co/investor-relations/documents/quarterly-earnings/2023/3rd-quarter/jpm-3q23-earnings-call-transcript.pdf', 'downloaded', '/content/drive/MyDrive/BOE/bank_of_england/data/raw/jpmorgan/jpm-3q23-earnings-call-transcript.pdf')
('https://www.jpmorganchase.com/content/dam/jpmc/jpmorgan-chase-and-co/investor-relations/documents/quarterly-earnings/2023/4th-quarter/jpm-4q23-earnings-call-transcript.pdf', 'downloaded', '/content/drive/MyDrive/BOE/bank_of_england/data/raw/jpmorgan/jpm-4q23-earnings-call-transcript.pdf')
('https://www.jpmorganchase.com/content/dam/jpmc/jpmorgan-chase-and-co/investor-relations/documents/quarterly-earnings/2023/1st-quarter/1q23-e