<a href="https://colab.research.google.com/github/sheldonkemper/bank_of_england/blob/main/blob/main/notebooks/import/sk_import_jpmorgan.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [13]:
"""
===================================================
Author: Sheldon Kemper
Role: Data Engineering Lead, Bank of England Employer Project (Quant Collective)
LinkedIn: https://www.linkedin.com/in/sheldon-kemper
Date: 2025-02-04
Version: 1.0

Description:
    This notebook is dedicated to the data engineering functions for the Bank of England Employer Project.
    It includes code for mounting Google Drive, reading raw PDF files (e.g., earnings call transcripts),
    and performing text extraction and cleaning using pdfplumber and regular expressions.

Dependencies:
    - pdfplumber
    - re
    - google.colab (for mounting Google Drive)
    - os

===================================================
"""



In [14]:
!pip install pdfplumber  # Install pdfplumber library



In [15]:
import os
import requests
import pdfplumber
import re
import pandas as pd
from google.colab import drive

In [16]:
# Mount Google Drive to the root location with force_remount
drive.mount('/content/drive', force_remount=True)

# Assuming 'BOE' folder is in 'MyDrive' and already shared
BOE_path = '/content/drive/MyDrive/BOE/bank_of_england/data'

# Now you (and others with access) can work with files in this directory
# For example, you can list the contents:
print(os.listdir(BOE_path))

Mounted at /content/drive
['raw', 'model', 'cleansed', 'processed']


In [24]:
# Notebook 1: Import Transcripts and Append Data (Improved Download Code)

# -------------------------------
# 1. Mount Google Drive and import necessary libraries
# -------------------------------
from google.colab import drive
drive.mount('/content/drive')

import os
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed

# -------------------------------
# 2. Define local paths on your Google Drive
# -------------------------------
raw_dir = "/content/drive/My Drive/BOE/bank_of_england/data/raw/"
processed_dir = "/content/drive/My Drive/BOE/bank_of_england/data/processed/"

# Create directories if they don't exist
os.makedirs(raw_dir, exist_ok=True)
os.makedirs(processed_dir, exist_ok=True)

# -------------------------------
# 3. Define the list of transcript URLs you are interested in
# -------------------------------
transcript_urls = [
    "https://www.jpmorganchase.com/content/dam/jpmc/jpmorgan-chase-and-co/investor-relations/documents/quarterly-earnings/2023/1st-quarter/1q23-earnings-transcript.pdf",
    "https://www.jpmorganchase.com/content/dam/jpmc/jpmorgan-chase-and-co/investor-relations/documents/quarterly-earnings/2023/2nd-quarter/2q23-earnings-transcript.pdf",
    "https://www.jpmorganchase.com/content/dam/jpmc/jpmorgan-chase-and-co/investor-relations/documents/quarterly-earnings/2023/3rd-quarter/jpm-3q23-earnings-call-transcript.pdf",
    "https://www.jpmorganchase.com/content/dam/jpmc/jpmorgan-chase-and-co/investor-relations/documents/quarterly-earnings/2023/4th-quarter/jpm-4q23-earnings-call-transcript.pdf",
    "https://www.jpmorganchase.com/content/dam/jpmc/jpmorgan-chase-and-co/investor-relations/documents/quarterly-earnings/2024/1st-quarter/jpm-1q24-earnings-call-transcript.pdf",
    "https://www.jpmorganchase.com/content/dam/jpmc/jpmorgan-chase-and-co/investor-relations/documents/quarterly-earnings/2024/2nd-quarter/jpm-2q24-earnings-call-transcript-final.pdf",
    "https://www.jpmorganchase.com/content/dam/jpmc/jpmorgan-chase-and-co/investor-relations/documents/quarterly-earnings/2024/3rd-quarter/jpmc-third-quarter-2024-earnings-conference-call-transcript.pdf",
    "https://www.jpmorganchase.com/content/dam/jpmc/jpmorgan-chase-and-co/investor-relations/documents/quarterly-earnings/2024/4th-quarter/4q24-earnings-transcript.pdf"
]

# -------------------------------
# 4. Define a function to download a PDF
# -------------------------------
def download_pdf(url, raw_dir):
    """Downloads a PDF from the given URL and saves it to raw_dir.
    Returns a tuple: (url, status, file_path or error message).
    """
    file_name = url.split("/")[-1]
    file_path = os.path.join(raw_dir, file_name)

    # Skip download if file already exists
    if os.path.exists(file_path):
        return (url, "exists", file_path)

    try:
        response = requests.get(url, timeout=15)
        if response.status_code == 200:
            with open(file_path, "wb") as f:
                f.write(response.content)
            return (url, "downloaded", file_path)
        else:
            return (url, f"Failed (status code: {response.status_code})", None)
    except Exception as e:
        return (url, f"Error: {str(e)}", None)

# -------------------------------
# 5. Download all PDFs concurrently and log the results
# -------------------------------
results = []
with ThreadPoolExecutor(max_workers=4) as executor:
    futures = [executor.submit(download_pdf, url, raw_dir) for url in transcript_urls]
    for future in as_completed(futures):
        result = future.result()
        results.append(result)
        print(result)

print("\nSummary of Downloads:")
for res in results:
    print(res)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
('https://www.jpmorganchase.com/content/dam/jpmc/jpmorgan-chase-and-co/investor-relations/documents/quarterly-earnings/2023/1st-quarter/1q23-earnings-transcript.pdf', 'exists', '/content/drive/My Drive/BOE/bank_of_england/data/raw/1q23-earnings-transcript.pdf')
('https://www.jpmorganchase.com/content/dam/jpmc/jpmorgan-chase-and-co/investor-relations/documents/quarterly-earnings/2023/2nd-quarter/2q23-earnings-transcript.pdf', 'exists', '/content/drive/My Drive/BOE/bank_of_england/data/raw/2q23-earnings-transcript.pdf')
('https://www.jpmorganchase.com/content/dam/jpmc/jpmorgan-chase-and-co/investor-relations/documents/quarterly-earnings/2024/1st-quarter/jpm-1q24-earnings-call-transcript.pdf', 'exists', '/content/drive/My Drive/BOE/bank_of_england/data/raw/jpm-1q24-earnings-call-transcript.pdf')
('https://www.jpmorganchase.com/content/dam/jpmc/jpmorgan-chase-and