In [None]:
"""
===================================================
Author: Sheldon Kemper
Role: Data Engineering Lead, Bank of England Employer Project (Quant Collective)
LinkedIn: https://www.linkedin.com/in/sheldon-kemper
Date: 2025-02-04
Version: 1.0

Description:
    This notebook is dedicated to the data engineering functions for the Bank of England Employer Project.
    It includes code for mounting Google Drive, reading raw PDF files (e.g., earnings call transcripts),
    and performing text extraction and cleaning using pdfplumber and regular expressions.

Dependencies:
    - pdfplumber
    - re
    - google.colab (for mounting Google Drive)
    - os

===================================================
"""

In [None]:
import os
import requests
import pdfplumber
import re
import pandas as pd

In [None]:
# Mount Google Drive to the root location with force_remount
drive.mount('/content/drive', force_remount=True)

# Assuming 'BOE' folder is in 'MyDrive' and already shared
BOE_path = '/content/drive/MyDrive/BOE/bank_of_england/data'

# Now you (and others with access) can work with files in this directory
# For example, you can list the contents:
print(os.listdir(BOE_path))

Mounted at /content/drive
['cleansed', 'raw', 'model']


In [None]:
# Notebook 1: Import Transcripts and Append Data

# -------------------------------
# 1. Define base URLs and local paths
# -------------------------------
base_url = "https://www.jpmorganchase.com/content/dam/jpmc/jpmorgan-chase-and-co/investor-relations/documents/quarterly-earnings"
raw_dir = "/content/drive/My Drive/BOE/bank_of_england/data/raw/"
processed_dir = "/content/drive/My Drive/BOE/bank_of_england/data/processed/"

os.makedirs(raw_dir, exist_ok=True)
os.makedirs(processed_dir, exist_ok=True)

# -------------------------------
# 2. Download PDFs for the last two years for each quarter
# -------------------------------
# Define the years and quarter mapping
years = [2022, 2023]  # adjust as needed
quarters = {
    1: {"dir": "1st-quarter", "prefix": "1q"},
    2: {"dir": "2nd-quarter", "prefix": "2q"},
    3: {"dir": "3rd-quarter", "prefix": "3q"},
    4: {"dir": "4th-quarter", "prefix": "4q"}
}

for year in years:
    year_str = str(year)
    year_suffix = year_str[-2:]  # e.g., "23" for 2023
    for q in range(1, 5):
        quarter_dir = quarters[q]["dir"]
        file_prefix = quarters[q]["prefix"]
        file_name = f"{file_prefix}{year_suffix}-earnings-transcript.pdf"
        url = f"{base_url}/{year}/{quarter_dir}/{file_name}"
        print("Downloading:", url)
        r = requests.get(url)
        if r.status_code == 200:
            file_path = os.path.join(raw_dir, file_name)
            with open(file_path, "wb") as f:
                f.write(r.content)
            print("Saved to:", file_path)
        else:
            print(f"Failed to download {url} (status code: {r.status_code})")