<a href="https://colab.research.google.com/github/sheldonkemper/bank_of_england/blob/main/notebooks/processed/sk_processed_ubs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
"""
===================================================
Author: Sheldon Kemper
Role: Data Engineering Lead, Bank of England Employer Project (Quant Collective)
LinkedIn: https://www.linkedin.com/in/sheldon-kemper
Date: 2025-02-04
Version: 1.0

Description:
    This notebook is dedicated to the data engineering functions for the Bank of England Employer Project.
    It includes code for mounting Google Drive, reading raw PDF files (e.g., earnings call transcripts),
    and performing text extraction and cleaning using pdfplumber and regular expressions.

Dependencies:
    - pdfplumber
    - re
    - google.colab (for mounting Google Drive)
    - os

===================================================
"""



Modules

In [3]:
# !pip install pdfplumber
# Make sure spaCy and the English model are installed:
# !pip install spacy
# !python -m spacy download en_core_web_sm

Collecting pdfplumber
  Downloading pdfplumber-0.11.5-py3-none-any.whl.metadata (42 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.5/42.5 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pdfminer.six==20231228 (from pdfplumber)
  Downloading pdfminer.six-20231228-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.2/48.2 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
Downloading pdfplumber-0.11.5-py3-none-any.whl (59 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.5/59.5 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pdfminer.six-20231228-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [4]:
import os
import re
import pdfplumber
import pandas as pd
from google.colab import drive
import spacy

In [5]:
# -------------------------------
# 1. Mount Google Drive and define folder paths
# -------------------------------
drive.mount('/content/drive', force_remount=True)

# Assuming 'BOE' folder is in 'MyDrive' and already shared
BOE_path = '/content/drive/MyDrive/BOE/bank_of_england/data'

# Now you (and others with access) can work with files in this directory
# For example, you can list the contents:
print(os.listdir(BOE_path))

Mounted at /content/drive
['model', 'preprocessed_data', 'processed', 'raw']


In [43]:
# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# -------------------------------
# 2. Define paths for raw and processed data
# -------------------------------
BOE_path = "/content/drive/MyDrive/BOE/bank_of_england/data"  # adjust as needed
raw_dir = os.path.join(BOE_path, "raw", "ubs")
processed_dir = os.path.join(BOE_path, "processed")
os.makedirs(raw_dir, exist_ok=True)
os.makedirs(processed_dir, exist_ok=True)

# -------------------------------
# 4. Define helper functions for cleaning and splitting the transcript
# -------------------------------
def clean_transcript(text):
    """Cleans the raw transcript text by removing excessive whitespace, page markers, page numbers, and disclaimers."""
    # Remove long sequences of dots (page breaks)
    text = re.sub(r'\n\s*\.{10,}\s*\n', '\n', text)
    # Remove isolated numbers on a line
    text = re.sub(r'\n\d+\n', '\n', text)
    # Remove phrases like "On page X" and "Starting on page X"
    text = re.sub(r'On page \d+', '', text)
    text = re.sub(r'Starting on page \d+', '', text)
    # Remove page number lines like "Page 24 of 35" (case-insensitive, whole-line match)
    text = re.sub(r'(?im)^Page\s+\d+\s+of\s+\d+\s*$', '', text)
    # Fix punctuation issues
    text = re.sub(r'\.\s*,', '.', text)
    text = text.replace('%. ,', '%.')
    # Normalize whitespace
    text = re.sub(r'\s+\n', '\n', text)
    text = re.sub(r'\n+', '\n', text).strip()
    # Remove disclaimer text if present
    if "Disclaimer" in text:
        text = text.split("Disclaimer")[0].strip()
    return text

def extract_metadata(text):
    """
    Extracts the financial quarter and call date from the transcript text.
    Expects a header like "Fourth quarter 2024 results" and a date like "4 February 2025".
    Returns a tuple (financial_quarter, call_date) with the quarter standardized (e.g., "4Q24").
    """
    quarter_match = re.search(r'(?i)(First|Second|Third|Fourth)\s+quarter\s+(\d{4})', text)
    if quarter_match:
        quarter_map = {"first": "1", "second": "2", "third": "3", "fourth": "4"}
        quarter_num = quarter_map.get(quarter_match.group(1).lower(), "")
        year_full = quarter_match.group(2)
        financial_quarter = f"{quarter_num}Q{year_full[-2:]}"
    else:
        financial_quarter = None

    # Capture a date in the form "4 February 2025" (assumes day-first)
    date_match = re.search(r'^\s*(\d{1,2}\s+[A-Za-z]+\s+\d{4})\s*$', text, re.MULTILINE)
    call_date = date_match.group(1) if date_match else None

    return financial_quarter, call_date

def split_sections(transcript):
    """
    Splits the transcript into Management Announcements and Q&A sections.
    Uses "Analyst Q&A (CEO and CFO)" as the marker (case-insensitive).
    Returns a tuple: (management_announcements, qa_section)
    """
    marker_pattern = r'(?i)Analyst\s+Q&A\s*\(CEO\s+and\s+CFO\)'
    marker_match = re.search(marker_pattern, transcript)
    if marker_match:
        management_announcements = transcript[:marker_match.start()].strip()
        qa_section = transcript[marker_match.start():].strip()
    else:
        management_announcements = transcript
        qa_section = ""
    return management_announcements, qa_section

# ----- Management Section Parsing (working code) -----
def parse_management_section(management_text, speaker_threshold=4):
    """
    Parses the Management Announcements section into a list of dictionaries.
    Each dictionary contains:
      - 'speaker'
      - 'utterance'

    This function skips header lines (e.g., those containing "results", "speeches", "transcript",
    "numbers for slides", "available", "www.ubs.com"), any line that contains the word "slide",
    and lines that match a date.
    A line is considered a speaker header if it is short (≤ speaker_threshold words) and in title case.
    """
    header_keywords = ["results", "speeches", "transcript", "numbers for slides", "available", "www.ubs.com"]
    entries = []
    current_entry = None
    lines = management_text.split('\n')

    for line in lines:
        stripped = line.strip()
        if not stripped:
            continue
        # Skip any line that contains the word "slide" (case-insensitive)
        if "slide" in stripped.lower():
            continue
        if any(kw in stripped.lower() for kw in header_keywords):
            continue
        if re.match(r'^\d{1,2}\s+[A-Za-z]+\s+\d{4}$', stripped):
            continue
        words = stripped.split()
        if len(words) <= speaker_threshold and stripped.istitle():
            if current_entry is not None:
                entries.append(current_entry)
            current_entry = {'speaker': stripped, 'utterance': ""}
        else:
            if current_entry is not None:
                if current_entry['utterance']:
                    current_entry['utterance'] += " " + stripped
                else:
                    current_entry['utterance'] = stripped
            else:
                current_entry = {'speaker': 'Unknown', 'utterance': stripped}
    if current_entry is not None:
        entries.append(current_entry)
    return entries

# ----- Q&A Section Parsing using a Simple Regex Approach (with spaCy check) -----
def parse_qa_section_simple(qa_text, header_word_threshold=6):
    """
    Parses the Q&A section using a simple regex approach.

    Assumes that speaker headers are lines that either:
      - Match the pattern "Name, Company" or "Name:" (i.e., they contain a comma, colon, or dash),
      - Or are short lines (≤ header_word_threshold words) in title case.

    IMPORTANT: If a line starts with a conjunction such as "And", "But", or "Or" (optionally followed by a comma),
    it is not treated as a new header; instead, it is appended to the previous speaker's utterance.

    Additionally, if a candidate header (obtained via regex) does not contain a PERSON entity (via spaCy),
    then it is treated as a continuation.

    Lines starting with "Slide" or empty lines are skipped.

    Returns a list of dictionaries with keys: 'speaker', 'job_title', 'utterance'.
    """
    # Remove the marker line if it exists
    qa_text = re.sub(r'(?i)^Analyst\s+Q&A\s*\(CEO\s+and\s+CFO\)', '', qa_text).strip()

    entries = []
    current_entry = None
    lines = qa_text.split('\n')
    # A header regex that looks for a name optionally followed by punctuation and a job title
    header_regex = re.compile(r'^(?P<speaker>[A-Z][A-Za-z\s\.\-]+)(?:[,:\-]\s*(?P<job_title>.+))?$')

    for line in lines:
        line = line.strip()
        if not line:
            continue
        if line.lower().startswith("slide"):
            continue
        # Check if the line starts with a conjunction (with optional comma)
        if re.match(r'^(And|But|Or)[,]?\s', line, re.IGNORECASE):
            if current_entry is not None:
                current_entry['utterance'] += " " + line
            else:
                current_entry = {'speaker': 'Unknown', 'job_title': "", 'utterance': line}
            continue

        m = header_regex.match(line)
        if m:
            candidate = m.group('speaker').strip()
            # Only treat as header if candidate has between 2 and header_word_threshold words and is in title case.
            if 2 <= len(candidate.split()) <= header_word_threshold and candidate.istitle():
                # Use spaCy to check if candidate contains a PERSON entity.
                doc_candidate = nlp(candidate)
                first_word = candidate.split()[0].lower().rstrip(".,")
                greetings = {"hi", "hello", "hey"}
                if first_word in greetings:
                    # Treat as continuation.
                    if current_entry is not None:
                        current_entry['utterance'] += " " + line
                    else:
                        current_entry = {'speaker': 'Unknown', 'job_title': "", 'utterance': line}
                    continue
                if not any(ent.label_ == "PERSON" for ent in doc_candidate.ents):
                    # Not a valid header; treat as continuation.
                    if current_entry is not None:
                        current_entry['utterance'] += " " + line
                    else:
                        current_entry = {'speaker': 'Unknown', 'job_title': "", 'utterance': line}
                    continue
                # Otherwise, treat it as a header.
                if current_entry is not None:
                    entries.append(current_entry)
                job_title = m.group('job_title').strip() if m.group('job_title') else ""
                current_entry = {'speaker': candidate, 'job_title': job_title, 'utterance': ""}
                continue

        # If the line did not match the header pattern, append as continuation.
        if current_entry is not None:
            if current_entry['utterance']:
                current_entry['utterance'] += " " + line
            else:
                current_entry['utterance'] = line
        else:
            current_entry = {'speaker': 'Unknown', 'job_title': "", 'utterance': line}
    if current_entry is not None:
        entries.append(current_entry)
    return entries

# -------------------------------
# 6. Process all PDFs in the raw/ubs directory and accumulate results
# -------------------------------
management_entries_all = []
qa_entries_all = []

for filename in os.listdir(raw_dir):
    if filename.lower().endswith(".pdf"):
        file_path = os.path.join(raw_dir, filename)
        print(f"Processing file: {filename}")
        with pdfplumber.open(file_path) as pdf:
            transcript_text = ""
            for page in pdf.pages:
                page_text = page.extract_text()
                if page_text:
                    transcript_text += page_text + "\n"
        cleaned_text = clean_transcript(transcript_text)
        financial_quarter, call_date = extract_metadata(cleaned_text)
        management_text, qa_text = split_sections(cleaned_text)

        # Parse the sections
        management_entries = parse_management_section(management_text)
        qa_entries = parse_qa_section_simple(qa_text)

        # Add metadata and filename info to each parsed entry
        for entry in management_entries:
            entry['call_date'] = call_date
            entry['financial_quarter'] = financial_quarter
            entry['source_file'] = filename
            management_entries_all.append(entry)
        for entry in qa_entries:
            entry['call_date'] = call_date
            entry['financial_quarter'] = financial_quarter
            entry['source_file'] = filename
            qa_entries_all.append(entry)

# Convert the accumulated lists to DataFrames
df_management = pd.DataFrame(management_entries_all)
df_qa = pd.DataFrame(qa_entries_all)


Processing file: 1q23-earnings-call-remarks.pdf
Processing file: 1q24-earnings-call-remarks.pdf
Processing file: 2q23-earnings-call-remarks.pdf
Processing file: 2q24-earnings-call-remarks.pdf
Processing file: 3q23-earnings-call-remarks.pdf
Processing file: 3q24-earnings-call-remarks.pdf
Processing file: 4q23-earnings-call-remarks.pdf
Processing file: 4q24-earnings-call-remarks.pdf
Management announcements saved to: /content/drive/MyDrive/BOE/bank_of_england/data/processed/ubs_management_discussion.csv
Q&A section saved to: /content/drive/MyDrive/BOE/bank_of_england/data/processed/ubs_qna_section.csv


In [44]:
# -------------------------------
# 7. Save the results to CSV files
# -------------------------------
management_csv_path = os.path.join(processed_dir, "ubs_management_discussion.csv")
qa_csv_path = os.path.join(processed_dir, "ubs_qna_section.csv")

df_management.to_csv(management_csv_path, index=False)
df_qa.to_csv(qa_csv_path, index=False)

print("Management announcements saved to:", management_csv_path)
print("Q&A section saved to:", qa_csv_path)