In [3]:
"""
===================================================
Author: Sheldon Kemper
Role: Data Engineering Lead, Bank of England Employer Project (Quant Collective)
LinkedIn: https://www.linkedin.com/in/sheldon-kemper
Date: 2025-02-04
Version: 1.0

Description:
    This notebook is dedicated to the data engineering functions for the Bank of England Employer Project.
    It includes code for mounting Google Drive, reading raw PDF files (e.g., earnings call transcripts),
    and performing text extraction and cleaning using pdfplumber and regular expressions.

Dependencies:
    - pdfplumber
    - re
    - google.colab (for mounting Google Drive)
    - os

===================================================
"""



Modules

In [6]:
# Mount Google Drive to the root location with force_remount
drive.mount('/content/drive', force_remount=True)

# Assuming 'BOE' folder is in 'MyDrive' and already shared
BOE_path = '/content/drive/MyDrive/BOE/bank_of_england/data'

# Now you (and others with access) can work with files in this directory
# For example, you can list the contents:
print(os.listdir(BOE_path))

Mounted at /content/drive
['model', 'cleansed', 'processed', 'raw']


In [None]:
# -------------------------------
# 4. Define helper functions for processing
# -------------------------------
def clean_transcript(text):
    """Cleans the raw transcript text by removing excessive whitespace, page markers, and disclaimers."""
    text = re.sub(r'\n\s*\.{10,}\s*\n', '\n', text)
    text = re.sub(r'\n\d+\n', '\n', text)
    text = re.sub(r'On page \d+', '', text)
    text = re.sub(r'Starting on page \d+', '', text)
    text = re.sub(r'\.\s*,', '.', text)
    text = text.replace('%. ,', '%.')
    text = re.sub(r'\s+\n', '\n', text)
    text = re.sub(r'\n+', '\n', text).strip()
    if "Disclaimer" in text:
        text = text.split("Disclaimer")[0].strip()
    return text

def extract_metadata(text):
    """Extracts the financial quarter (e.g., '1Q24') and call date (e.g., 'April 12, 2024') from the transcript text."""
    quarter_match = re.search(r'(\dQ\s*\d{2})', text)
    financial_quarter = quarter_match.group(1).replace(" ", "") if quarter_match else None
    date_match = re.search(r'([A-Za-z]+\s+\d{1,2},\s+\d{4})', text)
    call_date = date_match.group(1) if date_match else None
    return financial_quarter, call_date

def split_sections(transcript):
    """
    Splits the transcript into Management Discussion and Q&A sections.
    It looks for markers such as "QUESTION AND ANSWER" or "ANALYST Q&A" (case-insensitive).
    Returns a tuple: (management_discussion, qa_section)
    """
    qa_marker = re.search(r'(?i)(QUESTION\s+AND\s+ANSWER|ANALYST\s+Q&A)', transcript)
    if qa_marker:
        management_discussion = transcript[:qa_marker.start()].strip()
        qa_section = transcript[qa_marker.start():].strip()
    else:
        management_discussion = transcript
        qa_section = ""
    return management_discussion, qa_section

def parse_qa_section(qa_text, job_role_word_threshold=10):
    """
    Parses the Q&A section into a list of dictionaries.
    Each dictionary contains:
      - 'speaker'
      - 'marker' (e.g., 'Q' or 'A' if applicable)
      - 'job_title'
      - 'utterance'
    """
    entries = []
    current_entry = None
    lines = qa_text.split('\n')
    
    for line in lines:
        line = line.strip()
        if not line:
            continue
        
        # Match headers like "Name Q" or "Name A"
        m1 = re.match(r'^(?P<speaker>.+?)\s+(?P<marker>[QA])$', line)
        if m1:
            if current_entry is not None:
                entries.append(current_entry)
            current_entry = {
                'speaker': m1.group('speaker').strip(),
                'marker': m1.group('marker'),
                'job_title': "",
                'utterance': ""
            }
        else:
            # Match headers in "Speaker: Utterance" format
            m2 = re.match(r'^(?P<speaker>[^:]+):\s*(?P<utterance>.*)$', line)
            if m2:
                if current_entry is not None:
                    entries.append(current_entry)
                current_entry = {
                    'speaker': m2.group('speaker').strip(),
                    'marker': None,
                    'job_title': "",
                    'utterance': m2.group('utterance').strip()
                }
            else:
                # For continuation lines, check if the first line might be a job title
                if current_entry is not None:
                    if not current_entry['job_title'] and not current_entry['utterance']:
                        words = line.split()
                        if len(words) < job_role_word_threshold and ',' in line:
                            current_entry['job_title'] = line
                            continue
                    # Append the line to the current entry's utterance
                    if current_entry['utterance']:
                        current_entry['utterance'] += " " + line
                    else:
                        current_entry['utterance'] = line
                else:
                    current_entry = {'speaker': 'Unknown', 'marker': None, 'job_title': "", 'utterance': line}
    if current_entry is not None:
        entries.append(current_entry)
    return entries

# -------------------------------
# 5. Process each PDF in the raw folder and aggregate results
# -------------------------------
all_qa_entries = []  # To store parsed Q&A entries
all_md_entries = []  # To store Management Discussion entries

for filename in os.listdir(raw_dir):
    if filename.lower().endswith(".pdf"):
        file_path = os.path.join(raw_dir, filename)
        print("Processing file:", file_path)
        
        # Extract text from PDF
        transcript_text = ""
        with pdfplumber.open(file_path) as pdf:
            for page in pdf.pages:
                page_text = page.extract_text()
                if page_text:
                    transcript_text += page_text + "\n"
        print("Extracted text preview for", filename, ":", transcript_text[:1000])
        
        # Clean the transcript text
        transcript_clean = clean_transcript(transcript_text)
        print("Cleaned text preview for", filename, ":", transcript_clean[:1000])
        
        # Extract metadata: financial quarter and call date
        financial_quarter, call_date = extract_metadata(transcript_clean)
        print("Extracted Financial Quarter:", financial_quarter)
        print("Extracted Call Date:", call_date)
        
        # Split into Management Discussion and Q&A sections
        management_discussion, qa_section = split_sections(transcript_clean)
        # Remove any header text from the Q&A section
        qa_section = re.sub(r'(?i)^(QUESTION\s+AND\s+ANSWER|ANALYST\s+Q&A)\s*', '', qa_section, count=1).strip()
        
        # Append Management Discussion entry (with metadata)
        md_entry = {
            'filename': filename,
            'management_discussion': management_discussion,
            'financial_quarter': financial_quarter,
            'call_date': call_date
        }
        all_md_entries.append(md_entry)
        
        # Parse the Q&A section and add metadata to each entry
        qa_entries = parse_qa_section(qa_section)
        for entry in qa_entries:
            entry['filename'] = filename
            entry['financial_quarter'] = financial_quarter
            entry['call_date'] = call_date
        all_qa_entries.extend(qa_entries)
        
        print("Processed file:", filename)

# Convert aggregated lists to DataFrames
df_qa_all = pd.DataFrame(all_qa_entries)
df_md_all = pd.DataFrame(all_md_entries)

# -------------------------------
# 6. Format 'call_date' as datetime and sort descending by call_date
# -------------------------------
df_qa_all['call_date'] = pd.to_datetime(df_qa_all['call_date'], format='%B %d, %Y', errors='coerce')
df_md_all['call_date'] = pd.to_datetime(df_md_all['call_date'], format='%B %d, %Y', errors='coerce')

df_qa_all = df_qa_all.sort_values(by='call_date', ascending=False)
df_md_all = df_md_all.sort_values(by='call_date', ascending=False)

print("\nCombined Parsed Q&A Section Preview (Sorted):")
print(df_qa_all.head(10))

print("\nCombined Management Discussion DataFrame Preview (Sorted):")
print(df_md_all.head())

# -------------------------------
# 7. Save the DataFrames as CSV files
# -------------------------------
qa_csv_path = os.path.join(processed_dir, "ubs_qa_section.csv")
md_csv_path = os.path.join(processed_dir, "ubs_management_discussion.csv")

df_qa_all.to_csv(qa_csv_path, index=False)
print("\nQ&A DataFrame saved to:", qa_csv_path)

df_md_all.to_csv(md_csv_path, index=False)
print("Management Discussion DataFrame saved to:", md_csv_path)