<a href="https://colab.research.google.com/github/sheldonkemper/bank_of_england/blob/main/Untitled0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Modules

In [1]:
import os
import sys
from google.colab import drive

ModuleNotFoundError: No module named 'google'

In [None]:


# Mount Google Drive to the root location with force_remount
drive.mount('/content/drive', force_remount=True)

# Assuming 'BOE' folder is in 'MyDrive' and already shared
BOE_path = '/content/drive/MyDrive/BOE/bank_of_england/data'

# Now you (and others with access) can work with files in this directory
# For example, you can list the contents:
print(os.listdir(BOE_path))

In [None]:

sys.path.append('/content/drive/My Drive/bank_of_england/src')

from preprocessing import preprocess_pipeline

# Example usage
sample_text = "JP Morgan's earnings increased by 20% in Q4 2024."
processed_text = preprocess_pipeline(sample_text)
print(processed_text)  # Output: "jpmorgan earning increase q4"


In [None]:
import pdfplumber
import pandas as pd
import re

# File path to uploaded JPMorgan transcript PDF
pdf_path = "/mnt/data/4q24-earnings-transcript.pdf"

# Function to extract financial quarter and call date from the first page
def extract_financial_info(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        first_page_text = pdf.pages[0].extract_text()
        
        # Extract Financial Quarter (e.g., "4Q24")
        financial_quarter_match = re.search(r"(\dQ\d{2})", first_page_text)
        financial_quarter = financial_quarter_match.group(1) if financial_quarter_match else "Unknown"

        # Extract Call Date (e.g., "January 15, 2025")
        call_date_match = re.search(r"([A-Za-z]+ \d{1,2}, \d{4})", first_page_text)
        call_date = call_date_match.group(1) if call_date_match else "Unknown"

        return financial_quarter, call_date

# Extract financial quarter and call date dynamically
FINANCIAL_QUARTER, CALL_DATE = extract_financial_info(pdf_path)

import pdfplumber
import re

# Function to extract and clean text from the PDF
def extract_clean_text(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        text_data = []
        
        # Skip the first page and process remaining pages
        for i, page in enumerate(pdf.pages[1:]):  # Skipping first page
            text = page.extract_text()
            if text:
                # Remove headers, footers, and page numbers
                text = re.sub(r"JPMorgan Chase & Co\..*", "", text)  # Remove headers
                text = re.sub(r"Page \d+ of \d+", "", text)  # Remove footers
                text = re.sub(r"Copyright © \d{4} JPMorgan Chase & Co\.", "", text)  # Remove copyright notices
                
                # Preserve new lines to ensure correct speaker separation
                text_data.append(text.strip())

        # Join extracted text with explicit new lines for proper structure
        full_text = "\n".join(text_data)

        # Remove standalone page numbers (if appearing alone on a line)
        full_text = re.sub(r"\n\d+\n", "\n", full_text)

        # Fix text split across pages:
        # Remove hyphenation at line breaks (e.g., "finan-\ncial" -> "financial")
        full_text = re.sub(r"(?<=\w)-\n(?=\w)", "", full_text)

        # Merge text broken across pages while preserving speaker names
        full_text = re.sub(r"(?<!\n[A-Z][A-Za-z\s]+:\s)\n", " ", full_text)

        # Remove disclaimers section at the end of the document
        disclaimer_match = re.search(r"Disclaimer\s+This document contains forward-looking statements.*?JPMorgan Chase & Co\. does not undertake to update any forward-looking statements",
                                     full_text, re.IGNORECASE | re.DOTALL)
        if disclaimer_match:
            full_text = full_text[:disclaimer_match.start()].strip()  # Remove disclaimer section

        return full_text


# Extract and clean the text
cleaned_text = extract_clean_text(pdf_path)

# Split the transcript into Presentation and Q&A sections
qna_start_match = re.search(r"QUESTION AND ANSWER SECTION", cleaned_text, re.IGNORECASE)
if qna_start_match:
    presentation_text = cleaned_text[:qna_start_match.start()].strip()
    qna_text = cleaned_text[qna_start_match.start():].strip()
else:
    presentation_text = cleaned_text
    qna_text = ""

# Function to clean text while preserving alignment of speaker-comment pairs
def clean_text_preserving_speaker_alignment(text):
    """
    Cleans extracted text by removing page numbers and merging text split across page breaks
    while ensuring speaker-comment alignment is maintained.
    """
    # Remove standalone page numbers appearing alone on a line
    text = re.sub(r"\n\d+\n", "\n", text)

    # Preserve actual speaker name and job title by ensuring they don't get merged incorrectly
    text = re.sub(r"\n(?=[A-Z][A-Za-z\s]+\n[A-Za-z\s,]+)", " NEW_SPEAKER_MARKER ", text)  # Mark new speakers
    
    # Merge broken sentences split across pages without affecting new speakers
    text = re.sub(r"(?<!NEW_SPEAKER_MARKER)\n", " ", text)  

    # Restore proper formatting for new speakers
    text = text.replace(" NEW_SPEAKER_MARKER ", "\n")

    return text.strip()

# Apply improved page break handling
presentation_text_fixed = clean_text_preserving_speaker_alignment(presentation_text)
qna_text_fixed = clean_text_preserving_speaker_alignment(qna_text)

# Function to extract structured presentation data
def extract_presentation_data(text):
    """
    Extracts speaker names, job titles, and remarks while preserving alignment.
    """
    pattern = re.compile(
        r"([A-Z][A-Za-z\s]+)\n([A-Za-z\s,]+)\n(.+?)(?=\n[A-Z][A-Za-z\s]+\n[A-Za-z\s,]+|\Z)",
        re.DOTALL,
    )
    matches = pattern.findall(text)

    data = []
    for speaker, title, speech in matches:
        speech = " ".join(speech.splitlines())  # Merge broken lines
        speaker = speaker.strip()
        title = title.strip()

        data.append({
            "Financial_Quarter": FINANCIAL_QUARTER,
            "Call_Date": CALL_DATE,
            "Speaker": speaker,
            "Job_Title": title,
            "Text": speech.strip(),
        })

    return pd.DataFrame(data)

# Function to extract structured Q&A data
def extract_qna_data(text):
    """
    Extracts structured Q&A data, ensuring alignment of questions and answers.
    """
    pattern = re.compile(
        r"([A-Z][A-Za-z\s]+)\n([A-Za-z\s,]+)\n(.+?)(?=\n[A-Z][A-Za-z\s]+\n[A-Za-z\s,]+|\Z)",
        re.DOTALL,
    )
    matches = pattern.findall(text)

    data = []
    is_question = True  # Toggle between Question and Answer

    for speaker, title, speech in matches:
        speech = " ".join(speech.splitlines())  # Merge broken lines
        qna_type = "Question" if is_question else "Answer"
        data.append({
            "Financial_Quarter": FINANCIAL_QUARTER,
            "Call_Date": CALL_DATE,
            "Speaker": speaker.strip(),
            "Job_Title": title.strip(),
            "Type": qna_type,
            "Text": speech.strip(),
        })
        is_question = not is_question  # Toggle between Question and Answer

    return pd.DataFrame(data)

# Process Presentation and Q&A Data
presentation_df = extract_presentation_data(presentation_text_fixed)
qna_df = extract_qna_data(qna_text_fixed)

# Save the cleaned data to CSV files
presentation_csv_path = "/mnt/data/jpm_presentation.csv"
qna_csv_path = "/mnt/data/jpm_qna.csv"

presentation_df.to_csv(presentation_csv_path, index=False)
qna_df.to_csv(qna_csv_path, index=False)

# Display the cleaned data for review
import ace_tools as tools
tools.display_dataframe_to_user(name="Final JPMorgan Presentation Data", dataframe=presentation_df)
tools.display_dataframe_to_user(name="Final JPMorgan Q&A Data", dataframe=qna_df)

# Return file paths for download
presentation_csv_path, qna_csv_path
