<a href="https://colab.research.google.com/github/sheldonkemper/bank_of_england/blob/main/notebooks/processed/sk_chunking_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Modules

In [None]:
"""
===================================================
Author: Sheldon Kemper
Role: Data Engineering Lead, Bank of England Employer Project (Quant Collective)
LinkedIn: https://www.linkedin.com/in/sheldon-kemper
Date: 2025-02-04
Version: 1.1

Description:
    This notebook implements a system for cleaning, chunking, and exporting transcript data
    for the Bank of England project. The workflow reads a CSV file containing video transcripts
    (with metadata such as filename, financial quarter, and call date), applies source‐specific
    cleaning (e.g., removing header text in JP Morgan transcripts), and splits each transcript
    into smaller chunks based on sentence boundaries (with a maximum chunk size of 500 characters).
    Each chunk is annotated with its original filename, chunk index, financial quarter, and call date,
    and then saved to a new CSV file. This processed, chunked data supports downstream modeling
    tasks—such as topic modeling with BERTopic—and further analysis within our data engineering infrastructure.

===================================================
"""




In [None]:
!pip install nltk bertopic sentence-transformers

In [None]:
# Install whisper (if not already installed)
!pip install git+https://github.com/openai/whisper.git

Collecting git+https://github.com/openai/whisper.git
  Cloning https://github.com/openai/whisper.git to /tmp/pip-req-build-86pj5jpn
  Running command git clone --filter=blob:none --quiet https://github.com/openai/whisper.git /tmp/pip-req-build-86pj5jpn
  Resolved https://github.com/openai/whisper.git to commit 517a43ecd132a2089d85f4ebc044728a71d49f6e
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting tiktoken (from openai-whisper==20240930)
  Downloading tiktoken-0.8.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch->openai-whisper==20240930)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch->openai-whisper==20240930)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-

In [None]:
import os
import csv
import nltk
from nltk.tokenize import sent_tokenize
import re
import os
from google.colab import drive

In [None]:
# Mount Google Drive to the root location with force_remount
drive.mount('/content/drive', force_remount=True)

# Assuming 'BOE' folder is in 'MyDrive' and already shared
BOE_path = '/content/drive/MyDrive/BOE/bank_of_england/data'

# List the contents of the BOE directory
print("BOE Directory Contents:", os.listdir(BOE_path))

# Define the raw data path (assuming your audio files are under raw/santander)
raw_data_path = os.path.join(BOE_path, 'raw', 'santander')
print("Raw Data Directory Contents:", os.listdir(raw_data_path))


Mounted at /content/drive
BOE Directory Contents: ['raw', 'processed', 'model', 'preprocessed_data']
Raw Data Directory Contents: ['MP4.zip']


In [None]:
# Ensure the 'punkt_tab' resource is downloaded.
nltk.download('punkt_tab')

def clean_transcript(text):
    """
    Cleans the transcript text based on the source.

    For JP Morgan transcripts, if the text contains the marker
    "MANAGEMENT DISCUSSION SECTION", this function removes all content
    before (and including) that marker.

    For Santander transcripts (or any transcript without that marker),
    the text is returned unchanged.
    """
    marker = "MANAGEMENT DISCUSSION SECTION"
    if marker in text:
        parts = text.split(marker, 1)
        cleaned_text = parts[1].strip()
        return cleaned_text
    else:
        return text

def chunk_text(text, max_chunk_size=500):
    """
    Splits the input text into chunks that do not exceed max_chunk_size characters.
    The splitting is based on sentence boundaries.

    Parameters:
        text (str): The full text to be chunked.
        max_chunk_size (int): Maximum number of characters per chunk.

    Returns:
        List[str]: A list of text chunks.
    """
    sentences = sent_tokenize(text)
    chunks = []
    current_chunk = ""
    for sentence in sentences:
        if len(current_chunk) + len(sentence) + 1 > max_chunk_size:
            if current_chunk:
                chunks.append(current_chunk.strip())
                current_chunk = sentence + " "
            else:
                chunks.append(sentence.strip())
                current_chunk = ""
        else:
            current_chunk += sentence + " "
    if current_chunk.strip():
        chunks.append(current_chunk.strip())
    return chunks

In [None]:
# =====================================================
# Step 1: Load the transcript CSV file.
# =====================================================

location = "/content/drive/MyDrive/BOE/bank_of_england/data/processed/"
filename = "management_discussion.csv"

csv_file = f"/{location}/{filename}"
expected_headers = ["filename", "management_discussion", "financial_quarter", "call_date"]

if os.path.exists(csv_file):
    print(f"CSV file exists at: {csv_file}")
else:
    print(f"CSV file does not exist at: {csv_file}")

with open(csv_file, "r", newline="", encoding="utf-8") as f:
    reader = csv.reader(f)
    first_row = next(reader)
    print("First row read from file:", first_row)
    if first_row == expected_headers:
        f.seek(0)
        dict_reader = csv.DictReader(f)
    else:
        print("Header does not match expected headers; using manual fieldnames and skipping first row.")
        f.seek(0)
        dict_reader = csv.DictReader(f, fieldnames=expected_headers)
        next(dict_reader)

    transcripts = []
    row_count = 0
    for row in dict_reader:
        row_count += 1
        transcripts.append({
            "filename": row.get("filename", "").strip(),
            "management_discussion": row.get("management_discussion", "").strip(),
            "financial_quarter": row.get("financial_quarter", "Unknown").strip(),
            "call_date": row.get("call_date", "Unknown").strip()
        })

print(f"Total rows loaded: {row_count}")
for i, t in enumerate(transcripts, start=1):
    print(f"Row {i}:")
    print(f"  filename: {t['filename']}")
    print(f"  financial_quarter: {t['financial_quarter']}")
    print(f"  call_date: {t['call_date']}")
    print(f"  Transcript length: {len(t['management_discussion'])}")

# =====================================================
# Step 2: Clean and chunk each transcript, and save the chunked data.
# =====================================================
chunked_data = []  # This will store dictionaries with metadata for each chunk.
for t in transcripts:
    # Clean the transcript (e.g., remove JP Morgan headers if present).
    cleaned_text = clean_transcript(t["management_discussion"])
    if cleaned_text:
        chunks = chunk_text(cleaned_text, max_chunk_size=500)
        if chunks:
            for idx, chunk in enumerate(chunks, start=1):
                chunked_data.append({
                    "filename": t["filename"],
                    "chunk_index": idx,
                    "chunk_text": chunk,
                    "financial_quarter": t["financial_quarter"],
                    "call_date": t["call_date"]
                })
        else:
            print(f"No chunks produced for {t['filename']}.")
    else:
        print(f"Transcript for {t['filename']} is empty after cleaning.")

print(f"Total chunks obtained: {len(chunked_data)}")

# Save the chunked data to a new CSV file.
chunked_csv_file = os.path.join("/content/drive/MyDrive/BOE/bank_of_england/data/processed", f"chunked_{filename}")
with open(chunked_csv_file, "w", newline="", encoding="utf-8") as csvfile:
    fieldnames = ["filename", "chunk_index", "chunk_text", "financial_quarter", "call_date"]
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    for row in chunked_data:
        writer.writerow(row)

print(f"Chunked data saved to {chunked_csv_file}")


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


CSV file exists at: //content/drive/MyDrive/BOE/bank_of_england/data/processed//management_discussion.csv
First row read from file: ['filename', 'management_discussion', 'financial_quarter', 'call_date']
Total rows loaded: 8
Row 1:
  filename: 4q24-earnings-transcript.pdf
  financial_quarter: 4Q24
  call_date: 2025-01-15
  Transcript length: 13232
Row 2:
  filename: jpmc-third-quarter-2024-earnings-conference-call-transcript.pdf
  financial_quarter: 3Q24
  call_date: 2024-10-11
  Transcript length: 9213
Row 3:
  filename: jpm-2q24-earnings-call-transcript-final.pdf
  financial_quarter: 2Q24
  call_date: 2024-07-12
  Transcript length: 11025
Row 4:
  filename: jpm-1q24-earnings-call-transcript.pdf
  financial_quarter: 1Q24
  call_date: 2024-04-12
  Transcript length: 11789
Row 5:
  filename: jpm-4q23-earnings-call-transcript.pdf
  financial_quarter: 4Q23
  call_date: 2024-01-12
  Transcript length: 13699
Row 6:
  filename: jpm-3q23-earnings-call-transcript.pdf
  financial_quarter: 3Q23
