<a href="https://colab.research.google.com/github/sheldonkemper/bank_of_england/blob/main/notebooks/processed/sk_process_santander.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Modules

In [None]:
"""
===================================================
Author: Sheldon Kemper
Role: Data Engineering Lead, Bank of England Employer Project (Quant Collective)
LinkedIn: https://www.linkedin.com/in/sheldon-kemper
Date: 2025-02-04
Version: 1.1

Description:
    This notebook implements a system for processing and converting video transcripts into a single CSV file
    for the Bank of England project. The workflow processes MP4 files stored in the raw data directory on Google Drive
    by using a machine learning-based speech-to-text model (e.g., OpenAI’s Whisper) to transcribe the audio content into text.
    Each transcript is appended as a record in the CSV file along with metadata—such as the year, quarter, and a duplicate indicator—
    which are inferred from the video file name. This pipeline supports the ongoing integration of transcripts across multiple
    quarters and years, facilitating further analysis and reporting within our data engineering infrastructure.

===================================================
"""


In [14]:
# Install whisper (if not already installed)
# !pip install git+https://github.com/openai/whisper.git

In [15]:
import os
import glob
import subprocess
import requests
from bs4 import BeautifulSoup
import whisper
import re
import csv
import whisper

In [16]:
import os
from google.colab import drive

# Mount Google Drive to the root location with force_remount
drive.mount('/content/drive', force_remount=True)

# Assuming 'BOE' folder is in 'MyDrive' and already shared
BOE_path = '/content/drive/MyDrive/BOE/bank_of_england/data'

# List the contents of the BOE directory
print("BOE Directory Contents:", os.listdir(BOE_path))

# Define the raw data path (assuming your audio files are under raw/santander)
raw_data_path = os.path.join(BOE_path, 'raw', 'santander')
print("Raw Data Directory Contents:", os.listdir(raw_data_path))


Mounted at /content/drive
BOE Directory Contents: ['model', 'processed', 'raw']
Raw Data Directory Contents: ['text']


## Process All Downloaded MP4 Files

In [None]:
import os
import glob
import csv
import whisper
import re
import requests
from bs4 import BeautifulSoup

def get_call_dates():
    """
    Scrapes the Santander Financial and Economic Information page to build a mapping
    of financial quarter to call date. The function assumes that each quarterly result
    is contained in an element with class "quarterly-result" that has:
        - an <h2> tag containing text like "Q3 2023 Results"
        - a <span> tag with class "call-date" containing text such as "Call Date: 25 October 2023"
    Adjust the selectors if the page structure is different.
    Returns a dictionary mapping keys like "2023 Q3" to the call date.
    """
    url = "https://www.santander.com/en/shareholders-and-investors/financial-and-economic-information"
    call_date_mapping = {}
    try:
        response = requests.get(url)
        response.raise_for_status()
    except Exception as e:
        print("Error fetching call dates:", e)
        return call_date_mapping

    soup = BeautifulSoup(response.text, 'html.parser')
    # Look for quarterly result blocks. (Adjust the tag/class as needed.)
    results = soup.find_all("div", class_="quarterly-result")
    for result in results:
        header_elem = result.find("h2")
        if header_elem:
            header_text = header_elem.get_text(strip=True)
            # Expect header text in the format "Q3 2023 Results" (or similar)
            match = re.search(r'(Q[1-4])\s+(\d{4})', header_text)
            if match:
                quarter = match.group(1)
                year = match.group(2)
                key = f"{year} {quarter}"
                call_date_elem = result.find("span", class_="call-date")
                if call_date_elem:
                    call_date_text = call_date_elem.get_text(strip=True)
                    # Remove any label (e.g., "Call Date: ")
                    call_date = call_date_text.replace("Call Date: ", "")
                    call_date_mapping[key] = call_date
    return call_date_mapping

def parse_financial_quarter(filename):
    """
    Given a filename (e.g., "video_2023_Q3_1"), extract and return a string like "2023 Q3".
    If the pattern is not found, return "Unknown".
    """
    match = re.search(r'(\d{4})_(Q[1-4])', filename)
    if match:
        year = match.group(1)
        quarter = match.group(2)
        return f"{year} {quarter}"
    return "Unknown"

# Define directories – adjust these paths as needed.
raw_dir = '/content/drive/MyDrive/BOE/bank_of_england/data/raw/santander'
processed_dir = '/content/drive/MyDrive/BOE/bank_of_england/data/processed'
os.makedirs(processed_dir, exist_ok=True)

# Load the Whisper transcription model.
model = whisper.load_model("base")

# Define the CSV file where all transcripts will be appended.
all_transcripts_csv = os.path.join(processed_dir, "all_transcripts.csv")

# Prepare a set to store already processed file names for duplicate checking.
existing_files = set()
if os.path.exists(all_transcripts_csv):
    with open(all_transcripts_csv, "r", newline="", encoding="utf-8") as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            existing_files.add(row["filename"])

# If the CSV file doesn't exist, create it with the desired header.
if not os.path.exists(all_transcripts_csv):
    with open(all_transcripts_csv, "w", newline="", encoding="utf-8") as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(["filename", "management_discussion", "financial_quarter", "call_date"])

# Fetch the mapping of financial quarter to call date from Santander's page.
call_date_mapping = get_call_dates()

# Process each MP4 file in the raw directory.
mp4_files = glob.glob(os.path.join(raw_dir, "*.mp4"))

for mp4_file in mp4_files:
    print(f"\nProcessing MP4 file: {mp4_file}")
    # Transcribe the video using Whisper.
    result = model.transcribe(mp4_file)
    transcript_text = result["text"]

    # Use the file's base name as an identifier.
    base_name = os.path.splitext(os.path.basename(mp4_file))[0]

    # Extract the financial quarter from the filename.
    financial_quarter = parse_financial_quarter(base_name)
    # Look up the call date from our mapping; default to "Unknown" if not found.
    call_date = call_date_mapping.get(financial_quarter, "Unknown")

    # Check for duplicates.
    duplicate_flag = "Yes" if base_name in existing_files else "No"
    existing_files.add(base_name)
    if duplicate_flag == "Yes":
        print(f"Duplicate found for {base_name}.")

    # Append the new record to the CSV with headers: filename, management_discussion, financial_quarter, call_date.
    with open(all_transcripts_csv, "a", newline="", encoding="utf-8") as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow([base_name, transcript_text, financial_quarter, call_date])

    print(f"Transcript for '{base_name}' appended (financial_quarter: {financial_quarter}, call_date: {call_date}).")


  checkpoint = torch.load(fp, map_location=device)



Processing MP4 file: /content/drive/MyDrive/BOE/bank_of_england/data/raw/santander/video_2023_Q4_8.mp4
Transcript for 'video_2023_Q4_8' appended (financial_quarter: 2023 Q4, call_date: Unknown).

Processing MP4 file: /content/drive/MyDrive/BOE/bank_of_england/data/raw/santander/video_2024_Q3_3.mp4
