<a href="https://colab.research.google.com/github/sheldonkemper/bank_of_england/blob/main/notebooks/cleansed/sk_processed_santander.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Modules

In [4]:
"""
===================================================
Author: Sheldon Kemper
Role: Data Engineering Lead, Bank of England Employer Project (Quant Collective)
LinkedIn: https://www.linkedin.com/in/sheldon-kemper
Date: 2025-02-04
Version: 1.1

Description:
    This notebook implements a system for processing and converting video transcripts into a single CSV file
    for the Bank of England project. The workflow processes MP4 files stored in the raw data directory on Google Drive
    by using a machine learning-based speech-to-text model (e.g., OpenAI’s Whisper) to transcribe the audio content into text.
    Each transcript is appended as a record in the CSV file along with metadata—such as the year, quarter, and a duplicate indicator—
    which are inferred from the video file name. This pipeline supports the ongoing integration of transcripts across multiple
    quarters and years, facilitating further analysis and reporting within our data engineering infrastructure.

===================================================
"""




In [5]:
# Install whisper (if not already installed)
!pip install git+https://github.com/openai/whisper.git

Collecting git+https://github.com/openai/whisper.git
  Cloning https://github.com/openai/whisper.git to /tmp/pip-req-build-t6nt6q0m
  Running command git clone --filter=blob:none --quiet https://github.com/openai/whisper.git /tmp/pip-req-build-t6nt6q0m
  Resolved https://github.com/openai/whisper.git to commit 517a43ecd132a2089d85f4ebc044728a71d49f6e
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting tiktoken (from openai-whisper==20240930)
  Downloading tiktoken-0.8.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch->openai-whisper==20240930)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch->openai-whisper==20240930)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-

In [6]:
import os
import glob
import subprocess
import requests
from bs4 import BeautifulSoup
import whisper
import re
import csv
import whisper

In [14]:
import os
from google.colab import drive

# Mount Google Drive to the root location with force_remount
drive.mount('/content/drive', force_remount=True)

# Assuming 'BOE' folder is in 'MyDrive' and already shared
BOE_path = '/content/drive/MyDrive/BOE/bank_of_england/data'

# List the contents of the BOE directory
print("BOE Directory Contents:", os.listdir(BOE_path))

# Define the raw data path (assuming your audio files are under raw/santander)
raw_data_path = os.path.join(BOE_path, 'raw', 'santander')
print("Raw Data Directory Contents:", os.listdir(raw_data_path))


Mounted at /content/drive
BOE Directory Contents: ['raw', 'processed', 'model', 'preprocessed_data']
Raw Data Directory Contents: []


## Process All Downloaded MP4 Files

In [13]:
import os
import glob
import csv
import whisper
import re
import requests
from bs4 import BeautifulSoup

def get_call_dates():
    """
    Scrapes the Santander Financial and Economic Information page to build a mapping
    of financial quarter to call date. This function assumes that the page contains quarterly
    result sections within <div class="documents-wrapper"> elements. Within each wrapper:
      - A <div class="title-document"> contains a <span class="text-title"> with text like "Q4 2024".
      - The first <div class="documents-block__date"> element within the wrapper holds the call date (e.g., "05-02-2025").
    Returns a dictionary mapping keys like "2024 Q4" to the call date.
    """
    url = "https://www.santander.com/en/shareholders-and-investors/financial-and-economic-information"
    call_date_mapping = {}
    try:
        response = requests.get(url)
        response.raise_for_status()
    except Exception as e:
        print("Error fetching call dates:", e)
        return call_date_mapping

    soup = BeautifulSoup(response.text, 'html.parser')
    wrappers = soup.find_all("div", class_="documents-wrapper")
    for wrapper in wrappers:
        title_document = wrapper.find("div", class_="title-document")
        if title_document:
            span_title = title_document.find("span", class_="text-title")
            if span_title:
                title_text = span_title.get_text(strip=True)
                # Expect title text like "Q4 2024"; extract quarter and year.
                match = re.search(r'(Q[1-4])\s+(\d{4})', title_text)
                if match:
                    quarter = match.group(1)
                    year = match.group(2)
                    key = f"{year} {quarter}"
                    # Look for the call date in the first <div class="documents-block__date">
                    date_elem = wrapper.find("div", class_="documents-block__date")
                    if date_elem:
                        call_date = date_elem.get_text(strip=True)
                        if call_date:
                            call_date_mapping[key] = call_date
                        else:
                            call_date_mapping[key] = "Unknown"
    return call_date_mapping

def parse_financial_quarter(filename):
    """
    Given a filename (e.g., "video_2023_Q3_1"), extract and return a string like "2023 Q3".
    If the pattern is not found, return "Unknown".
    """
    match = re.search(r'(\d{4})_(Q[1-4])', filename)
    if match:
        year = match.group(1)
        quarter = match.group(2)
        return f"{year} {quarter}"
    return "Unknown"

# Define directories – adjust these paths as needed.
raw_dir = '/content/drive/MyDrive/BOE/bank_of_england/data/raw/santander'
processed_dir = '/content/drive/MyDrive/BOE/bank_of_england/data/processed'
os.makedirs(processed_dir, exist_ok=True)

# Load the Whisper transcription model.
model = whisper.load_model("base")

# Define the CSV file where all transcripts will be appended.
all_transcripts_csv = os.path.join(processed_dir, "santander_management_discussion.csv")

# Prepare a set to store already processed file names for duplicate checking.
existing_files = set()
if os.path.exists(all_transcripts_csv):
    with open(all_transcripts_csv, "r", newline="", encoding="utf-8") as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            if "filename" in row:
                existing_files.add(row["filename"])

# If the CSV file doesn't exist, create it with the desired header.
if not os.path.exists(all_transcripts_csv):
    with open(all_transcripts_csv, "w", newline="", encoding="utf-8") as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(["filename", "management_discussion", "financial_quarter", "call_date"])

# Fetch the mapping of financial quarter to call date from Santander's page.
call_date_mapping = get_call_dates()
print("Call Date Mapping:", call_date_mapping)

# Process each MP4 file in the raw directory.
mp4_files = glob.glob(os.path.join(raw_dir, "*.mp4"))

for mp4_file in mp4_files:
    print(f"\nProcessing MP4 file: {mp4_file}")
    # Transcribe the video using Whisper.
    result = model.transcribe(mp4_file)
    transcript_text = result["text"]

    # Use the file's base name as an identifier.
    base_name = os.path.splitext(os.path.basename(mp4_file))[0]

    # Extract the financial quarter from the filename.
    financial_quarter = parse_financial_quarter(base_name)
    # Look up the call date from our mapping; default to "Unknown" if not found.
    call_date = call_date_mapping.get(financial_quarter, "Unknown")

    # Check for duplicates.
    duplicate_flag = "Yes" if base_name in existing_files else "No"
    existing_files.add(base_name)
    if duplicate_flag == "Yes":
        print(f"Duplicate found for {base_name}.")

    # Append the new record to the CSV with headers: filename, management_discussion, financial_quarter, call_date.
    with open(all_transcripts_csv, "a", newline="", encoding="utf-8") as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow([base_name, transcript_text, financial_quarter, call_date])

    print(f"Transcript for '{base_name}' appended (financial_quarter: {financial_quarter}, call_date: {call_date}).")


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/BOE/bank_of_england/data/processed/santander_management_discussion.csv'

In [10]:
!pip install nltk bertopic sentence-transformers

Collecting bertopic
  Downloading bertopic-0.16.4-py3-none-any.whl.metadata (23 kB)
Collecting hdbscan>=0.8.29 (from bertopic)
  Downloading hdbscan-0.8.40-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (15 kB)
Collecting umap-learn>=0.5.0 (from bertopic)
  Downloading umap_learn-0.5.7-py3-none-any.whl.metadata (21 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading

In [21]:
import os
import csv
import nltk
from nltk.tokenize import sent_tokenize
import re

# Ensure the 'punkt_tab' resource is downloaded.
nltk.download('punkt_tab')

def chunk_text(text, max_chunk_size=500):
    """
    Splits the input text into chunks that do not exceed max_chunk_size characters.
    The splitting is based on sentence boundaries.

    Parameters:
        text (str): The full text to be chunked.
        max_chunk_size (int): Maximum number of characters per chunk.

    Returns:
        List[str]: A list of text chunks.
    """
    sentences = sent_tokenize(text)
    chunks = []
    current_chunk = ""
    for sentence in sentences:
        if len(current_chunk) + len(sentence) + 1 > max_chunk_size:
            if current_chunk:
                chunks.append(current_chunk.strip())
                current_chunk = sentence + " "
            else:
                # In case a single sentence exceeds max_chunk_size.
                chunks.append(sentence.strip())
                current_chunk = ""
        else:
            current_chunk += sentence + " "
    if current_chunk.strip():
        chunks.append(current_chunk.strip())
    return chunks

# =====================================================
# Step 1: Load the transcript CSV file.
# =====================================================
csv_file = "/content/drive/MyDrive/BOE/bank_of_england/data/processed/santander_management_discussion.csv"
expected_headers = ["filename", "management_discussion", "financial_quarter", "call_date"]

if os.path.exists(csv_file):
    print(f"CSV file exists at: {csv_file}")
else:
    print(f"CSV file does not exist at: {csv_file}")

with open(csv_file, "r", newline="", encoding="utf-8") as f:
    reader = csv.reader(f)
    first_row = next(reader)
    print("First row read from file:", first_row)
    # If the first row matches the expected headers, we use it normally.
    if first_row == expected_headers:
        f.seek(0)
        dict_reader = csv.DictReader(f)
    else:
        print("Header does not match expected headers; using manual fieldnames and skipping first row.")
        f.seek(0)
        dict_reader = csv.DictReader(f, fieldnames=expected_headers)
        next(dict_reader)  # Skip the first row if it is an unexpected header.

    transcripts = []
    row_count = 0
    for row in dict_reader:
        row_count += 1
        transcripts.append({
            "filename": row.get("filename", "").strip(),
            "management_discussion": row.get("management_discussion", "").strip(),
            "financial_quarter": row.get("financial_quarter", "Unknown").strip(),
            "call_date": row.get("call_date", "Unknown").strip()
        })

print(f"Total rows loaded: {row_count}")
for i, t in enumerate(transcripts, start=1):
    print(f"Row {i}:")
    print(f"  filename: {t['filename']}")
    print(f"  financial_quarter: {t['financial_quarter']}")
    print(f"  call_date: {t['call_date']}")
    print(f"  Transcript length: {len(t['management_discussion'])}")

# =====================================================
# Step 2: Chunk each transcript using sentence-based chunking.
# =====================================================
all_chunks = []
for t in transcripts:
    transcript = t["management_discussion"]
    if transcript:
        chunks = chunk_text(transcript, max_chunk_size=500)
        if chunks:
            all_chunks.extend(chunks)
        else:
            print(f"No chunks produced for {t['filename']}.")
    else:
        print(f"Transcript for {t['filename']} is empty.")

print(f"Total chunks obtained: {len(all_chunks)}")

# =====================================================
# Step 3: Apply BERTopic for topic modeling on the chunks.
# =====================================================
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer

# Initialize a SentenceTransformer embedding model.
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
# Create a BERTopic model instance.
topic_model = BERTopic(embedding_model=embedding_model, calculate_probabilities=True)

# Check that there are enough documents (BERTopic performs best with at least 3).
if len(all_chunks) < 3:
    print(f"Not enough documents for BERTopic modeling; got only {len(all_chunks)} document(s).")
else:
    topics, probabilities = topic_model.fit_transform(all_chunks)
    print("Topic Information:")
    print(topic_model.get_topic_info())
    # Optionally, print the topic assignment for each chunk.
    for i, chunk in enumerate(all_chunks):
        print(f"Chunk {i+1} assigned to topic: {topics[i]}")


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


CSV file exists at: /content/drive/MyDrive/BOE/bank_of_england/data/processed/santander_management_discussion.csv
First row read from file: ['video_2023_Q4_8', " Good morning everybody and welcome to Banco Santander's conference called to discuss our financial results for the fourth quarter of 2023. Just as a reminder, both the results report and presentation we will be following today are available to you on our website. I am joined here today by our executive chair, Ms. Ana Botín, our CEO, Mr. Hector Grisi and our CFO, Mr. José García Candera. Following their presentations, we will open the floor for any and all questions that you may have in the Q&A session. With this I will hand over to Ms. Botín. Another floor is yours. So thank you very much, Oña. Good morning to everybody. It's a great pleasure to be with all of you. Thank you for joining. As a reminder, we have recently announced a last step towards one Santander. We finished the creation of the five global businesses which we 

In [22]:
topic_info = topic_model.get_topic_info()
topic_info.to_csv("topic_info.csv", index=False)
print(topic_info)


    Topic  Count                                       Name  \
0      -1    382                         -1_you_the_in_that   
1       0    166                           0_nii_in_the_nia   
2       1    114                     1_our_bank_are_digital   
3       2     78                         2_risk_cost_of_the   
4       3     49          3_exchange_argentina_quarter_rate   
5       4     47                       4_brazil_the_not_you   
6       5     42               5_spain_deposits_deposit_you   
7       6     37  6_capital_organic_distributions_hierarchy   
8       7     37             7_profit_euros_revenue_million   
9       8     35                       8_civ_business_in_by   
10      9     33     9_global_platform_deployment_platforms   
11     10     32            10_basis_points_related_capital   
12     11     31           11_fees_fee_insurance_businesses   
13     12     30                       12_cab_and_flows_our   
14     13     30     13_products_retail_transformation_

In [23]:
for topic_id in topic_info["Topic"]:
    if topic_id != -1:  # Skip outlier/noise topic if present.
        print(f"Topic {topic_id}:", topic_model.get_topic(topic_id))


Topic 0: [('nii', 0.01984199204083605), ('in', 0.01913810314529516), ('the', 0.017732410065564624), ('nia', 0.01751173331771802), ('sensitivity', 0.016503683565833843), ('rates', 0.01631712502623381), ('and', 0.014423367061838256), ('to', 0.014270827129089463), ('uk', 0.013998034099051327), ('we', 0.013748197927568533)]
Topic 1: [('our', 0.025362552480045465), ('bank', 0.023590619536824464), ('are', 0.019009710663697183), ('digital', 0.017968623829829167), ('and', 0.017274440897989535), ('to', 0.01711463869348948), ('customers', 0.016849150765688388), ('we', 0.016662314837262826), ('transformation', 0.015183910769792162), ('of', 0.014082311650558762)]
Topic 2: [('risk', 0.050861157175143536), ('cost', 0.036056037344599175), ('of', 0.022211803596249694), ('the', 0.019632726330100134), ('in', 0.019174125450313006), ('year', 0.01823948818032784), ('provisions', 0.016980083934693257), ('normalization', 0.016551264159518676), ('at', 0.014911349342177922), ('levels', 0.014769039696883417)]
T

In [24]:
fig_overview = topic_model.visualize_topics()
fig_overview.show()

# For a more detailed heatmap of topic similarities:
fig_heatmap = topic_model.visualize_heatmap()
fig_heatmap.show()
