In [59]:
import sys
import os

sys.path.append(os.path.abspath("../tools"))
from envvars import load_env_vars_from_directory
import numpy as np
import pandas as pd
import faiss
import glob
import re
import pysrt
import requests
from urllib.parse import urljoin
from bs4 import BeautifulSoup, Comment

In [60]:
# Variables de entorno
# Cargar variables de entorno desde archivos .env
load_env_vars_from_directory("../.env")

# Comprobar si las variables de entorno necesarias están definidas
cal_file =os.getenv('PODCAST_CAL_FILE')
if not cal_file:
    raise ValueError("PODCAST_CAL_FILE environment variable is not set.")
if not os.path.exists(cal_file):
    raise FileNotFoundError(f"Calendar '{cal_file}' does not exist.")
workdir = os.path.join(os.getenv('PODCAST_WORKDIR'), 'Tmp')
if not workdir:
    raise ValueError("PODCAST_WORKDIR environment variable is not set.")
if not os.path.exists(workdir):
    raise FileNotFoundError(f"Work directory '{workdir}' does not exist.")
prefix = os.getenv('PODCAST_PREFIX')
if not prefix:
    raise ValueError("PODCAST_PREFIX environment variable is not set.")


In [61]:
# Leer el calendario de podcasts
caldf = pd.read_csv(cal_file, parse_dates=["date"], index_col="episode" )
caldf

Unnamed: 0_level_0,date
episode,Unnamed: 1_level_1
250725,2025-07-25
250718,2025-07-18
250711,2025-07-11
250704,2025-07-04
250627,2025-06-27
...,...
200201,2020-02-01
200125,2020-01-25
200118,2020-01-18
200111,2020-01-11


In [62]:
workdir

'/home/jmrobles/Podcasts/Cowboys de Medianoche/Tmp'

In [63]:
mp3files = glob.glob(os.path.join(workdir, f"{prefix}*mp3"))
for i in range(len(mp3files)):
    mp3files[i] = os.path.basename(mp3files[i])[:-4]
mp3files[:10]

['cm220423',
 'cm241129',
 'cm241108',
 'cm250117',
 'cm220219',
 'cm250124',
 'cm230602',
 'cm240315',
 'cm230210',
 'cm250404']

In [64]:
epnumber_regex = re.compile(rf"{prefix}([\d]+).*")
def get_epnumber(epname):
    global epnumber_regex
    return int(re.search(epnumber_regex, epname).group(1))
epnumbers = [get_epnumber(ep) for ep in mp3files]
print (f"Found {len(epnumbers)} episodes in {workdir} with prefix {prefix}")
print (f"Last 10 epnumbers: {epnumbers[-10:]}")

Found 161 episodes in /home/jmrobles/Podcasts/Cowboys de Medianoche/Tmp with prefix cm
Last 10 epnumbers: [220514, 250214, 240301, 230929, 230728, 231103, 240419, 230623, 240531, 230519]


In [65]:
epdf = pd.DataFrame(mp3files, columns=["epname"])
epdf["epnumber"] = epdf["epname"].apply(get_epnumber)

epdf["epdate"] = epdf["epnumber"].apply(
    lambda x: caldf.loc[x]["date"] if x in caldf.index else None
)
epdf.set_index("epname", inplace=True)
epdf

Unnamed: 0_level_0,epnumber,epdate
epname,Unnamed: 1_level_1,Unnamed: 2_level_1
cm220423,220423,2022-04-23
cm241129,241129,2024-11-29
cm241108,241108,2024-11-08
cm250117,250117,2025-01-17
cm220219,220219,2022-02-19
...,...,...
cm231103,231103,2023-11-03
cm240419,240419,2024-04-19
cm230623,230623,2023-06-23
cm240531,240531,2024-05-31


In [66]:
textpat = re.compile(r'^\[(?P<speaker>.*)\]: *(?P<spoken>.*)')

In [67]:

epints = {}
for epname in epdf.index:
    epsrt = os.path.join(workdir, f"{epname}_whisper_es.srt")
    if not os.path.exists(epsrt):
        print(f"Warning: SRT file for {epname} does not exist, skipping.")
        continue
    epsubs = pysrt.open(epsrt, encoding='utf-8')
    epints[epname] = {}
    for eps in epsubs:
        m = re.match(textpat, eps.text)
        if m is None:
            print (f"Fallo en parsear {eps.text}")
            continue
        speaker = m.group('speaker')
        if speaker.startswith('Unknown'):
            # Si el hablante es desconocido, lo ignoramos
            continue
        # spoken = m.group('spoken')
        start = eps.start.ordinal/1_000
        end = eps.end.ordinal/1_000
        epints[epname][speaker] = epints[epname].get(speaker, 0) + end - start
print(f"Found {len(epints.keys())} episodes with speaker information.")
for k in list(epints.keys())[:5]:
    print(f"{k}: {epints[k]}")

Found 161 episodes with speaker information.
cm220423: {'Luis Herrero': 838.0099999999993, 'Eduardo Torres-Dulce': 1840.4700000000003, 'José Luis Garci': 1393.199999999999, 'Luis Alberto de Cuenca': 141.15999999999894}
cm241129: {'Luis Alberto de Cuenca': 961.1800000000012, 'Luis Herrero': 1411.3899999999967, 'José Luis Garci': 1335.1499999999996, 'Eduardo Torres-Dulce': 495.39999999999964, 'Inocencio Arias': 26.980000000000018}
cm241108: {'Luis Alberto de Cuenca': 236.5699999999997, 'Luis Herrero': 1114.9999999999973, 'Eduardo Torres-Dulce': 1612.7800000000016, 'José Luis Garci': 953.7999999999984}
cm250117: {'Luis Herrero': 778.1900000000023, 'Luis Alberto de Cuenca': 444.8700000000008, 'Eduardo Torres-Dulce': 1204.859999999997, 'José Luis Garci': 1441.239999999997, 'Inocencio Arias': 370.8600000000006}
cm220219: {'Luis Herrero': 1163.130000000001, 'Eduardo Torres-Dulce': 1407.960000000001, 'José Luis Garci': 1298.3799999999992, 'Luis Alberto de Cuenca': 156.2500000000009}


In [68]:

for epname, speakers in epints.items():
    print(f"Processing episode {epname} with speakers {speakers.keys()}")
    html_whisper_files = glob.glob(os.path.join(workdir, f'{epname}*_whisper*.html'))
    for htmlf in html_whisper_files:
        with open(htmlf, 'r', encoding='utf-8') as f:
            soup = BeautifulSoup(f, 'html.parser')
        comments = soup.find_all(
            string=lambda text: isinstance(text, Comment)
                                and f" ha hablado " in text
        )
        if not comments:
            print(f"No comments found in {htmlf}")
            continue
        if len(comments) < len(speakers):
            print(f"Warning: Not enough comments in {htmlf} for speakers {speakers.keys()}"
                  f" (found: {len(comments)}, expected: {len(speakers)})")
            continue
        speaker_summary = soup.find('span', id='speaker-summary')
        if speaker_summary:
            ul = speaker_summary.find('ul')
            if ul:
                # Borrar los elementos de la lista
                ul.clear()
            else:
                print(f"No 'ul' found in 'speaker-summary' span in {htmlf}")
        else:
            print(f"No 'speaker-summary' span found in {htmlf}")
        cix = 0
        for speaker in speakers:
            # Paso de número de segundos a formato de texto
            seconds = int(speakers[speaker])
            datestr = f"{seconds // 3600:02d}:{(seconds % 3600) // 60:02d}:{seconds % 60:02d}"
            # Reemplazar el comentario con el tiempo de habla
            new_text = f"{speaker} ha hablado {datestr} en el segmento"
            comments[cix].replace_with(Comment(new_text))
            if ul:
                li = soup.new_tag('li')
                li.string = f"{speaker}: {datestr}"
                ul.append(li)
            cix += 1
        for i in range(cix, len(comments)):
            # Borrar los comentarios restantes teniendo en cuenta que no existe clear()
            comments[i].replace_with('')
            
        with open(htmlf, 'w', encoding='utf-8') as f:
            f.write(str(soup))
        #Localizar el span "speaker_summary" y dentro de el una lista ul. Se borrará y rellenará con los hablantes
        # tomados de los comentarios


 

Processing episode cm220423 with speakers dict_keys(['Luis Herrero', 'Eduardo Torres-Dulce', 'José Luis Garci', 'Luis Alberto de Cuenca'])
Processing episode cm241129 with speakers dict_keys(['Luis Alberto de Cuenca', 'Luis Herrero', 'José Luis Garci', 'Eduardo Torres-Dulce', 'Inocencio Arias'])
Processing episode cm241108 with speakers dict_keys(['Luis Alberto de Cuenca', 'Luis Herrero', 'Eduardo Torres-Dulce', 'José Luis Garci'])
Processing episode cm250117 with speakers dict_keys(['Luis Herrero', 'Luis Alberto de Cuenca', 'Eduardo Torres-Dulce', 'José Luis Garci', 'Inocencio Arias'])
Processing episode cm220219 with speakers dict_keys(['Luis Herrero', 'Eduardo Torres-Dulce', 'José Luis Garci', 'Luis Alberto de Cuenca'])
Processing episode cm250124 with speakers dict_keys(['Luis Alberto de Cuenca', 'Luis Herrero', 'José Luis Garci', 'Eduardo Torres-Dulce', 'Inocencio Arias'])
Processing episode cm230602 with speakers dict_keys(['Luis Alberto de Cuenca', 'Luis Herrero', 'José Luis Gar