In [None]:
from bs4 import BeautifulSoup, Comment
import os
import sys
import re
sys.path.append('..')
from dateestimation import DateEstimation 
from tools.envvars import load_env_vars_from_directory

In [None]:
load_env_vars_from_directory('../.env')
cal_file =os.getenv('PODCAST_CAL_FILE')
if not cal_file:
    raise ValueError("PODCAST_CAL_FILE environment variable is not set.")
if not os.path.exists(cal_file):
    raise FileNotFoundError(f"Calendar '{cal_file}' does not exist.")

prefix = os.getenv('PODCAST_PREFIX')
if not prefix:
    raise ValueError("PODCAST_PREFIX environment variable is not set.")

templates = os.getenv('PODCAST_TEMPLATES', '../templates')
css = os.path.join(templates, 'podcast.css')
if not os.path.exists(css):
    raise FileNotFoundError(f"CSS file '{css}' does not exist.")

workdir = os.getenv('PODCAST_WORKDIR', '../workdir')
if not os.path.exists(workdir):
    raise FileNotFoundError(f"Work directory '{workdir}' does not exist.")

print(f"Using calendar file: {cal_file}")
print(f"Using podcast prefix: {prefix}")
print(f"Using templates directory: {templates}")
print(f"Using work directory: {workdir}")
print(f"Using CSS file: {css}")

In [None]:
tdir = workdir
!ls "$tdir"

In [None]:
def get_epnumber(filename):
    # cálculo de basename del archivo
    basename = os.path.basename(filename)
    
    get_epnumber = re.compile(rf'{prefix}(\d+)[^\d].*')
    match = get_epnumber.search(basename)
    if match:
        epnumber = int(match.group(1))
    return epnumber

In [None]:
import glob
mp3_files = glob.glob(os.path.join(tdir, f'{prefix}*.mp3'))
print (mp3_files[:10])

In [None]:
import pysrt

In [None]:
textpat = re.compile(r'^\[(?P<speaker>.*)\]: *(?P<spoken>.*)')

In [None]:
def change_speaker_in_srt_file(srt_file, old_speaker, new_speaker):
    subs = pysrt.open(srt_file, encoding='utf-8')
    for sub in subs:
        if sub.text.startswith(f"[{old_speaker}]:"):
            sub.text = sub.text.replace(f"[{old_speaker}]:", f"[{new_speaker}]:")
    subs.save(srt_file, encoding='utf-8')


In [None]:
change_dict = {
    214: {
        'Unknown 1': 'José Ignacio Andrés de Nailted',
    },
    215: {
        'Unknown 1': 'Curro Ávalos',
    },
    216: {
        'Unknown 1': 'Moises Hamui',
    },
    217: {
        'Unknown 1': 'Raquel Roca',
    },
    218: {
        'Unknown 1': 'Carlos Fernández',
    },
    219: {
        'Unknown 1': 'Cristina Cifuentes',
    },
    220: {
        'Unknown 1': 'Roxana Falasco',
    },
    221: {
        'Unknown 1': 'Sara Escudero',
    },
    222: {
        'Unknown 1': 'María Jesús Álava Reyes',
    },
    223: {
        'Unknown 1': 'Juan Vicente Olmos Llorente',
    },
    224: {
        'Unknown 1': 'Agustín Peralt',
    },
    241:  {
        'Unknown 1': 'Tom Horsey',
    },
}

In [None]:
for mp3f in mp3_files:
    epnumber = get_epnumber(mp3f)
    if epnumber is None:
        print(f"Could not extract episode number from {mp3f}")
        continue
    print (f"Processing episode number: {epnumber}")
    if epnumber not in change_dict:
        print(f"No changes defined for episode number {epnumber}, skipping.")
        continue
    for strf in glob.glob(os.path.join(tdir, f'{prefix}{epnumber}_*.srt')):
        print(f"Processing subtitle file: {strf}")
        for old_speaker, new_speaker in change_dict[epnumber].items():
            print(f"Changing speaker from '{old_speaker}' to '{new_speaker}' in {strf}")
            change_speaker_in_srt_file(strf, old_speaker, new_speaker)


In [None]:
html_whisper_files = glob.glob(os.path.join(tdir, f'{prefix}*_whisper*.html'))
for htmlf in html_whisper_files:
    epnumber = get_epnumber(htmlf)
    if epnumber is None:
        print(f"Could not extract episode number from {mp3f}")
        continue
    if epnumber not in change_dict:
        print(f"No changes defined for episode number {epnumber}, skipping.")
        continue
    print(f"Processing HTML file: {htmlf}")
    with open(htmlf, 'r', encoding='utf-8') as f:
        soup = BeautifulSoup(f, 'html.parser')
    # Localizar todos los textos de class "speaker*"
    speakers = soup.find_all(class_=re.compile(r'speaker-\d+'))
    # Sustituir, si existe old_speaker por new_speaker
    for old_speaker, new_speaker in change_dict.get(epnumber, {}).items():
        print(f"Changing speaker from '{old_speaker}' to '{new_speaker}' in HTML file {htmlf}")
        for speaker in speakers:
            # Cambiar el texto old_speaker for new_speaker dejando el resto del texto intacto
            if old_speaker in speaker.text:
                speaker.string = speaker.text.replace(old_speaker, new_speaker)
    # Localizar el texto <!-- old_speaker ha hablado ... y cambiar por <!-- new_speaker ha hablado ... -->
    comments = soup.find_all(
    string=lambda text: isinstance(text, Comment)
                        and f"{old_speaker} ha hablado" in text
    )   
    if not comments:
        print(f"No comments found for speaker '{old_speaker}' in {htmlf}")
    #Localizar el span "speaker_summary" y dentro de el una lista ul. Se borrará y rellenará con los hablantes
    # tomados de los comentarios
    speaker_summary = soup.find('span', id='speaker-summary')
    if speaker_summary:
        ul = speaker_summary.find('ul')
        if ul:
            # Borrar los elementos de la lista
            ul.clear()
        else:
            print(f"No 'ul' found in 'speaker-summary' span in {htmlf}")
    else:
        print(f"No 'speaker-summary' span found in {htmlf}")

    for comment in comments:
        print (f"Changing comment from '{comment}' to include speaker '{new_speaker}'")
        # Cambiar el texto old_speaker por new_speaker
        
        new_text = comment.replace(old_speaker, new_speaker)
        comment.replace_with(Comment(new_text))
    
    # Una vez cambiados algunos comentarios vamos a repasar todos los comentarios y 
    # componer la lista de hablantes
    comments = soup.find_all(
    string=lambda text: isinstance(text, Comment)
                        and f" ha hablado" in text
    )   
    for comment in comments:
        # Extraer el hablante del comentario
        match = re.search(r'(.*?) ha hablado (.*?) en el segmento', comment)
        if match:
            speaker_name = match.group(1).strip()
            speaker_time = match.group(2).strip()
            # Si speaker_name empieza por "Unknown" o por "?" no lo añadimos
            if speaker_name.startswith('Unknown') or speaker_name.startswith('?'):
                continue
            li = soup.new_tag('li')
            li.string = f"{speaker_name}: {speaker_time}"
            ul.append(li)
                                    
    
    # Guardar el archivo modificado
    with open(htmlf, 'w', encoding='utf-8') as f:
        f.write(str(soup))

