In [40]:
from bs4 import BeautifulSoup, Comment
import os
import sys
import re
sys.path.append('..')
from dateestimation import DateEstimation 
from tools.envvars import load_env_vars_from_directory

In [41]:
load_env_vars_from_directory('../.env')
cal_file =os.getenv('PODCAST_CAL_FILE')
if not cal_file:
    raise ValueError("PODCAST_CAL_FILE environment variable is not set.")
if not os.path.exists(cal_file):
    raise FileNotFoundError(f"Calendar '{cal_file}' does not exist.")

prefix = os.getenv('PODCAST_PREFIX')
if not prefix:
    raise ValueError("PODCAST_PREFIX environment variable is not set.")

templates = os.getenv('PODCAST_TEMPLATES', '../templates')
css = os.path.join(templates, 'podcast.css')
if not os.path.exists(css):
    raise FileNotFoundError(f"CSS file '{css}' does not exist.")

workdir = os.getenv('PODCAST_WORKDIR', '../workdir')
if not os.path.exists(workdir):
    raise FileNotFoundError(f"Work directory '{workdir}' does not exist.")

print(f"Using calendar file: {cal_file}")
print(f"Using podcast prefix: {prefix}")
print(f"Using templates directory: {templates}")
print(f"Using work directory: {workdir}")
print(f"Using CSS file: {css}")

Using calendar file: /home/jmrobles/Podcasts/Cowboys de Medianoche/DB/cmcal.csv
Using podcast prefix: cm
Using templates directory: /home/jmrobles/Podcasts/Cowboys de Medianoche/templates
Using work directory: /home/jmrobles/Podcasts/Cowboys de Medianoche/
Using CSS file: /home/jmrobles/Podcasts/Cowboys de Medianoche/templates/podcast.css


In [42]:
tdir = workdir
!ls "$tdir"

addtocal.bash			cm200613.meta
Archivo				cm200613.mp3
Barroco				cm200613_vosk_audio_es.html
cm200104.meta			cm200613_vosk_es.html
cm200104.mp3			cm200613_whisper_audio_en.html
cm200104_vosk_audio_es.html	cm200613_whisper_audio_es.html
cm200104_vosk_es.html		cm200613_whisper_en.html
cm200104_whisper_audio_en.html	cm200613_whisper_en.srt
cm200104_whisper_audio_es.html	cm200613_whisper_es.html
cm200104_whisper_en.html	cm200613_whisper_es.srt
cm200104_whisper_en.srt		cm200620.meta
cm200104_whisper_es.html	cm200620.mp3
cm200104_whisper_es.srt		cm200620_vosk_audio_es.html
cm200111.meta			cm200620_vosk_es.html
cm200111.mp3			cm200620_whisper_audio_en.html
cm200111_vosk_audio_es.html	cm200620_whisper_audio_es.html
cm200111_vosk_es.html		cm200620_whisper_en.html
cm200111_whisper_audio_en.html	cm200620_whisper_en.srt
cm200111_whisper_audio_es.html	cm200620_whisper_es.html
cm200111_whisper_en.html	cm200620_whisper_es.srt
cm200111_whisper_en.srt		cm200627.meta
cm200111_whisper_es.html	cm200627.mp

In [43]:
def get_epnumber(filename):
    # cálculo de basename del archivo
    basename = os.path.basename(filename)
    
    get_epnumber = re.compile(rf'{prefix}(\d+)[^\d].*')
    match = get_epnumber.search(basename)
    if match:
        epnumber = int(match.group(1))
    return epnumber

In [44]:
import glob
mp3_files = glob.glob(os.path.join(tdir, f'{prefix}*.mp3'))
print (mp3_files[:10])

['/home/jmrobles/Podcasts/Cowboys de Medianoche/cm200229.mp3', '/home/jmrobles/Podcasts/Cowboys de Medianoche/cm200530.mp3', '/home/jmrobles/Podcasts/Cowboys de Medianoche/cm201024.mp3', '/home/jmrobles/Podcasts/Cowboys de Medianoche/cm200516.mp3', '/home/jmrobles/Podcasts/Cowboys de Medianoche/cm200118.mp3', '/home/jmrobles/Podcasts/Cowboys de Medianoche/cm201128.mp3', '/home/jmrobles/Podcasts/Cowboys de Medianoche/cm200725.mp3', '/home/jmrobles/Podcasts/Cowboys de Medianoche/cm200404.mp3', '/home/jmrobles/Podcasts/Cowboys de Medianoche/cm200320.mp3', '/home/jmrobles/Podcasts/Cowboys de Medianoche/cm201003.mp3']


In [45]:
import pysrt

In [46]:
textpat = re.compile(r'^\[(?P<speaker>.*)\]: *(?P<spoken>.*)')

In [47]:
def change_speaker_in_srt_file(srt_file, old_speaker, new_speaker):
    subs = pysrt.open(srt_file, encoding='utf-8')
    for sub in subs:
        if sub.text.startswith(f"[{old_speaker}]:"):
            sub.text = sub.text.replace(f"[{old_speaker}]:", f"[{new_speaker}]:")
    subs.save(srt_file, encoding='utf-8')


In [None]:
change_dict = {
    
   # 200218: {},
   200926: {
       'Unknown 3': 'José Luis Garci',
       'Unknown 4': 'Luis Alberto de Cuenca',
   }
   
}

In [49]:
for mp3f in mp3_files:
    epnumber = get_epnumber(mp3f)
    if epnumber is None:
        print(f"Could not extract episode number from {mp3f}")
        continue
    print (f"Processing episode number: {epnumber}")
    if epnumber not in change_dict:
        print(f"No changes defined for episode number {epnumber}, skipping.")
        continue
    for strf in glob.glob(os.path.join(tdir, f'{prefix}{epnumber}_*.srt')):
        print(f"Processing subtitle file: {strf}")
        for old_speaker, new_speaker in change_dict[epnumber].items():
            print(f"Changing speaker from '{old_speaker}' to '{new_speaker}' in {strf}")
            change_speaker_in_srt_file(strf, old_speaker, new_speaker)


Processing episode number: 200229
No changes defined for episode number 200229, skipping.
Processing episode number: 200530
Processing subtitle file: /home/jmrobles/Podcasts/Cowboys de Medianoche/cm200530_whisper_en.srt
Changing speaker from 'Unknown 3' to 'Luis Alberto de Cuenca' in /home/jmrobles/Podcasts/Cowboys de Medianoche/cm200530_whisper_en.srt
Changing speaker from 'Unknown 4' to 'José Luis Garci' in /home/jmrobles/Podcasts/Cowboys de Medianoche/cm200530_whisper_en.srt
Processing subtitle file: /home/jmrobles/Podcasts/Cowboys de Medianoche/cm200530_whisper_es.srt
Changing speaker from 'Unknown 3' to 'Luis Alberto de Cuenca' in /home/jmrobles/Podcasts/Cowboys de Medianoche/cm200530_whisper_es.srt
Changing speaker from 'Unknown 4' to 'José Luis Garci' in /home/jmrobles/Podcasts/Cowboys de Medianoche/cm200530_whisper_es.srt
Processing episode number: 201024
Processing subtitle file: /home/jmrobles/Podcasts/Cowboys de Medianoche/cm201024_whisper_es.srt
Changing speaker from 'Unkno

In [50]:
html_whisper_files = glob.glob(os.path.join(tdir, f'{prefix}*_whisper*.html'))
for htmlf in html_whisper_files:
    epnumber = get_epnumber(htmlf)
    if epnumber is None:
        print(f"Could not extract episode number from {mp3f}")
        continue
    if epnumber not in change_dict:
        print(f"No changes defined for episode number {epnumber}, skipping.")
        continue
    print(f"Processing HTML file: {htmlf}")
    with open(htmlf, 'r', encoding='utf-8') as f:
        soup = BeautifulSoup(f, 'html.parser')
    # Localizar todos los textos de class "speaker*"
    speakers = soup.find_all(class_=re.compile(r'speaker-\d+'))
    # Sustituir, si existe old_speaker por new_speaker
    olds = []
    news = []
    for old_speaker, new_speaker in change_dict.get(epnumber, {}).items():
        print(f"Changing speaker from '{old_speaker}' to '{new_speaker}' in HTML file {htmlf}")
        for speaker in speakers:
            # Cambiar el texto old_speaker for new_speaker dejando el resto del texto intacto
            if old_speaker in speaker.text:
                speaker.string = speaker.text.replace(old_speaker, new_speaker)
        olds.append(old_speaker)
        news.append(new_speaker)
    # Localizar el texto <!-- old_speaker ha hablado ... y cambiar por <!-- new_speaker ha hablado ... -->
    for oldsp in olds:
        newsp = news[olds.index(oldsp)]
        print(f"Changing comment from '{oldsp}' to '{newsp}' in {htmlf}")
        comments = soup.find_all(
            string=lambda text: isinstance(text, Comment)
                                and f"{oldsp} ha hablado" in text
        )
        if not comments:
            print(f"No comments found for speaker '{oldsp}' in {htmlf}")
        for comment in comments:
            print (f"Changing comment from '{comment}' to include speaker '{newsp}'")
            new_text = comment.replace(oldsp, newsp)
            comment.replace_with(Comment(new_text))

    #Localizar el span "speaker_summary" y dentro de el una lista ul. Se borrará y rellenará con los hablantes
    # tomados de los comentarios
    speaker_summary = soup.find('span', id='speaker-summary')
    if speaker_summary:
        ul = speaker_summary.find('ul')
        if ul:
            # Borrar los elementos de la lista
            ul.clear()
        else:
            print(f"No 'ul' found in 'speaker-summary' span in {htmlf}")
    else:
        print(f"No 'speaker-summary' span found in {htmlf}")

    # Si no existe, crear un nuevo ul    
    # Una vez cambiados algunos comentarios vamos a repasar todos los comentarios y 
    # componer la lista de hablantes
    comments = soup.find_all(
    string=lambda text: isinstance(text, Comment)
                        and f" ha hablado" in text
    )   
    for comment in comments:
        # Extraer el hablante del comentario
        match = re.search(r'(.*?) ha hablado (.*?) en el segmento', comment)
        if match:
            speaker_name = match.group(1).strip()
            speaker_time = match.group(2).strip()
            # Si speaker_name empieza por "Unknown" o por "?" no lo añadimos
            if speaker_name.startswith('Unknown') or speaker_name.startswith('?'):
                continue
            li = soup.new_tag('li')
            li.string = f"{speaker_name}: {speaker_time}"
            ul.append(li)
                                    
    
    # Guardar el archivo modificado
    with open(htmlf, 'w', encoding='utf-8') as f:
        f.write(str(soup))



Processing HTML file: /home/jmrobles/Podcasts/Cowboys de Medianoche/cm200118_whisper_es.html
Changing speaker from 'Unknown 2' to 'Luis Alberto de Cuenca' in HTML file /home/jmrobles/Podcasts/Cowboys de Medianoche/cm200118_whisper_es.html
Changing comment from 'Unknown 2' to 'Luis Alberto de Cuenca' in /home/jmrobles/Podcasts/Cowboys de Medianoche/cm200118_whisper_es.html
No comments found for speaker 'Unknown 2' in /home/jmrobles/Podcasts/Cowboys de Medianoche/cm200118_whisper_es.html
Processing HTML file: /home/jmrobles/Podcasts/Cowboys de Medianoche/cm201219_whisper_audio_en.html
Changing speaker from 'Unknown 3' to 'José Luis Garci' in HTML file /home/jmrobles/Podcasts/Cowboys de Medianoche/cm201219_whisper_audio_en.html
Changing speaker from 'Unknown 4' to 'Andrés Arconada' in HTML file /home/jmrobles/Podcasts/Cowboys de Medianoche/cm201219_whisper_audio_en.html
Changing speaker from 'Unknown 7' to 'Luis Alberto de Cuenca' in HTML file /home/jmrobles/Podcasts/Cowboys de Medianoche