In [1]:
from bs4 import BeautifulSoup
import os
import sys
import re
sys.path.append('..')
from dateestimation import DateEstimation 
from tools.envvars import load_env_vars_from_directory

In [2]:
load_env_vars_from_directory('../.env')
cal_file =os.getenv('PODCAST_CAL_FILE')
if not cal_file:
    raise ValueError("PODCAST_CAL_FILE environment variable is not set.")
if not os.path.exists(cal_file):
    raise FileNotFoundError(f"Calendar '{cal_file}' does not exist.")

prefix = os.getenv('PODCAST_PREFIX')
if not prefix:
    raise ValueError("PODCAST_PREFIX environment variable is not set.")

templates = os.getenv('PODCAST_TEMPLATES', '../templates')
css = os.path.join(templates, 'podcast.css')
if not os.path.exists(css):
    raise FileNotFoundError(f"CSS file '{css}' does not exist.")

print(f"Using calendar file: {cal_file}")
print(f"Using podcast prefix: {prefix}")
print(f"Using templates directory: {templates}")
print(f"Using CSS file: {css}")

Using calendar file: /home/jmrobles/Podcasts/Cowboys de Medianoche/DB/cmcal.csv
Using podcast prefix: cm
Using templates directory: /home/jmrobles/Podcasts/Cowboys de Medianoche/templates
Using CSS file: /home/jmrobles/Podcasts/Cowboys de Medianoche/templates/podcast.css


In [3]:
tdir = "/home/jmrobles/Podcasts/Cowboys de Medianoche"
!ls "$tdir"

addtocal.bash			cm251017_whisper_audio_es.html
Archivo				cm251017_whisper_en.html
Barroco				cm251017_whisper_en.srt
cm251010.meta			cm251017_whisper_es.html
cm251010.mp3			cm251017_whisper_es.srt
cm251010_vosk_audio_es.html	DB
cm251010_vosk_es.html		Pendiente
cm251010_whisper_audio_en.html	Sttcast
cm251010_whisper_audio_es.html	Summaries
cm251010_whisper_en.html	templates
cm251010_whisper_en.srt		Tmp
cm251010_whisper_es.html	ToSummarize
cm251010_whisper_es.srt		training.meta
cm251017.meta			training.mp3
cm251017.mp3			training.mp3.gen
cm251017_vosk_audio_es.html	training.mp3.simple
cm251017_vosk_es.html		validate.bash
cm251017_whisper_audio_en.html


In [4]:
def get_epnumber(filename):
    # cálculo de basename del archivo
    basename = os.path.basename(filename)
    
    get_epnumber = re.compile(rf'{prefix}(\d+)_')
    match = get_epnumber.search(basename)
    if match:
        epnumber = int(match.group(1))
    return epnumber

In [5]:
import glob
html_files = glob.glob(os.path.join(tdir, f'cm*.html'))
print (html_files[:10])

['/home/jmrobles/Podcasts/Cowboys de Medianoche/cm251010_whisper_en.html', '/home/jmrobles/Podcasts/Cowboys de Medianoche/cm251017_whisper_en.html', '/home/jmrobles/Podcasts/Cowboys de Medianoche/cm251017_whisper_es.html', '/home/jmrobles/Podcasts/Cowboys de Medianoche/cm251017_whisper_audio_en.html', '/home/jmrobles/Podcasts/Cowboys de Medianoche/cm251017_vosk_es.html', '/home/jmrobles/Podcasts/Cowboys de Medianoche/cm251010_vosk_es.html', '/home/jmrobles/Podcasts/Cowboys de Medianoche/cm251010_whisper_audio_es.html', '/home/jmrobles/Podcasts/Cowboys de Medianoche/cm251010_whisper_es.html', '/home/jmrobles/Podcasts/Cowboys de Medianoche/cm251017_vosk_audio_es.html', '/home/jmrobles/Podcasts/Cowboys de Medianoche/cm251010_vosk_audio_es.html']


In [6]:
de = DateEstimation(cal_file)


In [7]:
with open(css, 'r', encoding='utf-8') as f:
    style_content = f.read()


In [8]:

def put_style(htmlfile, style_content):
    with open(htmlfile, 'r', encoding='utf-8') as f:
        html_content = f.read()
    soup = BeautifulSoup(html_content, 'html.parser')
    existing_styles = soup.find('head').find_all('style')
    # Borrar estilos existentes
    for style in existing_styles:
        style.decompose()
    style_tag = soup.new_tag('style')
    style_tag.string = style_content
    soup.head.append(style_tag)
    with open(htmlfile, 'w', encoding='utf-8') as f:
        f.write(str(soup))

In [9]:

    
def put_date(html, epname, epnumber):
    print(f"Processing episode {epnumber} - {epname}")
    with open(html, 'r', encoding='utf-8') as f:
        html_content = f.read()
    soup = BeautifulSoup(html_content, 'html.parser')
    date = de.estimate_date_from_epnumber(epnumber).strftime('%Y-%m-%d')
    print(f"Estimated date for episode {epnumber}: {date}")
    title = soup.find('h2', class_='title')
    titles = title.find_all('span')
    # Hay que reemplazar el contenido de title
    title.clear()
    tid = soup.new_tag('span', id='epid')
    div = soup.new_tag('div')
    div.append(f"{epname} - {date}")
    tid.append(div)
    title.append(tid)
    for t in titles:
        # Si es la de id 'epid', se sustituye por el nombre y la fecha
        if t.get('id') == 'epid':
            continue
        title.append(t)

    
    # Guardar el HTML modificado y pretty-printed
    with open(html, 'w', encoding='utf-8') as f:
        # Utilizar str(soup) para obtener el HTML como cadena
        # y escribirlo en el archivo
        f.write(str(soup.prettify()))
    


In [10]:
# El notebook puede incluir fecha y/o cambiar estilos
# Las variables siguientes controlan lo que se hace
put_date_enabled = True
put_style_enabled = False

In [11]:
epname_regex = re.compile(rf'({prefix}.*)_(whisper|vosk).*')
counter = 0
for html in  html_files:
    epname_fname = os.path.basename(html)
    match = epname_regex.match(epname_fname)
    if not match:
        print(f"Skipping {epname_fname} as it does not match the expected pattern.")
        continue
    epname = match.group(1)
    print(f"Processing {epname} ({counter})...")
    counter += 1
    # Obtener el número de episodio
    epnumber = get_epnumber(html)
    if put_date_enabled:
        put_date(html, epname, epnumber)
    if put_style_enabled:
        put_style(html, style_content)


   

Processing cm251010 (0)...
Processing episode 251010 - cm251010


TypeError: Invalid type <class 'pandas.core.series.Series'>. Must be int or float.