In [1]:
import sys
import os

sys.path.append(os.path.abspath("../tools"))
from envvars import load_env_vars_from_directory
import numpy as np
import pandas as pd
import faiss
import glob
import re
import pysrt
import requests
from urllib.parse import urljoin

In [2]:
# Variables de entorno
# Cargar variables de entorno desde archivos .env
load_env_vars_from_directory("../.env")

# Comprobar si las variables de entorno necesarias están definidas
faiss_file = os.getenv('STTCAST_FAISS_FILE')
if not faiss_file:
    raise ValueError("STTCAST_FAISS_FILE environment variable is not set.")
if not os.path.exists(faiss_file):
    raise FileNotFoundError(f"Faiss file '{faiss_file}' does not exist.")
db_file = os.getenv('STTCAST_DB_FILE')
if not db_file:
    raise ValueError("STTCAST_DB_FILE environment variable is not set.")
if not os.path.exists(db_file):
    raise FileNotFoundError(f"Database file '{db_file}' does not exist.")
cal_file =os.getenv('PODCAST_CAL_FILE')
if not cal_file:
    raise ValueError("PODCAST_CAL_FILE environment variable is not set.")
if not os.path.exists(cal_file):
    raise FileNotFoundError(f"Calendar '{cal_file}' does not exist.")
workdir = os.getenv('PODCAST_WORKDIR')
if not workdir:
    raise ValueError("PODCAST_WORKDIR environment variable is not set.")
if not os.path.exists(workdir):
    raise FileNotFoundError(f"Work directory '{workdir}' does not exist.")
prefix = os.getenv('PODCAST_PREFIX')
if not prefix:
    raise ValueError("PODCAST_PREFIX environment variable is not set.")
context_server_host = os.getenv('CONTEXT_SERVER_HOST')
if not context_server_host:
    raise ValueError("CONTEXT_SERVER_HOST environment variable is not set.")
context_server_port = int(os.getenv('CONTEXT_SERVER_PORT'))
if not context_server_port:
    raise ValueError("CONTEXT_SERVER_PORT environment variable is not set or invalid.")
context_server_url = f"http://{context_server_host}:{context_server_port}/"
if not context_server_url:
    raise ValueError("CONTEXT_SERVER_URL environment variable is not set.")

In [3]:
# Leer el calendario de podcasts
caldf = pd.read_csv(cal_file, parse_dates=["date"], index_col="episode" )
caldf

Unnamed: 0_level_0,date
episode,Unnamed: 1_level_1
1,2015-04-17
2,2015-04-23
3,2015-04-30
4,2015-05-07
5,2015-05-14
...,...
510,2025-05-01
511,2025-05-08
512,2025-05-15
513,2025-05-22


In [4]:
mp3files = glob.glob(os.path.join(workdir, f"{prefix}*mp3"))
for i in range(len(mp3files)):
    mp3files[i] = os.path.basename(mp3files[i])[:-4]
mp3files[:10]

['ep514_A', 'ep514_B']

In [5]:
epnumber_regex = re.compile(rf"{prefix}(...).*")
def get_epnumber(epname):
    global epnumber_regex
    return int(re.search(epnumber_regex, epname).group(1))
[get_epnumber(ep) for ep in mp3files]

[514, 514]

In [6]:
epdf = pd.DataFrame(mp3files, columns=["epname"])
epdf["epnumber"] = epdf["epname"].apply(get_epnumber)

epdf["epdate"] = epdf["epnumber"].apply(
    lambda x: caldf.loc[x]["date"] if x in caldf.index else None
)
epdf.set_index("epname", inplace=True)
epdf

Unnamed: 0_level_0,epnumber,epdate
epname,Unnamed: 1_level_1,Unnamed: 2_level_1
ep514_A,514,2025-05-29
ep514_B,514,2025-05-29


In [7]:
textpat = re.compile(r'^\[(?P<speaker>.*)\]: *(?P<spoken>.*)')

In [8]:

epints = {}
for epname in epdf.index:
    epsrt = os.path.join(workdir, f"{epname}_whisper_es.srt")
    if not os.path.exists(epsrt):
        print(f"Warning: SRT file for {epname} does not exist, skipping.")
        continue
    epsubs = pysrt.open(epsrt, encoding='utf-8')
    epints[epname] = []
    for eps in epsubs:
        m = re.match(textpat, eps.text)
        if m is None:
            print (f"Fallo en parsear {eps.text}")
            continue
        speaker = m.group('speaker')
        spoken = m.group('spoken')
        start = eps.start.ordinal/1_000
        end = eps.end.ordinal/1_000
        epints[epname].append({'tag': speaker, 'content': spoken, 'start': start, 'end': end})
    print(f"Número de intervenciones en {epname}: {len(epints[epname])}")       

Número de intervenciones en ep514_A: 155
Número de intervenciones en ep514_B: 332


In [21]:
addsegments_path = "/addsegments"
addsegments_url = urljoin(context_server_url, addsegments_path)
for epname in epdf.index:
    if epname not in epints:
        print(f"Warning: No interventions found for {epname}, skipping.")
        continue
    payload = {
        "epname": epname,
        "epdate": epdf.loc[epname]["epdate"].isoformat(),
        "epfile": f"{prefix}{epname}.mp3",
        "segments": epints[epname]
    }
    try:
        response = requests.post(addsegments_url, json=payload)
        response.raise_for_status()
        print(f"Successfully added embeddings for {epname}")
    except requests.RequestException as e:
        print(f"Error adding embeddings for {epname}: {e}")
        continue
   

Successfully added embeddings for ep514_A
Successfully added embeddings for ep514_B
