In [31]:
import sys
import os

sys.path.append(os.path.abspath("../tools"))
from envvars import load_env_vars_from_directory
import numpy as np
import pandas as pd
import faiss
import glob
import re
import pysrt
import requests
from urllib.parse import urljoin
import hmac
import hashlib
import time
import json

In [None]:
# Imports
import sys
import os

# Add parent directory to path for imports
notebook_dir = os.path.dirname(os.path.abspath("__file__")) if "__file__" in globals() else os.getcwd()
project_root = os.path.abspath(os.path.join(notebook_dir, '..'))
sys.path.insert(0, project_root)

from tools.logs import logcfg
from tools.envvars import load_env_vars_from_directory
from api.apihmac import create_auth_headers, serialize_body

import logging
import pandas as pd
import glob
import re
import requests
from urllib.parse import urlparse
from datetime import datetime

# Limpiar handlers existentes para evitar duplicación
for handler in logging.root.handlers[:]:
    logging.root.removeHandler(handler)

# Configurar logging
logcfg(__file__ if "__file__" in globals() else "addepisodes.ipynb")

# Cargar variables de entorno
env_dir = os.path.join(project_root, '.env')
load_env_vars_from_directory(directory=env_dir)

# Cargar las claves de autenticación
context_server_api_key = os.getenv('CONTEXT_SERVER_API_KEY')
if not context_server_api_key:
    raise ValueError("CONTEXT_SERVER_API_KEY environment variable is not set.")

In [33]:

cal_file = os.getenv('PODCAST_CAL_FILE')
# Leer el calendario de podcasts
caldf = pd.read_csv(cal_file, parse_dates=["date"], index_col="episode" )
caldf

Unnamed: 0_level_0,date
episode,Unnamed: 1_level_1
0,2019-10-25
1,2019-11-01
2,2019-11-14
3,2019-11-20
4,2019-11-29
...,...
249,2025-10-25
250,2025-11-08
251,2025-11-18
252,2025-11-24


In [41]:
workdir = os.getenv('PODCAST_WORKDIR')
prefix = os.getenv('PODCAST_PREFIX')
context_server_url = os.getenv('CONTEXT_SERVER_URL')

In [42]:
mp3files = glob.glob(os.path.join(workdir, f"{prefix}*mp3"))
for i in range(len(mp3files)):
    mp3files[i] = os.path.basename(mp3files[i])[:-4]
mp3files[:10]

['ll253', 'll252']

In [43]:
epnumber_regex = re.compile(rf"{prefix}([\d]+).*")
def get_epnumber(epname):
    global epnumber_regex
    return int(re.search(epnumber_regex, epname).group(1))
epnumbers = [get_epnumber(ep) for ep in mp3files]
print (f"Found {len(epnumbers)} episodes in {workdir} with prefix {prefix}")
print (f"Last 10 epnumbers: {epnumbers[-10:]}")

Found 2 episodes in /home/jmrobles/Podcasts/Listening Leaders/ with prefix ll
Last 10 epnumbers: [253, 252]


In [44]:
epdf = pd.DataFrame(mp3files, columns=["epname"])
epdf["epnumber"] = epdf["epname"].apply(get_epnumber)

epdf["epdate"] = epdf["epnumber"].apply(
    lambda x: caldf.loc[x]["date"] if x in caldf.index else None
)
epdf.set_index("epname", inplace=True)
epdf

Unnamed: 0_level_0,epnumber,epdate
epname,Unnamed: 1_level_1,Unnamed: 2_level_1
ll253,253,2025-12-02
ll252,252,2025-11-24


In [45]:
textpat = re.compile(r'^\[(?P<speaker>.*)\]: *(?P<spoken>.*)')

In [46]:

epints = {}
for epname in epdf.index:
    epsrt = os.path.join(workdir, f"{epname}_whisper_es.srt")
    if not os.path.exists(epsrt):
        print(f"Warning: SRT file for {epname} does not exist, skipping.")
        continue
    epsubs = pysrt.open(epsrt, encoding='utf-8')
    epints[epname] = []
    for eps in epsubs:
        m = re.match(textpat, eps.text)
        if m is None:
            print (f"Fallo en parsear {eps.text}")
            continue
        speaker = m.group('speaker')
        spoken = m.group('spoken')
        start = eps.start.ordinal/1_000
        end = eps.end.ordinal/1_000
        epints[epname].append({'tag': speaker, 'content': spoken, 'start': start, 'end': end})
    print(f"Número de intervenciones en {epname}: {len(epints[epname])}")       

Número de intervenciones en ll253: 93
Número de intervenciones en ll252: 81


In [47]:
addsegments_path = "/addsegments"
addsegments_url = urlparse(context_server_url)._replace(path=addsegments_path).geturl()
for epname in epdf.index:
    if epname not in epints:
        print(f"Warning: No interventions found for {epname}, skipping.")
        continue
    payload = {
        "epname": epname,
        "epdate": epdf.loc[epname]["epdate"].isoformat(),
        "epfile": f"{prefix}{epname}.mp3",
        "segments": epints[epname]
    }
    try:
        # Crear headers de autenticación HMAC usando el módulo centralizado
        auth_headers = create_auth_headers(
            context_server_api_key,
            "POST",
            addsegments_path,
            payload,
            client_id='addepisodes_notebook'
        )
        
        # ENVIAR EL JSON EXACTO QUE USAMOS PARA LA FIRMA
        body_str = serialize_body(payload)
        response = requests.post(addsegments_url, data=body_str, headers=auth_headers)
        response.raise_for_status()
        print(f"Successfully added embeddings for {epname}")
    except requests.RequestException as e:
        print(f"Error adding embeddings for {epname}: {e}")
        continue

[2025-12-08 08:10:39,794] INFO     MainThread   - HMAC Debug - Enviando a /addsegments: timestamp=1765177839, signature=44e1267c93846b09...
[2025-12-08 08:10:39,795] INFO     MainThread   - HMAC Debug - Clave API (primeros 10 chars): s8haLCo-vw...
[2025-12-08 08:10:39,795] INFO     MainThread   - HMAC Debug - Clave API (primeros 10 chars): s8haLCo-vw...
[2025-12-08 08:10:43,558] INFO     MainThread   - HMAC Debug - Enviando a /addsegments: timestamp=1765177843, signature=fe6d55b92ec39e03...
[2025-12-08 08:10:43,559] INFO     MainThread   - HMAC Debug - Clave API (primeros 10 chars): s8haLCo-vw...
[2025-12-08 08:10:43,558] INFO     MainThread   - HMAC Debug - Enviando a /addsegments: timestamp=1765177843, signature=fe6d55b92ec39e03...
[2025-12-08 08:10:43,559] INFO     MainThread   - HMAC Debug - Clave API (primeros 10 chars): s8haLCo-vw...


Successfully added embeddings for ll253
Successfully added embeddings for ll252
Successfully added embeddings for ll252


In [None]:
# Verificar si hay duplicados en epdf.index
print(f"Total episodios en epdf: {len(epdf)}")
print(f"Episodios únicos: {epdf.index.nunique()}")
if len(epdf) != epdf.index.nunique():
    print("¡ADVERTENCIA! Hay episodios duplicados:")
    print(epdf.index[epdf.index.duplicated()].tolist())

In [None]:
!pwd
