In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from datetime import datetime, timedelta
import math
import numpy as np
import os
import glob
import re


In [2]:
CALFILE = "/home/jmrobles/Podcasts/Coffee Break/DB/cbcal.csv"
caldf = pd.read_csv(CALFILE, parse_dates=["date"], index_col=['episode'])
caldf

Unnamed: 0_level_0,date
episode,Unnamed: 1_level_1
1,2015-04-17
2,2015-04-23
3,2015-04-30
4,2015-05-07
5,2015-05-14
...,...
511,2025-05-08
512,2025-05-15
513,2025-05-22
514,2025-05-29


In [3]:
MP3DIR = "/home/jmrobles/Podcasts/Coffee Break/Archivo"
mp3files = glob.glob(os.path.join(MP3DIR, "ep*mp3"))
for i in range(len(mp3files)):
    mp3files[i] = os.path.basename(mp3files[i])[:-4]
mp3files[:10]


['ep333',
 'ep177',
 'ep448_B',
 'ep468_B',
 'ep277',
 'ep426_B',
 'ep428_B',
 'ep242',
 'ep024',
 'ep118']

In [4]:
epdf = pd.DataFrame(mp3files, columns=["epname"])
epdf

Unnamed: 0,epname
0,ep333
1,ep177
2,ep448_B
3,ep468_B
4,ep277
...,...
633,ep479_B
634,ep351
635,ep380
636,ep093


In [5]:
epnumber_regex = re.compile(r"ep(...).*")
def get_epnumber(epname):
    global epnumber_regex
    return int(re.search(epnumber_regex, epname).group(1))
get_epnumber("ep444_adfad fadf.mp3")

444

In [6]:
epdf["epnumber"] = epdf.apply(lambda row: get_epnumber(row.epname), axis=1)
epdf

Unnamed: 0,epname,epnumber
0,ep333,333
1,ep177,177
2,ep448_B,448
3,ep468_B,468
4,ep277,277
...,...,...
633,ep479_B,479
634,ep351,351
635,ep380,380
636,ep093,93


In [7]:
epdf = epdf.set_index("epname")
epdf

Unnamed: 0_level_0,epnumber
epname,Unnamed: 1_level_1
ep333,333
ep177,177
ep448_B,448
ep468_B,468
ep277,277
...,...
ep479_B,479
ep351,351
ep380,380
ep093,93


In [8]:
caldf.loc[5]['date']

Timestamp('2015-05-14 00:00:00')

In [9]:
from sttcastdb import SttcastDB

ModuleNotFoundError: No module named 'sttcastdb'

In [10]:
DBFILE = "/home/jmrobles/Podcasts/Coffee Break/DB/coffeebreak.db"

cbdb = SttcastDB(DBFILE, create_if_not_exists=True)

In [11]:
for epname, row in epdf.iloc[:10].iterrows():
    epnumber = row['epnumber']
    print (f"epname: {epname}, epnumber: {epnumber}, date: {caldf.loc[epnumber].date}")

epname: ep333, epnumber: 333, date: 2021-09-16 00:00:00
epname: ep177, epnumber: 177, date: 2018-08-23 00:00:00
epname: ep448_B, epnumber: 448, date: 2024-01-25 00:00:00
epname: ep468_B, epnumber: 468, date: 2024-06-13 00:00:00
epname: ep277, epnumber: 277, date: 2020-07-23 00:00:00
epname: ep426_B, epnumber: 426, date: 2023-08-24 00:00:00
epname: ep428_B, epnumber: 428, date: 2023-09-07 00:00:00
epname: ep242, epnumber: 242, date: 2019-11-21 00:00:00
epname: ep024, epnumber: 24, date: 2015-09-24 00:00:00
epname: ep118, epnumber: 118, date: 2017-07-06 00:00:00


In [12]:
epdf["epdate"] = epdf.apply(lambda row: caldf.loc[row.epnumber].date, axis=1)
epdf

Unnamed: 0_level_0,epnumber,epdate
epname,Unnamed: 1_level_1,Unnamed: 2_level_1
ep333,333,2021-09-16
ep177,177,2018-08-23
ep448_B,448,2024-01-25
ep468_B,468,2024-06-13
ep277,277,2020-07-23
...,...,...
ep479_B,479,2024-09-26
ep351,351,2022-01-20
ep380,380,2022-09-08
ep093,93,2017-01-12


In [13]:
import pysrt

In [14]:
textpat = re.compile(r'^\[(?P<speaker>.*)\]: *(?P<spoken>.*)')


In [6]:
for epname, row in epdf.iterrows():
    epnumber = row['epnumber']
    epdate = row['epdate']
    epsrt = os.path.join(MP3DIR, epname +'_whisper_es.srt')
    print (f"epname: {epname}, epnumber: {epnumber}, date: {epdate}, srt: {epsrt}")
    epsubs = pysrt.open(epsrt)
    epints = []
    for eps in epsubs:
        m = re.match(textpat, eps.text)
        if m is None:
            print (f"Fallo en parsear {eps.text}")
            continue
        speaker = m.group('speaker')
        spoken = m.group('spoken')
        start = eps.start.ordinal/1_000
        end = eps.end.ordinal/1_000
        epints.append({'tag': speaker, 'content': spoken, 'start': start, 'end': end})
    cbdb.add_episode(epname, epdate, epname+'.mp3', epints)

NameError: name 'epdf' is not defined

In [16]:
for ep in mp3files:
    srtfile = ep + '_whisper_en.srt'
    if not os.path.exists(os.path.join(MP3DIR,srtfile)):
        print(f"No existe {srtfile}")