In [1]:
import os
from pywhispercpp.model import Model
import sqlite3
from pathlib import Path
from yt_dlp import YoutubeDL
import re

### config

In [2]:
db_path = 'data/podcast.db'

input_files_dir = Path('input_files/')
# print(input_files_dir.resolve())
# print(list(input_files_dir.glob('*.mp3')))
file = './s10e43_tiny_benchmark.mp3'

### database

##### init database

In [3]:
# create a folder data/ if doesn't exist yet
os.makedirs('data', exist_ok=True)

# create/connect to sqlite database
def get_connection():
    return sqlite3.connect(db_path)

In [None]:
# create db structure
def init_db():
    with get_connection() as db:
        cursor = db.cursor()

        # episodes table
        cursor.execute("""
            CREATE TABLE IF NOT EXISTS episodes (
                id INTEGER PRIMARY KEY,
                title TEXT,
                date TEXT,
                url_path TEXT,
                description TEXT,
                season_number INTEGER,
                episode_number INTEGER,
                index_number INTEGER
            )
        """)

        # participants table
        cursor.execute("""
            CREATE TABLE IF NOT EXISTS participants (
                id INTEGER PRIMARY KEY,
                name TEXT NOT NULL
            )
        """)

        # transcript segments table
        cursor.execute("""
            CREATE TABLE IF NOT EXISTS transcription_segments (
                id INTEGER PRIMARY KEY,
                episode_id INTEGER REFERENCES episodes(id),
                start_time REAL,
                end_time REAL,
                text TEXT,
                participant_id INTEGER REFERENCES participants(id) -- speaker for diarisation
            )
        """)

        # --- junction table for a participant in an episode
        cursor.execute("""
            CREATE TABLE IF NOT EXISTS episodes_participants (
                episode_id INTEGER REFERENCES episodes(id),
                participant_id INTEGER REFERENCES participants(id),
                role TEXT,
                PRIMARY KEY (episode_id, participant_id)
            )
        """)

        db.commit()

# init the db
init_db()

##### check the structure of the tables

In [4]:
def inspect_table_structure(table_name):
    """Inspect and print table structure"""
    with get_connection() as db:
        cursor = db.cursor()
        cursor.execute(f'PRAGMA table_info({table_name})')
        print(f'\n{table_name}')
        for col in cursor.fetchall():
            print(col)

# Usage
tables = ['episodes', 'participants', 'transcription_segments', 'episodes_participants']
for table in tables:
    inspect_table_structure(table)


episodes
(0, 'id', 'INTEGER', 0, None, 1)
(1, 'title', 'TEXT', 0, None, 0)
(2, 'date', 'TEXT', 0, None, 0)
(3, 'url_path', 'TEXT', 0, None, 0)
(4, 'description', 'TEXT', 0, None, 0)
(5, 'season_number', 'INTEGER', 0, None, 0)
(6, 'episode_number', 'INTEGER', 0, None, 0)
(7, 'index_number', 'INTEGER', 0, None, 0)

participants
(0, 'id', 'INTEGER', 0, None, 1)
(1, 'name', 'TEXT', 1, None, 0)

transcription_segments
(0, 'id', 'INTEGER', 0, None, 1)
(1, 'episode_id', 'INTEGER', 0, None, 0)
(2, 'start_time', 'REAL', 0, None, 0)
(3, 'end_time', 'REAL', 0, None, 0)
(4, 'text', 'TEXT', 0, None, 0)
(5, 'participant_id', 'INTEGER', 0, None, 0)

episodes_participants
(0, 'episode_id', 'INTEGER', 0, None, 1)
(1, 'participant_id', 'INTEGER', 0, None, 2)
(2, 'role', 'TEXT', 0, None, 0)


### model

In [5]:
# model = Model(model='large-v3', models_dir='./whisper.cpp/models')
# model = Model(model='base.en', models_dir='./whisper.cpp/models')

def get_model(model_name:str):
    return Model(model=model_name, models_dir='./whisper.cpp/models')

### process episode - insert episode into sqlite db

##### gathering episode data

In [6]:
# get the season number as well as the episode number from the title string
# the title string should be formated as such 'S01E01...'
# it will then return season and episode as a number
def extract_season_episode(episode_title:str):
    if episode_title is None:
        return None, None

    pattern_to_find = r'S(\d+)E(\d+)'
    match = re.search(pattern_to_find, episode_title)

    if match:
        season_number = int(match.group(1))
        episode_number = int(match.group(2))
    else:
        season_number = None
        episode_number = None

    return season_number, episode_number

# function to convert the index number to reverse
# def convert_index_number():
#     print('convert index')

# getting episode data
def fetch_episodes_data(feed_url, episode_items):
    ydl_config = {
        'extract_flat': False,
        'playlist_items': episode_items,
        'quiet': True,
        'skip_download': True
    }
    with YoutubeDL(ydl_config) as ydl:
        info = ydl.extract_info(feed_url) # gather data from the feed url
        episodes = info.get('entries', [info]) # make up a list of items from the fetched entries
        return [
            {
                "title": episode.get("title"),
                "description": episode.get("description"),
                "url": episode.get("webpage_url") or episode.get("original_url"),
                "upload_date": episode.get("upload_date"),
                "playlist_index": episode.get("playlist_index"),
                "season_number": season,
                "episode_number": episode_num,
            }
            for episode in episodes
            for season, episode_num in [extract_season_episode(episode.get('title'))]
        ]

In [7]:
# download episode
def download_episode(episode_url, download_path, episode_filename):

    os.makedirs(download_path, exist_ok=True)

    ydl_config = {
        'outtmpl': f"{download_path}/{episode_filename}.%(ext)s",
        # 'format':'',
        'quiet': True
    }
    with YoutubeDL(ydl_config) as ydl:
        ydl.download([episode_url])
    # print(f'downloaded episode : {download_path}/{episode_filename}')

    # return the actual file path
    return f"{download_path}/{episode_filename}.mp3"

In [24]:
# usage test
fetch_episodes_data('https://feeds.acast.com/public/shows/floodcast', '240-242')
# fetch_episodes_data('https://feeds.acast.com/public/shows/floodcast', '240-250')
# test_url_episode = 'https://sphinx.acast.com/p/open/s/5ffe3facad3e633276e9ea57/e/tag%3Asoundcloud%2C2010%3Atracks%2F285183974/media.mp3#__youtubedl_smuggle=%7B%22force_videoid%22%3A+%22tag%3Asoundcloud%2C2010%3Atracks%2F285183974%22%7D'
# download_episode(test_url_episode, './output', 'panchour')
# extract_season_episode('S22E46 - Postiche de Fouffe')



[{'title': 'S02E02 - Postiche de Fouffe',
  'description': '<p>Avec Maud Givert, Sophie Riche et Sophie-Marie Larrouy.</p><br><p>Présenté par Florent Bernard et Adrien Ménielle.</p><br><p>Dans ce podcast, après le traditionnel tour de table de ce qu\'on a kiffé récemment et une longue parenthèse sur le film "Pattaya", nous parlons de nos ratés estivaux, nos vacances gâchés, bref que c\'était bien de la merde nos étés.</p><hr><p style=\'color:grey; font-size:0.75em;\'> Hébergé par Acast. Visitez <a style=\'color:grey;\' target=\'_blank\' rel=\'noopener noreferrer\' href=\'https://acast.com/privacy\'>acast.com/privacy</a> pour plus d\'informations.</p>',
  'url': 'https://sphinx.acast.com/p/open/s/5ffe3facad3e633276e9ea57/e/tag%3Asoundcloud%2C2010%3Atracks%2F285183974/media.mp3#__youtubedl_smuggle=%7B%22force_videoid%22%3A+%22tag%3Asoundcloud%2C2010%3Atracks%2F285183974%22%7D',
  'upload_date': '20160928',
  'playlist_index': 240,
  'season_number': 2,
  'episode_number': 2},
 {'title': 

##### create episode into db

In [8]:
# insert episode into db
def create_episode_in_db(episode_data):
    try:
        with get_connection() as db:
            cursor = db.cursor()
            cursor.execute("""
                INSERT INTO episodes (title, date, url_path, description, season_number, episode_number, index_number)
                VALUES (?,?,?,?,?,?,?)
            """, (
                episode_data.get('title'),
                episode_data.get('upload_date'),
                episode_data.get('url'),
                episode_data.get('description'),
                episode_data.get('season_number'),
                episode_data.get('episode_number'),
                episode_data.get('playlist_index')
            ))
            
            db.commit()
            episode_id = cursor.lastrowid
            print('episode created in db with id :',{episode_id})

        return episode_id
    except Exception as e:
        print(f'Error creating episode in DB: {e}')
        return None

### process episode - create episode transcriptions & store segments into sqlite db

##### transcription function

for testing whispercpp parameters

In [9]:
def transcribe(file_path:str, model_name):

    model = get_model(model_name=model_name)

    print('starting transcription...')
    transcription = model.transcribe(
        file_path, 
        language='fr',
        temperature=0.0,
        print_progress=True,
        extract_probability=False
    )
    print('end transcription',transcription)
    return transcription

# help(Model.transcribe)
# ?Model.transcribe

##### clean transcriptions

##### store transcription segments into db

In [None]:
# print la transcription dans un fichier
# def transcribe_into_file(file:str):

#     # directory to store the transcriptions into 
#     output_directory = 'transcriptions_output'
#     os.makedirs(f'./{output_directory}', exist_ok=True)

#     # name of the transcription output file
#     # (using the name of the file given as params)
#     base_name = os.path.splitext(file)[0]
#     output_file_name = f"{base_name}.txt"

#     # writing transcriptions in file
#     with open(f'./{output_directory}/{output_file_name}','w',encoding='utf-8') as output_file:
#         for segment in transcribe(file):
#             output_file.write(f'{segment.text}')
#             # print(segment)

In [10]:
# def transcribe_into_db(input_files_dir):
    
#     # for each files in the input audio files folder, it get the transcription, 
#     # and then stores it in the db
#     for audio_file in input_files_dir.glob('*.mp3'):

#         print(f"file being processed : {audio_file.name}")
        
#         # create episode and get its id
#         episode_id = create_episode_in_db(audio_file)

#         # store transcription segment of the episode into db
#         with get_connection() as db:
#             cursor = db.cursor()

#             for transcription_segment in transcribe(audio_file.name):
#                 start = transcription_segment.t0
#                 end = transcription_segment.t1
#                 text = transcription_segment.text.strip()

#                 cursor.execute("""
#                     INSERT INTO transcription_segments (episode_id, start_time, end_time, text)
#                     VALUES (?,?,?,?)
#                 """, (episode_num, start, end, text))

#             db.commit()

# store transcription segment of the episode in db
def store_transcripts_in_db(transcript_segments, episode_id):
    print('store_transcript_in_db - start')
    with get_connection() as db:
        cursor = db.cursor()

        for transcript_segment in transcript_segments:
            start = transcript_segment.t0
            end = transcript_segment.t1
            text = transcript_segment.text.strip()

            cursor.execute("""
                INSERT INTO transcription_segments (episode_id, start_time, end_time, text)
                VALUES (?,?,?,?)
            """, (episode_id, start, end, text))

        db.commit()
    print('store_transcript_in_db - end')

    # print('transcripts stored for episode with id :',{episode_id})

### process episode - create embeddings & store into vector db

##### create embeddings

##### store embeding into vector db

### main function

In [11]:
# - get the data of the episode(s)
# - download the audio file
# - create the episode in the db
def process_episodes(
        feed_url, 
        episodes_items, 
        download=False, 
        download_path='./files_to_transcribe', 
        transcription=False, 
        model_name='base.en'
    ):
    episodes = fetch_episodes_data(feed_url, episodes_items)

    for episode in episodes:

        # format file name with season and episode number
        episode_filename = f"s{episode['season_number']}e{episode['episode_number']}"
        print('processing episode :', episode_filename)
        
        # create episode in db
        episode_id = create_episode_in_db(episode)
        print('episode id in db :',episode_id)

        # download the episode file
        if download:
            episode_file_path = download_episode(episode['url'],download_path, episode_filename)

        # transcribe the episode file and store its transcription segments in the db
        if download and episode_file_path and transcription:

            print('start transcription of episode with filename:',episode_file_path)

            # getting the transcription segments generated by whisper
            transcript_segments = transcribe(episode_file_path, model_name)
            # storing the gathered segments and store them in db
            store_transcripts_in_db(transcript_segments, episode_id)

        if transcription and not download:
            print('download must be true in order to be able to make transcriptions')

### usage

parameters to provide :
- feed_url : the url for the podcast feed
- episodes_items :  to get (example : '1' for the episode 1, '1-200' for a range from episode 1 to 200)
- download : turn download on/off
- download_path : path for the audio files (if not given, no episode downloaded)
- transcription : turn transcription on/off
- model_name : the model to use for the transcription

In [None]:
process_episodes(
    feed_url='https://feeds.acast.com/public/shows/floodcast',
    episodes_items='4',
    transcription=True,
    download=True,
    # model_name='large-v3'
)



processing episode : s10e40
episode created in db with id : {22}
episode id in db : 22
start transcription of episode with filename: ./files_to_transcribe/s10e40.mp3
starting transcription...


Progress:   0%
Progress:   0%
Progress:   1%
Progress:   1%
Progress:   2%
Progress:   3%
Progress:   3%
Progress:   4%
Progress:   4%
Progress:   5%
Progress:   6%
Progress:   6%
Progress:   7%
Progress:   8%
Progress:   8%
Progress:   9%
Progress:   9%
Progress:  10%
Progress:  11%
Progress:  11%
Progress:  12%
Progress:  13%
Progress:  13%
Progress:  14%
Progress:  14%
Progress:  15%
Progress:  16%
Progress:  16%
Progress:  17%
Progress:  18%
Progress:  18%
Progress:  19%
Progress:  19%
Progress:  20%
Progress:  21%
Progress:  21%
Progress:  22%
Progress:  23%
Progress:  23%
Progress:  24%
Progress:  24%
Progress:  25%
Progress:  26%
Progress:  26%
Progress:  27%
Progress:  28%
Progress:  28%
Progress:  29%
Progress:  30%
Progress:  30%
Progress:  31%
Progress:  31%
Progress:  32%
Progress:  33%
Progress:  33%
Progress:  34%
Progress:  35%
Progress:  35%
Progress:  36%
Progress:  36%
Progress:  37%
Progress:  38%
Progress:  38%
Progress:  39%
Progress:  40%
Progress:  40%
Progress: 

end transcription [t0=0, t1=332, text=The corruption of James Patterson and Michael Christian, probability=nan, t0=332, t1=472, text=will be the first to get back., probability=nan, t0=472, t1=768, text=The only way to get to the corruption is to get to the corruption., probability=nan, t0=768, t1=1076, text=The corruption of John McGregor will be the first to get to the corruption, probability=nan, t0=1076, t1=1300, text=and the first to get to the corruption., probability=nan, t0=1300, t1=1452, text=The corruption of James, probability=nan, t0=1452, t1=1700, text=will be the first to get to the corruption., probability=nan, t0=1700, t1=2080, text=The corruption of James and the other two of James' citizens will be the first to get to the corruption., probability=nan, t0=2080, t1=2300, text=The corruption of James' citizens will be the first to get to the corruption., probability=nan, t0=2300, t1=2600, text=The corruption of James' citizens will be the first to get to the corruption.,

Progress: 100%


In [12]:
transcript_segments = transcribe('s10e43_tiny_benchmark.mp3','large-v3')
# storing the gathered segments and store them in db
store_transcripts_in_db(transcript_segments, 31)

whisper_init_from_file_with_params_no_state: loading model from '/Users/quentin/dev/podcast_audio_extractor/whisper.cpp/models/ggml-large-v3.bin'
whisper_init_with_params_no_state: use gpu    = 1
whisper_init_with_params_no_state: flash attn = 0
whisper_init_with_params_no_state: gpu_device = 0
whisper_init_with_params_no_state: dtw        = 0
whisper_init_with_params_no_state: devices    = 3
whisper_init_with_params_no_state: backends   = 3
whisper_model_load: loading model
whisper_model_load: n_vocab       = 51866
whisper_model_load: n_audio_ctx   = 1500
whisper_model_load: n_audio_state = 1280
whisper_model_load: n_audio_head  = 20
whisper_model_load: n_audio_layer = 32
whisper_model_load: n_text_ctx    = 448
whisper_model_load: n_text_state  = 1280
whisper_model_load: n_text_head   = 20
whisper_model_load: n_text_layer  = 32
whisper_model_load: n_mels        = 128
whisper_model_load: ftype         = 1
whisper_model_load: qntvr         = 0
whisper_model_load: type          = 5 (larg

starting transcription...


Progress:   0%
Progress:  36%
Progress:  78%


end transcription [t0=0, t1=140, text=Message publicitaire., probability=nan, t0=140, t1=1022, text=Le médicament Daflon 500 mg, indiqué pour soulager les jambes lourdes et douloureuses dues à l'insuffisance veineuse, agit en renforçant le tonus veineux et en protégeant les vaisseaux sanguins., probability=nan, t0=1022, t1=1874, text=Son efficacité a été cliniquement démontrée dans le traitement des troubles de la circulation veineuse, jambes lourdes, douleurs, impatience, en complément des mesures hygiéno-diététiques., probability=nan, t0=1874, t1=2034, text=Rendez-vous sur Daflon.fr., probability=nan, t0=2034, t1=2624, text=Daflon 500 mg, composé de fractions flavonoïques purifiées micronisées et réservées à l'adulte, est disponible en pharmacie sans ordonnance., probability=nan, t0=2624, t1=3006, text=Tout médicament peut exposer à des risques. Parlez-en à votre pharmacien et lisez attentivement la notice., probability=nan, t0=3006, t1=3230, text=Si les symptômes persistent, consult

Progress: 100%
ggml_metal_free: deallocating
