In [21]:
import os
from pywhispercpp.model import Model
import sqlite3

### config

In [25]:
db_path = 'data/podcast.db'

input_files_dir = './input_files/'

### database

##### init database

In [None]:
# create a folder data/ if doesn't exist yet
os.makedirs('data', exist_ok=True)

# create/connect to sqlite database
def get_connection():
    return sqlite3.connect(db_path)

def init_db():
    with get_connection() as db:
        cursor = db.cursor()

        # episodes table
        cursor.execute("""
            CREATE TABLE IF NOT EXISTS episodes (
                id INTEGER PRIMARY KEY,
                title TEXT,
                date TEXT,
                url_path TEXT,
                description TEXT
            )
        """)

        # participants table
        cursor.execute("""
            CREATE TABLE IF NOT EXISTS participants (
                id INTEGER PRIMARY KEY,
                name TEXT NOT NULL
            )
        """)

        # transcript segments table
        cursor.execute("""
            CREATE TABLE IF NOT EXISTS transcription_segments (
                id INTEGER PRIMARY KEY,
                episode_id INTEGER REFERENCES episodes(id),
                start_time REAL,
                end_time REAL,
                text TEXT,
                participant_id INTEGER REFERENCES participants(id) -- speaker for diarisation
            )
        """)

        # --- junction table for a participant in an episode
        cursor.execute("""
            CREATE TABLE IF NOT EXISTS episodes_participants (
                episode_id INTEGER REFERENCES episodes(id),
                participant_id INTEGER REFERENCES participants(id),
                role TEXT,
                PRIMARY KEY (episode_id, participant_id)
            )
        """)

        db.commit()

init_db()

##### check the structure of the tables

In [24]:
def inspect_table_structure(cursor, table_name):
    """Inspect and print table structure"""
    cursor.execute(f'PRAGMA table_info({table_name})')
    print(f'\n{table_name}')
    for col in cursor.fetchall():
        print(col)

# Usage
tables = ['episodes', 'participants', 'transcription_segments', 'episodes_participants']
for table in tables:
    inspect_table_structure(cursor, table)


episodes
(0, 'id', 'INTEGER', 0, None, 1)
(1, 'title', 'TEXT', 0, None, 0)
(2, 'number', 'INTEGER', 0, None, 0)
(3, 'date', 'TEXT', 0, None, 0)
(4, 'url_path', 'TEXT', 0, None, 0)
(5, 'description', 'TEXT', 0, None, 0)

participants
(0, 'id', 'INTEGER', 0, None, 1)
(1, 'name', 'TEXT', 1, None, 0)

transcription_segments
(0, 'id', 'INTEGER', 0, None, 1)
(1, 'episode_id', 'INTEGER', 0, None, 0)
(2, 'start_time', 'REAL', 0, None, 0)
(3, 'end_time', 'REAL', 0, None, 0)
(4, 'text', 'TEXT', 0, None, 0)
(5, 'participant_id', 'INTEGER', 0, None, 0)

episodes_participants
(0, 'episode_id', 'INTEGER', 0, None, 1)
(1, 'participant_id', 'INTEGER', 0, None, 2)
(2, 'role', 'TEXT', 0, None, 0)


### model

In [4]:
model = Model(model='large-v3', models_dir='./whisper.cpp/models')

whisper_init_from_file_with_params_no_state: loading model from '/Users/quentin/dev/podcast_audio_extractor/whisper.cpp/models/ggml-large-v3.bin'
whisper_init_with_params_no_state: use gpu    = 1
whisper_init_with_params_no_state: flash attn = 0
whisper_init_with_params_no_state: gpu_device = 0
whisper_init_with_params_no_state: dtw        = 0
whisper_init_with_params_no_state: devices    = 3
whisper_init_with_params_no_state: backends   = 3
whisper_model_load: loading model
whisper_model_load: n_vocab       = 51866
whisper_model_load: n_audio_ctx   = 1500
whisper_model_load: n_audio_state = 1280
whisper_model_load: n_audio_head  = 20
whisper_model_load: n_audio_layer = 32
whisper_model_load: n_text_ctx    = 448
whisper_model_load: n_text_state  = 1280
whisper_model_load: n_text_head   = 20
whisper_model_load: n_text_layer  = 32
whisper_model_load: n_mels        = 128
whisper_model_load: ftype         = 1
whisper_model_load: qntvr         = 0
whisper_model_load: type          = 5 (larg

### code

##### files

In [None]:
file = './s10e43_tiny_benchmark.mp3'

def gathering_mp3_files_in_directory() -> list[str]:
    print('test')

##### transcription functions

In [None]:
def transcribe(file:str):
    transcription = model.transcribe(
        file, 
        language='fr',
        temperature=0.0,
        print_progress=True,
        extract_probability=False
    )
    print(transcription)
    return transcription

In [None]:
def transcribe_into_file(file:str):

    # directory to store the transcriptions into 
    output_directory = 'transcriptions_output'
    os.makedirs(f'./{output_directory}', exist_ok=True)

    # name of the transcription output file
    # (using the name of the file given as params)
    base_name = os.path.splitext(file)[0]
    output_file_name = f"{base_name}.txt"

    # writing transcriptions in file
    with open(f'./{output_directory}/{output_file_name}','w',encoding='utf-8') as output_file:
        for segment in transcribe(file):
            output_file.write(f'{segment.text}')
            # print(segment)

def format_output():
    print('format')

def transcribe_into_db(input_dir, num_episode):
    
    with get_connection() as db:
        cursor = db.cursor()
        for transcription_segment in transcribe(file)
            start = transcription_segment.start
            end = transcription_segment.end
            text = transcription_segment.text.strip()

            cursor.execute("""
                INSERT INTO transcription_segments (episode_id, start_time, end_time, text)
                VALUES (?,?,?,?)
            """, (num_episode, start, end, text))

        db.commit()

def vector_store():
    print('vector')

##### usage

In [35]:
# transcribe_into_file(file)
transcribe(file)

Progress:   0%
Progress:  36%
Progress:  78%


[t0=0, t1=140, text=Message publicitaire., probability=0.688292920589447, t0=140, t1=1022, text=Le médicament Daflon 500 mg, indiqué pour soulager les jambes lourdes et douloureuses dues à l'insuffisance veineuse, agit en renforçant le tonus veineux et en protégeant les vaisseaux sanguins., probability=0.9342695474624634, t0=1022, t1=1874, text=Son efficacité a été cliniquement démontrée dans le traitement des troubles de la circulation veineuse, jambes lourdes, douleurs, impatience, en complément des mesures hygiéno-diététiques., probability=0.9753648638725281, t0=1874, t1=2034, text=Rendez-vous sur Daflon.fr., probability=0.901023805141449, t0=2034, t1=2624, text=Daflon 500 mg, composé de fractions flavonoïques purifiées micronisées et réservées à l'adulte, est disponible en pharmacie sans ordonnance., probability=0.9891102910041809, t0=2624, t1=3006, text=Tout médicament peut exposer à des risques. Parlez-en à votre pharmacien et lisez attentivement la notice., probability=0.9831526

Progress: 100%


[t0=0, t1=140, text=Message publicitaire., probability=0.688292920589447,
 t0=140, t1=1022, text=Le médicament Daflon 500 mg, indiqué pour soulager les jambes lourdes et douloureuses dues à l'insuffisance veineuse, agit en renforçant le tonus veineux et en protégeant les vaisseaux sanguins., probability=0.9342695474624634,
 t0=1022, t1=1874, text=Son efficacité a été cliniquement démontrée dans le traitement des troubles de la circulation veineuse, jambes lourdes, douleurs, impatience, en complément des mesures hygiéno-diététiques., probability=0.9753648638725281,
 t0=1874, t1=2034, text=Rendez-vous sur Daflon.fr., probability=0.901023805141449,
 t0=2034, t1=2624, text=Daflon 500 mg, composé de fractions flavonoïques purifiées micronisées et réservées à l'adulte, est disponible en pharmacie sans ordonnance., probability=0.9891102910041809,
 t0=2624, t1=3006, text=Tout médicament peut exposer à des risques. Parlez-en à votre pharmacien et lisez attentivement la notice., probability=0.98

##### unload model from vram

In [None]:
# unload model from vram (mac only)
del model

NameError: name 'model' is not defined

### embeddings