#### Import Libraries

In [None]:
import os
import deepspeech
deepspeech.version()

#### Configuration Settings

In [72]:
os.chdir("C:/Users/danie/Speaknow/projects")

# Specify the path to the directory containing the MP3 files
audio_folder_dir = './Data/audio'
audio_training_dir = "./Data/training/audio"
training_csvs = "./Data/training/training_csvs"
transcript_output_dir = "./Data/training/transcripts"
deepspeech_transcripts = "./Data/training/deepspeech_transcripts"
output_models_dir = "./Deepspeech/output_models"
fine_tuning_checkpoints = "./Deepspeech/fine_tuning_checkpoints"

# List all files in the train, dev and test directory
devtest_files = os.listdir(audio_training_dir)
# Filter out only the wav files
wav_devtest_files = [file for file in devtest_files if file.endswith('.wav')]

# List all files in the audio directory
audio_files = os.listdir(audio_folder_dir)
wav_files = [file for file in audio_files if file.endswith('.wav')]
mp3_files = [file for file in audio_files if file.endswith('.mp3')]

#### Generate transcripts using the base Deepspeech Model

In [56]:
from pathlib import Path
import pandas as pd

file_data_df = pd.DataFrame([])

i=0
for wav_file in wav_devtest_files:
    filesize = Path(audio_training_dir+"/"+wav_file).stat().st_size

    base_name, _ = os.path.splitext(audio_training_dir+"/"+wav_file)
    transcript_file_name = base_name + ".txt"
    with open(transcript_file_name, 'r') as file:
            file_contents = file.read()
    
    tmp_df = pd.DataFrame({"wav_filename": wav_file, "wav_filesize": filesize, "transcript": file_contents }, index=[i])  

    file_data_df = pd.concat([file_data_df, tmp_df], ignore_index=True)

    i+=1
    
file_data_df[:5].to_csv(training_csvs+"/"+"dev.csv", index=False)
file_data_df[5:10].to_csv(training_csvs+"/"+"test.csv", index=False)
file_data_df[10:].to_csv(training_csvs+"/"+"train.csv", index=False)


In [50]:
file_data_df.head(20)

Unnamed: 0,wav_filename,wav_filesize,transcript
0,1655476060404937-1.wav,3603500,"Um no, I don't want to be on a reality show. A..."
1,1655476060404937-2.wav,5789996,um my favorite social media platform that I pr...
2,1655476060404937-3.wav,3255596,"Ah, I prefer to work every day in a few hours ..."
3,1655476060404937-4.wav,2907692,I prefer to invert to invert or a drug because...
4,1655476060404937-5.wav,2949164,"wow, that's a very hard question because I lik..."
5,1687976114519971-1.wav,2529836,"A life in Panama. Um, um, my house is big and ..."
6,1687976114519971-2.wav,2354732,"I'm going to talk about, of my mom and my mom ..."
7,1687976114519971-3.wav,3161132,And my favorite vacation is when I go to Cancu...
8,1687976114519971-4.wav,4370732,"Ok. My, my friend is, um, his wife is a small ..."
9,1687976114519971-5.wav,2587436,"Ok. In my next summer I will play, er, I love ..."


In [78]:
import librosa
import numpy as np

# Load MP3 file
def load_mp3_file(file_path):
    audio, sr = librosa.load(file_path, sr=16000)
    return audio, sr

# Initialize DeepSpeech model
def initialize_deepspeech(model_path, scorer_path):
    model = deepspeech.Model(model_path)
    model.enableExternalScorer(scorer_path)
    return model

# Convert audio to text using DeepSpeech
def audio_to_text(audio, sr, model):
    audio = (audio * 32768).astype(np.int16)  # DeepSpeech expects int16 audio
    text = model.stt(audio)
    return text

# Main function
def run_deepspeech_model():
    model_path = "./Model/deepspeech-0.9.3-models.pbmm"
    scorer_path = "./Model/deepspeech-0.9.3-models.scorer"
    model = initialize_deepspeech(model_path, scorer_path)
    
    for audio_file in mp3_files[:5]:
        mp3_file_path = audio_folder_dir + "/" + audio_file  
        
        audio, sr = load_mp3_file(mp3_file_path)
        
        transcription = audio_to_text(audio, sr, model)

        text_file_path = os.path.join(deepspeech_transcripts, os.path.splitext(audio_file)[0] + ".txt")
        with open(text_file_path, "w") as text_file:
            text_file.write(transcription)
        text_file.close()

    print("transcription job completed")

run_deepspeech_model()


transcription job completed
