In [None]:
%%capture
# Authenticate to Google Drive to upload the dataset
! wget -O colab_utils.py https://raw.githubusercontent.com/saeedzou/Speech-Dataset/main/colab_utils.py
from colab_utils import initialize_drive, upload_file_to_drive, get_or_create_folder
DRIVE = initialize_drive()


! git clone https://github.com/saeedzou/NeMo.git
%cd NeMo

# Download cookies for gdown
! gdown 11X7uwxGepuz62m_9A0o8XSqJjxeL_7OR
! cp cookies.txt ~/.cache/gdown/cookies.txt

NEMO_DIR_PATH = "./"
TOOLS_DIR = f'{NEMO_DIR_PATH}/tools/ctc_segmentation/scripts'
WORK_DIR = 'WORK_DIR'
DATA_DIR = WORK_DIR + '/DATA'
OUTPUT_DIR = WORK_DIR + "/output"

import os
os.makedirs(DATA_DIR, exist_ok=True)
os.makedirs(DATA_DIR + '/audio', exist_ok=True)
os.makedirs(DATA_DIR + '/text', exist_ok=True)

! bash colab_install.sh
! apt-get install -y sox libsox-fmt-mp3 ffmpeg mediainfo
! pip install -q pymediainfo

Prepare your data in the following format:
- Audios must be under {DATA_DIR}/audio
- Texts must be under {DATA_DIR}/text

In [None]:
!rm -rf /content/NeMo/WORK_DIR/DATA/audio/*
!rm -rf /content/NeMo/WORK_DIR/DATA/text/*
!rm -rf /content/NeMo/WORK_DIR/output
!rm -rf /content/NeMo/WORK_DIR/*.csv

In [None]:
import json
import os
import gdown
import os
import re
import random
import time
import subprocess
import ast
import shutil
import librosa
import soundfile as sf
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import Audio, display
from pymediainfo import MediaInfo
from tqdm.notebook import tqdm

NVIDIA_FASTCONFORMER = "nvidia/stt_fa_fastconformer_hybrid_large"
WAV2VEC2_FA = "masoudmzb/wav2vec2-xlsr-multilingual-53-fa"
WAV2VEC2_V3 = "m3hrdadfi/wav2vec2-large-xlsr-persian-v3"
WHISPER_TINY = "openai/whisper-tiny"
WHISPER_BASE = "openai/whisper-base"
WHISPER_SMALL = "openai/whisper-small"
WHISPER_MEDIUM = "openai/whisper-medium"
WHISPER_LARGE = "openai/whisper-large"
HEZAR = 'hezarai/whisper-small-fa'
VOSK_SMALL = 'vosk-model-small-fa-0.42'
VOSK_BIG = 'vosk-model-fa-0.42'
MODELS_DICT = {
    "nvidia_stt_fa_fastconformer_hybrid_large": "NVIDIA_FASTCONFORMER",
    "masoudmzb_wav2vec2_xlsr_multilingual_53_fa": "WAV2VEC2_FA",
    "m3hrdadfi_wav2vec2_large_xlsr_persian_v3": "WAV2VEC2_V3",
    "openai_whisper_tiny": "WHISPER_TINY",
    "openai_whisper_base": "WHISPER_BASE",
    "openai_whisper_small": "WHISPER_SMALL",
    "openai_whisper_medium": "WHISPER_MEDIUM",
    "openai_whisper_large": "WHISPER_LARGE",
    "hezarai_whisper_small_fa": "HEZAR",
    "vosk_model_small_fa_0_42": "VOSK_SMALL",
    "vosk_model_fa_0_42": "VOSK_BIG"
}
MODELS = f"{NVIDIA_FASTCONFORMER} {WAV2VEC2_V3} {HEZAR}"

DATASET_DIR = ""
IDX = 0
MAX_DOWNLOADS_GDRIVE = 30

MODE = ""
LANG_ID='fa'
OFFSET = 0
THRESHOLD = -2
WINDOW = 8000
CER_THRESHOLD = 40
WER_THRESHOLD = 75
CER_EDGE_THRESHOLD = 75
LEN_DIFF_RATIO_THRESHOLD = 0.4
MIN_DURATION = 1
MAX_DURATION = 20
EDGE_LEN = 7
OUTPUT_FORMAT = 'wav'
REMOVE_BRACKETS=True
REMOVE_ASTERISKS=True
REMOVE_PARENTHESES=True
REMOVE_SPEAKER_LABELS=True
SPLIT_USING_PATTERN=False
SPLIT_ON_QUOTES=False
SPLIT_ON_VERBS=False
ADDITIONAL_SPLIT_SYMBOLS=""# add new symbols, separated by | (\| before ? and ! and . and | because they are special characters in regex)

In [None]:
def is_persian(text, threshold=0.5):
    # Regex for Persian characters
    persian_chars = re.findall(r'[\u0600-\u06FF\uFB50-\uFDFF]', text)
    total_chars = len(text)
    
    if total_chars == 0:  # Avoid division by zero
        return False

    # Calculate proportion of Persian characters
    persian_ratio = len(persian_chars) / total_chars
    return persian_ratio > threshold

# Download excel files from google drive where each sheet corresponds to a channel
# DATASET_DIR selects channel sheet from the excel file
sheets_path = gdown.download(url='https://drive.google.com/uc?id=1qVw60j8Xhb1l1g4W8q0d8WlS5CaenNVMZH2q4AuLPjY', output=f"{WORK_DIR}/sheets.xlsx", quiet=False)
sheets = pd.read_excel(sheets_path, sheet_name=None)

# Download the metadata file and save texts in text folder
summary = sheets['Summary']
channel_metadata = summary[summary['Channel'] == DATASET_DIR]
channel_metadata = gdown.download(url=channel_metadata['Gdrive Link'].values[0], output=f"{WORK_DIR}/df_{IDX}.csv", quiet=False)
channel_metadata = pd.read_csv(channel_metadata)

for i in range(IDX * MAX_DOWNLOADS_GDRIVE, min((IDX + 1) * MAX_DOWNLOADS_GDRIVE, len(channel_metadata))):
    x = channel_metadata.iloc[i]
    voice_name = x['voice_name']
    basename = voice_name.split('.')[0]
    text_name = basename + '.txt'
    transcript = ast.literal_eval(x['transcript'])
    transcript = pd.json_normalize(transcript)
    #text = "\n".join(transcript['text'].str.strip().str.replace("\n", " ", regex=True))
    text = "\n".join(transcript['text'])
    if is_persian(text):
        with open(os.path.join(f"{DATA_DIR}/text", text_name), 'w', encoding='utf-8') as f:
            f.write(text)

channel = sheets[DATASET_DIR]
# channel contains Audio Name, GDrive Public Link
# download all the audio files in range
for index, row in channel.iterrows():
    audio_name = row['Audio Name']
    audio_link = row['GDrive Public Link']
    audio_path = f"{DATA_DIR}/audio/{audio_name}"
    # check if the audio file is already downloaded and the text file in the text folder is present
    if not os.path.exists(audio_path) and os.path.exists(f"{DATA_DIR}/text/{audio_name.split('.')[0]}.txt"):
        gdown.download(url=audio_link, output=audio_path, quiet=False)

# Process each .wav file in the directory
for file_name in tqdm(os.listdir(f"{DATA_DIR}/audio")):
    if file_name.endswith(".wav"):
        file_path = os.path.join(f"{DATA_DIR}/audio", file_name)
        
        try:
            # Get media information
            media_info = MediaInfo.parse(file_path)
            format_detected = media_info.tracks[0].format if media_info.tracks else "Unknown"
            
            # Check if the file is mislabeled
            if format_detected.lower() not in ["wav", "pcm"]:
                # Define a temporary file path for the corrected file
                temp_file_path = file_path.replace(".wav", "_temp.wav")
                
                # Use ffmpeg to convert to a proper .wav file (this is the temp file)
                subprocess.run(
                    ["ffmpeg", "-i", file_path, "-ac", "1", "-c:a", "pcm_s16le", temp_file_path],
                    check=True,
                    stdout=subprocess.PIPE,
                    stderr=subprocess.PIPE
                )
                
                # Remove the old file (mislabeled)
                os.remove(file_path)
                
                # Rename the temporary file to overwrite the original
                os.rename(temp_file_path, file_path)
                
        except subprocess.CalledProcessError as e:
            print(f"FFmpeg error processing {file_name}: {e.stderr.decode()}")
        except Exception as e:
            print(f"Error processing {file_name}: {e}")

The following script does the following:
1. Prepares data in the right format for CTC segmentation i.e. newline separated text (roughly an utterance) and 16000 Hz mono audio in `.wav` format for the NeMo ASR model

2. Runs CTC segmentation on the processed data and outputs segments text file for each audio file containing utterance start, end timings and alignment score

3. Verifies the segments created in step 2

4. Cuts the audios into utterances and creates a json manifest file (NeMo format) of the information of each utterance

In [None]:
! rm -rf $OUTPUT_DIR

! bash $TOOLS_DIR/../run_segmentation.sh \
--MODEL_NAME_OR_PATH=$NVIDIA_FASTCONFORMER \
--DATA_DIR=$DATA_DIR \
--OUTPUT_DIR=$OUTPUT_DIR \
--SCRIPTS_DIR=$TOOLS_DIR \
--REMOVE_BRACKETS=$REMOVE_BRACKETS \
--REMOVE_ASTERISKS=$REMOVE_ASTERISKS \
--REMOVE_PARANTHESES=$REMOVE_PARENTHESES \
--REMOVE_SPEAKER_LABELS=$REMOVE_SPEAKER_LABELS \
--SPLIT_USING_PATTERN=$SPLIT_USING_PATTERN \
--SPLIT_ON_QUOTES=$SPLIT_ON_QUOTES \
--SPLIT_ON_VERBS=$SPLIT_ON_VERBS \
--ADDITIONAL_SPLIT_SYMBOLS=$ADDITIONAL_SPLIT_SYMBOLS \
--LANGUAGE=$LANG_ID \
--MIN_SCORE=$THRESHOLD  \
--USE_NEMO_NORMALIZATION=False

Steps:

1. Transcribes the segments created from the `run_segmentation.sh` using the generated manifest for models in `MODELS` and calculate metrics such as WER, CER, etc. and outputs a new manifest file for each model

2. Filters out segments that don't meet the minimum requirements of any of the models

In [None]:
! bash $TOOLS_DIR/../run_filter_multiple.sh \
--MODEL_NAME_OR_PATH="$MODELS" \
--INPUT_AUDIO_DIR=$DATA_DIR/audio \
--MANIFEST=$OUTPUT_DIR/manifests/manifest.json \
--SCRIPTS_DIR=$TOOLS_DIR \
--CER_THRESHOLD=$CER_THRESHOLD \
--WER_THRESHOLD=$WER_THRESHOLD \
--CER_EDGE_THRESHOLD=$CER_EDGE_THRESHOLD \
--LEN_DIFF_RATIO_THRESHOLD=$LEN_DIFF_RATIO_THRESHOLD \
--MIN_DURATION=$MIN_DURATION \
--MAX_DURATION=$MAX_DURATION \
--EDGE_LEN=$EDGE_LEN

Analyze some of the results

In [None]:
import random
import json
from IPython.display import Audio

# Specify the path to the JSON file
MANIFEST_FILE = f'{OUTPUT_DIR}/manifests/manifest_transcribed_metrics_filtered.json'

# Read the file and load all lines
with open(MANIFEST_FILE, 'r', encoding='utf-8') as f:
    lines = f.readlines()

# Select 10 random lines from the file
random_samples = random.sample(lines, min(10, len(lines)))

# Process and display the 10 random samples
for line in random_samples:
    x = json.loads(line.strip())
    display(Audio(x['audio_filepath']))
    time.sleep(1)
    print('Original: ')
    print(x['text_no_preprocessing'])
    print('Ground Truth: ')
    print(x['text'])
    print(f'Best hypothesis from {x["model_name"]}')
    print(x['pred_text'])
    print(f"WER : {x['WER']}, CER: {x['CER']}, Start CER: {x['start_CER']}, End CER: {x['end_CER']}, Alignment score: {x['score']}")
    print('*' * 20)

If the results were satisfactory, run the following script to upsample the clips to 44.1 kHz and create a metadata.csv file for the dataset, then zip the folder

In [None]:
DATASET_DIR = f"{DATASET_DIR}_{IDX}"
ZIP_PATH = f"/content/NeMo/{DATASET_DIR}.zip"

In [None]:
! bash $TOOLS_DIR/../run_prepare_dataset.sh \
--INPUT_AUDIO_DIR=$DATA_DIR/audio \
--MANIFEST=$OUTPUT_DIR/manifests/manifest_transcribed_metrics_filtered.json \
--SCRIPTS_DIR=$TOOLS_DIR \
--OUTPUT_DIR=$OUTPUT_DIR \
--OUTPUT_FORMAT=$OUTPUT_FORMAT \
--MODE=$MODE \
--DATASET_DIR="$DATASET_DIR"

In [None]:
# Load dataset
CSV_FILE = f"{DATASET_DIR}/metadata.csv"
df = pd.read_csv(CSV_FILE)
df['model_name'] = df['model_name'].map(MODELS_DICT)

# Set the theme for the plots
plt.style.use('seaborn-v0_8-darkgrid')

fig, axs = plt.subplots(1, 1, figsize=(5, 5))
sns.histplot(df['score'], bins=10, kde=True, color="lightgreen", ax=axs, alpha=0.7)
axs.set_xlabel('score', fontsize=12)
axs.set_ylabel('Frequency', fontsize=12)
axs.set_title('Distribution of CTC alignment score', fontsize=14)

fig, axs = plt.subplots(1, 2, figsize=(10, 5))
sns.histplot(df['duration'], bins=10, kde=True, color="orchid", ax=axs[0], alpha=0.7)
axs[0].set_xlabel('duration', fontsize=12)
axs[0].set_ylabel('Frequency', fontsize=12)
axs[0].set_title('Distribution of duration', fontsize=14)

sns.countplot(x='model_name', data=df, color='salmon', ax=axs[1])
axs[1].set_xlabel('model_name', fontsize=12)
axs[1].set_ylabel('Frequency', fontsize=12)
axs[1].set_title('Distribution of model_name', fontsize=8)
axs[1].tick_params(axis='x', rotation=60)

# Plot histogram of CER and WER
fig, axs = plt.subplots(1, 4, figsize=(12, 5))
sns.histplot(df['CER'], bins=10, kde=True, color="skyblue", ax=axs[0], alpha=0.7)
axs[0].set_xlabel('CER', fontsize=12)
axs[0].set_ylabel('Frequency', fontsize=12)
axs[0].set_title('Distribution of CER', fontsize=14)

sns.histplot(df['WER'], bins=10, kde=True, color="salmon", ax=axs[1], alpha=0.7)
axs[1].set_xlabel('WER', fontsize=12)
axs[1].set_ylabel('Frequency', fontsize=12)
axs[1].set_title('Distribution of WER', fontsize=14)

sns.histplot(df['start_CER'], bins=10, kde=True, color="lightgreen", ax=axs[2], alpha=0.7)
axs[2].set_xlabel('Start CER', fontsize=12)
axs[2].set_ylabel('Frequency', fontsize=12)
axs[2].set_title('Distribution of Start CER', fontsize=14)

sns.histplot(df['end_CER'], bins=10, kde=True, color="gold", ax=axs[3], alpha=0.7)
axs[3].set_xlabel('End CER', fontsize=12)
axs[3].set_ylabel('Frequency', fontsize=12)
axs[3].set_title('Distribution of End CER', fontsize=14)

# Plot histogram of ins_rate, del_rate, sub_rate
fig, axs = plt.subplots(1, 3, figsize=(12, 5))
sns.histplot(df['ins_rate'], bins=10, kde=True, color="lightgreen", ax=axs[0], alpha=0.7)
axs[0].set_xlabel('Insertion Rate', fontsize=12)
axs[0].set_ylabel('Frequency', fontsize=12)
axs[0].set_title('Distribution of Insertion Rate', fontsize=14)

sns.histplot(df['del_rate'], bins=10, kde=True, color="gold", ax=axs[1], alpha=0.7)
axs[1].set_xlabel('Deletion Rate', fontsize=12)
axs[1].set_ylabel('Frequency', fontsize=12)
axs[1].set_title('Distribution of Deletion Rate', fontsize=14)

sns.histplot(df['sub_rate'], bins=10, kde=True, color="orchid", ax=axs[2], alpha=0.7)
axs[2].set_xlabel('Substitution Rate', fontsize=12)
axs[2].set_ylabel('Frequency', fontsize=12)
axs[2].set_title('Distribution of Substitution Rate', fontsize=14)

plt.tight_layout()
plt.show()

In [None]:
parent_folder_id = get_or_create_folder(DRIVE, 'root', 'Youtube_Dataset')
zip_id = upload_file_to_drive(DRIVE, parent_folder_id, ZIP_PATH, os.path.basename(ZIP_PATH))
print(f"*** Dataset {DATASET_DIR} created ***")
print(f"*** Drive ID: {zip_id} ***")
print(f"*** Drive link: https://drive.google.com/file/d/{zip_id}/view?usp=sharing ***")
print(f"*** Direct download link: https://drive.google.com/uc?id={zip_id} ***")