In [None]:
%%capture
NEMO_DIR_PATH = "./NeMo"
TOOLS_DIR = f'{NEMO_DIR_PATH}/tools/ctc_segmentation/scripts'
WORK_DIR = 'WORK_DIR'
DATA_DIR = WORK_DIR + '/DATA'
OUTPUT_DIR = WORK_DIR + "/output"

import os
os.makedirs(DATA_DIR, exist_ok=True)
os.makedirs(DATA_DIR + '/audio', exist_ok=True)
os.makedirs(DATA_DIR + '/text', exist_ok=True)

! git clone https://github.com/saeedzou/NeMo.git
!cd NeMo && bash colab_install.sh && apt-get install -y sox libsox-fmt-mp3 ffmpeg

! pip install -q wtpsplit

Prepare your data in the following format:
- Audios must be under {DATA_DIR}/audio
- Texts must be under {DATA_DIR}/text

In [None]:
import json
import os
import gdown
import os
import re
import random
import time
import subprocess
import ast
import torch
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wtpsplit import SaT
from IPython.display import Audio, display
from tqdm.notebook import tqdm

def initialize_sat(model_name="sat-12l-sm"):
    sat = SaT(model_name)
    if torch.cuda.is_available():
        sat.half().to("cuda")
    return sat

NVIDIA_FASTCONFORMER = "nvidia/stt_fa_fastconformer_hybrid_large"
WAV2VEC2_FA = "masoudmzb/wav2vec2-xlsr-multilingual-53-fa"
WAV2VEC2_V3 = "m3hrdadfi/wav2vec2-large-xlsr-persian-v3"
WHISPER_TINY = "openai/whisper-tiny"
WHISPER_BASE = "openai/whisper-base"
WHISPER_SMALL = "openai/whisper-small"
WHISPER_MEDIUM = "openai/whisper-medium"
WHISPER_LARGE = "openai/whisper-large"
HEZAR = 'hezarai/whisper-small-fa'
MODELS_DICT = {
    "nvidia_stt_fa_fastconformer_hybrid_large": "NVIDIA_FASTCONFORMER",
    "masoudmzb_wav2vec2_xlsr_multilingual_53_fa": "WAV2VEC2_FA",
    "m3hrdadfi_wav2vec2_large_xlsr_persian_v3": "WAV2VEC2_V3",
    "openai_whisper_tiny": "WHISPER_TINY",
    "openai_whisper_base": "WHISPER_BASE",
    "openai_whisper_small": "WHISPER_SMALL",
    "openai_whisper_medium": "WHISPER_MEDIUM",
    "openai_whisper_large": "WHISPER_LARGE",
    "hezarai_whisper_small_fa": "HEZAR",
}
MODELS = f"{NVIDIA_FASTCONFORMER} {WAV2VEC2_V3} {HEZAR}"

MODE = ""
LANG_ID='fa'
OFFSET = 0
THRESHOLD = -2
WINDOW = 8000
CER_THRESHOLD = 40
WER_THRESHOLD = 75
CER_EDGE_THRESHOLD = 75
LEN_DIFF_RATIO_THRESHOLD = 0.4
MIN_DURATION = 1
MAX_DURATION = 20
EDGE_LEN = 7
OUTPUT_FORMAT = 'wav'
REMOVE_BRACKETS=True
REMOVE_ASTERISKS=True
REMOVE_PARENTHESES=True
REMOVE_SPEAKER_LABELS=False
SPLIT_USING_PATTERN=False
SPLIT_ON_QUOTES=False
SPLIT_ON_VERBS=True
SPLIT_ON_VERBS_MIN_WORDS=10
SPLIT_ON_VERBS_MAX_WORDS=24
ADDITIONAL_SPLIT_SYMBOLS=":\|\?\|\!\|\؟\|\?”\|\!”" # add new symbols, separated by | (\| before ? and ! and . and | because they are special characters in regex)

The following script does the following:
1. Prepares data in the right format for CTC segmentation i.e. newline separated text (roughly an utterance) and 16000 Hz mono audio in `.wav` format for the NeMo ASR model

2. Runs CTC segmentation on the processed data and outputs segments text file for each audio file containing utterance start, end timings and alignment score

3. Verifies the segments created in step 2

4. Cuts the audios into utterances and creates a json manifest file (NeMo format) of the information of each utterance

In [None]:
texts = []
text_names = []
for file in os.listdir(f"{DATA_DIR}/text"):
    if file.endswith('.txt'):
        text_names.append(os.path.join(f"{DATA_DIR}/text", file))
        texts.append(open(os.path.join(f"{DATA_DIR}/text", file), 'r', encoding='utf-8').read().replace("\n", " "))

sat = initialize_sat()
texts = sat.split(texts)
for text_name, text in zip(text_names, texts):
    text = "\n".join(text)
    with open(os.path.join(f"{DATA_DIR}/text", text_name), 'w', encoding='utf-8') as f:
        f.write(text)
del sat
torch.cuda.empty_cache()

In [None]:
! rm -rf $OUTPUT_DIR

! bash $TOOLS_DIR/../run_segmentation.sh \
--MODEL_NAME_OR_PATH=$NVIDIA_FASTCONFORMER \
--DATA_DIR=$DATA_DIR \
--OUTPUT_DIR=$OUTPUT_DIR \
--SCRIPTS_DIR=$TOOLS_DIR \
--REMOVE_BRACKETS=$REMOVE_BRACKETS \
--REMOVE_ASTERISKS=$REMOVE_ASTERISKS \
--REMOVE_PARANTHESES=$REMOVE_PARENTHESES \
--REMOVE_SPEAKER_LABELS=$REMOVE_SPEAKER_LABELS \
--SPLIT_USING_PATTERN=$SPLIT_USING_PATTERN \
--SPLIT_ON_QUOTES=$SPLIT_ON_QUOTES \
--SPLIT_ON_VERBS=$SPLIT_ON_VERBS \
--SPLIT_ON_VERBS_MIN_WORDS=$SPLIT_ON_VERBS_MIN_WORDS \
--SPLIT_ON_VERBS_MAX_WORDS=$SPLIT_ON_VERBS_MAX_WORDS \
--ADDITIONAL_SPLIT_SYMBOLS=$ADDITIONAL_SPLIT_SYMBOLS \
--LANGUAGE=$LANG_ID \
--MIN_SCORE=$THRESHOLD  \
--USE_NEMO_NORMALIZATION=False

Steps:

1. Transcribes the segments created from the `run_segmentation.sh` using the generated manifest for models in `MODELS` and calculate metrics such as WER, CER, etc. and outputs a new manifest file for each model

2. Filters out segments that don't meet the minimum requirements of any of the models

In [None]:
! bash $TOOLS_DIR/../run_filter_multiple.sh \
--MODEL_NAME_OR_PATH="$MODELS" \
--INPUT_AUDIO_DIR=$DATA_DIR/audio \
--MANIFEST=$OUTPUT_DIR/manifests/manifest.json \
--SCRIPTS_DIR=$TOOLS_DIR \
--CER_THRESHOLD=$CER_THRESHOLD \
--WER_THRESHOLD=$WER_THRESHOLD \
--CER_EDGE_THRESHOLD=$CER_EDGE_THRESHOLD \
--LEN_DIFF_RATIO_THRESHOLD=$LEN_DIFF_RATIO_THRESHOLD \
--MIN_DURATION=$MIN_DURATION \
--MAX_DURATION=$MAX_DURATION \
--EDGE_LEN=$EDGE_LEN || exit

Analyze some of the results

In [None]:
# Specify the path to the JSON file
MANIFEST_FILE = f'{OUTPUT_DIR}/manifests/manifest_transcribed_metrics_filtered.json'

# Read the file and load all lines
with open(MANIFEST_FILE, 'r', encoding='utf-8') as f:
    lines = f.readlines()

# Select 10 random lines from the file
random_samples = random.sample(lines, min(10, len(lines)))

# Process and display the 10 random samples
for line in random_samples:
    x = json.loads(line.strip())
    display(Audio(x['audio_filepath']))
    time.sleep(1)
    print('Original: ')
    print(x['text_no_preprocessing'])
    print('Ground Truth: ')
    print(x['text'])
    print(f'Best hypothesis from {x["model_name"]}')
    print(x['pred_text'])
    print(f"WER : {x['WER']}, CER: {x['CER']}, Start CER: {x['start_CER']}, End CER: {x['end_CER']}, Alignment score: {x['score']}")
    print('*' * 20)

If the results were satisfactory, run the following script to upsample the clips to 44.1 kHz and create a metadata.csv file for the dataset, then zip the folder

In [None]:
DATASET_DIR = "CTC_Dataset"
ZIP_PATH = f"{DATASET_DIR}.zip"

In [None]:
! bash $TOOLS_DIR/../run_prepare_dataset.sh \
--INPUT_AUDIO_DIR=$DATA_DIR/audio \
--MANIFEST=$OUTPUT_DIR/manifests/manifest_transcribed_metrics_filtered.json \
--SCRIPTS_DIR=$TOOLS_DIR \
--OUTPUT_DIR=$OUTPUT_DIR \
--OUTPUT_FORMAT=$OUTPUT_FORMAT \
--MODE=$MODE \
--DATASET_DIR=$DATASET_DIR || exit