In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%%capture
# Python cell to clone the repository
!gdown 11X7uwxGepuz62m_9A0o8XSqJjxeL_7OR
!cp cookies.txt ~/.cache/gdown/cookies.txt
!git clone https://github.com/saeedzou/NeMo.git
%cd NeMo

import os

NEMO_DIR_PATH = "./"
TOOLS_DIR = f'{NEMO_DIR_PATH}/tools/ctc_segmentation/scripts'
WORK_DIR = 'WORK_DIR'
DATA_DIR = WORK_DIR + '/DATA'
OUTPUT_DIR = WORK_DIR + "/output"

os.makedirs(DATA_DIR, exist_ok=True)
os.makedirs(DATA_DIR + '/audio', exist_ok=True)
os.makedirs(DATA_DIR + '/text', exist_ok=True)

! bash colab_install.sh
! apt-get install -y sox libsox-fmt-mp3 ffmpeg

Prepare your data in the following format:
- Audios must be under {DATA_DIR}/audio
- Texts must be under {DATA_DIR}/text

In [None]:
import json
import os
from IPython.display import Audio, display
import gdown
import os
import pandas as pd
import ast
import shutil
import librosa
import soundfile as sf
from tqdm.notebook import tqdm

NVIDIA_FASTCONFORMER = "nvidia/stt_fa_fastconformer_hybrid_large"
WAV2VEC2_FA = "masoudmzb/wav2vec2-xlsr-multilingual-53-fa"
WAV2VEC2_V3 = "m3hrdadfi/wav2vec2-large-xlsr-persian-v3"
WHISPER_TINY = "openai/whisper-tiny"
WHISPER_BASE = "openai/whisper-base"
WHISPER_SMALL = "openai/whisper-small"
WHISPER_MEDIUM = "openai/whisper-medium"
WHISPER_LARGE = "openai/whisper-large"
HEZAR = 'hezarai/whisper-small-fa'
VOSK_SMALL = 'vosk-model-small-fa-0.42'
VOSK_BIG = 'vosk-model-fa-0.42'
MODELS = f"{NVIDIA_FASTCONFORMER} {WAV2VEC2_FA} {HEZAR}"

DATASET_DIR = ""

MODE = ""
LANG_ID='fa'
OFFSET = 0
THRESHOLD = -2
WINDOW = 8000
CER_THRESHOLD = 40
WER_THRESHOLD = 75
CER_EDGE_THRESHOLD = 75
LEN_DIFF_RATIO_THRESHOLD = 0.4
MIN_DURATION = 1
MAX_DURATION = 20
EDGE_LEN = 7
OUTPUT_FORMAT = 'mp3'
REMOVE_BRACKETS=True
REMOVE_ASTERISKS=True
REMOVE_PARENTHESES=True
REMOVE_SPEAKER_LABELS=True
SPLIT_USING_PATTERN=False
SPLIT_ON_QUOTES=False
SPLIT_ON_VERBS=False
ADDITIONAL_SPLIT_SYMBOLS=":"

In [None]:
%cd /content

In [None]:
root_dir = '/content'
channels_csv_path = 'youtube_links.csv'

channels_df = pd.read_csv(channels_csv_path)
idx = 10 # index of the channel in the csv
gdown.download_folder(url=channels_df.loc[idx]['link'], quiet=False, remaining_ok=True)

channel_dir = os.path.join(root_dir, channels_df.loc[idx]['name'])
channel_audio_dir = os.path.join(channel_dir, 'Audios')
audio_dir = "/content/NeMo/WORK_DIR/DATA/audio/"
text_dir = "/content/NeMo/WORK_DIR/DATA/text"
df_file = os.path.join(channel_dir, "df.csv")

In [None]:
# Copy all files in channel_dir (without copying the folder itself)
for item in os.listdir(channel_audio_dir):
    source_path = os.path.join(channel_audio_dir, item)
    if os.path.isfile(source_path):  # Only copy files
        shutil.copy(source_path, audio_dir)

# Process each .wav file in the directory
for file_name in tqdm(os.listdir(audio_dir)):
    # Check if the file is a .wav file
    if file_name.endswith(".wav"):
        wav_path = os.path.join(audio_dir, file_name)
        
        try:
            # Load the audio file with librosa
            audio, sr = librosa.load(wav_path, sr=None)  # sr=None preserves the original sampling rate
            
            # Write the audio data to an MP3 file using soundfile
            sf.write(wav_path, audio, sr, format="WAV")
        
        except Exception as e:
            print(f"Error processing {file_name}: {e}")

In [None]:
df = pd.read_csv(df_file)

for i in range(len(df)):
  x = df.iloc[i]
  voice_name = x['voice_name']
  basename = voice_name.split('.')[0]
  text_name = basename + '.txt'
  transcript = ast.literal_eval(x['transcript'])
  transcript = pd.json_normalize(transcript)
  text = "\n".join(transcript['text'])
  with open(os.path.join(text_dir, text_name), 'w', encoding='utf-8') as f:
    f.write(text)

In [None]:
%cd NeMo

The following script does the following:
1. Prepares data in the right format for CTC segmentation i.e. newline separated text (roughly an utterance) and 16000 Hz mono audio in `.wav` format for the NeMo ASR model

2. Runs CTC segmentation on the processed data and outputs segments text file for each audio file containing utterance start, end timings and alignment score

3. Verifies the segments created in step 2

4. Cuts the audios into utterances and creates a json manifest file (NeMo format) of the information of each utterance

In [None]:
! rm -rf $OUTPUT_DIR

! bash $TOOLS_DIR/../run_segmentation.sh \
--MODEL_NAME_OR_PATH=$NVIDIA_FASTCONFORMER \
--DATA_DIR=$DATA_DIR \
--OUTPUT_DIR=$OUTPUT_DIR \
--SCRIPTS_DIR=$TOOLS_DIR \
--REMOVE_BRACKETS=$REMOVE_BRACKETS \
--REMOVE_ASTERISKS=$REMOVE_ASTERISKS \
--REMOVE_PARANTHESES=$REMOVE_PARANTHESES \
--REMOVE_SPEAKER_LABELS=$REMOVE_SPEAKER_LABELS \
--SPLIT_USING_PATTERN=$SPLIT_USING_PATTERN \
--SPLIT_ON_QUOTES=$SPLIT_ON_QUOTES \
--SPLIT_ON_VERBS=$SPLIT_ON_VERBS \
--ADDITIONAL_SPLIT_SYMBOLS=$ADDITIONAL_SPLIT_SYMBOLS \
--LANGUAGE=$LANG_ID \
--MIN_SCORE=$THRESHOLD  \
--USE_NEMO_NORMALIZATION=False

Steps:

1. Transcribes the segments created from the `run_segmentation.sh` using the generated manifest for models in `MODELS` and calculate metrics such as WER, CER, etc. and outputs a new manifest file for each model

2. Filters out segments that don't meet the minimum requirements of any of the models

In [None]:
! bash $TOOLS_DIR/../run_filter_multiple.sh \
--MODEL_NAME_OR_PATH="$MODELS" \
--INPUT_AUDIO_DIR=$DATA_DIR/audio \
--MANIFEST=$OUTPUT_DIR/manifests/manifest.json \
--SCRIPTS_DIR=$TOOLS_DIR \
--CER_THRESHOLD=$CER_THRESHOLD \
--WER_THRESHOLD=$WER_THRESHOLD \
--CER_EDGE_THRESHOLD=$CER_EDGE_THRESHOLD \
--LEN_DIFF_RATIO_THRESHOLD=$LEN_DIFF_RATIO_THRESHOLD \
--MIN_DURATION=$MIN_DURATION \
--MAX_DURATION=$MAX_DURATION \
--EDGE_LEN=$EDGE_LEN || exit

Analyze some of the results

In [None]:
# Specify the path to the JSON file
MANIFEST_FILE = f'{OUTPUT_DIR}/manifests/manifest_transcribed_metrics_filtered.json'

# Count the number of lines in the file
with open(MANIFEST_FILE, 'r', encoding='utf-8') as f:
    lines = f.readlines()

num_lines = len(lines)  # Get the number of lines

# Process the file line by line
for i, line in enumerate(lines):
    if i % (num_lines // 10) == 0:  # Print every 10% of the data
        x = json.loads(line.strip())
        display(Audio(x['audio_filepath']))
        print('Ground Truth: ')
        print(x['text'])
        print(f'Best hypothesis from {x["model_name"]}')
        print(x['pred_text'])
        print(f"WER : {x['WER']}, CER: {x['CER']}, Start CER: {x['start_CER']}, End CER: {x['end_CER']}, Alignment score: {x['score']}")
        print('*' * 20)

If the results were satisfactory, run the following script to upsample the clips to 44.1 kHz and create a metadata.csv file for the dataset, then zip the folder

In [None]:
! bash $TOOLS_DIR/../run_prepare_dataset.sh \
--INPUT_AUDIO_DIR=$DATA_DIR/audio \
--MANIFEST=$OUTPUT_DIR/manifests/manifest_transcribed_metrics_filtered.json \
--SCRIPTS_DIR=$TOOLS_DIR \
--OUTPUT_DIR=$OUTPUT_DIR \
--OUTPUT_FORMAT=$OUTPUT_FORMAT \
--MODE=$MODE \
--DATASET_DIR=$DATASET_DIR || exit