In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%%capture
# Python cell to clone the repository
!git clone https://github.com/saeedzou/NeMo.git
%cd NeMo

import os

NEMO_DIR_PATH = "./"
TOOLS_DIR = f'{NEMO_DIR_PATH}/tools/ctc_segmentation/scripts'
WORK_DIR = 'WORK_DIR'
DATA_DIR = WORK_DIR + '/DATA'
OUTPUT_DIR = WORK_DIR + "/output"

os.makedirs(DATA_DIR, exist_ok=True)
os.makedirs(DATA_DIR + '/audio', exist_ok=True)
os.makedirs(DATA_DIR + '/text', exist_ok=True)

! bash colab_install.sh
! apt-get install -y sox libsox-fmt-mp3 ffmpeg

Prepare your data in the following format:
- Audios must be under {DATA_DIR}/audio
- Texts must be under {DATA_DIR}/text

In [None]:
import json
import os
import time
import requests
from tqdm import tqdm
from IPython.display import Audio, display

# Define the models and paths
NVIDIA_FASTCONFORMER = "nvidia/stt_fa_fastconformer_hybrid_large"
WAV2VEC2_FA = "masoudmzb/wav2vec2-xlsr-multilingual-53-fa"
HEZAR = 'hezarai/whisper-small-fa'
VOSK_SMALL = 'vosk-model-small-fa-0.42'
VOSK_BIG = 'vosk-model-fa-0.42'
MODELS = f"{NVIDIA_FASTCONFORMER} {WAV2VEC2_FA} {HEZAR}"

# List of (DATASET_DIR, GDRIVE_ID) pairs
dataset_pairs = [
    ("dataset1", "GDRIVE_ID_1"),
    # ("dataset2", "GDRIVE_ID_2"),
    # ("dataset3", "GDRIVE_ID_3"),
]

# Path to store output log file
log_file = "/content/processing_output.log"
# Define other variables
LANG_ID = 'fa'
THRESHOLD = -2
CER_THRESHOLD = 40
WER_THRESHOLD = 75
CER_EDGE_THRESHOLD = 75
LEN_DIFF_RATIO_THRESHOLD = 0.4
MIN_DURATION = 1
MAX_DURATION = 20
EDGE_LEN = 7
OUTPUT_FORMAT = 'wav'
MODE = "ganjoor"
REMOVE_BRACKETS = False
REMOVE_ASTERISKS = False
REMOVE_PARENTHESES = False
REMOVE_SPEAKER_LABELS = False
SPLIT_USING_PATTERN = True
SPLIT_ON_QUOTES = False
SPLIT_ON_VERBS = True
SPLIT_ON_VERBS_MIN_WORDS = 5
SPLIT_ON_VERBS_MAX_WORDS = 24
ADDITIONAL_SPLIT_SYMBOLS = "" # add new symbols, separated by | (\| before ? and ! and . and | because they are special characters in regex)

In [None]:
def get_text_api(id):
    url = f'https://api.ganjoor.net/api/audio/verses/{id}'
    response = requests.get(url)
    # Check if the response status is OK (200)
    if response.status_code == 200:
        return response.json()
    else:
        # If the request failed, return a failure message
        return f"Error: Unable to fetch poem. Status code: {response.status_code}"


# Function to run the bash scripts for each dataset
def process_dataset(dataset_dir, gdrive_id):
    zip_path = f"/content/NeMo/{dataset_dir}.zip"
    WORK_DIR = 'WORK_DIR'
    data_dir = WORK_DIR + '/DATA'
    output_dir = WORK_DIR + "/output"
    
    # Writing the separator for the current dataset pair
    print(f"{'='*20}\nProcessing dataset: {dataset_dir} with GDrive ID: {gdrive_id}\n{'='*20}")
    with open(log_file, 'a') as f:
        f.write(f"\n{'='*20}\nProcessing dataset: {dataset_dir} with GDrive ID: {gdrive_id}\n{'='*20}\n")

    # Download and unzip the dataset
    !gdown $gdrive_id -O $data_dir/audio.zip >> $log_file 2>&1
    !unzip -q $data_dir/audio.zip -d $data_dir >> $log_file 2>&1


    # Remove the output directory if it exists
    !rm -rf $output_dir
    
    # remove texts because it doesn't contain punctuation
    !rm -rf $DATA_DIR/text

    os.makedirs(DATA_DIR + '/text', exist_ok=True)
    ids = [int(f.split('.')[0]) for f in os.listdir(f'{DATA_DIR}/audio') if f.endswith('.mp3')]
    with open(log_file, 'a') as f:
        f.write(f"Fetching text for {len(ids)} audio files\n")
    for id in tqdm(ids):
        text = get_text_api(id)
        text = "\n".join([x['verseText'] for x in text[1:]])
        with open(f'{DATA_DIR}/text/{id}.txt', 'w') as f:
            f.write(text)

    # Run the segmentation script
    segmentation_command = f"""
    !bash $TOOLS_DIR/../run_segmentation.sh \
    --MODEL_NAME_OR_PATH={NVIDIA_FASTCONFORMER} \
    --DATA_DIR={data_dir} \
    --OUTPUT_DIR={output_dir} \
    --SCRIPTS_DIR=$TOOLS_DIR \
    --REMOVE_BRACKETS={REMOVE_BRACKETS} \
    --REMOVE_ASTERISKS={REMOVE_ASTERISKS} \
    --REMOVE_PARANTHESES={REMOVE_PARENTHESES} \
    --REMOVE_SPEAKER_LABELS={REMOVE_SPEAKER_LABELS} \
    --SPLIT_USING_PATTERN={SPLIT_USING_PATTERN} \
    --SPLIT_ON_QUOTES={SPLIT_ON_QUOTES} \
    --SPLIT_ON_VERBS={SPLIT_ON_VERBS} \
    --SPLIT_ON_VERBS_MIN_WORDS={SPLIT_ON_VERBS_MIN_WORDS} \
    --SPLIT_ON_VERBS_MAX_WORDS={SPLIT_ON_VERBS_MAX_WORDS} \
    --ADDITIONAL_SPLIT_SYMBOLS={ADDITIONAL_SPLIT_SYMBOLS} \
    --LANGUAGE={LANG_ID} \
    --MIN_SCORE={THRESHOLD} \
    --USE_NEMO_NORMALIZATION=False
    """
    print(f"{'-'*10}\n Running CTC segmentation for {dataset_dir}\n{'-'*10}")
    with open(log_file, 'a') as f:
        f.write(segmentation_command + "\n")
    !bash $TOOLS_DIR/../run_segmentation.sh \
    --MODEL_NAME_OR_PATH=$NVIDIA_FASTCONFORMER \
    --DATA_DIR=$data_dir \
    --OUTPUT_DIR=$output_dir \
    --SCRIPTS_DIR=$TOOLS_DIR \
    --REMOVE_BRACKETS=$REMOVE_BRACKETS \
    --REMOVE_ASTERISKS=$REMOVE_ASTERISKS \
    --REMOVE_PARANTHESES=$REMOVE_PARENTHESES \
    --REMOVE_SPEAKER_LABELS=$REMOVE_SPEAKER_LABELS \
    --SPLIT_USING_PATTERN=$SPLIT_USING_PATTERN \
    --SPLIT_ON_QUOTES=$SPLIT_ON_QUOTES \
    --SPLIT_ON_VERBS=$SPLIT_ON_VERBS \
    --SPLIT_ON_VERBS_MIN_WORDS=$SPLIT_ON_VERBS_MIN_WORDS \
    --SPLIT_ON_VERBS_MAX_WORDS=$SPLIT_ON_VERBS_MAX_WORDS \
    --ADDITIONAL_SPLIT_SYMBOLS=$ADDITIONAL_SPLIT_SYMBOLS \
    --LANGUAGE=$LANG_ID \
    --MIN_SCORE=$THRESHOLD \
    --USE_NEMO_NORMALIZATION=False >> $log_file 2>&1

    # Run the filtering script
    filtering_command = f"""
    !bash $TOOLS_DIR/../run_filter_multiple.sh \
    --MODEL_NAME_OR_PATH="{MODELS}" \
    --INPUT_AUDIO_DIR={data_dir}/audio \
    --MANIFEST={output_dir}/manifests/manifest.json \
    --SCRIPTS_DIR=$TOOLS_DIR \
    --CER_THRESHOLD={CER_THRESHOLD} \
    --WER_THRESHOLD={WER_THRESHOLD} \
    --CER_EDGE_THRESHOLD={CER_EDGE_THRESHOLD} \
    --LEN_DIFF_RATIO_THRESHOLD={LEN_DIFF_RATIO_THRESHOLD} \
    --MIN_DURATION={MIN_DURATION} \
    --MAX_DURATION={MAX_DURATION} \
    --EDGE_LEN={EDGE_LEN}
    """
    print(f"{'-'*10}\n Running filtering by ASRs {MODELS} for {dataset_dir}\n{'-'*10}")
    with open(log_file, 'a') as f:
        f.write(filtering_command + "\n")
    !bash $TOOLS_DIR/../run_filter_multiple.sh \
    --MODEL_NAME_OR_PATH="$MODELS" \
    --INPUT_AUDIO_DIR=$data_dir/audio \
    --MANIFEST=$output_dir/manifests/manifest.json \
    --SCRIPTS_DIR=$TOOLS_DIR \
    --CER_THRESHOLD=$CER_THRESHOLD \
    --WER_THRESHOLD=$WER_THRESHOLD \
    --CER_EDGE_THRESHOLD=$CER_EDGE_THRESHOLD \
    --LEN_DIFF_RATIO_THRESHOLD=$LEN_DIFF_RATIO_THRESHOLD \
    --MIN_DURATION=$MIN_DURATION \
    --MAX_DURATION=$MAX_DURATION \
    --EDGE_LEN=$EDGE_LEN >> $log_file 2>&1

    # Process the JSON file
    manifest_file = f'{output_dir}/manifests/manifest_transcribed_metrics_filtered.json'
    with open(manifest_file, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    try:
        num_lines = len(lines)
        for i, line in enumerate(lines):
            if i % (num_lines // 10) == 0:
                x = json.loads(line.strip())
                display(Audio(x['audio_filepath']))
                time.sleep(1)
                print('Ground Truth: ')
                print(x['text'])
                print(f'Best hypothesis from {x["model_name"]}')
                print(x['pred_text'])
                print(f"WER : {x['WER']}, CER: {x['CER']}, Start CER: {x['start_CER']}, End CER: {x['end_CER']}, Alignment score: {x['score']}")
                print('*' * 20)
    except:
        print("Error in displaying the audio files")

    # Prepare the dataset
    preparation_command = f"""
    !bash $TOOLS_DIR/../run_prepare_dataset.sh \
    --INPUT_AUDIO_DIR={data_dir}/audio \
    --MANIFEST={output_dir}/manifests/manifest_transcribed_metrics_filtered.json \
    --SCRIPTS_DIR=$TOOLS_DIR \
    --OUTPUT_DIR={output_dir} \
    --OUTPUT_FORMAT={OUTPUT_FORMAT} \
    --MODE={MODE} \
    --DATASET_DIR={dataset_dir}
    """
    print(f"{'-'*10}\n Preparing dataset {dataset_dir}\n{'-'*10}")
    with open(log_file, 'a') as f:
        f.write(preparation_command + "\n")
    !bash $TOOLS_DIR/../run_prepare_dataset.sh \
    --INPUT_AUDIO_DIR=$data_dir/audio \
    --MANIFEST=$output_dir/manifests/manifest_transcribed_metrics_filtered.json \
    --SCRIPTS_DIR=$TOOLS_DIR \
    --OUTPUT_DIR=$output_dir \
    --OUTPUT_FORMAT=$OUTPUT_FORMAT \
    --MODE=$MODE \
    --DATASET_DIR=$dataset_dir >> $log_file 2>&1

    # Save dataset to Google Drive
    %cd /content
    !mkdir -p /content/drive/MyDrive/Ganjoor_Dataset
    !cp $zip_path /content/drive/MyDrive/Ganjoor_Dataset >> $log_file 2>&1
    # copy logfile
    !cp -f $log_file /content/drive/MyDrive/Ganjoor_Dataset >> $log_file 2>&1
    print(f"Dataset is saved in /content/drive/MyDrive/Ganjoor_Dataset/{dataset_dir}.zip")

In [None]:
# Iterate over the dataset pairs and process each
for i, (dataset_dir, gdrive_id) in enumerate(dataset_pairs):
    if i != 0:
        !rm -rf /content/NeMo/WORK_DIR/DATA/audio/*
        !rm -rf /content/NeMo/WORK_DIR/DATA/text/*
        !rm -rf /content/NeMo/WORK_DIR/DATA/audio.zip
        %cd NeMo
    process_dataset(dataset_dir, gdrive_id)