In [1]:
import torch

# Check if CUDA is available
if torch.cuda.is_available():
    print("CUDA is available.")
else:
    print("CUDA is not available. Check if your GPU drivers are properly installed.")


CUDA is available.


In [2]:
import os
os.chdir('/app/')


import librosa
import random
import numpy as np
import IPython.display as ipd
import pickle
import pandas as pd
from datasets import Dataset
from datasets import Audio
from torch.utils.data import ConcatDataset
from transformers import WhisperProcessor
import mutagen.mp3
from tqdm import tqdm
import json
from audiomentations import Compose, AddGaussianNoise, TimeStretch, PitchShift, Shift, RoomSimulator
import srt
import re
from IPython.display import clear_output
from transformers import WhisperForConditionalGeneration
from huggingface_hub import login



#our libraries
from global_variables.training_vars import *
from global_variables.folders import *
from parashat_hashavua_dataset import *
from nikud_and_teamim import just_teamim,remove_nikud


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# load the token from txt file
with open("./tokens/HF_token.txt", "r") as f:
    HF_TOKEN = f.read().strip() # strip() removes the trailing "\n" if it exists
login(token=HF_TOKEN)


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [4]:
from dataclasses import dataclass
from typing import Any, Dict, List, Union

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch

In [5]:
import concurrent.futures
import evaluate
import time
import cantilLocations_evaluation


# # possible metrics : "wer", "cer", "bleu", "rouge", "sacrebleu", "sari":
# # 1. `wer`: Word Error Rate.
# # 2. `cer`: Character Error Rate.
# # 3. `bleu`: Bilingual Evaluation Understudy.
# # 4. `rouge`: Recall-Oriented Understudy for Gisting Evaluation.
# # 5. `sacrebleu`: A standardized BLEU score implementation for more consistent machine translation evaluation.
# # 6. `sari`: System Agnostic Refinement Index. 

WER_CALCULATOR = evaluate.load("wer")
def compute_metrics(pred):
    eval_list = cantilLocations_evaluation.calculate_precision_recall_f1_for_string_list_with_method_list
    pred_ids = pred.predictions
    label_ids = pred.label_ids
    
    # method to calculate the metrics(method can be "Exact", "Letter_Shift", "Word_Level", "Word_Shift")
    methods = ["Exact", "Letter_Shift", "Word_Level", "Word_Shift"]

    # replace -100 with the pad_token_id
    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id
    
    start_time = time.time()
    
    # we do not want to group tokens when computing the metrics
    pred_str = processor.tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = processor.tokenizer.batch_decode(label_ids, skip_special_tokens=True)
    
    decode_time = time.time() - start_time
    
    # evaluate the metrics
    results = eval_list(pred_str, label_str, methods)
    
    
    
    # compute the average of each metric
    avg = {}
    for i in range(4):
        avg["avg_precision_" + methods[i]] = np.mean(results[i][0])
        avg["avg_recall_" + methods[i]] = np.mean(results[i][1])
        avg["avg_f1_" + methods[i]] = np.mean(results[i][2])
    
    precision_list_exact = results[methods.index("Exact")][0]
    recall_list_exact = results[methods.index("Exact")][1]
    f1_list_exact = results[methods.index("Exact")][2]
    
    # compute the median
    precision_median_exact = np.median(precision_list_exact)
    recall_median_exact = np.median(recall_list_exact)
    f1_median_exact = np.median(f1_list_exact)
    
    
    # max and min:
    precision_max_exact = np.max(precision_list_exact)
    recall_max_exact = np.max(recall_list_exact)
    f1_max_exact = np.max(f1_list_exact)
    best_index = np.argmax(f1_list_exact)
    
    f1_min = [0, 0, 0, 0]
    recall_min = [0, 0, 0, 0]
    precision_min = [0, 0, 0, 0]
    
    for i in range(4):
        precision_min[i] = np.min(results[i][0])
        recall_min[i] = np.min(results[i][1])
        f1_min[i] = np.min(results[i][2])
    
    worst_index = [np.argmin(results[i][2]) for i in range(4)] 
    
    
    
    start_time = time.time()
    # WER
    wer = 100 * WER_CALCULATOR.compute(predictions=pred_str, references=label_str)
    
    wer_time = time.time() - start_time
    
    best_pred = pred_str[best_index]
    best_label = label_str[best_index]
    worst_pred = [pred_str[worst_index[i]] for i in range(4)]
    worst_label = [label_str[worst_index[i]] for i in range(4)]
    
    # print
    # best:
    print(f"best f1 for {methods[0]}: {f1_max_exact}\nbest pred: {best_pred}\nbest label: {best_label}\n")
    
    # worst (the worst for each method):
    for i in range(4):
        print(f"worst f1 for {methods[i]}: {f1_min[i]}\nworst pred: {worst_pred[i]}\nworst label: {worst_label[i]}\n")
    
    
    
    print("Time taken for each part:")
    print(f"Decode calculation: {decode_time} seconds")
    print(f"WER calculation: {wer_time} seconds")
    
    # matric_dict = {"wer": wer, "precision": precision_avg, "recall": recall_avg, "f1": f1_avg, "precision_median": precision_median, "recall_median": recall_median, "f1_median": f1_median, "precision_max": precision_max, "recall_max": recall_max, "f1_max": f1_max, "precision_min": precision_min, "recall_min": recall_min, "f1_min": f1_min}
    
    # create the matric_dict with the metrics
    matric_dict = {"wer": wer}
    for i in range(4):
        matric_dict["avg_precision_" + methods[i]] = avg["avg_precision_" + methods[i]]
        matric_dict["avg_recall_" + methods[i]] = avg["avg_recall_" + methods[i]]
        matric_dict["avg_f1_" + methods[i]] = avg["avg_f1_" + methods[i]]
    matric_dict["precision_median_exact"] = precision_median_exact
    matric_dict["recall_median_exact"] = recall_median_exact
    matric_dict["f1_median_exact"] = f1_median_exact
    matric_dict["precision_max_exact"] = precision_max_exact
    matric_dict["recall_max_exact"] = recall_max_exact
    matric_dict["f1_max_exact"] = f1_max_exact
    for i in range(4):
        matric_dict["precision_min_" + methods[i]] = precision_min[i]
        matric_dict["recall_min_" + methods[i]] = recall_min[i]
        matric_dict["f1_min_" + methods[i]] = f1_min[i]
    # print(matric_dict)
    return matric_dict

In [6]:
INIT_OUTPUT_LAYER = False

In [7]:
kwargs = {
    # "dataset_args": "config: he, split: test",
    "language": "he",
    "model_name": "he-cantillation",
    "finetuned_from": BASE_MODEL_NAME,
    "tasks": "automatic-speech-recognition-cantillation",
    "tags": "hf-asr-leaderboard",
}

# Test the model

In [8]:
model_name = "cantillation/Teamim-large-v2_WeightDecay-0.05_Augmented_Combined-Data_date-14-07-2024_18-24"

In [9]:
# load the model
model = WhisperForConditionalGeneration.from_pretrained(model_name)
processor = WhisperProcessor.from_pretrained(model_name)

# load the test data
test_data = parashat_hashavua_dataset(new_data = "other", few_data=FASTTEST, train =False ,validation=False, test=True, random=False, num_of_words_in_sample=1, augment=AUGMENT, processor=processor)


# create the data collator
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)


Downloading shards: 100%|██████████| 2/2 [02:10<00:00, 65.45s/it] 
Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.93it/s]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


dict_keys(['text', 'audio'])


Loading audio nusach (1/1): 100%|██████████| 13/13 [00:00<00:00, 35.74it/s]

Num of missing files in audio nusach:  0





In [10]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir= MODEL_NAME,  # change to a repo name of your choice
    per_device_train_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=1,  # increase by 2x for every 2x decrease in batch size
    learning_rate=LR, # was 1e-5
    warmup_steps=WARMUP_STEPS, # was 500
    max_steps=MAX_STEPS, # was 4000
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={'use_reentrant':False}, # I added that because UserWarning: "The default value of use_reentrant will be updated to be False in the future."
    fp16=torch.cuda.is_available(), # I added that because fp16 can't be use on CPU but on cuda
    per_device_eval_batch_size=32,
    predict_with_generate=True,
    eval_strategy="steps", 
    generation_max_length=225, # without this, it crushes the tesorboard
    save_steps=SAVE_STEPS, 
    eval_steps=EVAL_STEPS,   
    logging_steps=1, 
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    metric_for_best_model= "avg_f1_Exact",# "avg_f1_..." like "avg_f1_Exact"
    greater_is_better=True, # if we use f1 score in eval so greater is better
    push_to_hub=True,
    # I added the dataloader_prefetch_factor to support newer versions of torch (now it must be int and not None. and the default is 2).
    dataloader_prefetch_factor=2, # support newer versions of torch
    dataloader_num_workers=1, # parallelize the data loading
    weight_decay=WEIGHT_DECAY,
    # run_name=RUN_NAME, # It doesn't work
)


In [11]:
from transformers import Seq2SeqTrainer
trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    eval_dataset=test_data,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor.feature_extractor,
)

# evaluate the model
results = trainer.evaluate() # we use evaluate to get the metrics
print(results)
# save the results to a json file
# create the results file
with open(f"results_{MODEL_NAME.split('/')[-1]}.json", 'w') as f:
    json.dump(results, f, indent=4)


max_steps is given, it will override any value given in num_train_epochs
Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.


best f1 for Exact: 0.6829268292682927
best pred: קט֔ן לממש֣לת הל֔ילה וא֖ת הכֽוכב֑ים וית֥ן את֛ם אלה֖ים ברק֣יע השמ֑ים להא֖יר על־ הא֑רץ ונמשל֙ בי֣ום ובל֔ילה וֽלהבד֔יל
best label:  הקטן֙ לממש֣לת הל֔ילה וא֖ת הכוכבֽים׃ וית֥ן את֛ם אלה֖ים ברק֣יע השמ֑ים להא֖יר על־ האֽרץ׃ ולמשל֙ בי֣ום ובל֔ילה וֽלהבד֔יל ב֥ין הא֖ור וב֣ין הח֑שך וי֥רא אלה֖ים 

worst f1 for Exact: 0.0
worst pred: י֣רב בא֑רץ וי֣רם בב֡קר י֣ום חמיש֗י וי֣אמר אלהים֒ תוצ֨א הא֜רץ נ֤פש חי֣ה למינה֙
worst label: י֥רב באֽרץ׃ וֽיהי־ ע֥רב וֽיהי־ ב֖קר י֥ום חמישֽי׃ וי֣אמר אלה֗ים תוצ֨א הא֜רץ נ֤פש חיה֙ למינ֔ה 

worst f1 for Letter_Shift: 0.0
worst pred: י֣רב בא֑רץ וי֣רם בב֡קר י֣ום חמיש֗י וי֣אמר אלהים֒ תוצ֨א הא֜רץ נ֤פש חי֣ה למינה֙
worst label: י֥רב באֽרץ׃ וֽיהי־ ע֥רב וֽיהי־ ב֖קר י֥ום חמישֽי׃ וי֣אמר אלה֗ים תוצ֨א הא֜רץ נ֤פש חיה֙ למינ֔ה 

worst f1 for Word_Level: 0.0
worst pred: י֣רב בא֑רץ וי֣רם בב֡קר י֣ום חמיש֗י וי֣אמר אלהים֒ תוצ֨א הא֜רץ נ֤פש חי֣ה למינה֙
worst label: י֥רב באֽרץ׃ וֽיהי־ ע֥רב וֽיהי־ ב֖קר י֥ום חמישֽי׃ וי֣אמר אלה֗ים תוצ֨א הא֜רץ נ֤פש חיה֙ למ

In [12]:
# that was just on small test data, we need to evaluate on a bigger test data in the future !!!

# The model that was trained only on one dataset (which is slightly more similar to the style of the test data)
{'eval_loss': 0.7734131813049316, 'eval_wer': 61.811023622047244, 'eval_avg_precision_Exact': 0.46710057318055453, 'eval_avg_recall_Exact': 0.5216817130186114, 'eval_avg_f1_Exact': 0.4904648408291106, 'eval_avg_precision_Letter_Shift': 0.4789416732569487, 'eval_avg_recall_Letter_Shift': 0.5334662248031231, 'eval_avg_f1_Letter_Shift': 0.5022712162718498, 'eval_avg_precision_Word_Level': 0.500396535917345, 'eval_avg_recall_Word_Level': 0.5445642177192979, 'eval_avg_f1_Word_Level': 0.518676288124462, 'eval_avg_precision_Word_Shift': 0.5926319891153651, 'eval_avg_recall_Word_Shift': 0.6415232658013406, 'eval_avg_f1_Word_Shift': 0.6130332240673503, 'eval_precision_median_exact': 0.5185185185185185, 'eval_recall_median_exact': 0.5714285714285714, 'eval_f1_median_exact': 0.5384615384615384, 'eval_precision_max_exact': 0.7857142857142857, 'eval_recall_max_exact': 0.8181818181818182, 'eval_f1_max_exact': 0.8, 'eval_precision_min_Exact': 0.0, 'eval_recall_min_Exact': 0.0, 'eval_f1_min_Exact': 0.0, 'eval_precision_min_Letter_Shift': 0.0, 'eval_recall_min_Letter_Shift': 0.0, 'eval_f1_min_Letter_Shift': 0.0, 'eval_precision_min_Word_Level': 0.0, 'eval_recall_min_Word_Level': 0.0, 'eval_f1_min_Word_Level': 0.0, 'eval_precision_min_Word_Shift': 0.25, 'eval_recall_min_Word_Shift': 0.2777777777777778, 'eval_f1_min_Word_Shift': 0.2631578947368421}

# The model that was trained on two datasets (the second dataset is much more different from the test data)
{'eval_loss': 1.015082836151123, 'eval_wer': 61.811023622047244, 'eval_avg_precision_Exact': 0.49305580535527055, 'eval_avg_recall_Exact': 0.4932712533026013, 'eval_avg_f1_Exact': 0.49279243354847346, 'eval_avg_precision_Letter_Shift': 0.4963025586020238, 'eval_avg_recall_Letter_Shift': 0.4964060495408457, 'eval_avg_f1_Letter_Shift': 0.49598222621195037, 'eval_avg_precision_Word_Level': 0.5137405627001068, 'eval_avg_recall_Word_Level': 0.5019255300603264, 'eval_avg_f1_Word_Level': 0.5073072310311691, 'eval_avg_precision_Word_Shift': 0.6237927402365905, 'eval_avg_recall_Word_Shift': 0.613736202842786, 'eval_avg_f1_Word_Shift': 0.6182032164583927, 'eval_precision_median_exact': 0.5294117647058824, 'eval_recall_median_exact': 0.5416666666666666, 'eval_f1_median_exact': 0.5142857142857143, 'eval_precision_max_exact': 0.7142857142857143, 'eval_recall_max_exact': 0.6818181818181818, 'eval_f1_max_exact': 0.6976744186046512, 'eval_precision_min_Exact': 0.05, 'eval_recall_min_Exact': 0.05, 'eval_f1_min_Exact': 0.05000000000000001, 'eval_precision_min_Letter_Shift': 0.05, 'eval_recall_min_Letter_Shift': 0.05, 'eval_f1_min_Letter_Shift': 0.05000000000000001, 'eval_precision_min_Word_Level': 0.05, 'eval_recall_min_Word_Level': 0.05, 'eval_f1_min_Word_Level': 0.05000000000000001, 'eval_precision_min_Word_Shift': 0.3, 'eval_recall_min_Word_Shift': 0.3, 'eval_f1_min_Word_Shift': 0.3}

# we can see that the model that was trained on two datasets has a higher f1 score

{'eval_loss': 1.015082836151123,
 'eval_wer': 61.811023622047244,
 'eval_avg_precision_Exact': 0.49305580535527055,
 'eval_avg_recall_Exact': 0.4932712533026013,
 'eval_avg_f1_Exact': 0.49279243354847346,
 'eval_avg_precision_Letter_Shift': 0.4963025586020238,
 'eval_avg_recall_Letter_Shift': 0.4964060495408457,
 'eval_avg_f1_Letter_Shift': 0.49598222621195037,
 'eval_avg_precision_Word_Level': 0.5137405627001068,
 'eval_avg_recall_Word_Level': 0.5019255300603264,
 'eval_avg_f1_Word_Level': 0.5073072310311691,
 'eval_avg_precision_Word_Shift': 0.6237927402365905,
 'eval_avg_recall_Word_Shift': 0.613736202842786,
 'eval_avg_f1_Word_Shift': 0.6182032164583927,
 'eval_precision_median_exact': 0.5294117647058824,
 'eval_recall_median_exact': 0.5416666666666666,
 'eval_f1_median_exact': 0.5142857142857143,
 'eval_precision_max_exact': 0.7142857142857143,
 'eval_recall_max_exact': 0.6818181818181818,
 'eval_f1_max_exact': 0.6976744186046512,
 'eval_precision_min_Exact': 0.05,
 'eval_recall_m