## **4. Combine an *n-gram* with Wav2Vec2**

In a final step, we want to wrap the *5-gram* into a `Wav2Vec2ProcessorWithLM` object to make the *5-gram* boosted decoding as seamless as shown in Section 1.
We start by downloading the currently "LM-less" processor of [`xls-r-300m-sv`](https://huggingface.co/hf-test/xls-r-300m-sv).

In [1]:
import os
os.chdir('/teamspace/studios/this_studio/')

In [2]:
!sudo apt-get install ffmpeg

Reading package lists... Done
Building dependency tree       
Reading state information... Done
ffmpeg is already the newest version (7:4.2.7-0ubuntu0.1).
0 upgraded, 0 newly installed, 0 to remove and 4 not upgraded.


In [3]:
#run inference on test dataset first example
import soundfile as sf
import torch
from IPython.display import Audio
import numpy as np
from transformers import Wav2Vec2CTCTokenizer, SeamlessM4TFeatureExtractor, Wav2Vec2BertForCTC, Wav2Vec2ProcessorWithLM, Wav2Vec2BertProcessor


In [4]:
!pip install pydub


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.2[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [5]:
os.getcwd()

'/teamspace/studios/this_studio'

In [6]:
#unzip /teamspace/studios/this_studio/FinalProject/wav2vec-kenlm/data/dafyomi/test_audio and get mp3 file array for each file in the zip
import zipfile
import os
import shutil
from pydub import AudioSegment
if not os.path.exists('FinalProject/wav2vec-kenlm/data/dafyomi/audio_clips'):
    with zipfile.ZipFile('FinalProject/wav2vec-kenlm/data/dafyomi/audio_clips.zip', 'r') as zip_ref:
        zip_ref.extractall('FinalProject/wav2vec-kenlm/data/dafyomi/audio_clips')
    
audio_files_paths = os.listdir('FinalProject/wav2vec-kenlm/data/dafyomi/audio_clips')
# files = [AudioSegment.from_file('FinalProject/wav2vec-kenlm/data/dafyomi/audio_clips/' + file) for file in audio_files_paths]
audio_files = {file: AudioSegment.from_file('FinalProject/wav2vec-kenlm/data/dafyomi/audio_clips/' + file) for file in audio_files_paths}
audio_files = dict(sorted(audio_files.items(), key=lambda item: item[0]))
audio_files

{'batra_155_01.mp3': <pydub.audio_segment.AudioSegment at 0x7fb4583fbac0>,
 'batra_155_02.mp3': <pydub.audio_segment.AudioSegment at 0x7fb4583f9a80>,
 'metzia_02_93_01.mp3': <pydub.audio_segment.AudioSegment at 0x7fb4583fbca0>,
 'metzia_02_93_02.mp3': <pydub.audio_segment.AudioSegment at 0x7fb4583fbb80>,
 'metzia_155_04.mp3': <pydub.audio_segment.AudioSegment at 0x7fb4583fa980>,
 'metzia_91_01.mp3': <pydub.audio_segment.AudioSegment at 0x7fb4583fb760>,
 'metzia_91_02.mp3': <pydub.audio_segment.AudioSegment at 0x7fb4583fa5f0>,
 'metzia_91_03.mp3': <pydub.audio_segment.AudioSegment at 0x7fb4583fbaf0>,
 'metzia_93_01.mp3': <pydub.audio_segment.AudioSegment at 0x7fb4583fbb50>,
 'metzia_93_02.mp3': <pydub.audio_segment.AudioSegment at 0x7fb4583fbc10>}

In [7]:
#hear first audio file
from IPython.display import Audio
Audio(filename='FinalProject/wav2vec-kenlm/data/dafyomi/audio_clips/' + audio_files_paths[0])

In [8]:
#convert to 16000
# audio_files_converted_frame_rate = [audio.set_frame_rate(16000) for audio in audio_files]
# audio_numpy_arrays = [np.array(audio.get_array_of_samples()) for audio in audio_files_converted_frame_rate]
# audio_cleaned = [audio_numpy_arrays.astype(np.float32) / np.abs(audio_numpy_arrays).max() for audio_numpy_arrays in audio_numpy_arrays] #normalize audio. Explanation: we divide by the max value of the audio to get values between -1 and 1

#adjust the above code commented out to work with the new dicrionary instead od an array
audio_files_converted_frame_rate = {file: audio.set_frame_rate(16000) for file, audio in audio_files.items()}
audio_numpy_arrays = {file: np.array(audio.get_array_of_samples()) for file, audio in audio_files_converted_frame_rate.items()}
audio_cleaned = {file: audio_numpy_arrays.astype(np.float32) / np.abs(audio_numpy_arrays).max() for file, audio_numpy_arrays in audio_numpy_arrays.items()} #normalize audio. Explanation: we divide by the max value of the audio to get values between -1 and 1


In [9]:
# #get first 10 seconds of the audio
# from pydub import AudioSegment
# from pydub.utils import make_chunks

# import math

# # Load your MP3 file
# audio = AudioSegment.from_mp3("FinalProject/wav2vec-kenlm/data/dafyomi/batra_155.mp3")
# # Define the length of each chunk in milliseconds
# chunk_length_ms = 10000  # 10 seconds * 1000 ms/sec
# chunks = make_chunks(audio, chunk_length_ms) 
# chunks = [chunk.set_frame_rate(16000).set_channels(1) for chunk in chunks]
# chunks = [np.array(chunk.get_array_of_samples()) for chunk in chunks]
# chunks = [chunk.astype(np.float32) / np.abs(chunk).max() for chunk in chunks]
# # Calculate the number of chunks to split the file into
# # num_chunks = math.ceil(len(audio) / chunk_length_ms)
# # chunks = []
# # Split the audio and save each chunk
# # for i in range(num_chunks):
# #     start_ms = i * chunk_length_ms
# #     end_ms = min((i + 1) * chunk_length_ms, len(audio))
# #     chunk = audio[start_ms:end_ms]
# #     chunks.append(chunk)

In [10]:
from transformers import AutoProcessor, AutoModelForCTC, WhisperForConditionalGeneration, WhisperProcessor
import librosa
class ASRModel:
    def __init__(self, model_name=None, model=None, processor=None, feature_extractor=None, tokenizer=None, 
                 lm_model=False, 
                 lm_params=None,
                 alpha = 0.5,
                 whisper=False):
        self.model_name=model_name
        self.feature_extractor=feature_extractor
        self.processor=processor
        self.tokenizer=tokenizer
        self.lm_model=lm_model
        self.lm_params = lm_params
        self.alpha = alpha
        self.whisper = whisper
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        
        if feature_extractor and tokenizer:
            self.feature_extractor=feature_extractor
            self.tokenizer=tokenizer
            self.processor=AutoProcessor(feature_extractor=feature_extractor, processor=processor)

        elif processor:
            self.processor=processor
        else:
            self.processor = AutoProcessor.from_pretrained(model_name)
        
        print('Getting Model...')
        if whisper:
            self.model = WhisperForConditionalGeneration.from_pretrained(model_name)
        elif lm_model:
            self.model= AutoModelForCTC.from_pretrained(model_name)
        else:
            self.model = AutoModelForCTC.from_pretrained(model_name)
        self.model.to(self.device)
        
    def process_data(self, example):
        speech_array, sampling_rate = example["audio"]["array"], example["audio"]["sampling_rate"]
        speech_array = librosa.resample(speech_array, orig_sr=sampling_rate, target_sr=16000)
        return speech_array 
         
    def get_prediction(self, inputs):
        with torch.no_grad():
            self.logits = self.model(inputs).logits
        if self.lm_model:
            if self.lm_params:
                return self.lm_model.batch_decode(self.logits.cpu().numpy(), **self.lm_params).text
            return self.lm_model.batch_decode(self.logits.cpu().numpy()).text
        else:
            predicted_ids = torch.argmax(self.logits, dim=-1)
            return self.processor.batch_decode(predicted_ids)

    def predict(self, audio_tensor):
        # Ensure the tensor is on the correct device
        # audio_tensor = audio_tensor.to(device)
        if not self.whisper:
            try:
                input_values = self.processor([audio_tensor], return_tensors="pt", sampling_rate=16_000).input_features
            except:
                input_values = self.processor([audio_tensor], return_tensors="pt", sampling_rate=16_000).input_values
            input_values = input_values.to(self.device)  # Ensure inputs are on the same device as the model
            features = input_values.to(self.device)

            prediction = self.get_prediction(features)
            return prediction[0]
        else:
            input_features = self.processor(audio_tensor, sampling_rate=16000, return_tensors="pt")
            input_features = input_features.to(self.device)
            predicted_ids = self.model.generate(input_features.input_features, language='he', num_beams=5)
            transcript = self.processor.batch_decode(predicted_ids, skip_special_tokens=True)
            return transcript[0]
        


In [11]:
#wav2vec2bert-finetuned

# processor = Wav2Vec2BertProcessor.from_pretrained('models/facebook/w2v-bert-2.0-finetuned', 
#                                             unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")
# models['wav2vec2Bert-finetuned'] = ASRModel(model_name="models/facebook/w2v-bert-2.0-finetuned", processor=processor)

# #wav2vec2bert-finetuned with MesivtaLm
# bertLM = Wav2Vec2ProcessorWithLM.from_pretrained("models/wav2vec2bert-MesivtaLm")
# models['wav2vec2Bert-finetuned-MesivtaLm'] = ASRModel(model_name="models/facebook/w2v-bert-2.0-finetuned", lm_model=bertLM)
# # # sample = chunks[10]

#wav2vec2-xls-r-300m-hebrew
# models['wav2vec2-xls-r-300m-hebrew'] = ASRModel("models/imvladikon/wav2vec2-xls-r-300m-hebrew")
# #wav2vec2-xls-r-300m-MesivtaLM
# lm = Wav2Vec2ProcessorWithLM.from_pretrained("models/KenLM-Wav2Vec2-imvladikon-300m-Hebrew-Mesivta")
# models['wav2vec2-xls-r-300m-hebrew-MesivtaLM'] = ASRModel("imvladikon/wav2vec2-xls-r-300m-hebrew", lm_model=lm)

#load whisper
# model_path = 'models/openai/whisper-large-v2'
# processor = WhisperProcessor.from_pretrained(model_path)
# model = WhisperForConditionalGeneration.from_pretrained(model_path)
# models['Whisper-large-v2'] = ASRModel(model=model, processor=processor, whisper=True)

#whisper-ivrit-ai
# model_path = 'models/ivrit-ai/whisper-large-v2-tuned'
# processor = WhisperProcessor.from_pretrained(model_path)
# model = WhisperForConditionalGeneration.from_pretrained(model_path)
# models['Whisper-ivrit-ai'] = ASRModel(model=model, processor=processor, whisper=True)

model_configs = {
    # "wav2vec2-finetuned": {
    #     "model_name": "imvladikon/wav2vec2-xls-r-300m-hebrew",
    #     "lm": None,
    #     "processor": AutoProcessor.from_pretrained('imvladikon/wav2vec2-xls-r-300m-hebrew', unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|"),
    #     "whisper": False
    # },
    # "wav2vec2-finetuned-MesivtaLm": {
    #     "model_name": "imvladikon/wav2vec2-xls-r-300m-hebrew",
    #     "lm": Wav2Vec2ProcessorWithLM.from_pretrained("models/imvladikon/wav2vec2-hebrew-MesivtaLm",unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|"),
    #     "processor": None,
    #     "whisper": False
    # },
    # "wav2vec2Bert-finetuned": {
    #     "model_name": "models/facebook/w2v-bert-2.0-finetuned",
    #     "lm": None,
    #     "processor": Wav2Vec2BertProcessor.from_pretrained('models/facebook/w2v-bert-2.0-finetuned', unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|"),
    #     "whisper": False
    # },
    "wav2vec2Bert-finetuned-MesivtaLm": {
        "model_name": "models/facebook/w2v-bert-2.0-finetuned",
        "lm": Wav2Vec2ProcessorWithLM.from_pretrained("models/wav2vec2bert-MesivtaLm",unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|"),
        "processor": None,
        "whisper": False,
        # "lm_params": {"alpha": 0.7},
        "lm_params": {},
        "alphas" : [0.45, 0.5, 0.6, 0.65, 0.7]
    },
#     # "whisper-large-v2": {
#     #     "model_name": "models/openai/whisper-large-v2",
#     #     "lm": None,
#     #     "processor": WhisperProcessor.from_pretrained('models/openai/whisper-large-v2'),
#     #     "whisper": True
#     # },
#     # "whisper-ivrit-ai": {
#     #     "model_name": "models/ivritai/whisper-large-v2-tuned",
#     #     "lm": None,
#     #     "processor": WhisperProcessor.from_pretrained('models/ivritai/whisper-large-v2-tuned'),
#     #     "whisper": True
#     # }
}
        


Loading the LM will be faster if you build a binary file.
Reading /teamspace/studios/this_studio/models/wav2vec2bert-MesivtaLm/language_model/5gram_correct.arpa
----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100
****************************************************************************************************


In [12]:
import gc

def load_and_predict(model_config, audio_data):
    model = ASRModel(
        model_name=model_config["model_name"],
        processor=model_config["processor"],
        whisper=model_config["whisper"],
        lm_model=model_config.get("lm", False),
        lm_params=model_config.get("lm_params", None)
    )
    
    results = {}
    for file_name, audio in audio_data.items():
        processed_audio = model.process_data({"audio": {"array": audio, "sampling_rate": 16000}})
        prediction = model.predict(processed_audio)
        results[file_name] = prediction
    
    del model
    gc.collect()
    torch.cuda.empty_cache()
    return results

In [13]:
import pandas as pd
results = []

for model_name, config in model_configs.items():
    if config.get("alphas", None):
        for alpha in config["alphas"]:
            config["lm_params"]["alpha"] = alpha
            print(f"Running model {model_name} with alpha={alpha}")
            model_name_with_alpha = f"{model_name}-alpha-{alpha}"
            predictions = load_and_predict(config, audio_cleaned)
            for file_name, prediction in predictions.items():
                results.append({"Model": model_name_with_alpha, "Audio": file_name,  "Prediction": prediction})
    else:
        print(model_name)
        predictions = load_and_predict(config, audio_cleaned) # returns dictionary of key= filename and value = prediction
        for file_name, prediction in predictions.items():
            results.append({"Model": model_name, "Audio": file_name,  "Prediction": prediction})

results_df = pd.DataFrame(results)
pivoted_df = results_df.pivot(index='Audio', columns='Model', values='Prediction').reset_index()

print(results_df)

Running model wav2vec2Bert-finetuned-MesivtaLm with alpha=0.45
Getting Model...
deleting the previous model
Running model wav2vec2Bert-finetuned-MesivtaLm with alpha=0.5
Getting Model...
Keeping the previous model for next prediction
Running model wav2vec2Bert-finetuned-MesivtaLm with alpha=0.6
Getting Model...
Keeping the previous model for next prediction
Running model wav2vec2Bert-finetuned-MesivtaLm with alpha=0.65
Getting Model...
Keeping the previous model for next prediction
Running model wav2vec2Bert-finetuned-MesivtaLm with alpha=0.7
Getting Model...
Keeping the previous model for next prediction
                                          Model                Audio  \
0   wav2vec2Bert-finetuned-MesivtaLm-alpha-0.45     batra_155_01.mp3   
1   wav2vec2Bert-finetuned-MesivtaLm-alpha-0.45     batra_155_02.mp3   
2   wav2vec2Bert-finetuned-MesivtaLm-alpha-0.45  metzia_02_93_01.mp3   
3   wav2vec2Bert-finetuned-MesivtaLm-alpha-0.45  metzia_02_93_02.mp3   
4   wav2vec2Bert-finetuned-

In [14]:
pivoted_df.to_csv('test_results_alphas.csv', index=False)