In [1]:
from datasets import load_from_disk, load_metric, load_dataset
from transformers import Wav2Vec2BertProcessor, Wav2Vec2BertForCTC, BatchFeature
from transformers import AutoModelForCTC, Wav2Vec2Processor
from torch.utils.data.dataloader import DataLoader
import torch
from tqdm import tqdm
from typing import Any, Dict, List, Optional, Union
from dataclasses import dataclass, field
import utils
from transformers import Wav2Vec2ForCTC, Wav2Vec2ProcessorWithLM, AutoProcessor
import torch
import os
from datasets import load_from_disk
import numpy as np

# Build an n-gram with KenLM


While large language models based on the Transformer architecture have become the standard in NLP, it is still very common to use an n-gram LM to boost speech recognition systems - as shown in Section 1.

Looking again at Table 9 of Appendix C of the official Wav2Vec2 paper, it can be noticed that using a Transformer-based LM for decoding clearly yields better results than using an n-gram model, but the difference between n-gram and Transformer-based LM is much less significant than the difference between n-gram and no LM.

E.g., for the large Wav2Vec2 checkpoint that was fine-tuned on 10min only, an n-gram reduces the word error rate (WER) compared to no LM by ca. 80% while a Transformer-based LM only reduces the WER by another 23% compared to the n-gram. This relative WER reduction becomes less, the more data the acoustic model has been trained on. E.g., for the large checkpoint a Transformer-based LM reduces the WER by merely 8% compared to an n-gram LM whereas the n-gram still yields a 21% WER reduction compared to no language model.

The reason why an n-gram is preferred over a Transformer-based LM is that n-grams come at a significantly smaller computational cost. For an n-gram, retrieving the probability of a word given previous words is almost only as computationally expensive as querying a look-up table or tree-like data storage - i.e. it's very fast compared to modern Transformer-based language models that would require a full forward pass to retrieve the next word probabilities.

For more information on how n-grams function and why they are (still) so useful for speech recognition, the reader is advised to take a look at this excellent summary from Stanford.

Great, let's see step-by-step how to build an n-gram. We will use the popular KenLM library to do so. Let's start by installing the Ubuntu library prerequisites:

In [2]:
import os
path = os.getcwd()
file = 'ivritai.txt'
finetuned_model_path = "imvladikon/wav2vec2-xls-r-300m-hebrew"

In [2]:
dataset_for_wav2vec2=load_dataset("ivrit-ai/whisper-training")
dataset = dataset_for_wav2vec2['train']

In [6]:
#subsample 10 percent of data
data = dataset.select(np.random.permutation(dataset.shape[0])[:int(len(dataset)*0.2)])
data = utils.standardize_dataset(data)
data = utils.drop_english_samples(data)

Removing unecessary columns
Removing Special Characters


Filter:   0%|          | 0/11049 [00:00<?, ? examples/s]

In [17]:
utils.show_random_elements(data.remove_columns(["audio"]), num_examples=10)

Unnamed: 0,transcription
0,גם בשר ודם יכול תמיד לשמוע ולראות אותנו ואולי אין שום דבר שהוא פרטי ו...
1,"זאת אומרת, אם אנחנו מסתכלים על הטלפון האדום שהיה בין מוסקבה לוושינגטון,"
2,הרי מדובר פה בבזבוז זמן מוחלט של שני הצדדים.
3,הוא אומר במפורש
4,יצא לי לדבר עם יעל געל שהיא בסאב והיא עושה את זה כבר הרבה מאוד שנים והרבה לפניי. אז אם כבר מדברים על חלוצות בתחום בישראל אז...
5,ועכשיו יש לפרק הזה עשרת אלפים הורדות
6,"באיזשהו מקום, הציבור ה...ישראלי,"
7,"במדע וביזמות, שווה מאוד. כן, המלצתי עליו פה פעם אחת, אז אני אוסיף את המלצה הזו ואת טרו בלאד, מה עוד?"
8,תראה יש לי הרבה מה להגיד על גיימסטופ קודם כל אחד הדברים שבאמת.
9,"נראה שחקן בוגר, כאילו לא ילד, ילד כזה שעלה מהנוער. הוא יוביל את מכבי פתח תקווה בשנה הבאה. מקף תחתית."


In [21]:
import re
def split_into_sentences(text):
    # This regex handles '.', '!', '?', '...' as sentence boundaries
    sentence_endings = re.compile(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)(?<!\.\.\.) +')
    sentences = sentence_endings.split(text)
    return sentences
def clean_text(text):
    # Keep only Hebrew letters, punctuation marks (.,!?,;:), and spaces
    return re.sub(r'[^א-ת \n]', '', text)

# Open a text file to write the sentences
with open(file, "w") as file:
    for example in data:  # Adjust if your dataset has a different split
        transcriptions = example["transcription"]
        # Split the transcription into sentences
        sentences = split_into_sentences(transcriptions)
        for sentence in sentences:
            sentence = clean_text(sentence)
            file.write(sentence.strip() + "\n")
            
with 

In [22]:
#show the first 10 rows of no punctuation text
with open('ivritai.txt', 'r') as f:
    for i in range(10):
        print(f.readline())

שצריך ללמוד תורה ואחד לא אומר שצריך ללמוד תורה אחד



אוקיי קודם כל הנושא של הביזנס מודל ו

רעיון ששמעתי היה ש היה איזושהי קריצה לזה ש

אבל יש להם חיסרון מרכזי וזו כמות עצומה של חישובים וזיכרון שנדרשים לביצוע הרשת

זה היה פשוט איוולת הנושא הזה כן

אתה מספר שם את הסיפור שמגיע שם היזם של פולגת

הקדיש פרק שלם בספר שלו על למה הוא לא שמרן

כלומר לא רק שהוא לא ישתעמם מהתנך אלא הוא ירצה לרוץ ולקרוא אותו עד הסוף יהיה לו קשה לסגור

הייתי יכול ללמד את הגוף להתמודד עם אין סוף מחלות או שזה לא עובד ככה



In [4]:
!pip install pyctcdecode



In [6]:
!apt install build-essential cmake libboost-system-dev libboost-thread-dev libboost-program-options-dev libboost-test-dev libeigen3-dev zlib1g-dev libbz2-dev liblzma-dev

The operation couldn’t be completed. Unable to locate a Java Runtime.
Please visit http://www.java.com for information on installing Java.



before downloading and unpacking the KenLM repo.

In [25]:
import platform
import os

if platform.system() == 'Darwin':  # Darwin stands for MacOS
    !curl -L https://kheafield.com/code/kenlm.tar.gz | tar xz
elif platform.system() == 'Linux':
    !wget -O - https://kheafield.com/code/kenlm.tar.gz | tar xz
else:
    print("Unsupported operating system")

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  480k  100  480k    0     0  79643      0  0:00:06  0:00:06 --:--:--  109k


KenLM is written in C++, so we'll make use of `cmake` to build the binaries.

In [52]:
!mkdir kenlm/build && cd kenlm/build && cmake .. && make -j2
!ls kenlm/build/bin

mkdir: kenlm/build: File exists


In [45]:
!pip install pyctcdecode
# !pip install kenlm -U



Great, as we can see, the executable functions have successfully been built under `kenlm/build/bin/`.

KenLM by default computes an *n-gram* with [Kneser-Ney smooting](https://en.wikipedia.org/wiki/Kneser%E2%80%93Ney_smoothing). All text data used to create the *n-gram* is expected to be stored in a text file.
We download our dataset and save it as a `.txt` file.

Now, we just have to run KenLM's `lmplz` command to build our *n-gram*, called `"5gram.arpa"`. As it's relatively common in speech recognition, we build a *5-gram* by passing the `-o 5` parameter.
For more information on the different *n-gram* LM that can be built
with KenLM, one can take a look at the [official website of KenLM](https://kheafield.com/code/kenlm/).

Executing the command below might take a minute or so.

In [46]:
!kenlm/build/bin/lmplz -o 5 <"{file}" > "5gram.arpa"

zsh:1: no such file or directory: kenlm/build/bin/lmplz


In [51]:
#show current directory
!ls kenlm/build/bin

In [44]:
os.getcwd()

'/Users/shua/Desktop/FinalProject/demo'

Great, we have built a *5-gram* LM! Let's inspect the first couple of lines.

In [39]:
!head -20 5gram.arpa

There is a small problem that 🤗 Transformers will not be happy about later on.
The *5-gram* correctly includes a "Unknown" or `<unk>`, as well as a *begin-of-sentence*, `<s>` token, but no *end-of-sentence*, `</s>` token.
This sadly has to be corrected currently after the build.

We can simply add the *end-of-sentence* token by adding the line `0 </s>  -0.11831701` below the *begin-of-sentence* token and increasing the `ngram 1` count by 1. Because the file has roughly 100 million lines, this command will take *ca.* 2 minutes.

In [40]:
with open(f"5gram.arpa", "r") as read_file, open(f"5gram_correct.arpa", "w") as write_file:
  has_added_eos = False
  for line in read_file:
    if not has_added_eos and "ngram 1=" in line:
      count=line.strip().split("=")[-1]
      write_file.write(line.replace(f"{count}", f"{int(count)+1}"))
    elif not has_added_eos and "<s>" in line:
      write_file.write(line)
      write_file.write(line.replace("<s>", "</s>"))
      has_added_eos = True
    else:
      write_file.write(line)

Let's now inspect the corrected *5-gram*.

In [41]:
!head -20 5gram_correct.arpa

## Create the Processor

In [34]:
# from transformers import AutoProcessor
from transformers import AutoProcessor, Wav2Vec2BertProcessor
processor = AutoProcessor.from_pretrained(finetuned_model_path,
                                            unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|"
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Next, we extract the vocabulary of its tokenizer as it represents the `"labels"` of `pyctcdecode`'s `BeamSearchDecoder` class.

In [35]:
vocab_dict = processor.tokenizer.get_vocab()
sorted_vocab_dict = {k.lower(): v for k, v in sorted(vocab_dict.items(), key=lambda item: item[1])}

The `"labels"` and the previously built `5gram_correct.arpa` file is all that's needed to build the decoder.

In [42]:
from pyctcdecode import build_ctcdecoder

decoder = build_ctcdecoder(
    labels=list(sorted_vocab_dict.keys()),
    kenlm_model_path="5gram_correct.arpa",
    
)

Loading the LM will be faster if you build a binary file.
Reading /Users/shua/Desktop/FinalProject/demo/5gram_correct.arpa
----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100


OSError: Cannot read model '5gram_correct.arpa' (End of file Byte: 0)

We can safely ignore the warning and all that is left to do now is to wrap the just created `decoder`, together with the processor's `tokenizer` and `feature_extractor` into a `Wav2Vec2ProcessorWithLM` class.

In [None]:
from transformers import Wav2Vec2ProcessorWithLM

processor_with_lm = Wav2Vec2ProcessorWithLM(
    feature_extractor=processor.feature_extractor,
    tokenizer=processor.tokenizer,
    decoder=decoder
)

## **Combine an *n-gram* with Wav2Vec2**

In a final step, we want to wrap the *5-gram* into a `Wav2Vec2ProcessorWithLM` object to make the *5-gram* boosted decoding.

In [None]:
model = Wav2Vec2ForCTC.from_pretrained(finetuned_model_path)
normal_processor = AutoProcessor.from_pretrained(finetuned_model_path)

In [None]:
# sample = chunks[46]
inputs = normal_processor(sample, sampling_rate=16000, return_tensors="pt")
# sf.write("test.wav", sample, 16000)
with torch.no_grad():
  logits = model(**inputs).logits
#prediction with kenlm
kenlm = processor_with_lm.batch_decode(logits.numpy()).text

In [None]:
#regular prediction
predicted_ids = torch.argmax(logits, dim=-1)
word2vec = processor.batch_decode(predicted_ids)