In [1]:
from datasets import load_from_disk, load_metric
from transformers import Wav2Vec2BertProcessor, Wav2Vec2BertForCTC, BatchFeature
from transformers import AutoModelForCTC, Wav2Vec2Processor
from torch.utils.data.dataloader import DataLoader
import torch
from tqdm import tqdm
from typing import Any, Dict, List, Optional, Union
from dataclasses import dataclass, field

import os
from datasets import load_from_disk
import numpy as np

In [2]:
dataset_for_wav2vec2=load_from_disk("/teamspace/studios/this_studio/datasets/ivritai")
dataset = dataset_for_wav2vec2['test']
shuffled_dataset = dataset.shuffle(seed=np.random.randint(1000))
dataset = shuffled_dataset.select(range(500))

In [3]:
from transformers import AutoProcessor, AutoModelForCTC


In [9]:
import torch
from datasets import load_dataset
from transformers import AutoProcessor, AutoModelForCTC
import torchaudio
import librosa
import numpy as np
import re
import string
from jiwer import wer

# Load model and processor
chars_to_remove_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\�\'\]\[\{\}\־]'


FILTER_THRESHOLD = 10

# Load the dataset
def remove_special_characters(batch):
    batch["transcription"] = re.sub(chars_to_remove_regex, '', batch["transcription"]).lower()
    return batch

# Function to drop samples that contain English letters or digits
def drop_english_samples(dataset):
    def contains_english_or_digits(text):
        english_letters = set(string.ascii_lowercase)
        digits = set(string.digits)
        return any(char in english_letters or char in digits for char in text.lower())
    filtered_dataset = dataset.filter(lambda example: not contains_english_or_digits(example['transcription']))
    return filtered_dataset

# Function to filter out long audio samples
def filter_long_samples(dataset):
    def is_shorter_than_max_duration(example):
        duration_seconds = len(example['audio']['array']) / example['audio']['sampling_rate']
        return duration_seconds <= FILTER_THRESHOLD
    filtered_dataset = dataset.filter(lambda example: is_shorter_than_max_duration(example))
    return filtered_dataset

# Apply the functions to the dataset
dataset = dataset.map(remove_special_characters)
dataset = drop_english_samples(dataset)
dataset = filter_long_samples(dataset)

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/500 [00:00<?, ? examples/s]

In [23]:
import torch
from tqdm import tqdm

# Setup device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Assuming the predict function is defined elsewhere and needs to be updated
def predict(audio_tensor, device, model, processor,whisper=False):
    # Ensure the tensor is on the correct device
    # audio_tensor = audio_tensor.to(device)
    if not whisper:
        
        # Convert the tensor into the model's input format
        try:
            input_values = processor(audio_tensor.cpu(), return_tensors="pt", padding="longest", sampling_rate=16000).input_features
        except:
            input_values = processor(audio_tensor, return_tensors="pt", padding="longest",sampling_rate=16000).input_values
        input_values = input_values.to(device)  # Ensure inputs are on the same device as the model

        # Perform the inference
        with torch.no_grad():
            logits = model(input_values).logits

        # Decode the model's logits to text
        predicted_ids = torch.argmax(logits, dim=-1)
        transcription = processor.batch_decode(predicted_ids)
        return transcription[0]
    else:
        input_features = processor(audio_tensor.cpu().numpy(), sampling_rate=16000, return_tensors="pt")
        if has_cuda:
            input_features = input_features.to('cuda:0')
        predicted_ids = model.generate(input_features.input_features, language='he', num_beams=5)
        transcript = processor.batch_decode(predicted_ids, skip_special_tokens=True)
        return transcript[0]

def evaluate_model(dataset, model, processor, whisper=False):
    from jiwer import wer
    total_wer = 0
    count = 0  # To count successfully processed examples
    with tqdm(total=len(dataset), desc="Evaluating model", unit=" samples") as progress_bar:

        for example in dataset:
            
                # Directly use the loaded audio data and sample rate
            speech_array, sampling_rate = example["audio"]["array"], example["audio"]["sampling_rate"]

            # Convert numpy array to tensor and then move it to the desired device for processing
            speech_tensor = torch.tensor(speech_array, dtype=torch.float32).to(device)
            
            # Explicitly pass the original sampling rate and move tensor to CPU for librosa processing
            speech_array = speech_tensor.cpu().numpy()
            speech_array = librosa.resample(speech_array, orig_sr=sampling_rate, target_sr=16_000)
            # Convert the resampled array back to tensor and move to the correct device
            speech_tensor = torch.tensor(speech_array, dtype=torch.float32).to(device)
            # Predict the transcription
            predicted_text = predict(speech_tensor, device=device, model=model, processor=processor, whisper=whisper)
            reference_text = example["transcription"]
        
            # Compute WER
            example_wer = wer(reference_text, predicted_text)
            total_wer += example_wer
            count += 1
            progress_bar.update(1)

        progress_bar.close()
        if count > 0:
            average_wer = total_wer / count
        else:
            average_wer = float('inf')  # Indicates an issue if no audio was processed
        return average_wer




Using device: cuda


In [20]:
processor = AutoProcessor.from_pretrained("imvladikon/wav2vec2-xls-r-300m-hebrew")
model = AutoModelForCTC.from_pretrained("imvladikon/wav2vec2-xls-r-300m-hebrew")
model.to(device)
model.eval()
# Run evaluation
average_wer = evaluate_model(dataset, model, processor)
print(f"Average WER: {average_wer:.2f}")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of the model checkpoint at imvladikon/wav2vec2-xls-r-300m-hebrew were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at imvladikon/wav2vec2-xls-r-300m-hebrew and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametri

Average WER: 0.44





In [24]:
model_path='models/facebook/w2v-bert-2.0-finetuned'
processor = Wav2Vec2BertProcessor.from_pretrained(model_path, 
                                            unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")
model = Wav2Vec2BertForCTC.from_pretrained(
    model_path,
    pad_token_id=processor.tokenizer.pad_token_id,
    vocab_size=len(processor.tokenizer),
)
model.to(device)
model.eval()

# Run evaluation
average_wer = evaluate_model(dataset, model, processor)
print(f"Average WER: {average_wer:.2f}")

Evaluating model: 100%|██████████| 457/457 [00:38<00:00, 11.77 samples/s]

Average WER: 0.44





In [10]:
average_wer = evaluate_model(dataset, model, processor)
print(f"Average WER: {average_wer:.2f}")

NameError: name 'model' is not defined

In [6]:
import torch
from transformers import WhisperProcessor, WhisperForConditionalGeneration

SAMPLING_RATE = 16000

has_cuda = torch.cuda.is_available()

model_path = 'ivrit-ai/whisper-large-v2-tuned'

model = WhisperForConditionalGeneration.from_pretrained(model_path)
if has_cuda:
    model.to('cuda:0')

processor = WhisperProcessor.from_pretrained(model_path)



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Evaluating model:   1%|          | 12/1000 [00:40<55:36,  3.38s/ samples] 


KeyboardInterrupt: 

In [8]:
average_wer = evaluate_model(dataset, model, processor, whisper=True)
print(f"Average WER: {average_wer:.2f}")

Evaluating model: 100%|██████████| 500/500 [23:13<00:00,  2.79s/ samples]

Average WER: 0.29



