<a href="https://colab.research.google.com/github/rfclara/fa_xhosa/blob/main/evaluation_asr_xhosa.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Step 1: Set Up the Environment
# Install required libraries
!pip install torchaudio transformers jiwer

import torchaudio
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import torch
import os
import json



In [2]:
!mkdir "temp_dir"
!git clone https://github.com/pytorch/fairseq
!pwd
%cd "/content/fairseq"
!pip install --editable ./
!pip install tensorboardX

  Installing build dependencies ... [?25l[?25hcanceled[31mERROR: Operation cancelled by user[0m[31m


In [None]:
# Step 2: Load the Model
# Load the processor and model for Xhosa from Facebook's MMS
from transformers import Wav2Vec2ForCTC, AutoProcessor

model_id = "facebook/mms-1b-fl102"
target_lang = "xho"  # Xhosa language code

processor = AutoProcessor.from_pretrained(model_id, target_lang=target_lang)
model = Wav2Vec2ForCTC.from_pretrained(model_id, target_lang=target_lang, ignore_mismatched_sizes=True)


In [4]:
# Step 3: Copy the aligned Audio Files from drive
from google.colab import drive
drive.mount('/content/drive')
!cp /content/drive/MyDrive/aligned.zip /content
!unzip /content/aligned.zip -d /

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Archive:  /content/aligned.zip
replace /content/xhosa/aligned/LM180625S_a/segment3.flac? [y]es, [n]o, [A]ll, [N]one, [r]ename: N


In [None]:
def load_and_process_audio(file_path):
    waveform, sample_rate = torchaudio.load(file_path)
    waveform = waveform.squeeze()  # Remove channel dimension if it's mono
    if sample_rate != 16000:
        waveform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)(waveform)
    return waveform

# Step 4: Iterate through directories and process each manifest file
base_path = "/content/xhosa/aligned"
all_transcriptions = []
all_references = []

for root, dirs, files in os.walk(base_path):
    for file in files:
        if file == "manifest.json":
            manifest_path = os.path.join(root, file)
            with open(manifest_path, 'r') as f:
                for line in f:
                    entry = json.loads(line.strip())
                    audio_path = entry["audio_filepath"]
                    reference_text = entry["normalized_text"]

                    # Load and process the audio file
                    waveform = load_and_process_audio(audio_path)

                    # Process the audio input
                    inputs = processor(waveform, sampling_rate=16000, return_tensors="pt", padding=True)

                    # Ensure input is 2D [batch_size, sequence_length]
                    if inputs.input_values.dim() == 3 and inputs.input_values.size(1) == 2:
                        inputs.input_values = inputs.input_values.mean(dim=1)  # Convert stereo to mono

                    # Perform inference
                    with torch.no_grad():
                        logits = model(inputs.input_values).logits

                    # Decode the predicted IDs to text
                    predicted_ids = torch.argmax(logits, dim=-1)
                    transcription = processor.batch_decode(predicted_ids)[0]

                    # Collect transcriptions and references
                    all_transcriptions.append(transcription)
                    all_references.append(reference_text)

                    # Print the result for each segment
                    print(f"Transcription: {transcription}")
                    print(f"Reference: {reference_text}")
                    print()

# Step 5: Calculate Word Error Rate (WER)
error_rate = wer(all_references, all_transcriptions)
print(f"Word Error Rate (WER): {error_rate}")

Transcription: mikitomi st wathi daitisitudayi 31 31 ash 2016uzukwenziza ndoni ngempela veki ndizavuka ekuseni
Reference: ngempelaveki ndizovuka ek u seni

Transcription: la   le
Reference: ndihlambe

Transcription: a mamm ma la  alalaba abatwana bama aa
Reference: ndihlambe abantwana bam

Transcription: sitye isidlo sasekuseni
Reference: sitye isidlo sasek u seni

Transcription: a a a malta ki e pandle
Reference: sihlale phandle

Transcription: emveni koko ndingene egadini ndihlakule
Reference: emveni koko ndingene egadini ndihlakule

Transcription: ndiphume ndiphumle endlini
Reference: ndiphume ndiphumle endlini talking in the background

Transcription: anamuzana
Reference: ungabuza kaloku

Transcription: a ophezolo bekusezekani kupha kwamashezi
Reference: so phezolo u ee bekusenzeka ni o pha kwamashezi

Transcription: abe ni clani eyio
Reference: benikhona

Transcription: aa aa a a tawe ba ta pima a
Reference: yho yho into e be ipha

Transcription: we umnyi adala
Reference: ibengumy

In [None]:
# Optional: Calculate Word Error Rate (WER) if ground truth is available
from jiwer import wer
ground_truth = "ungabuza kaloku so  phezolo u+ ee bekusenzeka ni o+ pha kwaMashezi benikhona ? Yho yho into {e}be ipha ibengumyadala Eeh ndiyakutshela ngoba kaloku ee besithenjiswe kuthwa kuzofika akutshiwongo ukuba kuzofika umlungu kuthwe  kuza abelungu  abesuka ePitoli Ee ndakhalaza ke  mna hii kuthwa hayi bafun{a} ukubona indlela esigqoka ngayo ee indlela  esenza ngayo izinto nxa kulola suku njeng{oku}ba kulusuku lwamaGugube Oh ok oh b+ bebenxibile njengo{ku}ba bekulusuku lwamaGugube benxibile isintu sabo ? Hayi ke Uhm ya bebenxibile qha into eye yasiphazamisa kukuba siye safikela apho koli joyinti khona {Ooh} ngoku {h}ayi bayadla ngoku awukwazi ukumbona {ingumntu} nxe {i}nguye enormal ngoba ngoku u+ usoloko ebonakala edlile Ee si+ siye sahlangana ke  phaya bagiya ke shame hayi bayayenza bon{a} into qha ingathi bangayenza benormal uhm konakona hayi bezibuya"
# print("Ground Truth:", ground_truth)
error_rate = wer(ground_truth, transcription[0])
print("Word Error Rate (WER):", error_rate)