In [None]:
# Cell 1: Install Dependencies
!apt-get -qq update
!apt-get -qq install -y ffmpeg
!pip -q install transformers torch datasets evaluate jiwer librosa soundfile
print("Installation complete.")

W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m89.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstallation complete.


In [None]:
# Cell 2: Imports & Configuration
import torch
import librosa
import numpy as np
import os
import re
import tempfile
import subprocess
import evaluate
from datasets import load_dataset, Audio
from transformers import HubertForCTC, Wav2Vec2Processor

# Configuration
MODEL_ID = "facebook/hubert-large-ls960-ft" # English Pre-trained Model
DATASET_ID = "sukumbasar/ASR_EchoBase_Raw"
TARGET_SAMPLING_RATE = 16000

# Check GPU availability
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Device set to: {device}")

Device set to: cuda


In [None]:
# Cell 3: Utility Functions

def convert_bytes_to_array(bytes_data):
    """
    Converts raw m4a audio bytes to a 16kHz numpy array using ffmpeg.
    """
    # Create temp input file
    fin = tempfile.NamedTemporaryFile(suffix=".m4a", delete=False)
    fin.write(bytes_data)
    fin.flush()
    fin.close()

    # Create temp output file
    fout = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
    out_path = fout.name
    fout.close()

    # Convert using ffmpeg
    try:
        subprocess.run(["ffmpeg", "-y", "-hide_banner", "-loglevel", "error",
                        "-i", fin.name, "-ac", "1", "-ar", str(TARGET_SAMPLING_RATE), out_path],
                       check=True)
        speech_array, _ = librosa.load(out_path, sr=TARGET_SAMPLING_RATE)
    finally:
        if os.path.exists(fin.name): os.remove(fin.name)
        if os.path.exists(out_path): os.remove(out_path)

    return speech_array

def normalize_text_strict(text: str) -> str:
    """
    Normalizes text: lowercase, remove punctuation, keep Turkish chars.
    """
    if not isinstance(text, str): return ""
    text = text.lower()
    # Keep standard letters, numbers and Turkish characters
    text = re.sub(r"[^a-zçğıöşü0-9\s]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

In [None]:
# Cell 4: Load Dataset
print(f"Loading dataset from Hugging Face: {DATASET_ID}...")
ds = load_dataset(DATASET_ID, split="train")

# Keep audio as bytes to handle m4a format manually
ds = ds.cast_column("audio", Audio(decode=False))

print(f"Dataset loaded. Total samples: {len(ds)}")

Loading dataset from Hugging Face: sukumbasar/ASR_EchoBase_Raw...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/461 [00:00<?, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/10.7M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/50 [00:00<?, ? examples/s]

Dataset loaded. Total samples: 50


In [None]:
# Cell 5: Load Model and Processor
print(f"Loading model: {MODEL_ID}...")
processor = Wav2Vec2Processor.from_pretrained(MODEL_ID)
model = HubertForCTC.from_pretrained(MODEL_ID)

model.to(device)
print("Model loaded successfully on GPU.")

Loading model: facebook/hubert-large-ls960-ft...


preprocessor_config.json:   0%|          | 0.00/212 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/138 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

vocab.json:   0%|          | 0.00/291 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.26G [00:00<?, ?B/s]

Model loaded successfully on GPU.


In [None]:
# Cell 6: Run Inference Loop
wer_metric = evaluate.load("wer")
cer_metric = evaluate.load("cer")

predictions = []
references = []

print("Starting inference process...")

for i, example in enumerate(ds):
    try:
        audio_bytes = example["audio"]["bytes"]
        ref_text = example["text"]

        # 1. Preprocess Audio
        speech_input = convert_bytes_to_array(audio_bytes)

        # 2. Model Input Preparation
        inputs = processor(speech_input, sampling_rate=TARGET_SAMPLING_RATE, return_tensors="pt", padding=True)
        input_values = inputs.input_values.to(device)

        # 3. Prediction (No Gradients)
        with torch.no_grad():
            logits = model(input_values).logits

        # 4. Decode
        predicted_ids = torch.argmax(logits, dim=-1)
        transcription = processor.batch_decode(predicted_ids)[0]

        # 5. Normalize
        norm_pred = normalize_text_strict(transcription)
        norm_ref = normalize_text_strict(ref_text)

        predictions.append(norm_pred)
        references.append(norm_ref)

        # Progress Log
        if (i + 1) % 10 == 0:
            print(f"Processed {i + 1}/{len(ds)} samples...")

    except Exception as e:
        print(f"Error at index {i}: {e}")

print("Inference finished.")

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Starting inference process...
Processed 10/50 samples...
Processed 20/50 samples...
Processed 30/50 samples...
Processed 40/50 samples...
Processed 50/50 samples...
Inference finished.


In [None]:
# Cell 7: Calculate and Display Metrics
print("\n" + "="*40)
print(f"=== EVALUATION RESULTS ({MODEL_ID}) ===")
print("="*40)

final_wer = wer_metric.compute(predictions=predictions, references=references)
final_cer = cer_metric.compute(predictions=predictions, references=references)

print(f"Overall WER: {final_wer:.4f}")
print(f"Overall CER: {final_cer:.4f}")

print("\n--- Sample Predictions vs References ---")
for i in range(min(5, len(predictions))):
    print(f"Ref : {references[i]}")
    print(f"Pred: {predictions[i]}")
    print("-" * 20)


=== EVALUATION RESULTS (facebook/hubert-large-ls960-ft) ===
Overall WER: 1.0635
Overall CER: 0.4904

--- Sample Predictions vs References ---
Ref : bugün hava oldukça sakin
Pred: bogunhava oldukcasykin
--------------------
Ref : toplantı saatini üç buçuğa erteledim
Pred: top land to sigtin it wihwould try it a little
--------------------
Ref : yoğun trafik nedeniyle otobüs yarım saatten fazla gecikti
Pred: yon treficnedenile otobserum satan fata gijicte
--------------------
Ref : bu dosyayı ne zaman teslim etmemiz gerekiyor
Pred: budosn as amantis smatmos getacure
--------------------
Ref : e posta adresimi yanlış yazmış olabilirim tekrar kontrol eder misin
Pred: e porsa dismiant chasen cholleblir tecra contrele darms
--------------------
