# SONAR MODEL EVALUATION

In [1]:
!which pip3
!which python3

/Users/nihesh/Nihesh/sonar/.venv/bin/pip3
/Users/nihesh/Nihesh/sonar/.venv/bin/python3


In [1]:
# Install dependencies step by step to avoid conflicts
%pip install --upgrade pip setuptools wheel
%pip install torch torchaudio
%pip install numpy matplotlib pydub sacrebleu
%pip install sentencepiece

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Collecting sentencepiece
  Using cached sentencepiece-0.2.0.tar.gz (2.6 MB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: sentencepiece
[33m  DEPRECATION: Building 'sentencepiece' using the legacy setup.py bdist_wheel mechanism, which will be removed in a future version. pip 25.3 will enforce this behaviour change. A possible replacement is to use the standardized build interface by setting the `--use-pep517` option, (possibly combined with `--no-build-isolation`), or adding a `pyproject.toml` file to the source tree of 'sentencepiece'. Discussion can be found at https://github.com/pypa/pip/issues/6334[0m[33m
[0m  Building wheel for sentencepiece (setup.py) ... [?25lerror
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×

In [12]:
# Try installing fairseq2 with specific compatible versions
# First try installing fairseq2n directly (the native component)
%pip install fairseq2n

# Then try fairseq2
%pip install fairseq2

# If that fails, try with specific versions
# %pip install fairseq2==0.2.1 fairseq2n==0.2.1

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


## Installing SONAR & its Dependencies

In [13]:
# First, make sure we're in the correct directory
import os
os.chdir('/Users/nihesh/nihesh/sonar')

# Remove any previous failed installation
import shutil
if os.path.exists('SONAR'):
    shutil.rmtree('SONAR')

# Clone fresh SONAR repository
%cd /Users/nihesh/nihesh/sonar
!git clone https://github.com/facebookresearch/SONAR.git

# Change to SONAR directory
%cd SONAR

# Install SONAR without dependencies first to avoid fairseq2 conflict
%pip install -e . --no-deps

# Install the core dependencies manually (skip fairseq2 for now)
%pip install numpy torch torchaudio soundfile tqdm typing_extensions

# Try installing sox (might fail on some systems, that's OK)
%pip install sox || echo "Sox installation failed - this is OK for basic functionality"

# Verify installation
%pip list | grep sonar

/Users/nihesh/Nihesh/sonar
Cloning into 'SONAR'...
remote: Enumerating objects: 1287, done.[K
remote: Counting objects: 100% (254/254), done.[K
remote: Compressing objects: 100% (182/182), done.[K
remote: Total 1287 (delta 120), reused 118 (delta 67), pack-reused 1033 (from 2)[K
Receiving objects: 100% (1287/1287), 5.69 MiB | 4.08 MiB/s, done.
Resolving deltas: 100% (350/350), done.
/Users/nihesh/Nihesh/sonar/SONAR
Obtaining file:///Users/nihesh/Nihesh/sonar/SONAR
  Installing build dependencies ... [?25ldone
[?25h  Checking if build backend supports build_editable ... [?25ldone
[?25h  Getting requirements to build editable ... [?25ldone
[?25h  Preparing editable metadata (pyproject.toml) ... [?25ldone
[?25hBuilding wheels for collected packages: sonar-space
  Building editable for sonar-space (pyproject.toml) ... [?25ldone
[?25h  Created wheel for sonar-space: filename=sonar_space-0.4.0-py3-none-any.whl size=8370 sha256=c39c65abff3618535a84458f13b65e2ae682b1e31a841b07dca

## Import SONAR

In [1]:
import io
import json
import matplotlib.pyplot as plt
import numpy as np
import os
import torch
import torchaudio
import zipfile
import glob
from pathlib import Path
from IPython.display import Audio, display

# Test SONAR imports
try:
    import sonar
    print("✓ SONAR imported successfully")
    
    # Test if we can access basic functionality
    print(f"SONAR version: {sonar.__version__ if hasattr(sonar, '__version__') else 'Unknown'}")
    
except ImportError as e:
    print(f"❌ Failed to import SONAR: {e}")

✓ SONAR imported successfully
SONAR version: 0.4.0


## For advanced users, to set defaults to GPU or CPU

In [2]:
import torch

if torch.backends.mps.is_available() and torch.backends.mps.is_built():
    device = torch.device("mps")
elif torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

print(f"Using device: {device}")


Using device: mps


## Sonar Models Downloading and Importing

In [3]:
import os
import zipfile
import glob
import torch
import torchaudio
import time
import urllib.request
import ssl
from pathlib import Path

# Set device
if torch.backends.mps.is_available() and torch.backends.mps.is_built():
    device = torch.device("mps")
elif torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")
print(f"Using device: {device}")

# ====== Step 1: Check Cache and Configure Network ======
def check_cache_and_configure():
    """Check what's already downloaded and configure network settings"""
    
    # Check fairseq2 cache
    cache_dir = Path.home() / ".cache" / "fairseq2"
    print(f"Checking cache directory: {cache_dir}")
    
    if cache_dir.exists():
        print("Found fairseq2 cache:")
        for item in cache_dir.rglob("*"):
            if "sonar" in str(item).lower():
                print(f"  {item}")
    else:
        print("No fairseq2 cache found")
    
    # Create SSL context that's more permissive
    ssl_context = ssl.create_default_context()
    ssl_context.check_hostname = False
    ssl_context.verify_mode = ssl.CERT_NONE
    
    # Set global SSL context
    ssl._create_default_https_context = lambda: ssl_context
    
    print("✓ Network configuration updated")

# ====== Step 2: Initialize SONAR with Enhanced Retry Logic ======
def initialize_sonar_enhanced(max_retries=2, delay=2):
    """Enhanced initialization with better error handling"""
    
    check_cache_and_configure()
    
    last_error = None  # Store the last error for potential re-raising
    
    for attempt in range(max_retries):
        try:
            print(f"Attempt {attempt + 1}/{max_retries} to initialize SONAR pipeline...")
            
            # Try with increased timeout and better error handling
            from sonar.inference_pipelines.speech import SpeechToTextModelPipeline
            
            s2t_model = SpeechToTextModelPipeline(
                encoder="sonar_speech_encoder_eng",
                decoder="text_sonar_basic_decoder",
                tokenizer="text_sonar_basic_decoder"
            ).to(device)
            
            print("✓ SONAR pipeline initialized successfully!")
            return s2t_model
            
        except Exception as e:
            last_error = e  # Store the error for potential re-raising
            print(f"❌ Attempt {attempt + 1} failed: {str(e)[:200]}...")
            
            if "Connection" in str(e) or "download" in str(e).lower():
                print("  Network issue detected - trying longer delay...")
                delay_time = delay * (attempt + 1)  # Exponential backoff
            else:
                delay_time = delay
                
            if attempt < max_retries - 1:
                print(f"  Waiting {delay_time} seconds before retry...")
                time.sleep(delay_time)
            else:
                print("All attempts failed. Trying alternative approach...")
                break
    
    # If all retries failed, try a different model configuration
    print("\n====== Trying Alternative Model Configuration ======")
    try:
        # Try with just the basic text model first
        from sonar.inference_pipelines.text import TextToTextModelPipeline
        print("Trying text-only pipeline first...")
        
        text_model = TextToTextModelPipeline(
            encoder="text_sonar_basic_encoder",
            decoder="text_sonar_basic_decoder",
            tokenizer="text_sonar_basic_encoder"
        )
        
        print("✓ Text pipeline works - network issue is specific to speech models")
        print("You may need to download the speech model manually or try a VPN")
        return None
        
    except Exception as e2:
        print(f"❌ Even text pipeline failed: {e2}")
        print("\nTroubleshooting suggestions:")
        print("1. Check your internet connection")
        print("2. Try using a VPN to change your IP location")
        print("3. Check if your firewall/antivirus is blocking downloads")
        print("4. Try running this on a different network")
        raise last_error if last_error else e2

# Run the enhanced initialization
s2t_model = initialize_sonar_enhanced()
s2t_model.to(device)
print(f"🔧 Moved model to device: {next(s2t_model.parameters()).device}")

Using device: mps
Checking cache directory: /Users/nihesh/.cache/fairseq2
Found fairseq2 cache:
  /Users/nihesh/.cache/fairseq2/assets/dd6b263a791ecff9b7dd5bff/sonar_text_decoder.pt
✓ Network configuration updated
Attempt 1/2 to initialize SONAR pipeline...


  from .autonotebook import tqdm as notebook_tqdm


✓ SONAR pipeline initialized successfully!
🔧 Moved model to device: mps:0


## Sonar Running and Evaluation

In [14]:
import json
import pickle
from pathlib import Path
import os
import datetime
import torch
import torchaudio
import sacrebleu

device = torch.device("cuda" if torch.cuda.is_available()
                      else ("mps" if torch.backends.mps.is_available() else "cpu"))

def translate_speech_to_text_optimized(audio_path, target_lang="eng"):
    try:
        waveform, sr = torchaudio.load(audio_path)
        waveform = waveform.to(device)
        print(f"Device type:      {device.type}")
        print(f"Waveform device:  {waveform.device}")
        print(f"Model device:     {next(s2t_model.parameters()).device}")

        if sr != 16000:
            waveform = torchaudio.transforms.Resample(sr, 16000).to(device)(waveform)
        if waveform.size(0) > 1:
            waveform = waveform.mean(dim=0, keepdim=True)
        if device.type in ("cuda", "mps"):
            with torch.amp.autocast(device_type=device.type, enabled=True):
                out = s2t_model.predict([waveform], f"{target_lang}_Latn", max_seq_len=256)
        else:
            out = s2t_model.predict([waveform], f"{target_lang}_Latn", max_seq_len=256)

        return out[0] if out else "ERROR: No output"
    except Exception as e:
        return f"ERROR: {e}"


cwd = Path.cwd()
test_folder = cwd / "test"
audio_folder = test_folder / "audio"
cache_dir = cwd / ".sonar_cache"
cache_dir.mkdir(exist_ok=True)
progress_file = cache_dir / "progress.json"
results_cache = cache_dir / "results.pkl"
output_file = cwd / "sonar_translated_output.txt"
transcript_file = test_folder / "transcript.txt"
tgt_lang = "eng"

print("🔍 Loading progress…")
try:
    with open(progress_file, 'r') as f:
        data = json.load(f)
    processed_files = set(data.get('processed_files', []))
    print(f"📂 Found progress.json — {len(processed_files)} files already done")
except (FileNotFoundError, json.JSONDecodeError):
    processed_files = set()
    print("📂 No valid progress.json found; starting fresh")

print("🔍 Loading cache…")
try:
    with open(results_cache, 'rb') as f:
        cached_results = pickle.load(f)
    processed_files.update(cached_results.keys())
    print(f"🗄️  Loaded {len(cached_results)} cached results")
except (FileNotFoundError, EOFError):
    cached_results = {}
    print("🗄️  No cache file found; starting with empty results")

if not audio_folder.exists():
    new_path = input(f"❌ Audio folder not found at {audio_folder}. Enter test folder path: ")
    test_folder = Path(new_path)
    audio_folder = test_folder / "audio"
    transcript_file = test_folder / "transcript.txt"

audio_files = sorted(audio_folder.glob("*.wav"))
total = len(audio_files)
remaining = [p for p in audio_files if p.name not in processed_files]
print(f"📊 Total .wav files: {total}")
print(f"✅ Already processed: {total - len(remaining)}")
print(f"⏳ Remaining: {len(remaining)}")

with open(output_file, "a", encoding="utf-8") as out_f:
    try:
        for i, path in enumerate(remaining, 1):
            name = path.name
            print(f"🔄 Processing {i}/{len(remaining)}: {name}")
            text = translate_speech_to_text_optimized(path, target_lang=tgt_lang)
            cached_results[name] = text
            processed_files.add(name)
            out_f.write(f"{name}\t{text}\n")
            out_f.flush()

            if i % 5 == 0 or i == len(remaining):
                print(f"💾 Saving progress at file {i}…")
                with open(progress_file, 'w') as pf:
                    json.dump({
                        'processed_files': list(processed_files),
                        'last_updated': datetime.datetime.now().isoformat()
                    }, pf, indent=2)
                with open(results_cache, 'wb') as rf:
                    pickle.dump(cached_results, rf)

        print("✅ Done processing current batch")

    except KeyboardInterrupt:
        print("\n⏸️  Interrupted by user — saving progress…")
        with open(progress_file, 'w') as pf:
            json.dump({
                'processed_files': list(processed_files),
                'last_updated': datetime.datetime.now().isoformat(),
                'interrupted': True
            }, pf, indent=2)
        with open(results_cache, 'wb') as rf:
            pickle.dump(cached_results, rf)
        print(f"💾 Final save complete: {len(processed_files)}/{total} files")
        # Exit the cell here; do not run evaluation
        raise

# Only run evaluation if we processed everything:
if len(processed_files) == total:
    print("🎉 All files processed! Running evaluation…")
    if transcript_file.exists():
        ref = {}
        with open(transcript_file, 'r', encoding='utf-8') as rf:
            for line in rf:
                k, v = line.strip().split('|', 1)
                ref[k.lower()] = v
        hyps, refs = [], []
        for fn, hyp in cached_results.items():
            if fn.lower() in ref and not hyp.startswith("ERROR:"):
                hyps.append(hyp)
                refs.append(ref[fn.lower()])
        if hyps:
            bleu = sacrebleu.corpus_bleu(hyps, [refs])
            chrf = sacrebleu.corpus_chrf(hyps, [refs])
            print(f"🔹 BLEU score: {bleu.score:.2f}", flush=True)
            print(f"🔹 ChrF++ score: {chrf.score:.2f}", flush=True)
            with open(cache_dir / "evaluation_results.json", 'w') as ef:
                json.dump({
                    'bleu_score': bleu.score,
                    'chrf_score': chrf.score,
                    'matched_files': len(hyps),
                    'total_files': total,
                    'timestamp': datetime.datetime.now().isoformat()
                }, ef, indent=2)
        else:
            print("⚠️  No matching transcripts for evaluation.")
    else:
        print(f"❌ transcript.txt not found at {transcript_file}; skipping evaluation.")
else:
    print(f"⌛ {len(processed_files)}/{total} done. Run again to resume.")



# Clear cache command (uncomment to reset)
# import shutil
# shutil.rmtree(cache_dir)
# print("🗑️  Cache cleared")

🔍 Loading progress…
📂 Found progress.json — 2936 files already done
🔍 Loading cache…
🗄️  Loaded 2936 cached results
📊 Total .wav files: 2936
✅ Already processed: 2936
⏳ Remaining: 0
✅ Done processing current batch
🎉 All files processed! Running evaluation…
🔹 BLEU score: 1.00
🔹 ChrF++ score: 10.88
🔹 BLEU score: 1.00
🔹 ChrF++ score: 10.88


### Checking if GPU or CPU

In [6]:
import torch

if torch.backends.mps.is_available() and torch.backends.mps.is_built():
    device = torch.device("mps")
elif torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

print(f"Using device: {device}")


Using device: mps


### Checking the files for errors

In [10]:
%pip install pandas

Collecting pandas
  Downloading pandas-2.3.1-cp312-cp312-macosx_11_0_arm64.whl.metadata (91 kB)
  Downloading pandas-2.3.1-cp312-cp312-macosx_11_0_arm64.whl.metadata (91 kB)
Collecting pytz>=2020.1 (from pandas)
  Using cached pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting pytz>=2020.1 (from pandas)
  Using cached pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Using cached tzdata-2025.2-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pandas-2.3.1-cp312-cp312-macosx_11_0_arm64.whl (10.7 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/10.7 MB[0m [31m?[0m eta [36m-:--:--[0mCollecting tzdata>=2022.7 (from pandas)
  Using cached tzdata-2025.2-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pandas-2.3.1-cp312-cp312-macosx_11_0_arm64.whl (10.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.7/10.7 MB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hUsing cached 

In [12]:
from pathlib import Path
import pandas as pd
import re

def check_files():
    cwd = Path.cwd()
    test_folder = cwd / "test"
    transcript_file = test_folder / "transcript.txt"
    output_file = cwd / "sonar_translated_output.txt"
    
    # Check if files exist
    print(f"Checking transcript file: {transcript_file}")
    if not transcript_file.exists():
        print(f"❌ Transcript file not found at {transcript_file}")
        return
    
    print(f"Checking output file: {output_file}")
    if not output_file.exists():
        print(f"❌ Output file not found at {output_file}")
        return
    
    # Read transcript file
    transcript_data = {}
    transcript_separator = None
    print("\n🔍 Analyzing transcript.txt format...")
    
    with open(transcript_file, 'r', encoding='utf-8') as f:
        sample_lines = [next(f) for _ in range(min(5, sum(1 for _ in open(transcript_file))))]
    
    # Detect separator in transcript file
    for line in sample_lines:
        if '|' in line:
            transcript_separator = '|'
            break
        elif '\t' in line:
            transcript_separator = '\t'
            break
    
    print(f"Detected separator in transcript file: {repr(transcript_separator)}")
    
    # Read transcript file with correct separator
    with open(transcript_file, 'r', encoding='utf-8') as f:
        for line in f:
            try:
                if transcript_separator:
                    k, v = line.strip().split(transcript_separator, 1)
                    transcript_data[k.lower()] = v
                else:
                    print(f"⚠️ Could not parse line: {line.strip()}")
            except ValueError:
                print(f"⚠️ Error parsing line: {line.strip()}")
    
    print(f"Read {len(transcript_data)} entries from transcript.txt")
    
    # Read output file
    output_data = {}
    output_separator = None
    output_keys = []
    print("\n🔍 Analyzing sonar_translated_output.txt format...")
    
    with open(output_file, 'r', encoding='utf-8') as f:
        sample_lines = [next(f) for _ in range(min(5, sum(1 for _ in open(output_file))))]
    
    # Detect separator in output file
    for line in sample_lines:
        if '\t' in line:
            output_separator = '\t'
            break
        elif ' ' in line and not line.startswith(' '):
            output_separator = ' '
            break
    
    print(f"Detected separator in output file: {repr(output_separator)}")
    
    # Read output file with correct separator
    with open(output_file, 'r', encoding='utf-8') as f:
        for line in f:
            try:
                if output_separator:
                    k, v = line.strip().split(output_separator, 1)
                    output_data[k.lower()] = v
                    output_keys.append(k.lower())
                else:
                    print(f"⚠️ Could not parse line: {line.strip()}")
            except ValueError:
                print(f"⚠️ Error parsing line: {line.strip()}")
    
    print(f"Read {len(output_data)} entries from sonar_translated_output.txt")
    
    # Check for duplicates in output file
    duplicates = [x for x in output_keys if output_keys.count(x) > 1]
    unique_duplicates = set(duplicates)
    
    if unique_duplicates:
        print(f"\n⚠️ Found {len(unique_duplicates)} duplicate keys in output file:")
        for dup in unique_duplicates:
            print(f"  - {dup} appears {output_keys.count(dup)} times")
    else:
        print("\n✅ No duplicates found in output file")
    
    # Compare files
    trans_keys = set(transcript_data.keys())
    out_keys = set(output_data.keys())
    
    print("\n📊 Comparison Summary:")
    print(f"Transcript entries: {len(trans_keys)}")
    print(f"Output entries: {len(out_keys)}")
    print(f"Entries in both files: {len(trans_keys & out_keys)}")
    print(f"Entries only in transcript: {len(trans_keys - out_keys)}")
    print(f"Entries only in output: {len(out_keys - trans_keys)}")
    
    # Print sample entries
    print("\n📝 Sample transcript entries:")
    for k in list(transcript_data.keys())[:3]:
        print(f"  {k}: {transcript_data[k]}")
    
    print("\n📝 Sample output entries:")
    for k in list(output_data.keys())[:3]:
        print(f"  {k}: {output_data[k]}")

# Run the check
check_files()

Checking transcript file: /Users/nihesh/Nihesh/sonar/test/transcript.txt
Checking output file: /Users/nihesh/Nihesh/sonar/sonar_translated_output.txt

🔍 Analyzing transcript.txt format...
Detected separator in transcript file: '|'
Read 2936 entries from transcript.txt

🔍 Analyzing sonar_translated_output.txt format...
Detected separator in output file: '\t'
Read 2936 entries from sonar_translated_output.txt

✅ No duplicates found in output file

📊 Comparison Summary:
Transcript entries: 2936
Output entries: 2936
Entries in both files: 2936
Entries only in transcript: 0
Entries only in output: 0

📝 Sample transcript entries:
  iiith_tdbc_yt_set16_96_96_0018.wav: Right from my childhood I am an actor.At the time of shool I had acted on the stage located in the school.
  iiith_dbc_pe_tel_ca_1021_51054.wav: Talking about the Ahobilam Temple in Anantapur — one word isn’t enough. So many people keep visiting because the temple is so beautiful that two eyes aren’t enough to take it all in
  i