# ASR RNN-T Demo Notebook

## 1. Repository Setup and Installation

In [None]:
!git clone https://github.com/rerum-nn/asr-rnn-t.git
!cd asr-rnn-t

In [None]:
%cd asr-rnn-t

!pwd
!ls -la

In [None]:
%pip install -r requirements.txt

In [1]:
import torch
import sys
import os

print(f"CUDA available: {torch.cuda.is_available()}")
print(f"Python version: {sys.version}")

CUSTOM_DATASET_URL = "PLACEHOLDER"

CUDA available: True
Python version: 3.10.4 | packaged by conda-forge | (main, Mar 24 2022, 17:39:04) [GCC 10.3.0]


## 2. Download Pre-trained Model Checkpoints

In [3]:
import os
from huggingface_hub import hf_hub_download

os.makedirs("saved/conformer-rnn-t-small", exist_ok=True)

model_files = [
    "model_best.pth",
    "config.yaml"
]

for file_name in model_files:
    file_path = hf_hub_download(
        repo_id="Rerumnn/conformer-rnn-t",
        filename=file_name,
        local_dir="saved/conformer-rnn-t-small",
        local_dir_use_symlinks=False
    )
    print(f"Downloaded: {file_path}")

!ls -la saved/conformer-rnn-t-small/


For more details, check out https://huggingface.co/docs/huggingface_hub/main/en/guides/download#download-files-to-local-folder.


Downloaded: saved/conformer-rnn-t-small/model_best.pth
Downloaded: saved/conformer-rnn-t-small/config.yaml
total 342792
drwxrwxr-x  3 ubuntu ubuntu      4096 Oct 18 11:16 .
drwxrwxr-x 20 ubuntu ubuntu      4096 Oct 18 11:16 ..
drwxrwxr-x  3 ubuntu ubuntu      4096 Oct 18 11:15 .cache
-rw-rw-r--  1 ubuntu ubuntu 116987623 Oct 17 18:59 checkpoint-epoch11.pth
-rw-rw-r--  1 ubuntu ubuntu 116984994 Oct 17 09:34 checkpoint-epoch5.pth
-rw-rw-r--  1 ubuntu ubuntu      4117 Oct 18 11:16 config.yaml
-rw-rw-r--  1 ubuntu ubuntu        41 Oct 17 09:37 git_commit.txt
-rw-rw-r--  1 ubuntu ubuntu       460 Oct 17 09:37 git_diff.patch
-rw-rw-r--  1 ubuntu ubuntu     23180 Oct 17 18:59 info.log
-rw-rw-r--  1 ubuntu ubuntu 116979327 Oct 18 11:16 model_best.pth


## 3. Sample Dataset

In [5]:
from src.datasets.librispeech_dataset import LibrispeechDataset
from src.text_encoder import RNNTTextEncoder
from src.transforms import NormalizeRMS
from torch import nn
import torchaudio

text_encoder = RNNTTextEncoder()
dataset = LibrispeechDataset("test-other", text_encoder=text_encoder, instance_transforms={
    "get_spectrogram": torchaudio.transforms.MelSpectrogram(sample_rate=16000, n_fft=400, hop_length=160, n_mels=80),
    "audio": NormalizeRMS()
})

print(f"Dataset loaded with {len(dataset)} samples")
print(f"First sample: {dataset[0]['text']}")
print(f"Audio path: {dataset[0]['audio_path']}")


Dataset loaded with 2939 samples
First sample: well
Audio path: /home/ubuntu/asr-rnn-t/data/datasets/librispeech/test-other/2414/128291/2414-128291-0020.flac


In [26]:
import subprocess
from pathlib import Path
import shutil
import os

dataset_dir = Path("data/datasets/librispeech_inference")
audio_dir = dataset_dir / "audio"
transcriptions_dir = dataset_dir / "transcriptions"

audio_dir.mkdir(parents=True, exist_ok=True)
transcriptions_dir.mkdir(parents=True, exist_ok=True)

for i in range(min(10, len(dataset))):
    sample = dataset[i]
    audio_path = Path(sample['audio_path'])
    text = sample['text']
    
    new_audio_path = audio_dir / f"sample_{i:03d}.flac"
    
    if new_audio_path.exists():
        os.chmod(new_audio_path, 0o644) 
        new_audio_path.unlink()
    
    shutil.copy2(audio_path, new_audio_path)
    
    transcription_path = transcriptions_dir / f"sample_{i:03d}.txt"
    with open(transcription_path, 'w') as f:
        f.write(text)

print(f"Created inference dataset with {len(list(audio_dir.glob('*')))} samples")

inference_cmd = [
    "python", "inference.py",
    "++inferencer.save_path=sample_inference",
    f"++dir={dataset_dir}",
    "writer=console"
]

result = subprocess.run(inference_cmd, capture_output=True, text=True, timeout=300)

if result.returncode == 0:
    print("Inference completed successfully")
    print(result.stdout)
else:
    print("Inference failed:")
    print(result.stderr)


Created inference dataset with 10 samples
Inference completed successfully
Logging git commit and patch...
Run name: conformer-rnn-t-small-inference
Run ID: g46rl6b6
ConformerRNNT(
  (conformer): Conformer(
    (conv_subsampling): Sequential(
      (0): Conv2d(1, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
      (1): ReLU()
      (2): Conv2d(128, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
      (3): ReLU()
    )
    (proj): Sequential(
      (0): Linear(in_features=2560, out_features=256, bias=True)
      (1): Dropout(p=0.1, inplace=False)
    )
    (conformer_blocks): ModuleList(
      (0-3): 4 x ConformerBlock(
        (ff1): FeedForwardModule(
          (ff_module): Sequential(
            (0): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
            (1): Linear(in_features=256, out_features=1024, bias=True)
            (2): SiLU()
            (3): Dropout(p=0.1, inplace=False)
            (4): Linear(in_features=1024, out_features=256, bias=True)
 

## 4. Calculate Metrics

Now let's calculate the WER (Word Error Rate) and CER (Character Error Rate) metrics on our sample dataset to evaluate the model's performance.


In [27]:
metrics_cmd = [
    "python", "calc_metrics.py",
    "--dataset_dir", "data/datasets/librispeech_inference",
    "--predictions", "data/saved/sample_inference",
    "--verbose"
]

result = subprocess.run(metrics_cmd, capture_output=True, text=True, timeout=900)

if result.returncode == 0:
    print("Metrics calculation completed successfully:")
    print(result.stdout)
else:
    print("Metrics calculation failed:")
    print(result.stderr)


Metrics calculation completed successfully:
Dataset dir: data/datasets/librispeech_inference
Found 10 ground truth transcriptions out of 10 audio files
Results:
Number of samples: 10
	Average WER: 0.3200 (32.00%)
	Average CER: 0.1838 (18.38%)

Individual sample results:
Utterance ID	WER	CER
----------------------------------------
sample_000	0.0000	0.0000
sample_001	1.0000	0.6250
sample_002	0.0000	0.0000
sample_003	0.0000	0.0000
sample_004	1.0000	0.5714
sample_005	0.8000	0.5217
sample_006	0.0000	0.0000
sample_007	0.0000	0.0000
sample_008	0.0000	0.0000
sample_009	0.4000	0.1200



## 5. Custom Dataset 

```
YourDataset/
├── audio/
│   ├── utterance1.wav  # can be .flac, .mp3, .m4a, .ogg
│   ├── utterance2.wav
│   └── ...
└── transcriptions/  # ground truth
    ├── utterance1.txt
    ├── utterance2.txt
    └── ...
```


In [30]:
import gdown
import zipfile
import os

print("Current URL:", CUSTOM_DATASET_URL)

def download_and_extract_dataset(url, extract_to="data/datasets/custom_dataset"):
    extract_path = Path(extract_to)
    extract_path.mkdir(parents=True, exist_ok=True)
    
    print(f"Downloading dataset from: {url}")
    zip_path = extract_path / "dataset.zip"
    
    try:
        if "drive.google.com" in url:
            file_id = url.split('/')[-2] if '/file/d/' in url else url.split('id=')[1].split('&')[0]
            download_url = f"https://drive.google.com/uc?id={file_id}"
        else:
            download_url = url
            
        gdown.download(download_url, str(zip_path), quiet=False)
        
        print(f"Extracting to: {extract_path}")
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(extract_path)
        
        zip_path.unlink()
        
        print(f"Dataset extracted to: {extract_path}")
        return extract_path
        
    except Exception as e:
        print(f"Error downloading/extracting dataset: {e}")
        return None


custom_dataset_path = download_and_extract_dataset(CUSTOM_DATASET_URL)
custom_dataset_first_dir = next((item for item in os.listdir(custom_dataset_path)), None)

Current URL: PLACEHOLDER
Downloading dataset from: PLACEHOLDER
Error downloading/extracting dataset: Invalid URL 'PLACEHOLDER': No scheme supplied. Perhaps you meant https://PLACEHOLDER?


In [None]:
if custom_dataset_first_dir:
    print(f"Running inference on custom dataset: {custom_dataset_first_dir}")

    audio_dir = custom_dataset_first_dir / "audio"
    transcriptions_dir = custom_dataset_first_dir / "transcriptions"
    
    if audio_dir.exists() and transcriptions_dir.exists():
        print(f"Found {len(list(audio_dir.glob('*')))} audio files")
        print(f"Found {len(list(transcriptions_dir.glob('*.txt')))} transcription files")
        
        custom_inference_cmd = [
            "python", "inference.py",
            "++inferencer.save_path=custom_inference",
            f"++datasets.test.dir={custom_dataset_first_dir}",
            "writer=console"
        ]
        
        result = subprocess.run(custom_inference_cmd, capture_output=True, text=True, timeout=600)
        
        if result.returncode == 0:
            print("Custom dataset inference completed successfully")
            print(result.stdout)
        else:
            print("Custom dataset inference failed:")
            print(result.stderr)
    else:
        print("Error: Dataset structure is incorrect. Missing 'audio' or 'transcriptions' directories.")
        print(f"Audio dir exists: {audio_dir.exists()}")
        print(f"Transcriptions dir exists: {transcriptions_dir.exists()}")
else:
    print("Skipping custom dataset inference - no dataset downloaded")


In [None]:
if custom_dataset_first_dir:
    print("Calculating metrics")
    
    custom_metrics_cmd = [
        "python", "calc_metrics.py",
        "--dataset_dir", str(custom_dataset_first_dir),
        "--predictions", "data/saved/custom_inference",
        "--verbose"
    ]
    
    result = subprocess.run(custom_metrics_cmd, capture_output=True, text=True, timeout=120)
    
    if result.returncode == 0:
        print("Custom dataset metrics calculation completed successfully:")
        print(result.stdout)
    else:
        print("Custom dataset metrics calculation failed:")
        print(result.stderr)
else:
    print("Skipping custom dataset metrics calculation - no dataset available")
