# ASR RNN-T Demo Notebook

## 1. Repository Setup and Installation

In [1]:
!git clone https://github.com/rerum-nn/asr-rnn-t.git
!cd asr-rnn-t

Cloning into 'asr-rnn-t'...
remote: Enumerating objects: 501, done.[K
remote: Counting objects: 100% (501/501), done.[K
remote: Compressing objects: 100% (308/308), done.[K
remote: Total 501 (delta 268), reused 402 (delta 169), pack-reused 0 (from 0)[K
Receiving objects: 100% (501/501), 9.38 MiB | 6.31 MiB/s, done.
Resolving deltas: 100% (268/268), done.


In [2]:
%cd asr-rnn-t

!pwd
!ls -la

/content/asr-rnn-t
/content/asr-rnn-t
total 108
drwxr-xr-x  4 root root  4096 Oct 18 20:43 .
drwxr-xr-x  1 root root  4096 Oct 18 20:43 ..
-rw-r--r--  1 root root  3238 Oct 18 20:43 augmentation_demo.py
-rw-r--r--  1 root root  2260 Oct 18 20:43 bpe.model
-rw-r--r--  1 root root  8617 Oct 18 20:43 calc_metrics.py
-rw-r--r--  1 root root 22017 Oct 18 20:43 demo.ipynb
-rw-r--r--  1 root root   147 Oct 18 20:43 .flake8
drwxr-xr-x  8 root root  4096 Oct 18 20:43 .git
-rw-r--r--  1 root root  1318 Oct 18 20:43 .gitignore
-rw-r--r--  1 root root  2482 Oct 18 20:43 inference.py
-rw-r--r--  1 root root  1073 Oct 18 20:43 LICENSE
-rw-r--r--  1 root root  4487 Oct 18 20:43 normalization_params_clean.json
-rw-r--r--  1 root root  4466 Oct 18 20:43 normalization_params_other.json
-rw-r--r--  1 root root   771 Oct 18 20:43 .pre-commit-config.yaml
-rw-r--r--  1 root root  3233 Oct 18 20:43 README.md
-rw-r--r--  1 root root   255 Oct 18 20:43 requirements.txt
drwxr-xr-x 12 root root  4096 Oct 18 20:4

In [3]:
%pip install -r requirements.txt



In [None]:
!pip install -e .

In [4]:
import torch
import sys
import os

print(f"CUDA available: {torch.cuda.is_available()}")
print(f"Python version: {sys.version}")

CUSTOM_DATASET_URL = "https://drive.google.com/file/d/1JRq9zOKZ9HmsMpsEeGnu8M6mL_tN3FXF/view?usp=sharing"

CUDA available: True
Python version: 3.12.12 (main, Oct 10 2025, 08:52:57) [GCC 11.4.0]


## 2. Download Pre-trained Model Checkpoints

In [2]:
import os
from huggingface_hub import hf_hub_download

os.makedirs("saved/conformer-rnn-t-small-other", exist_ok=True)

model_files = [
    "model_best.pth",
    "config.yaml"
]

for file_name in model_files:
    file_path = hf_hub_download(
        repo_id="Rerumnn/conformer-rnn-t",
        filename=file_name,
        local_dir="saved/conformer-rnn-t-small-other",
        local_dir_use_symlinks=False
    )
    print(f"Downloaded: {file_path}")

!ls -la saved/conformer-rnn-t-small/


For more details, check out https://huggingface.co/docs/huggingface_hub/main/en/guides/download#download-files-to-local-folder.


Downloaded: saved/conformer-rnn-t-small-other/model_best.pth
Downloaded: saved/conformer-rnn-t-small-other/config.yaml
total 222352
drwxrwxr-x 3 ubuntu ubuntu      4096 Oct 19 20:13 .
drwxrwxr-x 9 ubuntu ubuntu      4096 Oct 19 20:14 ..
drwxrwxr-x 3 ubuntu ubuntu      4096 Oct 19 20:13 .cache
-rw-rw-r-- 1 ubuntu ubuntu 110666434 Oct 17 02:26 checkpoint-epoch1.pth
-rw-rw-r-- 1 ubuntu ubuntu      4117 Oct 19 20:13 config.yaml
-rw-rw-r-- 1 ubuntu ubuntu        41 Oct 17 02:11 git_commit.txt
-rw-rw-r-- 1 ubuntu ubuntu       677 Oct 17 02:11 git_diff.patch
-rw-rw-r-- 1 ubuntu ubuntu      3762 Oct 17 02:26 info.log
-rw-rw-r-- 1 ubuntu ubuntu 116979327 Oct 19 20:13 model_best.pth


## 3. Custom Dataset

```
YourDataset/
├── audio/
│   ├── utterance1.wav  # can be .flac, .mp3, .m4a, .ogg
│   ├── utterance2.wav
│   └── ...
└── transcriptions/  # ground truth
    ├── utterance1.txt
    ├── utterance2.txt
    └── ...
```


In [8]:
import gdown
import zipfile
from pathlib import Path
import os

print("Current URL:", CUSTOM_DATASET_URL)

def download_and_extract_dataset(url, extract_to="data/datasets/custom_dataset"):
    extract_path = Path(extract_to)
    extract_path.mkdir(parents=True, exist_ok=True)

    print(f"Downloading dataset from: {url}")
    zip_path = extract_path / "dataset.zip"

    try:
        if "drive.google.com" in url:
            file_id = url.split('/')[-2] if '/file/d/' in url else url.split('id=')[1].split('&')[0]
            download_url = f"https://drive.google.com/uc?id={file_id}"
        else:
            download_url = url

        gdown.download(download_url, str(zip_path), quiet=False)

        print(f"Extracting to: {extract_path}")
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(extract_path)

        zip_path.unlink()

        print(f"Dataset extracted to: {extract_path}")
        return extract_path

    except Exception as e:
        print(f"Error downloading/extracting dataset: {e}")
        return None


custom_dataset_path = download_and_extract_dataset(CUSTOM_DATASET_URL)
custom_dataset_first_dir = next((item for item in os.listdir(custom_dataset_path)), None)

Current URL: https://drive.google.com/file/d/1JRq9zOKZ9HmsMpsEeGnu8M6mL_tN3FXF/view?usp=sharing
Downloading dataset from: https://drive.google.com/file/d/1JRq9zOKZ9HmsMpsEeGnu8M6mL_tN3FXF/view?usp=sharing


Downloading...
From: https://drive.google.com/uc?id=1JRq9zOKZ9HmsMpsEeGnu8M6mL_tN3FXF
To: /content/asr-rnn-t/data/datasets/custom_dataset/dataset.zip
100%|██████████| 238k/238k [00:00<00:00, 3.84MB/s]

Extracting to: data/datasets/custom_dataset
Dataset extracted to: data/datasets/custom_dataset





In [10]:
custom_dataset_first_dir

'librispeech_inference'

In [12]:
custom_dataset_path / custom_dataset_first_dir

PosixPath('data/datasets/custom_dataset/librispeech_inference')

In [13]:
import subprocess

if custom_dataset_path:
    print(f"Running inference on custom dataset: {custom_dataset_first_dir}")

    audio_dir = custom_dataset_path / custom_dataset_first_dir / "audio"
    transcriptions_dir = custom_dataset_path / custom_dataset_first_dir / "transcriptions"

    if audio_dir.exists() and transcriptions_dir.exists():
        print(f"Found {len(list(audio_dir.glob('*')))} audio files")
        print(f"Found {len(list(transcriptions_dir.glob('*.txt')))} transcription files")

        custom_inference_cmd = [
            "python", "inference.py",
            "++inferencer.save_path=custom_inference",
            f"++datasets.test.dir={custom_dataset_first_dir}",
            "writer=console"
        ]

        result = subprocess.run(custom_inference_cmd, capture_output=True, text=True, timeout=600)

        if result.returncode == 0:
            print("Custom dataset inference completed successfully")
            print(result.stdout)
        else:
            print("Custom dataset inference failed:")
            print(result.stderr)
    else:
        print("Error: Dataset structure is incorrect. Missing 'audio' or 'transcriptions' directories.")
        print(f"Audio dir exists: {audio_dir.exists()}")
        print(f"Transcriptions dir exists: {transcriptions_dir.exists()}")
else:
    print("Skipping custom dataset inference - no dataset downloaded")


Running inference on custom dataset: librispeech_inference
Found 10 audio files
Found 10 transcription files
Custom dataset inference failed:
Traceback (most recent call last):
  File "/content/asr-rnn-t/inference.py", line 8, in <module>
    from src.datasets.data_utils import get_dataloaders
  File "/content/asr-rnn-t/src/datasets/__init__.py", line 1, in <module>
    from src.datasets.common_voice import CommonVoiceDataset
  File "/content/asr-rnn-t/src/datasets/common_voice.py", line 9, in <module>
    from src.datasets.base_dataset import BaseDataset
  File "/content/asr-rnn-t/src/datasets/base_dataset.py", line 9, in <module>
    from src.text_encoder import RNNTTextEncoder
  File "/content/asr-rnn-t/src/text_encoder/__init__.py", line 1, in <module>
    from src.text_encoder.rnnt_text_encoder import RNNTTextEncoder, RNNTTextEncoderBPE
ImportError: cannot import name 'RNNTTextEncoderBPE' from 'src.text_encoder.rnnt_text_encoder' (/content/asr-rnn-t/src/text_encoder/rnnt_text_enco

In [None]:
if custom_dataset_first_dir:
    print("Calculating metrics")

    custom_metrics_cmd = [
        "python", "calc_metrics.py",
        "--dataset_dir", str(custom_dataset_first_dir),
        "--predictions", "data/saved/custom_inference",
        "--verbose"
    ]

    result = subprocess.run(custom_metrics_cmd, capture_output=True, text=True, timeout=120)

    if result.returncode == 0:
        print("Custom dataset metrics calculation completed successfully:")
        print(result.stdout)
    else:
        print("Custom dataset metrics calculation failed:")
        print(result.stderr)
else:
    print("Skipping custom dataset metrics calculation - no dataset available")
