In [5]:
import nemo.collections.asr as nemo_asr
import os
import numpy as np

squeezeformer_checkpoint_path = "./checkpoints/squeezeformer-xs-ctc-bpe"
if not os.path.exists(squeezeformer_checkpoint_path):
    asr_model = nemo_asr.models.EncDecCTCModelBPE.from_pretrained(model_name="stt_en_squeezeformer_ctc_xsmall_ls")
    asr_model.save_to(squeezeformer_checkpoint_path)
asr_model = nemo_asr.models.EncDecCTCModelBPE.restore_from(squeezeformer_checkpoint_path)


[NeMo I 2024-07-06 16:34:09 mixins:172] Tokenizer SentencePieceTokenizer initialized with 128 tokens


[NeMo W 2024-07-06 16:34:09 modelPT:176] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    manifest_filepath: /data/tarred_train/tarred_audio_manifest.json
    sample_rate: 16000
    batch_size: 32
    shuffle: true
    num_workers: 8
    pin_memory: true
    use_start_end_token: false
    trim_silence: false
    max_duration: 16.7
    min_duration: 0.1
    is_tarred: true
    tarred_audio_filepaths: /data/tarred_train/audio__OP_0..511_CL_.tar
    shuffle_n: 2048
    bucketing_strategy: synced_randomized
    bucketing_batch_size: null
    
[NeMo W 2024-07-06 16:34:09 modelPT:183] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). 
    Validation config : 
    manifest_filepath:
    - /data

[NeMo I 2024-07-06 16:34:09 features:305] PADDING: 0
[NeMo I 2024-07-06 16:34:10 save_restore_connector:263] Model EncDecCTCModelBPE was successfully restored from /home/robert/uni/ASRTest/checkpoints/squeezeformer-xs-ctc-bpe.


In [14]:
from omegaconf import OmegaConf, open_dict
with open("asr_model_cfg.yml", 'w') as fout:
    fout.write(OmegaConf.to_yaml(asr_model.cfg))

In [8]:
import os

def read_timit_test_txt_files(base_path):
    txt_audio_contents = []
    
    for data_folder in os.listdir(base_path):
        data_folder_path = os.path.join(base_path, data_folder)
        # Iterate over each speaker folder in DR1
        for speaker_folder in os.listdir(data_folder_path):
            speaker_folder_path = os.path.join(data_folder_path, speaker_folder)
            
            if os.path.isdir(speaker_folder_path):
                # Iterate over each file in the speaker's folder
                for file_name in os.listdir(speaker_folder_path):
                    if file_name.endswith('.TXT'):
                        txt_path = os.path.join(speaker_folder_path, file_name)
                        
                        # Read the content of the TXT file
                        with open(txt_path, 'r') as file:
                            content = file.read()
                            # Process the content to remove leading numbers and newlines
                            content = ' '.join(content.split()[2:]).strip()
                        

                        # Get the corresponding WAV file path
                        wav_file_name = file_name.replace('.TXT', '.WAV.wav')
                        wav_path = os.path.join(speaker_folder_path, wav_file_name)
                        
                        if os.path.exists(wav_path):
                            # Append the content and file path to the result list
                            txt_audio_contents.append((content, wav_path))
        
    return txt_audio_contents




In [9]:
import json
def create_manifest(manifest_out_path, base_data_path):
    txt_files = read_timit_test_txt_files(base_data_path)
    with open(manifest_out_path, 'w') as fout:
        
        for transcript, audiofilepath in txt_files:
            duration = librosa.core.get_duration(filename=audiofilepath)
            metadata = {
                "audio_filepath": audiofilepath,
                "duration": duration,
                "text": transcript
            }
            json.dump(metadata, fout)
            fout.write('\n')

In [10]:
import re
chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\�\…\{\}\【\】\・\。\『\』\、\ー\〜]' 

def remove_special_characters(text):
    cleaned = re.sub(chars_to_ignore_regex, '', text).lower().strip()
    return cleaned

def clean_manifest(manifest_path, clean_manifest_out_path):
    with open(manifest_path, 'r') as fin, open(clean_manifest_out_path, 'w') as fout:
        for line in fin:
            data = json.loads(line)
            # remove special characters and make lowercase
            cleaned_text = remove_special_characters(data["text"])
            metadata = {
                "audio_filepath": data["audio_filepath"],
                "duration": data["duration"],
                "text": cleaned_text
            }
            json.dump(metadata, fout)
            fout.write('\n')

In [11]:
test_base_path = '/mnt/c/Users/Robert/Downloads/timit-dataset/data/TEST'
test_manifest_path = "./timit-dataset/test-manifest.json"
test_clean_manifest_path = "./timit-dataset/test-manifest.clean.json"


if not os.path.exists(test_manifest_path):
    create_manifest(test_manifest_path, test_base_path)
if not os.path.exists(test_clean_manifest_path):
    clean_manifest(test_manifest_path, test_clean_manifest_path)
    
    
    
train_base_path = '/mnt/c/Users/Robert/Downloads/timit-dataset/data/TRAIN'
train_manifest_path = "./timit-dataset/train-manifest.json"
train_clean_manifest_path = "./timit-dataset/train-manifest.clean.json"
            
    
if not os.path.exists(train_manifest_path):
    create_manifest(train_manifest_path, train_base_path)
if not os.path.exists(train_clean_manifest_path):
    clean_manifest(train_manifest_path, train_clean_manifest_path)
    
    


In [12]:
import json
from nemo.collections.asr.metrics.wer import word_error_rate

# Function to read the manifest file
def read_manifest(manifest_path):
    with open(manifest_path, 'r') as f:
        lines = f.readlines()
    return [json.loads(line.strip()) for line in lines]

run_pre_check = False
# WER without finetunting
if run_pre_check: 
    # Read the test manifest file
    test_data = read_manifest(test_clean_manifest_path)
    
    # Extract audio file paths and reference texts
    audio_filepaths = [item['audio_filepath'] for item in test_data]
    references = [item['text'].lower() for item in test_data]
    
    # Run inference on the entire batch
    predictions = asr_model.transcribe(audio_filepaths)
    
    for prediction, reference_text in zip(predictions, references):
        print(f"""
                reference : {reference_text}         
                prediction: {prediction}
        """)
    
    # Calculate WER
    wer = word_error_rate(hypotheses=predictions, references=references)
    print(f"Word Error Rate (WER): {wer}")


In [13]:
from nemo.collections.asr.parts.utils.manifest_utils import read_manifest
test_data = read_manifest(test_clean_manifest_path)

In [16]:
data_points = 4620
batch_size = 8
epochs = 10

cfg = asr_model.cfg
with open_dict(cfg):
    cfg.train_ds.manifest_filepath = train_clean_manifest_path
    cfg.train_ds.batch_size = batch_size
    cfg.train_ds.num_workers = 8
    cfg.train_ds.is_tarred = False
    #   cfg.train_ds.pin_memory=True \
    
    cfg.spec_augment.freq_mask: 0
    cfg.spec_augment.time_mask: 0

    # Validation dataset  (Use test dataset as validation, since we train using train + dev)
    cfg.validation_ds.manifest_filepath = test_clean_manifest_path
    cfg.validation_ds.batch_size = batch_size
    cfg.validation_ds.num_workers = 8
    cfg.train_ds.is_tarred = False
    
    # Optimizer and Scheduler
    cfg.optim.name = 'adamw'
    cfg.optim.lr = 1.0e-4
    cfg.optim.betas = [0.9, 0.98]
    cfg.optim.weight_decay = 0.0005
    cfg.optim.sched.max_steps = (data_points // batch_size) * epochs
    cfg.optim.sched.name = "CosineAnnealing"
    cfg.optim.sched.min_lr = 1.0e-7
    cfg.optim.sched.warmup_steps = 0
        # Remove unnecessary parameters
    if 'warmup_ratio' in cfg.optim.sched:
        del cfg.optim.sched['warmup_ratio']
    if 'hold_ratio' in cfg.optim.sched:
        del cfg.optim.sched['hold_ratio']
    if 'hold_steps' in cfg.optim.sched:
        del cfg.optim.sched['hold_steps']
    if 'decay_rate' in cfg.optim.sched:
        del cfg.optim.sched['decay_rate']

In [18]:
asr_model.setup_training_data(cfg.train_ds)
asr_model.setup_validation_data(cfg.validation_ds)
asr_model.setup_optimization(cfg.optim)
asr_model.spec_augmentation = asr_model.from_config_dict(cfg.spec_augment)

[NeMo I 2024-07-03 18:03:46 collections:196] Dataset loaded with 4620 files totalling 3.94 hours
[NeMo I 2024-07-03 18:03:46 collections:197] 0 files were filtered totalling 0.00 hours
[NeMo I 2024-07-03 18:03:47 collections:196] Dataset loaded with 1680 files totalling 1.44 hours
[NeMo I 2024-07-03 18:03:47 collections:197] 0 files were filtered totalling 0.00 hours


[NeMo W 2024-07-03 18:03:47 modelPT:652] Trainer wasn't specified in model constructor. Make sure that you really wanted it.


[NeMo I 2024-07-03 18:03:47 modelPT:770] Optimizer config = AdamW (
    Parameter Group 0
        amsgrad: False
        betas: [0.9, 0.98]
        capturable: False
        differentiable: False
        eps: 1e-08
        foreach: None
        fused: None
        lr: 0.0001
        maximize: False
        weight_decay: 0.0005
    )
[NeMo I 2024-07-03 18:03:47 lr_scheduler:923] Scheduler "<nemo.core.optim.lr_scheduler.CosineAnnealing object at 0x7ff07fcef310>" 
    will be used during training (effective maximum steps = 5770) - 
    Parameters : 
    (warmup_steps: 0
    min_lr: 1.0e-07
    max_steps: 5770
    )


In [19]:
print(OmegaConf.to_yaml(asr_model.cfg))

sample_rate: 16000
log_prediction: true
ctc_reduction: mean_batch
skip_nan_grad: false
train_ds:
  manifest_filepath: ./timit-dataset/train-manifest.clean.json
  sample_rate: 16000
  batch_size: 8
  shuffle: true
  num_workers: 8
  pin_memory: true
  use_start_end_token: false
  trim_silence: false
  max_duration: 16.7
  min_duration: 0.1
  is_tarred: false
  tarred_audio_filepaths: /data/tarred_train/audio__OP_0..511_CL_.tar
  shuffle_n: 2048
  bucketing_strategy: synced_randomized
  bucketing_batch_size: null
validation_ds:
  manifest_filepath: ./timit-dataset/test-manifest.clean.json
  sample_rate: 16000
  batch_size: 8
  shuffle: false
  num_workers: 8
  pin_memory: true
  use_start_end_token: false
test_ds:
  manifest_filepath: null
  sample_rate: 16000
  batch_size: 16
  shuffle: false
  num_workers: 8
  pin_memory: true
  use_start_end_token: false
tokenizer:
  dir: /tokenizers/librispeech_tokenizer_spe_unigram_v128/
  type: bpe
  model_path: nemo:7d362afca0cc487bab74d46332fc644

In [20]:
import torch
import pytorch_lightning as ptl

if torch.cuda.is_available():
  accelerator = 'gpu'
else:
  accelerator = 'cpu'

EPOCHS = epochs

trainer = ptl.Trainer(devices=1,
                      accelerator=accelerator,
                      max_epochs=EPOCHS,
                      accumulate_grad_batches=1,
                      enable_checkpointing=False,
                      logger=False,
                      log_every_n_steps=5,
                      check_val_every_n_epoch=2)

# Setup model with the trainer
asr_model.set_trainer(trainer)

# Finally, update the model's internal config
asr_model.cfg = asr_model._cfg

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


In [21]:
from nemo.utils import exp_manager

# Environment variable generally used for multi-node multi-gpu training.
# In notebook environments, this flag is unnecessary and can cause logs of multiple training runs to overwrite each other.
os.environ.pop('NEMO_EXPM_VERSION', None)
LANGUAGE = "en"
config = exp_manager.ExpManagerConfig(
    exp_dir=f'experiments/lang-{LANGUAGE}/',
    name=f"ASR-Model-Language-{LANGUAGE}",
    checkpoint_callback_params=exp_manager.CallbackParams(
        monitor="val_wer",
        mode="min",
        always_save_nemo=True,
        save_best_model=True,
    ),
)

config = OmegaConf.structured(config)

logdir = exp_manager.exp_manager(trainer, config)

[NeMo I 2024-07-03 18:03:47 exp_manager:396] Experiments will be logged at experiments/lang-en/ASR-Model-Language-en/2024-07-03_18-03-47
[NeMo I 2024-07-03 18:03:47 exp_manager:856] TensorboardLogger has been set up


In [22]:
trainer.fit(asr_model)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


[NeMo I 2024-07-03 18:03:47 modelPT:770] Optimizer config = AdamW (
    Parameter Group 0
        amsgrad: False
        betas: [0.9, 0.98]
        capturable: False
        differentiable: False
        eps: 1e-08
        foreach: None
        fused: None
        lr: 0.0001
        maximize: False
        weight_decay: 0.0005
    )
[NeMo I 2024-07-03 18:03:47 lr_scheduler:923] Scheduler "<nemo.core.optim.lr_scheduler.CosineAnnealing object at 0x7ff0682dde70>" 
    will be used during training (effective maximum steps = 5770) - 
    Parameters : 
    (warmup_steps: 0
    min_lr: 1.0e-07
    max_steps: 5770
    )



  | Name              | Type                              | Params | Mode 
--------------------------------------------------------------------------------
0 | preprocessor      | AudioToMelSpectrogramPreprocessor | 0      | train
1 | encoder           | SqueezeformerEncoder              | 9.0 M  | train
2 | decoder           | ConvASRDecoder                    | 18.7 K | train
3 | loss              | CTCLoss                           | 0      | train
4 | spec_augmentation | SpectrogramAugmentation           | 0      | train
5 | wer               | WER                               | 0      | train
--------------------------------------------------------------------------------
9.0 M     Trainable params
0         Non-trainable params
9.0 M     Total params
36.128    Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

[NeMo I 2024-07-03 18:03:49 wer:334] 
    
[NeMo I 2024-07-03 18:03:49 wer:335] reference:she had your dark suit in greasy wash water all year
[NeMo I 2024-07-03 18:03:49 wer:336] predicted:she had your dark suit and greasy wash water all year
[NeMo I 2024-07-03 18:03:49 wer:334] 
    
[NeMo I 2024-07-03 18:03:49 wer:335] reference:her wardrobe consists of only skirts and blouses
[NeMo I 2024-07-03 18:03:49 wer:336] predicted:her wardrobe consists of only skirts and blouses


Training: |          | 0/? [00:00<?, ?it/s]

[NeMo I 2024-07-03 18:03:49 preemption:56] Preemption requires torch distributed to be initialized, disabling preemption
[NeMo I 2024-07-03 18:03:52 wer:334] 
    
[NeMo I 2024-07-03 18:03:52 wer:335] reference:don't ask me to carry an oily rag like that
[NeMo I 2024-07-03 18:03:52 wer:336] predicted:don ask me a heoy weg like that
[NeMo I 2024-07-03 18:03:54 wer:334] 
    
[NeMo I 2024-07-03 18:03:54 wer:335] reference:gus saw pine trees and redwoods on his walk through sequoia national forest
[NeMo I 2024-07-03 18:03:54 wer:336] predicted:thusaw pine trees and red woods on his walk through the quoya national forst
[NeMo I 2024-07-03 18:03:55 wer:334] 
    
[NeMo I 2024-07-03 18:03:55 wer:335] reference:the old shop adage still holds a good mechanic is usually a bad boss
[NeMo I 2024-07-03 18:03:55 wer:336] predicted:the old shop atag still holds a good mecchanic is usually a bad boss
[NeMo I 2024-07-03 18:03:57 wer:334] 
    
[NeMo I 2024-07-03 18:03:57 wer:335] reference:she had you

Validation: |          | 0/? [00:00<?, ?it/s]

[NeMo I 2024-07-03 18:10:23 wer:334] 
    
[NeMo I 2024-07-03 18:10:23 wer:335] reference:she had your dark suit in greasy wash water all year
[NeMo I 2024-07-03 18:10:23 wer:336] predicted:she had your dark suit in greasy wash water all year
[NeMo I 2024-07-03 18:10:23 wer:334] 
    
[NeMo I 2024-07-03 18:10:23 wer:335] reference:her wardrobe consists of only skirts and blouses
[NeMo I 2024-07-03 18:10:23 wer:336] predicted:her wardrobe consists of only skirts and blouses
[NeMo I 2024-07-03 18:10:23 wer:334] 
    
[NeMo I 2024-07-03 18:10:23 wer:335] reference:the morning dew on the spider web glistened in the sun
[NeMo I 2024-07-03 18:10:23 wer:336] predicted:the morning dew on the spder web glistened in the sun
[NeMo I 2024-07-03 18:10:24 wer:334] 
    
[NeMo I 2024-07-03 18:10:24 wer:335] reference:materials ceramic modeling clay red white or buff
[NeMo I 2024-07-03 18:10:24 wer:336] predicted:materials syremic modeling clay red white or buff
[NeMo I 2024-07-03 18:10:24 wer:334] 
 

Epoch 1, global step 1156: 'val_wer' reached 0.08821 (best 0.08821), saving model to '/home/robert/uni/ASRTest/experiments/lang-en/ASR-Model-Language-en/2024-07-03_18-03-47/checkpoints/ASR-Model-Language-en--val_wer=0.0882-epoch=1.ckpt' as top 3


[NeMo I 2024-07-03 18:11:06 nemo_model_checkpoint:219] New best .nemo model saved to: /home/robert/uni/ASRTest/experiments/lang-en/ASR-Model-Language-en/2024-07-03_18-03-47/checkpoints/ASR-Model-Language-en.nemo
[NeMo I 2024-07-03 18:11:10 wer:334] 
    
[NeMo I 2024-07-03 18:11:10 wer:335] reference:there's no war on now
[NeMo I 2024-07-03 18:11:10 wer:336] predicted:there's no war on now
[NeMo I 2024-07-03 18:11:11 wer:334] 
    
[NeMo I 2024-07-03 18:11:11 wer:335] reference:don't ask me to carry an oily rag like that
[NeMo I 2024-07-03 18:11:11 wer:336] predicted:don't ask me to carry an oily rag like that
[NeMo I 2024-07-03 18:11:13 wer:334] 
    
[NeMo I 2024-07-03 18:11:13 wer:335] reference:civilization is what man has made of himself
[NeMo I 2024-07-03 18:11:13 wer:336] predicted:civilization is what man has made of himself
[NeMo I 2024-07-03 18:11:14 wer:334] 
    
[NeMo I 2024-07-03 18:11:14 wer:335] reference:this he added brought about petty jealousies and petty personal g

Validation: |          | 0/? [00:00<?, ?it/s]

[NeMo I 2024-07-03 18:17:32 wer:334] 
    
[NeMo I 2024-07-03 18:17:32 wer:335] reference:she had your dark suit in greasy wash water all year
[NeMo I 2024-07-03 18:17:32 wer:336] predicted:she had your dark suit in greasy wash water all year
[NeMo I 2024-07-03 18:17:32 wer:334] 
    
[NeMo I 2024-07-03 18:17:32 wer:335] reference:her wardrobe consists of only skirts and blouses
[NeMo I 2024-07-03 18:17:32 wer:336] predicted:her wardrobe consists of only skirts and blouses
[NeMo I 2024-07-03 18:17:32 wer:334] 
    
[NeMo I 2024-07-03 18:17:32 wer:335] reference:the morning dew on the spider web glistened in the sun
[NeMo I 2024-07-03 18:17:32 wer:336] predicted:the morning dew on the spite of web glistened in the sun
[NeMo I 2024-07-03 18:17:32 wer:334] 
    
[NeMo I 2024-07-03 18:17:32 wer:335] reference:materials ceramic modeling clay red white or buff
[NeMo I 2024-07-03 18:17:32 wer:336] predicted:materials syremic modelling clay red white or buff
[NeMo I 2024-07-03 18:17:32 wer:334

Epoch 3, global step 2312: 'val_wer' reached 0.08662 (best 0.08662), saving model to '/home/robert/uni/ASRTest/experiments/lang-en/ASR-Model-Language-en/2024-07-03_18-03-47/checkpoints/ASR-Model-Language-en--val_wer=0.0866-epoch=3.ckpt' as top 3


[NeMo I 2024-07-03 18:18:10 nemo_model_checkpoint:299] /home/robert/uni/ASRTest/experiments/lang-en/ASR-Model-Language-en/2024-07-03_18-03-47/checkpoints/ASR-Model-Language-en.nemo already exists, moving existing checkpoint to /home/robert/uni/ASRTest/experiments/lang-en/ASR-Model-Language-en/2024-07-03_18-03-47/checkpoints/ASR-Model-Language-en-v1.nemo
[NeMo I 2024-07-03 18:18:10 nemo_model_checkpoint:219] New best .nemo model saved to: /home/robert/uni/ASRTest/experiments/lang-en/ASR-Model-Language-en/2024-07-03_18-03-47/checkpoints/ASR-Model-Language-en.nemo
[NeMo I 2024-07-03 18:18:10 nemo_model_checkpoint:228] Removing old .nemo backup /home/robert/uni/ASRTest/experiments/lang-en/ASR-Model-Language-en/2024-07-03_18-03-47/checkpoints/ASR-Model-Language-en-v1.nemo
[NeMo I 2024-07-03 18:18:13 wer:334] 
    
[NeMo I 2024-07-03 18:18:13 wer:335] reference:cooperation along with understanding alleviate dispute
[NeMo I 2024-07-03 18:18:13 wer:336] predicted:cooperation along with underst

Validation: |          | 0/? [00:00<?, ?it/s]

[NeMo I 2024-07-03 18:24:20 wer:334] 
    
[NeMo I 2024-07-03 18:24:20 wer:335] reference:she had your dark suit in greasy wash water all year
[NeMo I 2024-07-03 18:24:20 wer:336] predicted:she had your dark suit in greasy wash water all year
[NeMo I 2024-07-03 18:24:20 wer:334] 
    
[NeMo I 2024-07-03 18:24:20 wer:335] reference:her wardrobe consists of only skirts and blouses
[NeMo I 2024-07-03 18:24:20 wer:336] predicted:her wardrobe consists of only skirts and blouses
[NeMo I 2024-07-03 18:24:20 wer:334] 
    
[NeMo I 2024-07-03 18:24:20 wer:335] reference:the morning dew on the spider web glistened in the sun
[NeMo I 2024-07-03 18:24:20 wer:336] predicted:the morning dew on the spit of web glistened in the sun
[NeMo I 2024-07-03 18:24:20 wer:334] 
    
[NeMo I 2024-07-03 18:24:20 wer:335] reference:materials ceramic modeling clay red white or buff
[NeMo I 2024-07-03 18:24:20 wer:336] predicted:materials syremic modelling clay red white or buff
[NeMo I 2024-07-03 18:24:21 wer:334]

Epoch 5, global step 3468: 'val_wer' reached 0.08483 (best 0.08483), saving model to '/home/robert/uni/ASRTest/experiments/lang-en/ASR-Model-Language-en/2024-07-03_18-03-47/checkpoints/ASR-Model-Language-en--val_wer=0.0848-epoch=5.ckpt' as top 3


[NeMo I 2024-07-03 18:24:57 nemo_model_checkpoint:299] /home/robert/uni/ASRTest/experiments/lang-en/ASR-Model-Language-en/2024-07-03_18-03-47/checkpoints/ASR-Model-Language-en.nemo already exists, moving existing checkpoint to /home/robert/uni/ASRTest/experiments/lang-en/ASR-Model-Language-en/2024-07-03_18-03-47/checkpoints/ASR-Model-Language-en-v1.nemo
[NeMo I 2024-07-03 18:24:58 nemo_model_checkpoint:219] New best .nemo model saved to: /home/robert/uni/ASRTest/experiments/lang-en/ASR-Model-Language-en/2024-07-03_18-03-47/checkpoints/ASR-Model-Language-en.nemo
[NeMo I 2024-07-03 18:24:58 nemo_model_checkpoint:228] Removing old .nemo backup /home/robert/uni/ASRTest/experiments/lang-en/ASR-Model-Language-en/2024-07-03_18-03-47/checkpoints/ASR-Model-Language-en-v1.nemo
[NeMo I 2024-07-03 18:25:01 wer:334] 
    
[NeMo I 2024-07-03 18:25:01 wer:335] reference:one of these is the solidarity and the confidential relationship of marriage
[NeMo I 2024-07-03 18:25:01 wer:336] predicted:one of t

Validation: |          | 0/? [00:00<?, ?it/s]

[NeMo I 2024-07-03 18:31:30 wer:334] 
    
[NeMo I 2024-07-03 18:31:30 wer:335] reference:she had your dark suit in greasy wash water all year
[NeMo I 2024-07-03 18:31:30 wer:336] predicted:she had your dark suit in greasy wash water all year
[NeMo I 2024-07-03 18:31:30 wer:334] 
    
[NeMo I 2024-07-03 18:31:30 wer:335] reference:her wardrobe consists of only skirts and blouses
[NeMo I 2024-07-03 18:31:30 wer:336] predicted:her wardrobe consists of only skirts and blouses
[NeMo I 2024-07-03 18:31:31 wer:334] 
    
[NeMo I 2024-07-03 18:31:31 wer:335] reference:the morning dew on the spider web glistened in the sun
[NeMo I 2024-07-03 18:31:31 wer:336] predicted:the morning dew on the spita web glistened in the sun
[NeMo I 2024-07-03 18:31:31 wer:334] 
    
[NeMo I 2024-07-03 18:31:31 wer:335] reference:materials ceramic modeling clay red white or buff
[NeMo I 2024-07-03 18:31:31 wer:336] predicted:materials syremic modelling clay red white or buff
[NeMo I 2024-07-03 18:31:31 wer:334] 


Epoch 7, global step 4624: 'val_wer' reached 0.08518 (best 0.08483), saving model to '/home/robert/uni/ASRTest/experiments/lang-en/ASR-Model-Language-en/2024-07-03_18-03-47/checkpoints/ASR-Model-Language-en--val_wer=0.0852-epoch=7.ckpt' as top 3


[NeMo I 2024-07-03 18:32:11 wer:334] 
    
[NeMo I 2024-07-03 18:32:11 wer:335] reference:he looked over at him lying there asleep and he felt a wave of revulsion
[NeMo I 2024-07-03 18:32:11 wer:336] predicted:he looked overt he lying there asleep and he felt a wave of revulsion
[NeMo I 2024-07-03 18:32:13 wer:334] 
    
[NeMo I 2024-07-03 18:32:13 wer:335] reference:merely to satisfy the author's curiosity
[NeMo I 2024-07-03 18:32:13 wer:336] predicted:merely to satisfy the author's curiosy
[NeMo I 2024-07-03 18:32:14 wer:334] 
    
[NeMo I 2024-07-03 18:32:14 wer:335] reference:the body was heavier than he had anticipated
[NeMo I 2024-07-03 18:32:14 wer:336] predicted:the body was heavier than he had anticipated
[NeMo I 2024-07-03 18:32:16 wer:334] 
    
[NeMo I 2024-07-03 18:32:16 wer:335] reference:there are people who travel long distances to assure my continued existence
[NeMo I 2024-07-03 18:32:16 wer:336] predicted:there are people to travel long distances to assure my conontin

Validation: |          | 0/? [00:00<?, ?it/s]

[NeMo I 2024-07-03 18:38:15 wer:334] 
    
[NeMo I 2024-07-03 18:38:15 wer:335] reference:she had your dark suit in greasy wash water all year
[NeMo I 2024-07-03 18:38:15 wer:336] predicted:she had your dark suit in greasy wash water all year
[NeMo I 2024-07-03 18:38:15 wer:334] 
    
[NeMo I 2024-07-03 18:38:15 wer:335] reference:her wardrobe consists of only skirts and blouses
[NeMo I 2024-07-03 18:38:15 wer:336] predicted:her wardrobe consists of only skirts and blouses
[NeMo I 2024-07-03 18:38:15 wer:334] 
    
[NeMo I 2024-07-03 18:38:15 wer:335] reference:the morning dew on the spider web glistened in the sun
[NeMo I 2024-07-03 18:38:15 wer:336] predicted:the morning dew on the spite of web glistened in the sun
[NeMo I 2024-07-03 18:38:15 wer:334] 
    
[NeMo I 2024-07-03 18:38:15 wer:335] reference:materials ceramic modeling clay red white or buff
[NeMo I 2024-07-03 18:38:15 wer:336] predicted:materials syremic modelling clay red white or buff
[NeMo I 2024-07-03 18:38:15 wer:334

Epoch 9, global step 5780: 'val_wer' reached 0.08449 (best 0.08449), saving model to '/home/robert/uni/ASRTest/experiments/lang-en/ASR-Model-Language-en/2024-07-03_18-03-47/checkpoints/ASR-Model-Language-en--val_wer=0.0845-epoch=9.ckpt' as top 3


[NeMo I 2024-07-03 18:38:52 nemo_model_checkpoint:299] /home/robert/uni/ASRTest/experiments/lang-en/ASR-Model-Language-en/2024-07-03_18-03-47/checkpoints/ASR-Model-Language-en.nemo already exists, moving existing checkpoint to /home/robert/uni/ASRTest/experiments/lang-en/ASR-Model-Language-en/2024-07-03_18-03-47/checkpoints/ASR-Model-Language-en-v1.nemo
[NeMo I 2024-07-03 18:38:52 nemo_model_checkpoint:219] New best .nemo model saved to: /home/robert/uni/ASRTest/experiments/lang-en/ASR-Model-Language-en/2024-07-03_18-03-47/checkpoints/ASR-Model-Language-en.nemo
[NeMo I 2024-07-03 18:38:52 nemo_model_checkpoint:228] Removing old .nemo backup /home/robert/uni/ASRTest/experiments/lang-en/ASR-Model-Language-en/2024-07-03_18-03-47/checkpoints/ASR-Model-Language-en-v1.nemo


`Trainer.fit` stopped: `max_epochs=10` reached.
Restoring states from the checkpoint path at /home/robert/uni/ASRTest/experiments/lang-en/ASR-Model-Language-en/2024-07-03_18-03-47/checkpoints/ASR-Model-Language-en--val_wer=0.0845-epoch=9.ckpt
Restored all states from the checkpoint at /home/robert/uni/ASRTest/experiments/lang-en/ASR-Model-Language-en/2024-07-03_18-03-47/checkpoints/ASR-Model-Language-en--val_wer=0.0845-epoch=9.ckpt


[NeMo I 2024-07-03 18:38:55 nemo_model_checkpoint:299] /home/robert/uni/ASRTest/experiments/lang-en/ASR-Model-Language-en/2024-07-03_18-03-47/checkpoints/ASR-Model-Language-en.nemo already exists, moving existing checkpoint to /home/robert/uni/ASRTest/experiments/lang-en/ASR-Model-Language-en/2024-07-03_18-03-47/checkpoints/ASR-Model-Language-en-v1.nemo
[NeMo I 2024-07-03 18:38:55 nemo_model_checkpoint:271] Removing old .nemo backup /home/robert/uni/ASRTest/experiments/lang-en/ASR-Model-Language-en/2024-07-03_18-03-47/checkpoints/ASR-Model-Language-en-v1.nemo


tensorboard --logdir /content/experiments/lang-en/ASR-Model-Language-en/

In [23]:
checkpoint_path = "./experiments/lang-en/ASR-Model-Language-en/2024-07-03_17-04-06/checkpoints/ASR-Model-Language-en.nemo"
asr_model2 = nemo_asr.models.EncDecCTCModelBPE.restore_from(checkpoint_path)


[NeMo I 2024-07-03 18:38:56 mixins:172] Tokenizer SentencePieceTokenizer initialized with 128 tokens


[NeMo W 2024-07-03 18:38:56 modelPT:176] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    manifest_filepath: ./timit-dataset/train-manifest.clean.json
    sample_rate: 16000
    batch_size: 8
    shuffle: true
    num_workers: 8
    pin_memory: true
    use_start_end_token: false
    trim_silence: false
    max_duration: 16.7
    min_duration: 0.1
    is_tarred: false
    tarred_audio_filepaths: /data/tarred_train/audio__OP_0..511_CL_.tar
    shuffle_n: 2048
    bucketing_strategy: synced_randomized
    bucketing_batch_size: null
    
[NeMo W 2024-07-03 18:38:56 modelPT:183] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). 
    Validation config : 
    manifest_filepath: ./timit-dataset

[NeMo I 2024-07-03 18:38:56 features:305] PADDING: 0
[NeMo I 2024-07-03 18:38:57 save_restore_connector:263] Model EncDecCTCModelBPE was successfully restored from /home/robert/uni/ASRTest/experiments/lang-en/ASR-Model-Language-en/2024-07-03_17-04-06/checkpoints/ASR-Model-Language-en.nemo.


In [24]:

run_after_check = True

if run_after_check: 
    # Read the test manifest file
    test_data = read_manifest(test_clean_manifest_path)
    
    # Extract audio file paths and reference texts
    audio_filepaths = [item['audio_filepath'] for item in test_data]
    references = [item['text'].lower() for item in test_data]
    
    # Run inference on the entire batch
    predictions = asr_model2.transcribe(audio_filepaths)
    
    for prediction, reference_text in zip(predictions, references):
        print(f"""
                reference : {reference_text}         
                prediction: {prediction}
        """)
    
    # Calculate WER
    wer = word_error_rate(hypotheses=predictions, references=references)
    print(f"Word Error Rate (WER): {wer}")


Transcribing: 100%|██████████| 420/420 [01:01<00:00,  6.83it/s]


                reference : she had your dark suit in greasy wash water all year         
                prediction: she had your dark suit in greasy wash water all year
        

                reference : don't ask me to carry an oily rag like that         
                prediction: don't ask me to carry an oily rag like that
        

                reference : his captain was thin and haggard and his beautiful boots were worn and shabby         
                prediction: his captain was thin and haggard and his beautiful boots were worn and schhabby
        

                reference : the reasons for this dive seemed foolish now         
                prediction: the reasons for this stive seemed foolish now
        

                reference : production may fall far below expectations         
                prediction: production may fall far below expectations
        

                reference : pizzerias are convenient for a quick lunch         
               


