# Task: `Automatic Speech Recognition`

Given an audio file of a turret command instruction, return a transcription of the instruction.

Note that noise simulating the corruption of radio transmissions will be present in the audio datasets provided to **both Novice and Advanced teams.**

- Audio files are provided in .WAV format with a sample rate of 16 kHz. Images are provided as 1520x870 JPG files.
- In the **audio datasets** provided to both the Novice and Advanced Guardians, noise will be present. Guardians who wish to fine-tune their models on additional data are free to use the (clean, unaugmented) National Speech Corpus data present in the `til-ai-24-data` bucket on Google Cloud Storage.

_Insert Code Here_

In [None]:
!pip install -q transformers librosa jiwer torchaudio jsonlines datasets accelerate audiomentations # Audio Augmentation
!pip install -q Cython
!pip install -q nemo_toolkit[all]

In [12]:
import audiomentations
import jsonlines
import torchaudio
from datasets import Dataset
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, Trainer, TrainingArguments, DataCollatorWithPadding
from pathlib import Path
import torch
import librosa
import IPython.display as ipd
import jiwer
import os
from torch.utils.data import DataLoader
import json
import re

In [3]:
cur_dir = os.getcwd()
asr_dir = os.path.dirname(cur_dir)
til_dir = os.path.dirname(asr_dir)
home_dir = os.path.dirname(til_dir)
data_dir = os.path.join(home_dir, 'novice')
audio_dir = os.path.join(data_dir, 'audio')

audio_dir

'/home/jupyter/novice/audio'

In [2]:
import logging

# Get the root logger
root_logger = logging.getLogger()

# Get the current logging level
current_log_level = root_logger.getEffectiveLevel()

# Print the current logging level
print(f"Current logging level: {logging.getLevelName(current_log_level)}")




In [None]:
import nemo.collections.asr as nemo_asr
logging.getLogger().setLevel(logging.ERROR)

# Load your pre-trained ASR model and processor
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h") # 

In [None]:
from jiwer import wer

def calculate_wer(actual_sentences, predicted_sentences):
    """ Returns: float: The average WER."""
    wer_values = [wer(actual, predicted) for actual, predicted in zip(actual_sentences, predicted_sentences)]
    average_wer = sum(wer_values) / len(wer_values)
    
    for i, (actual, predicted, wer_value) in enumerate(zip(actual_sentences, predicted_sentences, wer_values)):
        print(f"Sentence {i+1} WER: {wer_value:.2f}")
    
    print(f"Average WER: {average_wer:.2f}")
    return average_wer

In [None]:
data = {'key': [], 'audio': [], 'transcript': []}
data_path = os.path.join(data_dir, "asr.jsonl")
with jsonlines.open(data_path) as reader:
    for obj in reader:
        if len(data['key']) >= 10:
            break
        for key, value in obj.items():
            data[key].append(value)

actual_sentences = []
predicted_sentences = []

model.to('cuda')

for file_name, transcript in zip(data['audio'], data['transcript']):
    actual_sentences.append(transcript)
    
    file_path = os.path.join(audio_dir, file_name)
    audio_input, sample_rate = librosa.load(file_path, sr=16000)  # Ensure sample rate is 16kHz for Wav2Vec2
    input_values = processor(audio_input, sampling_rate=sample_rate, return_tensors="pt").input_values.to('cuda')  # Move input values to GPU

    with torch.no_grad():
        logits = model(input_values).logits  # Forward pass

    predicted_ids = torch.argmax(logits, dim=-1)
    predicted_sentences.append(processor.batch_decode(predicted_ids)[0])

In [None]:
calculate_wer(actual_sentences, predicted_sentences)

## Fine Tuning

In [None]:
chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"]'
batch_size = 64
total_size = None # set to None if use all

In [12]:
def preprocess_data(examples):
    audio_paths = examples['audio']
    transcripts = examples['transcript']
    
    # Load and process the audio files
    speech_arrays = [torchaudio.load(os.path.join(audio_dir, path))[0].squeeze(0) for path in audio_paths]
    sampling_rates = [torchaudio.load(os.path.join(audio_dir, path))[1] for path in audio_paths]
    
    # the following line calls processor.feature_extractor
    processed_inputs = [processor(speech, sampling_rate=rate, return_tensors="pt", padding=True) for speech, rate in zip(speech_arrays, sampling_rates)]
    
    input_values = [processed.input_values.squeeze(0) for processed in processed_inputs]
    
    # Create attention masks based on the input values
    attention_masks = [torch.ones_like(values) for values in input_values]
    for mask, values in zip(attention_masks, input_values):
        mask[values == processor.tokenizer.pad_token_id] = 0  # Set padding tokens to 0
    
    # Process and pad the labels
    processed_labels = []
    for transcript in transcripts:
        transcript = re.sub(chars_to_ignore_regex, '', transcript).upper()
        with processor.as_target_processor():
            label = processor(transcript, return_tensors="pt", padding=True)
        processed_labels.append(label.input_ids.squeeze(0))
    
    max_input_length = max([values.size(0) for values in input_values])
    max_label_length = max([label.size(0) for label in processed_labels])
    
    padded_input_values = [torch.nn.functional.pad(values, (0, max_input_length - values.size(0)), value=processor.tokenizer.pad_token_id) for values in input_values]
    padded_attention_masks = [torch.nn.functional.pad(mask, (0, max_input_length - mask.size(0)), value=0) for mask in attention_masks]
    
    padded_labels = [torch.nn.functional.pad(label, (0, max_label_length - label.size(0)), value=-100) for label in processed_labels]
    
    # DEBUG Check shapes
    # print("Input values shapes:", [values.shape for values in padded_input_values])
    # print("Attention masks shapes:", [mask.shape for mask in padded_attention_masks])
    # print("Labels shapes:", [label.shape for label in padded_labels])
    
    examples['input_values'] = padded_input_values
    examples['attention_mask'] = padded_attention_masks
    examples['labels'] = padded_labels
    
    return examples

In [16]:
data = {'key': [], 'audio': [], 'transcript': []}
data_path = os.path.join(data_dir, "asr.jsonl")
with jsonlines.open(data_path) as reader:
    for obj in reader:
        if total_size and len(data['key']) > total_size:
            break
        for key, value in obj.items():
            data[key].append(value)

# Convert to a Hugging Face dataset
dataset = Dataset.from_dict(data)

# Shuffle the dataset
dataset = dataset.shuffle(seed=42)

# Split the dataset into training, validation, and test sets
train_size = int(0.8 * len(dataset))
val_size = int(0.1 * len(dataset))
test_size = len(dataset) - train_size - val_size

train_dataset = dataset.select(range(train_size))
val_dataset = dataset.select(range(train_size, train_size + val_size))
test_dataset = dataset.select(range(train_size + val_size, train_size + val_size + test_size))

train_dataset = train_dataset.map(preprocess_data, batched=True, batch_size=batch_size, remove_columns=train_dataset.column_names)
val_dataset = val_dataset.map(preprocess_data, batched=True, batch_size=batch_size, remove_columns=val_dataset.column_names)
test_dataset = test_dataset.map(preprocess_data, batched=True, batch_size=batch_size, remove_columns=test_dataset.column_names)

Map:   0%|          | 0/2800 [00:00<?, ? examples/s]



Map:   0%|          | 0/350 [00:00<?, ? examples/s]

Map:   0%|          | 0/350 [00:00<?, ? examples/s]

## Save the datasets

In [17]:
# train_dataset.save_to_disk('./data/train_dataset')
# val_dataset.save_to_disk('./data/val_dataset')
# test_dataset.save_to_disk('./data/test_dataset')

Saving the dataset (0/9 shards):   0%|          | 0/2800 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/350 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/350 [00:00<?, ? examples/s]

## Load the datasets

In [6]:
from datasets import load_from_disk

train_dataset = load_from_disk('./data/train_dataset')
val_dataset = load_from_disk('./data/val_dataset')
test_dataset = load_from_disk('./data/test_dataset')

In [7]:
class CustomDataset(torch.utils.data.Dataset): 
    def __init__(self, hf_dataset):
        self.dataset = hf_dataset

    def __getitem__(self, idx):
        item = self.dataset[idx]
        input_values = torch.tensor(item['input_values'], dtype=torch.float32).clone().detach()
        attention_mask = torch.tensor(item['attention_mask'], dtype=torch.int64).clone().detach()
        labels = torch.tensor(item['labels'], dtype=torch.int64).clone().detach()
        return {
            'input_values': input_values,
            'attention_mask': attention_mask,
            'labels': labels
        }

    def __len__(self):
        return len(self.dataset)

In [8]:
train_dataset = CustomDataset(train_dataset)
val_dataset = CustomDataset(val_dataset)
test_dataset = CustomDataset(test_dataset)

In [None]:
# # DEBUG: Check the first few samples from the preprocessed training dataset
# for i in range(3):
#     sample = train_dataset[i]
#     print(f"Sample {i+1}:")
#     print(f"  input_values: {sample['input_values'][:10]}... (length: {len(sample['input_values'])})")  # Print first 10 values
#     print(f"  attention_mask: {sample['attention_mask'][:10]}... (length: {len(sample['attention_mask'])})")
#     print(f"  labels: {sample['labels'][:10]}... (length: {len(sample['labels'])})")
#     print()

In [None]:
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False) # TODO check if need true
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [None]:
# # DEBUG last 3 batches of train_dataloader
# start_idx = len(train_dataloader) - 3
# print(start_idx)

# for batch_idx, batch in enumerate(train_dataloader):
    
#     if batch_idx < start_idx:  # Only print shapes for the first 3 batches to avoid excessive output
#         continue
    
#     print(f"Batch {batch_idx}:")
#     input_values_shape = batch['input_values'].shape
#     attention_mask_shape = batch['attention_mask'].shape
#     labels_shape = batch['labels'].shape

#     print("Batch input values shape:", input_values_shape)
#     print("Batch attention mask shape:", attention_mask_shape)
#     print("Batch labels shape:", labels_shape)

#     # Print the shapes of each item within the batch
#     for item_idx in range(input_values_shape[0]):
#         print(f"  Item {item_idx} input values shape:", batch['input_values'][item_idx].shape)
#         print(f"  Item {item_idx} attention mask shape:", batch['attention_mask'][item_idx].shape)
#         print(f"  Item {item_idx} labels shape:", batch['labels'][item_idx].shape)

In [None]:
class CustomDataCollator:
    def __init__(self, processor):
        self.processor = processor

    def __call__(self, features):
        input_values = [feature['input_values'] for feature in features]
        attention_mask = [feature['attention_mask'] for feature in features]
        labels = [feature['labels'] for feature in features]

        # Determine the max length for padding
        max_input_length = max([len(input_value) for input_value in input_values])
        max_label_length = max([len(label) for label in labels])

        # Pad input values and attention masks
        padded_input_values = [torch.nn.functional.pad(input_value, (0, max_input_length - len(input_value)), value=self.processor.tokenizer.pad_token_id) for input_value in input_values]
        padded_attention_mask = [torch.nn.functional.pad(mask, (0, max_input_length - len(mask)), value=0) for mask in attention_mask]

        # Pad labels
        padded_labels = [torch.nn.functional.pad(label, (0, max_label_length - len(label)), value=-100) for label in labels]

        # Stack the tensors
        batch = {
            'input_values': torch.stack(padded_input_values),
            'attention_mask': torch.stack(padded_attention_mask),
            'labels': torch.stack(padded_labels)
        }

        return batch

In [None]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="steps",
    learning_rate=1e-4,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=20,
    weight_decay=0.005,
    save_steps=100,
    eval_steps=100,
    logging_steps=100,
    load_best_model_at_end=True
)

# Initially freeze all layers except the classifier layer
for param in model.parameters():
    param.requires_grad = False
for param in model.lm_head.parameters():
    param.requires_grad = True

data_collator = CustomDataCollator(processor)
    
# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,  # Use the validation dataset for evaluation
    data_collator=data_collator,
    tokenizer=processor.tokenizer
)

# Train the model
trainer.train()

In [None]:
predicted_sentences = []
actual_sentences = []

model.eval()
with torch.no_grad():
    for batch in test_dataloader:
        input_values = batch['input_values'].to('cuda')
        attention_mask = batch['attention_mask'].to('cuda')
        labels = batch['labels']

        outputs = model(input_values=input_values, attention_mask=attention_mask)
        logits = outputs.logits

        # Get the predicted token IDs
        predicted_ids = torch.argmax(logits, dim=-1)

        # Decode the predicted IDs to text
        predicted_transcripts = [processor.decode(ids, skip_special_tokens=True) for ids in predicted_ids]

        # Ensure labels are on the CPU and convert them to NumPy arrays if necessary
        label_ids = labels.cpu().numpy() if isinstance(labels, torch.Tensor) else labels
        
        # debug
        # print(f"Raw label IDs: {label_ids}")
        # print(f"Label IDs length before processing: {len(label_ids)}")
        # print(f"First label length before processing: {len(label_ids[0])}")

        # Replace the padding value (-100) with the pad_token_id of the tokenizer
        pad_token_id = processor.tokenizer.pad_token_id
        label_ids = [[id if id != -100 else pad_token_id for id in sent] for sent in label_ids]

        # Decode the actual labels to text
        actual_transcripts = [processor.decode(ids, skip_special_tokens=True) for ids in label_ids]

        # Extend the lists with the current batch results
        predicted_sentences.extend(predicted_transcripts)
        actual_sentences.extend(actual_transcripts)

# Print results
results = [{"actual": actual, "predicted": predicted} for actual, predicted in zip(actual_sentences, predicted_sentences)]
print(json.dumps(results, indent=2))

In [None]:
from jiwer import wer
wer_values = [wer(actual, predicted) for actual, predicted in zip(actual_sentences, predicted_sentences)]

# Calculate average WER
average_wer = sum(wer_values) / len(wer_values)

# Print WER for each sentence and the average WER
for i, (actual, predicted, wer_value) in enumerate(zip(actual_sentences, predicted_sentences, wer_values)):
    print(f"Sentence {i+1} WER: {wer_value:.2f}")

print(f"Average WER: {average_wer:.2f}")

In [None]:
# token_id = 3
# token = processor.tokenizer.convert_ids_to_tokens(token_id)
# print(f"Token ID {token_id} corresponds to token: {token}")

In [None]:
# print(f"Tokenizer vocabulary size: {len(processor.tokenizer)}")

In [88]:
# from torchaudio.transforms import AddNoise, SpeedPerturb, TimeStretch

# def augment_audio(audio):
#     # Add noise
#     audio = AddNoise()(audio)
#     # Speed perturbation
#     audio = SpeedPerturb()(audio)
#     # Time stretch
#     audio = TimeStretch()(audio)
#     return audio

# # Apply augmentation to your dataset
# train_dataset = [augment_audio(audio) for audio in train_dataset]