In [None]:
# This mounts your Google Drive to the Colab VM.
from google.colab import drive
drive.mount('/content/drive')

# TODO: Enter the foldername in your Drive where you have saved the unzipped
# assignment folder, e.g. 'cs231/assignment1'
FOLDERNAME = 'Introduction to Speech Processing/'
assert FOLDERNAME is not None, "[!] Enter the folername."

# Now that we've mounted your Drive, this ensures that
# the Python interpreter of the Colab VM can load
# python files from within it
import sys
sys.path.append('/content/drive/MyDrive/{}'.format(FOLDERNAME))

%cd /content/drive/MyDrive/$FOLDERNAME

Mounted at /content/drive
/content/drive/MyDrive/Introduction to Speech Processing


In [None]:
!pip install jiwer
!pip install tokenizers
!pip install tensorboardX

Collecting jiwer
  Downloading jiwer-3.0.2-py3-none-any.whl (21 kB)
Collecting rapidfuzz==2.13.7 (from jiwer)
  Downloading rapidfuzz-2.13.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz, jiwer
Successfully installed jiwer-3.0.2 rapidfuzz-2.13.7
Collecting tokenizers
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m29.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tokenizers
Successfully installed tokenizers-0.13.3
Collecting tensorboardX
  Downloading tensorboardX-2.6.2.2-py2.py3-none-any.whl (101 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m101.7/101.7 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packa

In [None]:
import os

import torch
import random
import torchaudio
import torch.nn as nn
import torch.optim as optim

from librosa import effects

from torch.utils.data import Dataset
from torch.utils.data import DataLoader

from jiwer import wer, cer

from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from torchaudio.models.decoder import ctc_decoder
from tokenizers.pre_tokenizers import Whitespace

from collections import defaultdict
from tensorboardX import SummaryWriter

VOCAB_SIZE = 198



In [None]:
class AudioTranscriptDataset(Dataset):
    """ Custom dataset to zip audio files and their matching transcripts """

    def __init__(self, audio_folder, transcript_folder, audio_extension=".wav", transcript_extension=".txt"):
        self.audio_folder = audio_folder
        self.transcript_folder = transcript_folder
        self.audio_extension = audio_extension
        self.transcript_extension = transcript_extension
        self.audio_files = os.listdir(audio_folder)

    def __len__(self):
        return len(self.audio_files)

    def __getitem__(self, idx):
        audio_file = os.path.join(self.audio_folder, self.audio_files[idx])
        transcript_file = os.path.join(self.transcript_folder,
                                       self.audio_files[idx].replace(self.audio_extension, self.transcript_extension))
        waveform, sample_rate = torchaudio.load(audio_file)
        with open(transcript_file, "r") as f:
            transcript = f.read().strip()

        return waveform, transcript


In [None]:
class TextTransform:
    """Maps characters to integers and vice versa"""

    def __init__(self):
        # Initialize the tokenizer
        self.tokenizer = Tokenizer.from_file(rf"./tokenizer-train_{VOCAB_SIZE}.json")

    def text_to_int(self, text):
        """ Use a character map and convert text to an integer sequence """
        text = text.replace(" ", "[SPACE]")
        return self.tokenizer.encode(text).ids

        int_sequence = []
        for c in text:
            if c == ' ':
                ch = self.tokenizer.token_to_id("[SPACE]")
            else:
                ch = self.tokenizer.token_to_id(c)
            int_sequence.append(ch)
        return int_sequence

    def int_to_text(self, labels):
        """ Use a character map and convert integer labels to an text sequence """
        string = []
        for i in labels:
            string.append(self.tokenizer.id_to_token(int(i)))
        return ''.join(string).replace('[SPACE]', ' ')

    def int_to_text_with_space(self, labels):
      """ Use a character map and convert integer labels to an text sequence """
      string = []
      for i in labels:
          string.append(self.tokenizer.id_to_token(int(i)))
          string.append(' ')
      return ''.join(string).replace('[SPACE]', ' ')


text_transform = TextTransform()

In [None]:
def GreedyDecoder(output, labels, label_lengths, blank_label=VOCAB_SIZE, collapse_repeated=True):
    arg_maxes = torch.argmax(output, dim=2)
    decodes = []
    targets = []
    for i, args in enumerate(arg_maxes):
        decode = []
        targets.append(text_transform.int_to_text(labels[i][:label_lengths[i]].tolist()))
        for j, index in enumerate(args):
            if index != blank_label:
                if collapse_repeated and j != 0 and index == args[j - 1]:
                    continue
                decode.append(index.item())
        decodes.append(text_transform.int_to_text(decode))
    return decodes, targets


In [None]:
# Define the CTC model
class CTCModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, num_layers=2, dropout=0.3, ckpt_path=None):
        super(CTCModel, self).__init__()
        self.rnn = nn.LSTM(input_dim, hidden_dim, num_layers=num_layers, dropout=dropout, bidirectional=True,
                           batch_first=True)
        self.linear = nn.Linear(hidden_dim * 2, output_dim)  # *2 for bidirectional
        self.softmax = nn.LogSoftmax(dim=-1)

        # Initilize model's weigths when a path is given
        if ckpt_path is not None:
            self.load_state_dict(torch.load(ckpt_path))

    def forward(self, x):
        rnn_out, _ = self.rnn(x)
        linear_out = self.linear(rnn_out)
        return self.softmax(linear_out)

In [None]:
# Define the dataset and data loader
def load_an4_dataset(mode='train'):
    # Load the AN4 dataset and adjust the path accordingly
    audio_folder = f"./an4/{mode}/an4/wav/"
    transcript_folder = f"./an4/{mode}/an4/txt/"
    dataset = AudioTranscriptDataset(audio_folder, transcript_folder)

    return dataset


# MFCC feature extraction
# copied from fairseq/examples/hubert/simple_kmeans/dump_mfcc_feature.py on github
def get_feats(waveform):
    with torch.no_grad():
        x = waveform.float()
        x = x.view(1, -1)

        mfccs = torchaudio.compliance.kaldi.mfcc(
            waveform=x,
            sample_frequency=16000,
            use_energy=False,
        )  # (time, freq)
        mfccs = mfccs.transpose(0, 1)  # (freq, time)
        deltas = torchaudio.functional.compute_deltas(mfccs)
        ddeltas = torchaudio.functional.compute_deltas(deltas)
        concat = torch.cat([mfccs, deltas, ddeltas], dim=0)
        concat = concat.transpose(0, 1).contiguous()  # (freq, time)
        return concat


# Define the preprocessing function for MFCC feature extraction and target labels extraction
def data_processing(data, mode):
    mfccs = []
    labels = []
    input_lengths = []
    label_lengths = []
    for (waveform, transcript) in data:
        # Augment the waveform with probability
        if mode == 'train':
            waveform = augment_waveform(waveform)
        # Use 39 MFCC coefficients
        mfcc = get_feats(waveform)
        mfccs.append(mfcc)
        label = torch.Tensor(text_transform.text_to_int(transcript))
        labels.append(label)
        input_lengths.append(mfcc.shape[0])
        label_lengths.append(len(label))

    mfccs = nn.utils.rnn.pad_sequence(mfccs, batch_first=True)
    labels = nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=1)
    return mfccs, labels, torch.tensor([mfccs.shape[1]] * mfccs.shape[0]), torch.tensor(
        [labels.shape[1]] * labels.shape[0])  # input_lengths, label_lengths


def augment_waveform(waveform, augmentation_probability=0.7):
    """
    Augment a waveform by adding random noise with the given probability.
    """
    noise_level = 0.02
    minimum_stretch_rate = 0.8
    stretch_factor = 0.4
    sample_rate = 16000
    num_semitones = 2
    min_factor = 1.0
    max_factor = 1.2

    waveform = waveform.numpy()
    probability = torch.rand(1)

    add_white_noise = lambda waveform: waveform + torch.randn_like(torch.tensor(waveform)).numpy() * noise_level
    time_stretch = lambda waveform: effects.time_stretch(waveform, rate=minimum_stretch_rate + stretch_factor * torch.rand(1).item())
    pitch_scale = lambda waveform: effects.pitch_shift(waveform, sr=sample_rate, n_steps=num_semitones)
    random_gain = lambda waveform: waveform * random.uniform(min_factor, max_factor)
    invert_polarity = lambda waveform: waveform * -1

    functions = [add_white_noise, time_stretch, pitch_scale, random_gain, invert_polarity]
    if probability < augmentation_probability:
      waveform = functions[int(probability * 5 / augmentation_probability)](waveform)

    return torch.tensor(waveform)

In [None]:
# Training function
def train(model, batch_size, print_interval, save_ckpt_interval, criterion, optimizer, device):
    # Load the AN4 train dataset
    train_dataset = load_an4_dataset(mode='train')

    # Preprocess the dataset and create the data loader
    data_loader = DataLoader(dataset=train_dataset,
                             batch_size=batch_size,
                             shuffle=True,
                             collate_fn=lambda x: data_processing(x, mode='train'))
    model.train()
    total_loss = 0.0

    for batch_idx, _data in enumerate(data_loader):
        batch_inputs, targets, input_lengths, target_lengths = _data
        batch_inputs, targets = batch_inputs.to(device), targets.to(device)
        optimizer.zero_grad()

        # Get model predictions
        outputs = model(batch_inputs).transpose(0, 1)

        # Calculate the CTC loss
        loss = criterion(outputs, targets, input_lengths, target_lengths)

        # Backpropagation and optimization step
        loss.backward()
        optimizer.step()

        # Print current batch loss
        if batch_idx % print_interval == 0:
            print(f"Batch {batch_idx}, Loss: {loss:.4f}")

        # Save the trained model
        # if batch_idx % save_ckpt_interval == 1:
        #   torch.save(model.state_dict(), "ctc_model.pt")

        total_loss += loss.item()

    return total_loss / len(data_loader)

In [None]:
def test(model, device, batch_size, criterion, mode='test', std_output=False):
    # Load the AN4 test dataset
    test_dataset = load_an4_dataset(mode=mode)

    # Preprocess the dataset and create the data loader
    test_loader = DataLoader(dataset=test_dataset,
                             batch_size=batch_size,
                             shuffle=False,
                             collate_fn=lambda x: data_processing(x, mode='test'))

    if std_output:
      print('\nevaluating...')
    model.eval()
    test_loss = 0
    test_cer, test_wer = [], []
    with torch.no_grad():
        for i, _data in enumerate(test_loader):
            mfccs, labels, input_lengths, label_lengths = _data
            mfccs, labels = mfccs.to(device), labels.to(device)

            output = model(mfccs)  # (batch, time, n_class)
            output = output.transpose(0, 1)  # (time, batch, n_class)

            loss = criterion(output, labels, input_lengths, label_lengths)
            test_loss += loss.item() / len(test_loader)

            # decoded_preds = tokenizer.decode_batch(torch.argmax(output.transpose(0, 1), dim=2).tolist())
            # labels = labels.tolist()
            # labels = [list(map(int, label.tolist())) for label in labels]
            # decoded_targets = tokenizer.decode_batch(labels)

            decoded_preds, decoded_targets = GreedyDecoder(output.transpose(0, 1), labels, label_lengths)

            for j in range(len(decoded_preds)):
                if std_output:
                  print("Target: " + decoded_targets[j])
                  print("Predicted: " + decoded_preds[j])
                test_cer.append(cer(decoded_targets[j], decoded_preds[j]))
                test_wer.append(wer(decoded_targets[j], decoded_preds[j]))

    avg_cer = sum(test_cer) / len(test_cer)
    avg_wer = sum(test_wer) / len(test_wer)

    if std_output:
      print('Test set: Average loss: {:.4f}, Average CER: {:4f}% Average WER: {:.4f}%\n'.format(test_loss, avg_cer * 100,
                                                                                                avg_wer * 100))
    return avg_cer, avg_wer, test_loss

In [None]:
def generate_lexicon():
    # tokenizer = text_transform.tokenizer
    dataset = load_an4_dataset(mode='train')
    lexicon = defaultdict(list)

    for example in dataset:
        words = example[1].split()  # Split transcription into words

        for word in words:
            tokens = text_transform.text_to_int(word)
            sub_words = text_transform.int_to_text_with_space(tokens)
            if sub_words+"|" not in lexicon[word]:
              lexicon[word].append(sub_words+ "|")

    # Save the lexicon to a file
    lexicon_path = "lexicon.txt"
    with open(lexicon_path, 'w') as f:
        for word, tokens_list in lexicon.items():
            f.write(f"{word} {' '.join(tokens_list)}\n")

    print("Lexicon generated and saved to", lexicon_path)

In [None]:
def loss_on_training_and_val_graph(input_dim, hidden_dim, output_dim, batch_size, print_interval,
                                   save_ckpt_interval, learning_rate, regularization, device):
    # Initialize SummaryWriter
    log_dir = "logs_loss_all"  # Directory to store the logs
    writer = SummaryWriter(log_dir=log_dir)

    # Initialize ASR model with the current hidden_dim
    model = CTCModel(input_dim, hidden_dim, output_dim).to(device)

    # Loss function and optimizer
    criterion = CTCLoss(device)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=regularization)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=80, gamma=0.1)

    num_epochs = 120
    for epoch in range(num_epochs):

        avg_loss = train(model, batch_size, print_interval, save_ckpt_interval, criterion, optimizer, device)
        print(f"Epoch {epoch + 1}/{num_epochs}, Average Loss: {avg_loss:.4f} Hidden Dim: {hidden_dim}")
        scheduler.step()

        if epoch % 5 == 0:

          train_CER, train_WER, train_set_loss = test(model, device, batch_size, criterion, 'train', std_output=False)
          writer.add_scalars('Training Set Loss', {f'hidden_dim_{hidden_dim}':avg_loss}, epoch)
          writer.add_scalars('Training Set CER %', {f'hidden_dim_{hidden_dim}':train_CER}, epoch)
          writer.add_scalars('Training Set WER %', {f'hidden_dim_{hidden_dim}':train_WER}, epoch)

          val_CER, val_WER, val_set_loss = test(model, device, batch_size, criterion, 'val', std_output=False)
          writer.add_scalars('Validation Set Loss', {f'hidden_dim_{hidden_dim}':val_set_loss}, epoch)
          writer.add_scalars('Validation Set CER %', {f'hidden_dim_{hidden_dim}':val_CER}, epoch)
          writer.add_scalars('Validation Set WER %', {f'hidden_dim_{hidden_dim}':val_WER}, epoch)

          test_CER, test_WER, test_set_loss = test(model, device, batch_size, criterion, 'test', std_output=False)
          writer.add_scalars('test Set Loss', {f'hidden_dim_{hidden_dim}':test_set_loss}, epoch)
          writer.add_scalars('test Set CER %', {f'hidden_dim_{hidden_dim}':test_CER}, epoch)
          writer.add_scalars('test Set WER %', {f'hidden_dim_{hidden_dim}':test_WER}, epoch)

    # Close the SummaryWriter
    writer.close()
    return model

In [None]:
# CTC loss function
class CTCLoss(nn.Module):
    def __init__(self, device):
        super(CTCLoss, self).__init__()
        self.ctc_loss = nn.CTCLoss(blank=VOCAB_SIZE).to(device)

    def forward(self, log_probs, targets, input_lengths, target_lengths):
        return self.ctc_loss(log_probs, targets, input_lengths, target_lengths)

In [None]:
# Hyperparameters
input_dim = 39
hidden_dim = 1024
output_dim = VOCAB_SIZE + 1  # Number of characters in AN4 dataset, including blank (249)

batch_size = 16
print_interval = 15
save_ckpt_interval = 45
learning_rate = 0.0005
regularization = 0.001
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
criterion = CTCLoss(device)

model = loss_on_training_and_val_graph(input_dim, hidden_dim, output_dim, batch_size, print_interval,
                                       save_ckpt_interval, learning_rate, regularization, device)
# save final weights
torch.save(model.state_dict(), "./aug_sub_token_ctc_lstm_model.pt")

Batch 0, Loss: 85.2661
Batch 15, Loss: 2.7895
Batch 30, Loss: 2.5311
Batch 45, Loss: 2.3734
Epoch 1/120, Average Loss: 8.1826 Hidden Dim: 1024
Batch 0, Loss: 1.3357
Batch 15, Loss: 1.5120
Batch 30, Loss: 1.5352
Batch 45, Loss: 1.8268
Epoch 2/120, Average Loss: 1.7225 Hidden Dim: 1024
Batch 0, Loss: 1.6629
Batch 15, Loss: 1.6920
Batch 30, Loss: 1.8105
Batch 45, Loss: 1.4948
Epoch 3/120, Average Loss: 1.6843 Hidden Dim: 1024
Batch 0, Loss: 1.5684
Batch 15, Loss: 1.3562
Batch 30, Loss: 1.5358
Batch 45, Loss: 1.4720
Epoch 4/120, Average Loss: 1.6449 Hidden Dim: 1024
Batch 0, Loss: 1.2783
Batch 15, Loss: 1.6791
Batch 30, Loss: 1.5601
Batch 45, Loss: 1.4503
Epoch 5/120, Average Loss: 1.6413 Hidden Dim: 1024
Batch 0, Loss: 1.7202
Batch 15, Loss: 1.6543
Batch 30, Loss: 1.3478
Batch 45, Loss: 1.3147
Epoch 6/120, Average Loss: 1.6049 Hidden Dim: 1024
Batch 0, Loss: 2.2184
Batch 15, Loss: 1.3027
Batch 30, Loss: 1.7502
Batch 45, Loss: 1.7166
Epoch 7/120, Average Loss: 1.6181 Hidden Dim: 1024
Batch

In [None]:
model = CTCModel(input_dim, hidden_dim, output_dim, ckpt_path="./aug_sub_token_ctc_lstm_model.pt").to(device)
test(model, device, batch_size, criterion, mode='train', std_output=True)


evaluating...
Target: HELP                      
Predicted: HELP 
Target: FIVE TWO SEVEN                  
Predicted: FIVE TWO SEVEN
Target: O A K D A L E D R I V E
Predicted: O A K D A L E D R I V E
Target: NO                      
Predicted: NO 
Target: ONE TWO FOUR ONE                
Predicted: ONE TWO FOUR ONE 
Target: M I C H A E L          
Predicted: M I C H A E L 
Target: B R O S T              
Predicted: B R O S T
Target: RUBOUT F M Q N H THREE SEVENTY        
Predicted: RUBOUT F M Q N H THREE SEVENTY 
Target: GO                      
Predicted: GO 
Target: ONE FIVE TWO ZERO SEVEN              
Predicted: ONE FIVE TWO ZERO SEVEN
Target: P I T T S B U R G H    
Predicted: P I T T S B U R G H
Target: L O O F B O U R R O W  
Predicted: L O O F B O U R R O W
Target: T V A H FIFTY TWO FIFTY THREE        
Predicted: T V A H FIFTY TWO FIFTY THREE
Target: H I N I C H            
Predicted: H I N I C H
Target: P I T T S B U R G H    
Predicted: P I T T S B U R G H
Target: SEVEN FOUR

(0.0012703797746208615, 0.0010863111097577684, 0.1501303255833961)

In [None]:
test(model, device, batch_size, criterion, mode='val', std_output=True)


evaluating...
Target: P I T T S B U R G H          
Predicted: P I T T S B U R G H
Target: ERASE O T H F I FIVE ZERO              
Predicted: ERASE O T H F I I Z 
Target: YES                            
Predicted: YES 
Target: EIGHT OH THREE TWO THREE FOUR FIVE NINE SEVEN TWO          
Predicted: EIGHT  THREE TWO THREE FOUR FIVE NINE SEVEN TWO 
Target: S H A R O N                  
Predicted: S H A R O E N
Target: ONE SIXTEEN FORTY EIGHT                      
Predicted: ONE SIXTEEN FORTY EIGHT 
Target: FOUR ONE TWO FOUR TWO TWO NINE EIGHT TWO EIGHT          
Predicted: FOUR ONE TWO FOUR TWO TWO NINE EIGHT TWO EIGHT
Target: RUBOUT S H K J FIVE SEVEN SIX              
Predicted: RUBOUT S H K J FIVE SEVEN SIX
Target: SIX EIGHT THREE ONE FIVE FIVE EIGHT                
Predicted: SIX EIGHT THREE ONE FIVE  EIGHT
Target: SEPTEMBER FIRST NINETEEN SIXTY NINE                    
Predicted: SEPTEMBER FIRST NINETEEN SIXTY NINE
Target: ONE TWO TWO TWO                      
Predicted: ONE TWO TWO 

(0.08996258706868343, 0.11497365207891526, 0.337231790026029)

In [None]:
test(model, device, batch_size, criterion, std_output=True)


evaluating...
Target: ONE FIVE TWO TWO SEVEN          
Predicted: ONE FIVE TWO TWO SEVEN
Target: ERASE K M H N I SIX OH FIVE  
Predicted: ERASE K M H N I SIX OH FIVE
Target: NO                  
Predicted: NO 
Target: ENTER EIGHT THIRTEEN              
Predicted: ENTER EIGHT FIFTEEN TWO
Target: ENTER ONE SEVENTY SIX            
Predicted: ENTER  SEVENTY SIX
Target: ONE FIVE TWO ONE THREE          
Predicted: ONE FIVE TWO ONE THREE
Target: M Y E R S          
Predicted: M Y E R S
Target: MAY SECOND NINETEEN SIXTY FIVE          
Predicted: MAY SECOND NINETEEN SIXTY FIVE
Target: P H I N N E Y      
Predicted: P H I N N E Y
Target: SIXTY SIX THIRTY THREE            
Predicted: SIXTY SIX THIRTY THREE
Target: ENTER TWO NINE EIGHT ONE          
Predicted: ENTER  TWO NINE EIGHT ONE
Target: M E L V I N        
Predicted: N E M V I N 
Target: P I T T S B U R G H
Predicted: P I T T S B  R G H
Target: C E D A R V I L L E
Predicted: C E D A R V I L L E
Target: L E V I S O N      
Predicted: M E V 

(0.0797497860906476, 0.10807653884576959, 0.27760335471895004)