In [1]:
import sys
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.nn.utils.rnn as rnn_utils
import torch.optim as optim
import torch.nn.utils as utils
import seaborn as sns
import matplotlib.pyplot as plt
import time
import random
from torch.utils import data
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
from torch.utils.data import Dataset, DataLoader
from typing import List, Tuple, Dict
import os
import glob
import tqdm



In [2]:
# These are the various characters in the transcripts of the datasetW
VOCAB = ['<sos>',   
         'A',   'B',    'C',    'D',    
         'E',   'F',    'G',    'H',    
         'I',   'J',    'K',    'L',       
         'M',   'N',    'O',    'P',    
         'Q',   'R',    'S',    'T', 
         'U',   'V',    'W',    'X', 
         'Y',   'Z',    "'",    ' ', 
         '<eos>']

VOCAB_MAP = {VOCAB[i]:i for i in range(0, len(VOCAB))}

SOS_TOKEN = VOCAB_MAP["<sos>"]
EOS_TOKEN = VOCAB_MAP["<eos>"]

BATCH_SIZE = 96

In [3]:
class MFCCDataset:
    def __init__(self, data_path, vocab_map, val = False, cep_norm = True):
        """
        Let's access the datapaths for the input and the labels in this sections 
        x: MFCC path
        y: Transcripts 

        1) Load all the data a-priori in the init for faster training. 
        2) Cepstral normalization :  
        """
        self.val = val
        self.cep_norm = cep_norm 
        if self.val:
            self.x =  str(data_path)+"\\dev-clean\\mfcc\\*.npy" 
            self.y =  str(data_path)+"\\dev-clean\\transcript\\*.npy"
        else: 
            self.x = str(data_path)+"\\train-clean-100\\mfcc\\*.npy"
            self.y = str(data_path)+"\\train-clean-100\\transcript\\raw\\*.npy"

        self.mfcc_list = sorted(glob.glob(self.x))
        self.transcript_list = sorted(glob.glob(self.y))
        self.alphabets = vocab_map
       
    def __len__(self):
        
        return len(self.mfcc_list)

    def __getitem__(self, index):
        """
        cepstral normalization performed here for higher SNR 
        """

        if self.val:
            mf_temp = np.load(self.mfcc_list[index], allow_pickle= True)
            tr_temp = np.load(self.transcript_list[index], allow_pickle= True)
            tr_temp = [self.alphabets[ele] for ele in tr_temp]
            if self.cep_norm:
                mf_temp = (mf_temp - np.mean(mf_temp, axis = 0))/ np.std(mf_temp)
            
            return torch.tensor(mf_temp)
        
        else: 
            mf_temp = np.load(self.mfcc_list[index], allow_pickle= True)
            tr_temp = np.load(self.transcript_list[index], allow_pickle= True)
            
            # Converting the alphabets in the labels to integers using the pre-defined map provided 
            tr_temp = [self.alphabets[ele] for ele in tr_temp]

            if self.cep_norm:
                mf_temp = (mf_temp - np.mean(mf_temp, axis = 0))/ np.std(mf_temp)
            return torch.tensor(mf_temp), torch.tensor(tr_temp)

#Collate function for uniform padding of the input sequences 

def collate_train_val(data): 
    
    (xx, yy) = zip(*data)
    x_lens = [len(x) for x in xx]
    y_lens = [len(y) for y in yy]

    xx_pad = pad_sequence(xx,batch_first=True)
    yy_pad = pad_sequence(yy,batch_first=True)

    x_lens = np.asarray(x_lens)
    y_lens = np.asarray(y_lens)
    # Some augmentation and masking here may help the network converge better. 

        
    return xx_pad, yy_pad, torch.tensor(x_lens), torch.tensor(y_lens)


In [4]:
# Dataset and dataloader sections 
data_path = "C:\\Users\\thopa\Desktop\\Assignments\\11685\\HW4\\2022Implementation\\11-785-f22-hw4p2\\hw4p2"
train_data = MFCCDataset(data_path, vocab_map= VOCAB_MAP)
val_data = MFCCDataset(data_path,vocab_map= VOCAB_MAP, val = True)

train_loader = DataLoader(train_data ,batch_size = 8 , collate_fn= collate_train_val , shuffle = True)
val_loader = DataLoader(val_data,batch_size = 8, collate_fn = collate_train_val, shuffle= False)


## Neural Network and Training 

### Silly notes for reference 
When training RNN (LSTM or GRU or vanilla-RNN), it is difficult to batch the variable length sequences. For example: if the length of sequences in a size 8 batch is [4,6,8,5,4,3,7,8], you will pad all the sequences and that will result in 8 sequences of length 8. You would end up doing 64 computations (8x8), but you needed to do only 45 computations. Moreover, if you wanted to do something fancy like using a bidirectional-RNN, it would be harder to do batch computations just by padding and you might end up doing more computations than required.

Instead, PyTorch allows us to pack the sequence, internally packed sequence is a tuple of two lists. One contains the elements of sequences. Elements are interleaved by time steps (see example below) and other contains the size of each sequence the batch size at each step. This is helpful in recovering the actual sequences as well as telling RNN what is the batch size at each time step. This has been pointed by @Aerin. This can be passed to RNN and it will internally optimize the computations.

In [5]:
class PBLSTM(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(PBLSTM,self).__init__()

        self.blstm = nn.LSTM(input_size = input_size, hidden_size = hidden_size, num_layers = 2, batch_first = True, bidirectional = True, dropout = 0.3)
    
    def reshape(self, x, x_lens):
        # Reshaping for concatenation / reducing dimensions
        batch, rows, cols = x.shape[0], x.shape[1], x.shape[2]

        if (rows % 2 != 0):
            x = x[:,:-1,:]
        x = x.reshape(batch, int(rows/2), cols*2)
        x_lens = x_lens//2
    
        return x, x_lens
    
    def forward(self,x):
        """
        Computational savings and original sequence recovery using the pack padded and pad packed routine 
        """
        x_pad, x_pad_lens = pad_packed_sequence(x, batch_first=True)
        x, x_lens = self.reshape(x_pad, x_pad_lens.to("cuda"))
        input = rnn_utils.pack_padded_sequence(x, lengths = x_lens.cpu(), batch_first= True, enforce_sorted= False)
        rnn_out, _ = self.blstm(input)
        #output, lens = rnn_utils.pad_packed_sequence(rnn_out, batch_first= True)
        
        return rnn_out

In [6]:
class Encoder(nn.Module):
    def __init__(self, input_size, encoder_hidden_size):
        super(Encoder, self).__init__()
        """
        The encoder used is a pyramidal-BiLSTM for matching the input rate and the speech transcription rate which is about 8:1. 
        This model is sigificantly influenced by the LAS paper 
        LAS: Chan, William, et al. "Listen, attend and spell." arXiv preprint arXiv:1508.01211 (2015).

        [REF: B. Raj, Deep Learning Carnegie Mellon University]
        The pBLSTM is a variant of Bi-LSTMs that downsamples sequences by a factor of 2 by concatenating
        adjacent pairs of inputs before running a conventional Bi-LSTM on the reduced-length sequence. So, given
        an input vector sequence X0, X1, X2, X3, . . . XN−1, the pBLSTM first concatenates adjacent pairs of vectors
        as [X0, X1], [X2, X3], . . . [XN−2, XN−1], and then computes a regular BiLSTM on the reshaped input.

        -) Initial Bi-LSTM 
        -) 3x Pyramidal Bi-LSTM 
        """    
        self.base_lstm = nn.LSTM(input_size = input_size, hidden_size = encoder_hidden_size, num_layers = 1, batch_first = True, bidirectional = True, dropout = 0.1)
        self.pblstm = nn.Sequential(PBLSTM(4*encoder_hidden_size,encoder_hidden_size),PBLSTM(4*encoder_hidden_size,encoder_hidden_size),PBLSTM(4*encoder_hidden_size,encoder_hidden_size))
        
    def forward(self, x, x_lens):
        pack_padd_out = pack_padded_sequence(x, x_lens.to('cpu'),batch_first=True, enforce_sorted=False)
        #print(type(pack_padd_out))
        out_lstm, _  = self.base_lstm(pack_padd_out)
        encoder_outputs = self.pblstm(out_lstm)
        encoder_outputs, encoder_lens = pad_packed_sequence(encoder_outputs, batch_first=True)

        return encoder_outputs, encoder_lens




In [7]:
# from torchsummaryX import summary

# for data in train_loader:
#     x, y, lx, ly = data
#     print(x.shape, y.shape, lx.shape, ly.shape)
#     break 

# encoder = Encoder(15,256)# TODO: Initialize Listener
# out, lens = encoder.forward(x, lx)
# del encoder

In [8]:
# Attention block 
"""
Possible Efficiencies with the attention mechanism (d2l book)
1) In general, it requires that both the query and the key have the 
same vector length, say d, even though this can be addressed easily by replacing 
q⊤k with q⊤Mk where M is a suitably chosen matrix to translate
between both spaces. For now assume that the dimensions match.
2) Adding dropout weights also helps 
"""
class Attention(nn.Module):
    def __init__(self, encoder_output_size, decoder_output_size, projection):
        super(Attention, self).__init__()
        self.key_layer = nn.Linear(encoder_output_size, projection)
        self.value_layer = nn.Linear(encoder_output_size, projection)
        self.query_layer = nn.Linear(decoder_output_size, projection)
    
    def key_value_calc(self, encoder_output, encoder_len):
        _ ,encoder_max_seq_len, _ = encoder_output.shape 
        self.key = self.key_layer(encoder_output)
        self.value = self.value_layer(encoder_output)
        print("Attention Calculated")
        # Attention mask 
        # Removing the influence of padding in the raw weights, we create a boolean mask of (batchsize, timesteps)
        self.mask = (torch.arange(encoder_max_seq_len)[None, :] < encoder_len[:, None]).to("cuda")
        
    def forward(self, decoder_output_embeddings):
        print(decoder_output_embeddings.shape)
        self.query = self.query_layer(decoder_output_embeddings)
        
        energy = torch.bmm(self.key, self.query.unsqueeze(2))
        
        energy = torch.squeeze(energy, dim = 2)

        #What should the mask least value be? 
        energy.masked_fill_(self.mask, -1e9)
        
        attention = torch.nn.functional.softmax(energy, dim = 1)
        context = torch.bmm(torch.permute(self.value,[0,2,1]),attention.unsqueeze(2)).squeeze(2)
        return context, attention

In [9]:
# Decoder ~ according to the speller of the LAS paper 

class Decoder(nn.Module):
    def __init__(self, embed_dim, projection, vocab_size, decoder_hidden_size, decoder_output_size, encoder_output_size):
        super(Decoder, self).__init__()
        
        """
        A simple lookup table that stores embeddings of a fixed dictionary and size.
        This module is often used to store word embeddings and retrieve them using indices. 
        The input to the module is a list of indices, and the output is the corresponding word embeddings.
        """
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx = 0 ).cuda()
        self.lstm_cells = nn.Sequential(nn.LSTMCell(embed_dim + projection , decoder_hidden_size) , nn.LSTMCell(decoder_hidden_size , decoder_output_size))
        self.vocab_size = vocab_size
        self.attention = Attention(2*encoder_output_size , decoder_output_size, projection)
        self.character_prob = nn.Linear(512, vocab_size)
        self.device = "cuda"
        
    def forward(self, encoder_output, encoder_len, y = None, mode = "train", teacherForcingRate = 0.1, isGumbel = False ):
       
        # batch, key_seq_max_len, key_value_size = key.shape
        batch, encoder_max_seq, _ = encoder_output.shape

        # # Attention mask for making the system autoregressive 
        # mask = torch.arange(key_seq_max_len).unsqueeze(0)>=encoder_len.unsqueeze(1)
        # mask = mask.to(self.device)

        # List to store output attention plots 
        predictions, attention_plot = [], []
        prediction = torch.full((batch,1), fill_value = 0 ,device= self.device)
        
        # Hidden states
        hidden_states= [None]*len(self.lstm_cells)
        self.attention.key_value_calc(encoder_output, encoder_len)
        
        context = self.attention.value[:,0,:]

        if mode == "train":
            max_len = y.shape[1]
            char_embedding = self.embedding(y)
        else: 
            max_len = 600

        for i in range(max_len):
            if mode == "train":
                # Teacher Forcing regime ~ Assigned and picked randomly 
                teacher_forcing = True if random.random() > teacherForcingRate else False 
                if not teacher_forcing:
                    if i != 0 : # use Gumbel noise to add noise to add variety to phoneme
                        char_embed = torch.nn.functional.gumbel_softmax(prediction).mm(self.embedding.weight)
                    else:
                        char_embed = self.embedding(prediction.argmax(dim=-1))
                else:
                    if i == 0:
                        char_embed = self.embedding(torch.zeros(batch, dtype = torch.long).fill_(VOCAB_MAP['<sos>']).to(self.device)) 
                    else: 
                        char_embed = char_embedding[:,i-1,:] # ground truth teacher forcing 
            # Validation mode 
            else: 
                if i == 0: 
                    char_embed = self.embedding(torch.zeros(batch, dtype = torch.long).fill_(VOCAB_MAP['<sos>']).to(self.device)) 
                else: 
                    char_embed = self.embedding(prediction.argmax(dim = -1)) # feed in the previous prediction as input 
            
            # Input to the decoder (prev embedding + context from attention mechanism) 
            decoder_input_embedding = torch.cat([char_embed, context.squeeze(1)], dim = 1)
           
            for i in range(len(self.lstm_cells)):
                # An LSTM Cell returns (h,c) -> h = hidden state, c = cell memory state
                # Using 2 LSTM Cells is akin to a 2 layer LSTM looped through t timesteps 
                # The second LSTM Cell takes in the output hidden state of the first LSTM Cell (from the current timestep) as Input, along with the hidden and cell states of the cell from the previous timestep
                hidden_states[i] = self.lstm_cells[i](decoder_input_embedding, hidden_states[i]) 
                decoder_input_embedding = hidden_states[i][0]

            decoder_output_embedding = hidden_states[-1][0]
            # What is the query? (same len as the key)
            # Hidden state of the LSTM 
            # 8x768 and 128x30
            # decoder_output_embeddings, mask
            context, attention = self.attention(decoder_output_embedding)
            attention_plot.append(attention[0].detach().cpu())
            
            output_context = torch.cat([self.attention.query, context], dim = 1)
            prediction = self.character_prob(output_context)
            predictions.append(prediction.unsqueeze(1))
        attentions = torch.stack(attention_plot, dim = 0)
        predictions = torch.cat(predictions, dim = 1 )

        return predictions, attentions

In [10]:
"""
Combining the pipelines of the Seq2Seq model
"""
class Seq2Seq(nn.Module):
    def __init__(self, input_size, encoder_hidden_size, vocab_size, embed_size, decoder_hidden_size, decoder_output_size, projection_size = 128 ):
        super(Seq2Seq, self).__init__()
        """
        Parameters of each of the model classes 
        Encoder : input_size, encoder_hidden_size
        Decoder : embed_dim, projection, vocab_size, decoder_hidden_size, decoder_output_size, encoder_output_size
        """

        self.encoder =  Encoder(input_size = input_size, encoder_hidden_size = encoder_hidden_size)
        #self.attention = Attention(encoder_output_size= 2*encoder_hidden_size, decoder_output_size=decoder_output_size, projection= projection_size)
        self.decoder =  Decoder(embed_dim = embed_size, projection = projection_size, vocab_size = vocab_size, decoder_hidden_size = decoder_hidden_size, decoder_output_size=decoder_output_size,encoder_output_size = encoder_hidden_size)
    def forward(self, x, x_lens, y = None, mode = "none"):
        
        encoder_outputs, encoder_lens = self.encoder(x, x_lens)
        # encoder_output, key, value, encoder_len, y = None, mode = "train", teacherForcingRate = 0.1, isGumbel = False 
        predictions, attention_map = self.decoder(encoder_outputs, encoder_lens , y, mode = mode)

        return predictions, attention_map 


In [11]:
"""
Model initialization 
"""
DEVICE = "cuda"
model  = Seq2Seq(input_size=15,encoder_hidden_size=512,vocab_size=len(VOCAB),
            embed_size=512,decoder_hidden_size=512,decoder_output_size=128,projection_size=256)
model.to(DEVICE)
# print(model)



Seq2Seq(
  (encoder): Encoder(
    (base_lstm): LSTM(15, 512, batch_first=True, dropout=0.1, bidirectional=True)
    (pblstm): Sequential(
      (0): PBLSTM(
        (blstm): LSTM(2048, 512, num_layers=2, batch_first=True, dropout=0.3, bidirectional=True)
      )
      (1): PBLSTM(
        (blstm): LSTM(2048, 512, num_layers=2, batch_first=True, dropout=0.3, bidirectional=True)
      )
      (2): PBLSTM(
        (blstm): LSTM(2048, 512, num_layers=2, batch_first=True, dropout=0.3, bidirectional=True)
      )
    )
  )
  (decoder): Decoder(
    (embedding): Embedding(30, 512, padding_idx=0)
    (lstm_cells): Sequential(
      (0): LSTMCell(768, 512)
      (1): LSTMCell(512, 128)
    )
    (attention): Attention(
      (key_layer): Linear(in_features=1024, out_features=256, bias=True)
      (value_layer): Linear(in_features=1024, out_features=256, bias=True)
      (query_layer): Linear(in_features=128, out_features=256, bias=True)
    )
    (character_prob): Linear(in_features=512, out_f

### Hyper-parameters under consideration 
- Optimizer
    - Learning Rate
    - Weight Decay
- Learning Rate Scheduler
    - reduction 
- Loss function 
    - Reduction
    - Factor 
    - Patience 

In [12]:
optimizer = torch.optim.Adam(model.parameters(),lr = 2e-3, weight_decay = 5e-6 )
criterion = nn.CrossEntropyLoss(reduction = "none")
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode = 'min', factor = 0.4, patience = 2)
scaler = torch.cuda.amp.GradScaler()

### Train and evaluate 

In [14]:
epochs = 100 

for epoch in range(epochs): 
    model.train()
    for i,(x, y, x_len, y_len) in enumerate(train_loader):
        optimizer.zero_grad()
        x, x_len, y, y_len = x.to(DEVICE), x_len, y.to(DEVICE), y_len
        pred, attn = model(x = x, x_lens = x_len, y = y, mode = "train")
        print("prediction:{}, label:{}".format(pred.shape, y.shape))
        break

  x_lens = x_lens//2


torch.Size([8, 190, 1024])
Attention Calculated
torch.Size([8, 128])
torch.Size([8, 128])
torch.Size([8, 128])
torch.Size([8, 128])
torch.Size([8, 128])
torch.Size([8, 128])
torch.Size([8, 128])
torch.Size([8, 128])
torch.Size([8, 128])
torch.Size([8, 128])
torch.Size([8, 128])
torch.Size([8, 128])
torch.Size([8, 128])
torch.Size([8, 128])
torch.Size([8, 128])
torch.Size([8, 128])
torch.Size([8, 128])
torch.Size([8, 128])
torch.Size([8, 128])
torch.Size([8, 128])
torch.Size([8, 128])
torch.Size([8, 128])
torch.Size([8, 128])
torch.Size([8, 128])
torch.Size([8, 128])
torch.Size([8, 128])
torch.Size([8, 128])
torch.Size([8, 128])
torch.Size([8, 128])
torch.Size([8, 128])
torch.Size([8, 128])
torch.Size([8, 128])
torch.Size([8, 128])
torch.Size([8, 128])
torch.Size([8, 128])
torch.Size([8, 128])
torch.Size([8, 128])
torch.Size([8, 128])
torch.Size([8, 128])
torch.Size([8, 128])
torch.Size([8, 128])
torch.Size([8, 128])
torch.Size([8, 128])
torch.Size([8, 128])
torch.Size([8, 128])
torch.S

KeyboardInterrupt: 

In [None]:
del model

torch.cuda.empty_cache()