# 11785 HW3P2: Automatic Speech Recognition

Welcome to HW3P2. In this homework, you will be using the same data from HW1 but will be incorporating sequence models. We recommend you get familaried with sequential data and the working of RNNs, LSTMs and GRUs to have a smooth learning in this part of the homework.

Disclaimer: This starter notebook will not be as elaborate as that of HW1P2 or HW2P2. You will need to do most of the implementation in this notebook because, it is expected after 2 HWs, you will be in a position to write a notebook from scratch. You are welcomed to reuse the code from the previous starter notebooks but may also need to make appropriate changes for this homework. <br>
We have also given you 3 log files for the Very Low Cutoff (Levenshtein Distance = 30) so that you can observe how loss decreases.

Common errors which you may face


*   Shape errors: Half of the errors from this homework will account to this category. Try printing the shapes between intermediate steps to debug
*   CUDA out of Memory: When your architecture has a lot of parameters, this can happen. Golden keys for this is, (1) Reducing batch_size (2) Call *torch.cuda.empty_cache* often, even inside your training loop, (3) Call *gc.collect* if it helps and (4) Restart run time if nothing works







# Libraries

In [1]:
import torch
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import torch.autograd as autograd

from torchsummaryX import summary
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence

from sklearn.metrics import accuracy_score
import gc
import zipfile
import pandas as pd
from tqdm import tqdm
import os
import datetime
import csv
import time

# imports for decoding and distance calculation
import ctcdecode
import Levenshtein
from ctcdecode import CTCBeamDecoder

from data.phonemes import *

import warnings
warnings.filterwarnings('ignore')

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print("Device: ", device)
torch.backends.cudnn.deterministic = False
torch.backends.cudnn.benchmark = False

  from .autonotebook import tqdm as notebook_tqdm


Device:  cuda


# Dataset and dataloading (TODO)

In [2]:
# This cell is where your actual TODOs start
# You will need to implement the Dataset class by your own. You may also implement it similar to HW1P2 (dont require context)
# The steps for implementation given below are how we have implemented it.
# However, you are welcomed to do it your own way if it is more comfortable or efficient. 

class LibriSamples(torch.utils.data.Dataset):

    def __init__(self, data_path, partition= "train"): # You can use partition to specify train or dev

        self.X_dir = os.path.join(data_path, partition, "mfcc")
        self.Y_dir = os.path.join(data_path, partition, "transcript")
        
        self.X_files = os.listdir(self.X_dir)
        self.Y_files = os.listdir(self.Y_dir)

        # TODO: store PHONEMES from phonemes.py inside the class. phonemes.py will be downloaded from kaggle.
        # You may wish to store PHONEMES as a class attribute or a global variable as well.
        self.PHONEMES = PHONEMES

        assert(len(self.X_files) == len(self.Y_files))


    def __len__(self):
        return len(self.X_files)

    def __getitem__(self, ind):
    
        X = np.load(os.path.join(self.X_dir, self.X_files[ind])) # TODO: Load the mfcc npy file at the ind in the directory
        X = (X - X.mean(axis=0)) / X.std(axis=0) # ADD: normalize
        X = torch.FloatTensor(X)
       
        Y = np.load(os.path.join(self.Y_dir, self.Y_files[ind])) # TODO: Load the corresponding transcripts
        labels = np.asarray([self.PHONEMES.index(yy) for yy in Y[1:-1]]) # TODO: Convert sequence of  phonemes into sequence of Long tensors
        
        # Remember, the transcripts are a sequence of phonemes. Eg. np.array(['<sos>', 'B', 'IH', 'K', 'SH', 'AA', '<eos>'])
        # You need to convert these into a sequence of Long tensors
        # Tip: You may need to use self.PHONEMES
        # Remember, PHONEMES or PHONEME_MAP do not have '<sos>' or '<eos>' but the transcripts have them. 
        # You need to remove '<sos>' and '<eos>' from the trancripts. 
        # Inefficient way is to use a for loop for this. Efficient way is to think that '<sos>' occurs at the start and '<eos>' occurs at the end.
        
        Yy = torch.LongTensor(labels) # TODO: Convert sequence of  phonemes into sequence of Long tensors

        return X, Yy
    
    def collate_fn(batch):
        batch_x = [x for x,y in batch]
        batch_y = [y for x,y in batch]

        batch_x_pad = pad_sequence(batch_x, batch_first=True)# TODO: pad the sequence with pad_sequence (already imported)
        lengths_x = [sample.shape[0] for sample in batch_x] # TODO: Get original lengths of the sequence before padding

        batch_y_pad = pad_sequence(batch_y, batch_first=True) # TODO: pad the sequence with pad_sequence (already imported)
        lengths_y = [sample.shape[0] for sample in batch_y] # TODO: Get original lengths of the sequence before padding

        return batch_x_pad, batch_y_pad, torch.tensor(lengths_x), torch.tensor(lengths_y)


# You can either try to combine test data in the previous class or write a new Dataset class for test data
class LibriSamplesTest(torch.utils.data.Dataset):

    def __init__(self, data_path, test_order): # test_order is the csv similar to what you used in hw1

        test_order_list = os.path.join(data_path, "test", test_order)
        self.X_dir = os.path.join(data_path, "test", "mfcc")
        self.X_files = []

        with open(test_order_list) as f:
            f_csv = csv.reader(f)
            for row in f_csv:
                self.X_files.append(row[0])
            self.X_files = self.X_files[1:]

    def __len__(self):
        return len(self.X_files)
    
    def __getitem__(self, ind):
        # TODOs: Need to return only X because this is the test dataset
    
        X = np.load(os.path.join(self.X_dir, self.X_files[ind]))
        
        return torch.from_numpy(X)
    
    def collate_fn(batch):
        batch_x = [x for x in batch]
        batch_x_pad = pad_sequence(batch_x, batch_first=True) # TODO: pad the sequence with pad_sequence (already imported)
        lengths_x = [x.shape[0] for x in batch_x] # TODO: Get original lengths of the sequence before padding

        return batch_x_pad, torch.tensor(lengths_x)

In [3]:
batch_size = 128

root = 'data' # TODO: Where your hw3p2_student_data folder is

train_data = LibriSamples(root, 'train')
val_data = LibriSamples(root, 'dev')
test_data = LibriSamplesTest(root, 'test_order.csv')

train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True, 
                            num_workers=4, collate_fn=LibriSamples.collate_fn) # TODO: Define the train loader. Remember to pass in a parameter (function) for the collate_fn argument 
val_loader = DataLoader(val_data, batch_size=batch_size, shuffle=False, 
                            num_workers=4, collate_fn=LibriSamples.collate_fn) # TODO: Define the val loader. Remember to pass in a parameter (function) for the collate_fn argument 
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False, 
                            num_workers=4, collate_fn=LibriSamplesTest.collate_fn) # TODO: Define the test loader. Remember to pass in a parameter (function) for the collate_fn argument 

print("Batch size: ", batch_size)
print("Train dataset samples = {}, batches = {}".format(train_data.__len__(), len(train_loader)))
print("Val dataset samples = {}, batches = {}".format(val_data.__len__(), len(val_loader)))
print("Test dataset samples = {}, batches = {}".format(test_data.__len__(), len(test_loader)))

Batch size:  128
Train dataset samples = 28539, batches = 223
Val dataset samples = 2703, batches = 22
Test dataset samples = 2620, batches = 21


In [4]:
# Optional
# Test code for checking shapes and return arguments of the train and val loaders
for data in val_loader:
    x, y, lx, ly = data # if you face an error saying "Cannot unpack", then you are not passing the collate_fn argument
    print(x.shape, y.shape, lx.shape, ly.shape)
    break

torch.Size([128, 2267, 13]) torch.Size([128, 280]) torch.Size([128]) torch.Size([128])


# Model Configuration (TODO)

In [5]:

class Network(nn.Module):

    def __init__(self,input_size=13, hidden_size=256, num_layers=1, num_classes=41): # You can add any extra arguments as you wish

        super(Network, self).__init__()

        # Embedding layer converts the raw input into features which may (or may not) help the LSTM to learn better 
        # For the very low cut-off you dont require an embedding layer. You can pass the input directly to the  LSTM
        # self.embedding = 
        
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)# TODO: # Create a single layer, uni-directional LSTM with hidden_size = 256
        # Use nn.LSTM() Make sure that you give in the proper arguments as given in https://pytorch.org/docs/stable/generated/torch.nn.LSTM.html

        self.classification = nn.Linear(hidden_size, num_classes)# TODO: Create a single classification layer using nn.Linear()

    def forward(self, x, X_lens): # TODO: You need to pass atleast 1 more parameter apart from self and x
        # x is returned from the dataloader. So it is assumed to be padded with the help of the collate_fn
        packed_input = pack_padded_sequence(x, X_lens, batch_first=True, enforce_sorted=False)# TODO: Pack the input with pack_padded_sequence. Look at the parameters it requires

        out1, (out2, out3) = self.lstm(packed_input) # TODO: Pass packed input to self.lstm
        # As you may see from the LSTM docs, LSTM returns 3 vectors. Which one do you need to pass to the next function?
        out, lengths = pad_packed_sequence(out1) # TODO: Need to 'unpack' the LSTM output using pad_packed_sequence

        out = self.classification(out) # TODO: Pass unpacked LSTM output to the classification layer
        out = F.log_softmax(out, dim=2) # Optional: Do log softmax on the output. Which dimension?

        return out, lengths # TODO: Need to return 2 variables

model = Network().to(device)
print(model)
print(x.shape, lx)
summary(model, x.to(device), lx) # x and lx are from the previous cell

Network(
  (lstm): LSTM(13, 256, batch_first=True)
  (classification): Linear(in_features=256, out_features=41, bias=True)
)
torch.Size([128, 2267, 13]) tensor([ 980,  553,  636,  417,  642,  642, 1399, 1107,  400, 1023,  270,  257,
         381,  399,  427, 2267,  861,  469, 2054, 1532,  555,  855,  857, 1410,
         482,  482,  259,  276,  609,  452,  358,  596,  969,  781,  952,  294,
         767,  380,  276, 2081,  396,  278,  195,  287, 1006,  484,  248,  292,
        1480,  731,  279, 1177,  446,  729,  279, 1187,  214,  234, 1138,  353,
         442,  308,  630,  290,  681,  333,  824,  249,  405,  336,  304,  801,
         528, 1101,  219, 1419,  480,  793,  647,  734,  444,  501,  515,  217,
        2100,  245,  765,  413,  434,  953,  661,  498,  679, 1746,  542,  889,
         384,  202,  304,  465,  416, 1268,  441,  281,  796,  576,  294, 1561,
         445,  225, 1334, 1254, 1909, 1196, 1942, 1028,  570,  994, 1446,  264,
         658,  821,  428, 1521,  425, 1548,  79

Unnamed: 0_level_0,Kernel Shape,Output Shape,Params,Mult-Adds
Layer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0_lstm,-,"[91673, 256]",277504,275456
1_classification,"[256, 41]","[2267, 128, 41]",10537,10496


# Training Configuration (TODO)

In [6]:
criterion = nn.CTCLoss() # TODO: What loss do you need for sequence to sequence models? 
# Do you need to transpose or permute the model output to find out the loss? Read its documentation
optimizer = torch.optim.Adam(model.parameters(), lr=2e-3) # TODO: Adam works well with LSTM (use lr = 2e-3)
decoder = CTCBeamDecoder(
                            PHONEMES,
                            model_path=None,
                            alpha=0,
                            beta=0,
                            cutoff_top_n=40,
                            cutoff_prob=1.0,
                            beam_width=100,
                            num_processes=4,
                            blank_id=0,
                            log_probs_input=False
                        ) # TODO: Intialize the CTC beam decoder
# Check out https://github.com/parlance/ctcdecode for the details on how to implement decoding
# Do you need to give log_probs_input = True or False?

In [7]:
# this function calculates the Levenshtein distance 

def calculate_levenshtein(h, y, lh, ly, decoder, PHONEME_MAP):

    # h - ouput from the model. Probability distributions at each time step 
    # y - target output sequence - sequence of Long tensors
    # lh, ly - Lengths of output and 
    # decoder - decoder object which was initialized in the previous cell
    # PHONEME_MAP - maps output to a character to find the  distance

    # TODO: You may need to transpose or permute h based on how you passed it to the criterion
    # Print out the shapes often to debug

    # TODO: call the decoder's decode method and get beam_results and out_len (Read the docs about the decode method's outputs
    # Input to the decode method will be h and its lengths lh 
    # You need to pass lh for the 'seq_lens' parameter. This is not explicitly mentioned in the git repo of ctcdecode.
    beam_result, beam_scores, timesteps, out_len = decoder.decode(h, seq_lens=lh)

    batch_size = y.shape[0]# TODO

    dist = 0

    for i in range(batch_size): # Loop through each element in the batch

        h_sliced = beam_result[i,:out_len] # TODO: Get the output as a sequence of numbers from beam_results
        # Remember that h is padded to the max sequence length and lh contains lengths of individual sequences
        # Same goes for beam_results and out_lens
        # You do not require the padded portion of beam_results - you need to slice it with out_lens 
        # If it is confusing, print out the shapes of all the variables and try to understand

        h_string = [PHONEME_MAP.index(hh) for hh in h_sliced] # TODO: MAP the sequence of numbers to its corresponding characters with PHONEME_MAP and merge everything as a single string

        y_sliced = y[i,:ly] # TODO: Do the same for y - slice off the padding with ly
        y_string = [PHONEME_MAP.index(yy) for yy in y_sliced] # TODO: MAP the sequence of numbers to its corresponding characters with PHONEME_MAP and merge everything as a single string
        
        dist += Levenshtein.distance(h_string, y_string)

    dist/=batch_size

    return dist

In [8]:
def train(epoch, model, train_loader, optimizer, criterion):
    model.train()

    sum_loss, total = 0, 0
    total_dist = 0

    start_time = time.time()
    optimizer.zero_grad()
    log_interval = 40

    for batch, (data) in enumerate(train_loader):
        x, y, lx, ly = data
        total += len(y)

        x = x.to(device)
        outputs, length = model(x, lx)

        loss = criterion(outputs, y, length, ly)
        loss.backward()
        sum_loss += loss.item()


        #total_dist += calculate_levenshtein(outputs, y, length, ly, decoder, PHONEME_MAP)

        optimizer.step()
        optimizer.zero_grad()

        if batch % log_interval == 0:
            elapsed = time.time() - start_time
            avg_loss = sum_loss / total
            print(
                '| epoch {:3d} | {:5d}/{:5d} batches ({:5.2f}%) | lr {:.2e} | {:3.0f} ms/utter | loss/utter {:5.2f} | loss/phoneme {:5.4f}'
                .format(epoch+1, batch, len(train_loader), (100.0 * batch / len(train_loader)),
                        optimizer.param_groups[0]['lr'],
                        elapsed * 1000.0 / (log_interval * batch_size),
                        sum_loss / (log_interval * batch_size),
                        avg_loss))
            start_time = time.time()

train(0, model, train_loader, optimizer, criterion)

| epoch   1 |     0/  223 batches ( 0.00%) | lr 2.00e-03 |   0 ms/utter | loss/utter  0.01 | loss/phoneme 0.2402
| epoch   1 |    40/  223 batches (17.94%) | lr 2.00e-03 |   2 ms/utter | loss/utter  0.06 | loss/phoneme 0.0570
| epoch   1 |    80/  223 batches (35.87%) | lr 2.00e-03 |   2 ms/utter | loss/utter  0.09 | loss/phoneme 0.0421
| epoch   1 |   120/  223 batches (53.81%) | lr 2.00e-03 |   2 ms/utter | loss/utter  0.11 | loss/phoneme 0.0370
| epoch   1 |   160/  223 batches (71.75%) | lr 2.00e-03 |   2 ms/utter | loss/utter  0.14 | loss/phoneme 0.0344
| epoch   1 |   200/  223 batches (89.69%) | lr 2.00e-03 |   2 ms/utter | loss/utter  0.17 | loss/phoneme 0.0329


In [9]:
torch.cuda.empty_cache() # Use this often

# TODO: Write the model evaluation function if you want to validate after every epoch

# You are free to write your own code for model evaluation or you can use the code from previous homeworks' starter notebooks
# However, you will have to make modifications because of the following.
# (1) The dataloader returns 4 items unlike 2 for hw2p2
# (2) The model forward returns 2 outputs
# (3) The loss may require transpose or permuting

# Note that when you give a higher beam width, decoding will take a longer time to get executed
# Therefore, it is recommended that you calculate only the val dataset's Levenshtein distance (train not recommended) with a small beam width
# When you are evaluating on your test set, you may have a higher beam width

def validate(model, val_loader, criterion):
    model.eval()

    sum_loss, total_num = 0, 0
    total_dist = 0

    start_time = time.time()
    log_interval = 40

    with torch.no_grad():
        for batch, (data) in enumerate(val_loader):
            x, y, lx, ly = data
            total_num += len(y)

            x = x.to(device)
            outputs, length = model(x, lx)

            loss = criterion(outputs, y, length, ly)
            sum_loss += loss.item()


            #total_dist += calculate_levenshtein(outputs, y, length, ly, decoder, PHONEME_MAP)

            torch.nn.utils.clip_grad_norm_(model.parameters(), 0.25)

    elapsed = time.time() - start_time
    avg_loss = sum_loss / total_num
    print(
        '| Validation | {:5d}/{:5d} batches ({:5.2f}%) | lr {:.2e} | {:3.0f} ms/utter | loss/utter {:5.2f} | loss/phoneme {:5.4f}'
        .format(batch, len(val_loader), (100.0 * batch / len(val_loader)),
                optimizer.param_groups[0]['lr'],
                elapsed * 1000.0 / (log_interval * batch_size),
                sum_loss / (log_interval * batch_size),
                avg_loss))
                    

validate(model, val_loader, criterion)


| Validation |    21/   22 batches (95.45%) | lr 2.00e-03 |   0 ms/utter | loss/utter  0.01 | loss/phoneme 0.0278


In [10]:
torch.cuda.empty_cache()

# TODO: Write the model training code 

# You are free to write your own code for training or you can use the code from previous homeworks' starter notebooks
# However, you will have to make modifications because of the following.
# (1) The dataloader returns 4 items unlike 2 for hw2p2
# (2) The model forward returns 2 outputs
# (3) The loss may require transpose or permuting

# Tip: Implement mixed precision training
epochs = 40
for epoch in range(epochs):
    train(epoch, model, train_loader, optimizer, criterion)
    validate(model, val_loader, criterion)
    

| epoch   1 |     0/  223 batches ( 0.00%) | lr 2.00e-03 |   0 ms/utter | loss/utter  0.00 | loss/phoneme 0.0265
| epoch   1 |    40/  223 batches (17.94%) | lr 2.00e-03 |   2 ms/utter | loss/utter  0.03 | loss/phoneme 0.0265
| epoch   1 |    80/  223 batches (35.87%) | lr 2.00e-03 |   2 ms/utter | loss/utter  0.05 | loss/phoneme 0.0263
| epoch   1 |   120/  223 batches (53.81%) | lr 2.00e-03 |   2 ms/utter | loss/utter  0.08 | loss/phoneme 0.0262
| epoch   1 |   160/  223 batches (71.75%) | lr 2.00e-03 |   2 ms/utter | loss/utter  0.10 | loss/phoneme 0.0258
| epoch   1 |   200/  223 batches (89.69%) | lr 2.00e-03 |   2 ms/utter | loss/utter  0.13 | loss/phoneme 0.0254
| Validation |    21/   22 batches (95.45%) | lr 2.00e-03 |   0 ms/utter | loss/utter  0.01 | loss/phoneme 0.0232
| epoch   2 |     0/  223 batches ( 0.00%) | lr 2.00e-03 |   0 ms/utter | loss/utter  0.00 | loss/phoneme 0.0224
| epoch   2 |    40/  223 batches (17.94%) | lr 2.00e-03 |   2 ms/utter | loss/utter  0.02 | lo

# Submit to kaggle (TODO)

In [None]:
def decode(output, seq_sizes, beam_width=60):
    decoder = CTCBeamDecoder(labels=PHONEME_MAP, blank_id=0, beam_width=beam_width)
    output = torch.transpose(output, 0, 1) 
    probs = F.softmax(output, dim=2).data

    output, scores, timesteps, out_seq_len = decoder.decode(probs=probs, seq_lens=seq_sizes)


    decoded = []
    for i in range(output.size(0)):
        chrs = ""
        if out_seq_len[i, 0] != 0:
            chrs = "".join(PHONEME_MAP[o] for o in output[i, 0, :out_seq_len[i, 0]])
        decoded.append(chrs)
    return decoded

In [None]:
# TODO: Write your model evaluation code for the test dataset
# You can write your own code or use from the previous homewoks' stater notebooks
# You can't calculate loss here. Why?
csv_path = './submission.csv'
with open(csv_path, 'w') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=['id', 'predictions'])
        writer.writeheader()
        cnt = 0
        with torch.no_grad():
              for batch, (x, lx) in enumerate(test_loader):
                    x = x.to(device)
                    output, length = model(x, lx)

                    decoded = decode(output, length, beam_width=100)
                    for s in decoded:
                        writer.writerow({"id": cnt, "predictions": s})
                        cnt += 1
print("done")

done
