# 11785 HW3P2: Automatic Speech Recognition

Welcome to HW3P2. In this homework, you will be using the same data from HW1 but will be incorporating sequence models. We recommend you get familaried with sequential data and the working of RNNs, LSTMs and GRUs to have a smooth learning in this part of the homework.

Disclaimer: This starter notebook will not be as elaborate as that of HW1P2 or HW2P2. You will need to do most of the implementation in this notebook because, it is expected after 2 HWs, you will be in a position to write a notebook from scratch. You are welcomed to reuse the code from the previous starter notebooks but may also need to make appropriate changes for this homework. <br>
We have also given you 3 log files for the Very Low Cutoff (Levenshtein Distance = 30) so that you can observe how loss decreases.

Common errors which you may face


*   Shape errors: Half of the errors from this homework will account to this category. Try printing the shapes between intermediate steps to debug
*   CUDA out of Memory: When your architecture has a lot of parameters, this can happen. Golden keys for this is, (1) Reducing batch_size (2) Call *torch.cuda.empty_cache* often, even inside your training loop, (3) Call *gc.collect* if it helps and (4) Restart run time if nothing works







# Prelimilaries

You will need to install packages for decoding and calculating the Levenshtein distance

In [None]:
!pip install --upgrade pip
!pip install python-Levenshtein
!git clone --recursive https://github.com/parlance/ctcdecode.git
!pip install wget
%cd ctcdecode
!pip install .
%cd ..

!pip install torchsummaryX

# Kaggle (TODO)

You need to set up your Kaggle and download the data

In [None]:
! pip install --upgrade --force-reinstall --no-deps kaggle

In [None]:
from google.colab import drive
drive.mount('/content/drive')
! mkdir ~/.kaggle
! cp /content/drive/MyDrive/kaggle.json ~/.kaggle

In [None]:
! kaggle competitions download -c 11-785-s22-hw3p2

In [None]:
! unzip -q /content/11-785-s22-hw3p2.zip

In [None]:
import torch
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
from torchsummaryX import summary
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence

from sklearn.metrics import accuracy_score
import gc
import zipfile
import pandas as pd
from tqdm import tqdm
import os
import datetime
import csv

import ctcdecode
import Levenshtein
from ctcdecode import CTCBeamDecoder

import warnings
warnings.filterwarnings('ignore')

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print("Device: ", device)

# Dataset and dataloading (TODO)

In [None]:
import phonemes

PHONEME_MAP = [
    " ",
    ".", #SIL
    "a", #AA
    "A", #AE
    "h", #AH
    "o", #AO
    "w", #AW
    "y", #AY
    "b", #B
    "c", #CH
    "d", #D
    "D", #DH
    "e", #EH
    "r", #ER
    "E", #EY
    "f", #F
    "g", #G
    "H", #H
    "i", #IH 
    "I", #IY
    "j", #JH
    "k", #K
    "l", #L
    "m", #M
    "n", #N
    "N", #NG
    "O", #OW
    "Y", #OY
    "p", #P 
    "R", #R
    "s", #S
    "S", #SH
    "t", #T
    "T", #TH
    "u", #UH
    "U", #UW
    "v", #V
    "W", #W
    "?", #Y
    "z", #Z
    "Z" #ZH
]

In [None]:
def parse_csv(filepath):
    subset = []
    with open(filepath) as f:
        f_csv = csv.reader(f)
        for row in f_csv:
            subset.append(row[0])
    return subset[1:]

In [None]:
class LibriSamples(torch.utils.data.Dataset):

    def __init__(self, data_path, partition= "train"):

        self.X_dir = data_path + '/' + partition + '/mfcc/'
        self.Y_dir = data_path + '/' + partition + '/transcript/'

        self.X_files = os.listdir(self.X_dir)
        self.Y_files = os.listdir(self.Y_dir)

        self.PHONEMES = phonemes.PHONEMES
        
        assert(len(self.X_files) == len(self.Y_files))

    def __len__(self):
        return len(self.X_files)

    def __getitem__(self, ind):
    
        X = np.load(self.X_dir + self.X_files[ind])
        Y = np.load(self.Y_dir + self.Y_files[ind])

        Y = Y[1:-1]
        Yy = [self.PHONEMES.index(yy) for yy in Y]

        return (torch.tensor(X), torch.tensor(Yy, dtype=torch.long))
    
    def collate_fn(self, batch):
        
        batch_x = [x for x,y in batch]
        batch_y = [y for x,y in batch]

        indexes = np.random.choice(np.arange(batch_size+1), 20)

        batch_x_pad = pad_sequence(batch_x, batch_first = True)
        lengths_x = [len(x) for x in batch_x]

        batch_y_pad = pad_sequence(batch_y, batch_first = True)
        lengths_y = [len(y) for y in batch_y]

        return batch_x_pad, batch_y_pad, torch.tensor(lengths_x), torch.tensor(lengths_y)

class LibriSamplesTest(torch.utils.data.Dataset):

    def __init__(self, data_path):

        test_order_list = parse_csv('/content/hw3p2_student_data/hw3p2_student_data/test/test_order.csv')
        self.X_path = data_path + '/test/mfcc/'
        self.X = [np.load(self.X_path + file) for file in test_order_list]

    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, ind):
        return torch.tensor(self.X[ind])
    
    def collate_fn(self, batch):
        
        batch_x = [x for x in batch]
        batch_x_pad = pad_sequence(batch_x, batch_first = True)
        lengths_x = [len(x) for x in batch_x]

        return batch_x_pad, torch.tensor(lengths_x)

In [None]:
batch_size = 64

root = '/content/hw3p2_student_data/hw3p2_student_data'

train_data = LibriSamples(root, 'train')
val_data = LibriSamples(root, 'dev')
test_data = LibriSamplesTest(root)

train_loader = DataLoader(train_data, collate_fn = train_data.collate_fn, shuffle=True, batch_size = batch_size, num_workers = 4)
val_loader = DataLoader(val_data, collate_fn = val_data.collate_fn, shuffle=True, batch_size = batch_size)
test_loader = DataLoader(test_data, collate_fn = test_data.collate_fn, shuffle=False, batch_size = batch_size)

print("Batch size: ", batch_size)
print("Train dataset samples = {}, batches = {}".format(train_data.__len__(), len(train_loader)))
print("Val dataset samples = {}, batches = {}".format(val_data.__len__(), len(val_loader)))
print("Test dataset samples = {}, batches = {}".format(test_data.__len__(), len(test_loader)))

In [None]:
for i, data in enumerate(val_loader):
    x, y, lx, ly = data
    print(x.shape, y.shape, lx.shape, ly.shape)
    break

# Model Configuration (TODO)

In [None]:
class LockedDropout(nn.Module):
    def __init__(self, p=0.5):
        self.p = p
        super().__init__()

    def forward(self, x):
        if not self.training or not self.p:
            return x
        x = x.clone()
        mask = x.new_empty(1, x.size(1), x.size(2), requires_grad=False).bernoulli_(1 - self.p)
        mask = mask.div_(1 - self.p)
        mask = mask.expand_as(x)
        return x * mask

In [None]:
class NextBlock(nn.Module):

    def __init__(self, in_channels, out_channels, i):

        super().__init__()

        self.embedding = torch.nn.Sequential(
            nn.Conv1d(in_channels = in_channels, out_channels = out_channels, bias = False, padding = 0, kernel_size = 1, stride = 1),
            nn.BatchNorm1d(out_channels),
            nn.GELU(),
            nn.Conv1d(in_channels = out_channels, out_channels = out_channels, bias = False, padding = 1, kernel_size = 3, stride = 1, groups = out_channels),
            nn.BatchNorm1d(out_channels),
            nn.GELU(),
            nn.Conv1d(in_channels = out_channels, out_channels = in_channels, bias = False, padding = 0,  kernel_size = 1, stride = 1),
            nn.BatchNorm1d(in_channels),
            nn.GELU(),
            nn.Dropout(0.25)
        )

    def forward(self, x):
        out = self.embedding(x)
        out = out + x
        return out

In [None]:
class Downsample(nn.Module):
    def __init__(self, in_channels):
        super().__init__()
        self.downsample = nn.Sequential(
            nn.Conv1d(in_channels, in_channels * 2, kernel_size = 2, stride = 2)
        )

    def forward(self, x):
        out = self.downsample(x)
        return out 

In [None]:
class Network(nn.Module):

    def __init__(self):

        super(Network, self).__init__()

        self.embedding_stem = nn.Conv1d(in_channels = 13, out_channels = 64, bias = False, kernel_size = 3, padding = 1, stride = 1)

        self.stages = [
            [64, 256, 2],
            [128, 512, 2],
            [256, 1024, 2],
        ]

        layers = self.make_layers()
        self.layers = nn.Sequential(*layers)

        final_classes = self.stages[-1][0]
    
        self.lstm_base = nn.LSTM(input_size = 256, hidden_size = 512, batch_first = True, bidirectional = True, bias = True)
        self.lstm1 = nn.LSTM(input_size = 1024, hidden_size = 512, batch_first = True, bidirectional = True, bias = True)
        self.lstm2 = nn.LSTM(input_size = 1024, hidden_size = 512, batch_first = True, bidirectional = True, bias = True)
        self.lstm3 = nn.LSTM(input_size = 1024, hidden_size = 512, batch_first = True, bidirectional = True, bias = True)

        
        self.lockdrop = LockedDropout(0.4)

        self.classification = nn.Sequential(
            nn.Linear((512 * 2), 2048),
            nn.GELU(),
            nn.Dropout(0.3),
            nn.Linear(2048, 41)
        )
        self.logSoftmax = nn.LogSoftmax(dim = 2)

    def make_layers(self):
        layers = []
        for idx, curr_stage in enumerate(self.stages):
            in_channels, out_channels, num_blocks = curr_stage
            for j in range(num_blocks):
                layers.append(NextBlock(in_channels = in_channels,out_channels=out_channels, i = j))
 
            if(idx != len(self.stages)-1):
                layers.append(Downsample(in_channels = in_channels))

        return layers

    def forward(self, x, lx):
        out = torch.permute(x, (0,2,1))
        out = self.embedding_stem(out)
        out = self.layers(out)

        out = torch.permute(out, (0,2,1))
        lx = lx // 4
        
        packed_input = pack_padded_sequence(out, lx, batch_first = True, enforce_sorted = False)


        for i in range(0,4):
            lstm = None
            if i == 0:
                lstm = self.lstm_base
            elif i == 1:
                lstm = self.lstm1
            elif i == 2:
                lstm = self.lstm2
            elif i == 3:
                lstm = self.lstm3

            out1, (out2, out3) = lstm(packed_input)
            
                # out1, (out2, out3) = self.lstm(packed_input)

            out, lengths  = pad_packed_sequence(out1, batch_first = True)
            
            out = torch.permute(out, (1,0,2))
            out = self.lockdrop(out)
            out = torch.permute(out, (1,0,2))

            packed_input = pack_padded_sequence(out, lengths, batch_first = True, enforce_sorted = False)

        # out1, (out2, out3) = self.lstm(packed_input)
        # out, lengths  = pad_packed_sequence(out1, batch_first = False)
        
        out = self.classification(out)
        out = self.logSoftmax(out)        

        return out, lx

model = Network().to(device)
summary(model, x.to(device), lx)

# Training Configuration (TODO)

In [None]:
criterion = nn.CTCLoss()
lr = 1e-3
optimizer = torch.optim.AdamW(params = model.parameters(), lr = lr, weight_decay = 2e-3)
decoder = CTCBeamDecoder(labels = PHONEME_MAP, beam_width = 2, log_probs_input = True)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer = optimizer, mode='min', factor = 0.5, patience = 4, threshold = 0.01)

In [None]:
def calculate_levenshtein(h, y, lh, ly, decoder, PHONEME_MAP):
    batch_size = h.shape[0]
    dist = 0
    beam_results, beam_scores, timesteps, out_lens = decoder.decode(h, seq_lens = lh)
    for i in range(batch_size): 
        beam = beam_results[i][0][:out_lens[i][0]]

        h_string = "".join([PHONEME_MAP[x] for x in beam])

        y_sliced = y[i,0:ly[i]]
        y_string = "".join([PHONEME_MAP[x] for x in y_sliced])
        
        dist += Levenshtein.distance(h_string, y_string)

    dist /= batch_size

    return dist

In [None]:
scaler = torch.cuda.amp.GradScaler()

def train(model, train_loader, optimizer, criterion):
    model.train()
    total_loss = 0
    bar = tqdm(total=len(train_loader), dynamic_ncols=True, leave=False, position=0, desc='Train')
    for i, data in (enumerate(train_loader)):
        
        optimizer.zero_grad()

        x, y, lx, ly = data
        x = x.to(device)
        y = y.to(device)

        with torch.cuda.amp.autocast():
            outputs, opLength = model(x, lx)
            outputs = torch.permute(outputs, (1,0,2))
            loss = criterion(outputs, y, opLength, ly)
        
        total_loss += loss.item()
        bar.set_postfix(loss="{:.04f}".format(float(total_loss / (i + 1))))
        scaler.scale(loss).backward() 
        scaler.step(optimizer) 
        scaler.update()
        bar.update()

    total_loss /= len(train_loader)

    return total_loss
    
def validate(model, val_loader, optimizer, criterion):
    model.eval()
    total_loss = 0
    distance = 0
    bar = tqdm(total=len(val_loader), dynamic_ncols=True, leave=False, position=0, desc='Validation')
    for i, data in (enumerate(val_loader)):
        
        optimizer.zero_grad()

        x, y, lx, ly = data
        x = x.to(device)
        y = y.to(device)

        with torch.no_grad():
            outputs, opLength = model(x, lx)
            outputs = torch.permute(outputs, (1,0,2))
            loss = criterion(outputs, y, opLength, ly)
        
        total_loss += loss.item()
        outputs = torch.permute(outputs, (1,0,2))
        distance += calculate_levenshtein(outputs, y, opLength, ly, decoder, PHONEME_MAP) 
        bar.update()

    total_loss /= len(val_loader)
    distance /= len(val_loader)
    return total_loss, distance

In [None]:
for i in range(1,61):
    print('Epoch', i)

    train_loss = train(model, train_loader, optimizer, criterion)
    val_loss, val_distance = validate(model, val_loader, optimizer, criterion)
    
    scheduler.step(val_distance)
    print(scheduler._last_lr)

    if i > 20 and i % 5 == 0:
        checkpoint = { 
            'epoch': i,
            'model': model.state_dict(),
            'optimizer': optimizer.state_dict(),
            'lr_sched': scheduler
        }
        prev_loss = val_loss
        torch.save(checkpoint, '/content/drive/MyDrive/hw3p2/model'+str(i)+'.pth')

    print('Training Loss', train_loss)
    print('Validation Loss', val_loss)
    print('Validation distance', val_distance)

In [None]:
test_decoder = CTCBeamDecoder(labels = PHONEME_MAP, beam_width = 20, log_probs_input = True)
def getPhonemes(h, lh):
    h_string = []
    batch_size = h.shape[0]
    dist = 0
    beam_results, beam_scores, timesteps, out_lens = test_decoder.decode(h, seq_lens = lh)
    for i in range(batch_size): 
        beam = beam_results[i][0][:out_lens[i][0]]
        h_string.append("".join([PHONEME_MAP[x] for x in beam]))

    return h_string

In [None]:
def test(model, test_loader):
    tid = 0
    model.eval()
    bar = tqdm(total=len(test_loader), dynamic_ncols=True, leave=False, position=0, desc='Train')
    with open("verification_early_submission.csv", "w+") as f:
        f.write("id,predictions\n")
        for i, data in enumerate(test_loader):
            x = data[0].cuda()
            lx = data[1]
            output, lh = model(x, lx)
            opString = getPhonemes(output, lh)
            for output in opString:
                f.write("{},{}\n".format(tid, output))
                tid += 1
            bar.update()

In [None]:
test(model, test_loader)

# Submit to kaggle (TODO)

In [None]:
! kaggle competitions submit -c 11-785-s22-hw3p2 -f /content/verification_early_submission.csv -m "Message"

In [None]:
torch.cuda.empty_cache()