# Using LSTMs with CNN based embeddings for Utterances to Phoneme Mapping

### Author: Lakshay Sethi
#### Custom Architecture 1 (Designed by Lakshay Sethi through Ablations)
##### Ablations Link: https://wandb.ai/verydeeplearning/hw3p2-ablations

In [1]:
!nvidia-smi

Sat Apr  8 03:42:41 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   33C    P8     9W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
# !pip install wandb

### Levenshtein

This may take a while

In [3]:
# !pip install wandb --quiet
# !pip install python-Levenshtein -q
# !git clone --recursive https://github.com/parlance/ctcdecode.git
# !pip install wget -q
# %cd ctcdecode
# !pip install . -q
# %cd ..

# !pip install torchsummaryX -q

## Imports

In [1]:
import torch
import torchaudio
import random
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
from torchsummaryX import summary
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence

import torchaudio.transforms as tat

from sklearn.metrics import accuracy_score
import gc

import zipfile
import pandas as pd
from tqdm import tqdm
import os
import datetime

# imports for decoding and distance calculation
import ctcdecode
import Levenshtein
from ctcdecode import CTCBeamDecoder

import warnings
warnings.filterwarnings('ignore')

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print("Device: ", device)

Device:  cuda


# Kaggle Setup

In [5]:
!pip install --upgrade --force-reinstall --no-deps kaggle==1.5.8
!mkdir /root/.kaggle

with open("/root/.kaggle/kaggle.json", "w+") as f:
    f.write('{"username":"lakshaysethi","key":"32d86595e55bef36c1f649381dc3282f"}')

!chmod 600 /root/.kaggle/kaggle.json

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting kaggle==1.5.8
  Using cached kaggle-1.5.8-py3-none-any.whl
Installing collected packages: kaggle
  Attempting uninstall: kaggle
    Found existing installation: kaggle 1.5.8
    Uninstalling kaggle-1.5.8:
      Successfully uninstalled kaggle-1.5.8
Successfully installed kaggle-1.5.8
mkdir: cannot create directory ‘/root/.kaggle’: File exists


In [6]:
# !kaggle competitions download -c 11-785-s23-hw3p2
# '''
# This will take a couple minutes, but you should see at least the following:
# 11-785-f22-hw3p2.zip  ctcdecode  hw3p2
# '''
# !unzip -q 11-785-s23-hw3p2.zip
# !ls

# Google Drive

In [7]:
# from google.colab import drive
# drive.mount('/content/gdrive')

# Dataset and Dataloader

In [2]:
# ARPABET PHONEME MAPPING
# DO NOT CHANGE
# This overwrites the phonetics.py file.

CMUdict_ARPAbet = {
    "" : " ",
    "[SIL]": "-", "NG": "G", "F" : "f", "M" : "m", "AE": "@", 
    "R"    : "r", "UW": "u", "N" : "n", "IY": "i", "AW": "W", 
    "V"    : "v", "UH": "U", "OW": "o", "AA": "a", "ER": "R", 
    "HH"   : "h", "Z" : "z", "K" : "k", "CH": "C", "W" : "w", 
    "EY"   : "e", "ZH": "Z", "T" : "t", "EH": "E", "Y" : "y", 
    "AH"   : "A", "B" : "b", "P" : "p", "TH": "T", "DH": "D", 
    "AO"   : "c", "G" : "g", "L" : "l", "JH": "j", "OY": "O", 
    "SH"   : "S", "D" : "d", "AY": "Y", "S" : "s", "IH": "I",
    "[SOS]": "[SOS]", "[EOS]": "[EOS]"
}

CMUdict = list(CMUdict_ARPAbet.keys())
ARPAbet = list(CMUdict_ARPAbet.values())


PHONEMES = CMUdict[:-2]
LABELS = ARPAbet[:-2]

In [3]:
PHONEMES

['',
 '[SIL]',
 'NG',
 'F',
 'M',
 'AE',
 'R',
 'UW',
 'N',
 'IY',
 'AW',
 'V',
 'UH',
 'OW',
 'AA',
 'ER',
 'HH',
 'Z',
 'K',
 'CH',
 'W',
 'EY',
 'ZH',
 'T',
 'EH',
 'Y',
 'AH',
 'B',
 'P',
 'TH',
 'DH',
 'AO',
 'G',
 'L',
 'JH',
 'OY',
 'SH',
 'D',
 'AY',
 'S',
 'IH']

In [4]:
ARPAbet

[' ',
 '-',
 'G',
 'f',
 'm',
 '@',
 'r',
 'u',
 'n',
 'i',
 'W',
 'v',
 'U',
 'o',
 'a',
 'R',
 'h',
 'z',
 'k',
 'C',
 'w',
 'e',
 'Z',
 't',
 'E',
 'y',
 'A',
 'b',
 'p',
 'T',
 'D',
 'c',
 'g',
 'l',
 'j',
 'O',
 'S',
 'd',
 'Y',
 's',
 'I',
 '[SOS]',
 '[EOS]']

### Train Data

In [5]:
class AudioDataset(torch.utils.data.Dataset):

    def __init__(self, root, phonemes = PHONEMES, partition = ['train-clean-100']):
        self.mfcc_files, self.transcript_files = [], []

        for part in partition:  
            mfcc_dir = root + part + '/mfcc/'
            transcript_dir = root + part +'/transcript/'

            mfcc_temp_dir = os.listdir(mfcc_dir)
            mfcc_temp_dir.sort()
            mfcc_temp = [os.path.join(mfcc_dir, currFile) for currFile in mfcc_temp_dir]
            self.mfcc_files.extend(mfcc_temp)

            transcript_temp_dir = os.listdir(transcript_dir)
            transcript_temp_dir.sort()
            transcript_temp = [os.path.join(transcript_dir, currFile) for currFile in transcript_temp_dir]
            self.transcript_files.extend(transcript_temp)

        
        # Sanity Check
        assert len(self.mfcc_files) == len(self.transcript_files)

        # Lenght Assignment
        self.length = len(self.mfcc_files)

        self.PHONEMES = phonemes
        # HOW CAN WE REPRESENT PHONEMES? CAN WE CREATE A MAPPING FOR THEM?
        # HINT: TENSORS CANNOT STORE NON-NUMERICAL VALUES OR STRINGS
        self.PHONEMES_DICT = {value: idx for idx, value in enumerate(self.PHONEMES)}


        self.mfccs, self.transcripts = [], []

        for i in range(self.length):
            mfcc = np.load(self.mfcc_files[i], allow_pickle=True)
            mfcc = (mfcc - np.mean(mfcc, axis = 0))/np.std(mfcc, axis = 0)
            transcript = np.load(self.transcript_files[i], allow_pickle=True)
            transcript = transcript[1:-1]         # Removing [SOS] and [EOS] from the transcript 

            label = np.array([self.PHONEMES_DICT[trans] for trans in transcript])

            self.mfccs.append(mfcc)
            self.transcripts.append(label)
    

    def __len__(self):
        return self.length

    def __getitem__(self, ind):
        mfcc_tensor = torch.FloatTensor(self.mfccs[ind])
        transcript_tensor = torch.tensor(self.transcripts[ind])

        return mfcc_tensor, transcript_tensor


    def collate_fn(self, batch):
        batch_mfcc       = [mfcc for mfcc, _ in batch]
        batch_transcript = [transcript for _, transcript in batch]

        batch_mfcc_pad = pad_sequence(batch_mfcc, batch_first=True, padding_value=0.0)
        lengths_mfcc = [len(mfcc) for mfcc in batch_mfcc]

        batch_transcript_pad = pad_sequence(batch_transcript, batch_first=True, padding_value=0.0)
        lengths_transcript = [len(trans) for trans in batch_transcript]

        return batch_mfcc_pad, batch_transcript_pad, torch.tensor(lengths_mfcc), torch.tensor(lengths_transcript)

In [6]:
# class customCollate:

#     def __init__(self, trainData = True):
#         self.trainData = trainData

#     def __call__(self, batch):
#         batch_mfcc       = [mfcc for mfcc, _ in batch]
#         batch_transcript = [transcript for _, transcript in batch]

#         # if self.trainData:
#         #     time_masking = torchaudio.transforms.TimeMasking(time_mask_param = 10, iid_masks = False, p = 1.0)
#         #     frequency_masking = torchaudio.transforms.FrequencyMasking(freq_mask_param = 20, iid_masks = False)
            
#         #     batch_mfcc = 

#         batch_mfcc_pad = pad_sequence(batch_mfcc, batch_first=True, padding_value=0.0)
#         lengths_mfcc = [len(mfcc) for mfcc in batch_mfcc]

#         batch_transcript_pad = pad_sequence(batch_transcript, batch_first=True, padding_value=0.0)
#         lengths_transcript = [len(trans) for trans in batch_transcript]

#         return batch_mfcc_pad, batch_transcript_pad, torch.tensor(lengths_mfcc), torch.tensor(lengths_transcript)

### Test Data

In [7]:
# Test Dataloader
class AudioDatasetTest(torch.utils.data.Dataset):

    def __init__(self, root, partition = 'test-clean'): 

        self.mfcc_dir = root + partition + '/mfcc/'
        self.mfcc_files = os.listdir(self.mfcc_dir)
        self.mfcc_files.sort()

        self.length = len(self.mfcc_files)

        self.mfccs = []

        for i in range(self.length):
            mfcc = np.load(self.mfcc_dir + self.mfcc_files[i], allow_pickle=True)
            mfcc = (mfcc - np.mean(mfcc, axis = 0))/np.std(mfcc, axis = 0)
            self.mfccs.append(mfcc)
    
    def __len__(self):
        return self.length

    def __getitem__(self, ind):
        mfcc = torch.FloatTensor(self.mfccs[ind])
        return mfcc

    def collate_fn(self,batch):
        batch_mfcc = batch
        batch_mfcc_pad = pad_sequence(batch_mfcc, batch_first=True, padding_value=0)
        lengths_mfcc = [len(mfcc) for mfcc in batch_mfcc]

        return batch_mfcc_pad, torch.tensor(lengths_mfcc)

### Data - Hyperparameters

In [8]:
BATCH_SIZE = 256 # Increase if your device can handle it

transforms = [] # set of tranformations
# You may pass this as a parameter to the dataset class above
# This will help modularize your implementation

root = '/content/hw3p2' 

### Data loaders

In [9]:
gc.collect()

0

In [10]:
# del train_data, val_data, test_data, train_loader, val_loader, test_loader

In [11]:
# Create objects for the dataset class
train_data = AudioDataset(root = '/content/11-785-s23-hw3p2/', phonemes = PHONEMES, partition = ['train-clean-100', 'train-clean-360'])
val_data =AudioDataset(root = '/content/11-785-s23-hw3p2/', phonemes = PHONEMES, partition = ['dev-clean'])
test_data = AudioDatasetTest(root = '/content/11-785-s23-hw3p2/', partition = 'test-clean')

# Do NOT forget to pass in the collate function as parameter while creating the dataloader
train_loader = torch.utils.data.DataLoader(
    dataset     = train_data, 
    num_workers = 4,
    batch_size  = BATCH_SIZE, 
    pin_memory  = True,
    shuffle     = True,
    collate_fn  = train_data.collate_fn
)
val_loader = torch.utils.data.DataLoader(
    dataset     = val_data, 
    num_workers = 2,
    batch_size  = BATCH_SIZE,
    pin_memory  = True,
    shuffle     = False,
    collate_fn  = val_data.collate_fn
)
test_loader = torch.utils.data.DataLoader(
    dataset     = test_data, 
    num_workers = 2, 
    batch_size  = BATCH_SIZE, 
    pin_memory  = True, 
    shuffle     = False,
    collate_fn  = test_data.collate_fn
)

print("Batch size: ", BATCH_SIZE)
print("Train dataset samples = {}, batches = {}".format(train_data.__len__(), len(train_loader)))
print("Val dataset samples = {}, batches = {}".format(val_data.__len__(), len(val_loader)))
print("Test dataset samples = {}, batches = {}".format(test_data.__len__(), len(test_loader)))

Batch size:  256
Train dataset samples = 132552, batches = 518
Val dataset samples = 2703, batches = 11
Test dataset samples = 2620, batches = 11


In [12]:
gc.collect()

1210

In [13]:
# sanity check
for data in train_loader:
    x, y, lx, ly = data
    print(x.shape, y.shape, lx.shape, ly.shape)
    break 

torch.Size([256, 1690, 27]) torch.Size([256, 222]) torch.Size([256]) torch.Size([256])


In [14]:
print(len(x))
print(len(lx))

256
256


# NETWORK

## Basic

This is a basic block for understanding, you can skip this and move to pBLSTM one

In [None]:
# torch.cuda.empty_cache()

# class Network(nn.Module):

#     def __init__(self):

#         super(Network, self).__init__()

#         # Adding some sort of embedding layer or feature extractor might help performance.
#         self.embedding = nn.Sequential(nn.Conv1d(27, 512, kernel_size = 9, stride = 1, padding = 4, bias = False),
#                                        nn.GELU(),
#                                        nn.BatchNorm1d(512),
                                       
#                                        nn.Conv1d(512, 512, kernel_size = 7, stride = 1, padding = 3, bias = False),
#                                        nn.GELU(),
#                                        nn.BatchNorm1d(512),

#                                        nn.Conv1d(512, 512, kernel_size = 5, stride = 1, padding = 2, bias = False),
#                                        nn.GELU(),
#                                        nn.BatchNorm1d(512),

#                                        nn.Conv1d(512, 512, kernel_size = 3, stride = 1, padding = 1, bias = False),
#                                        nn.BatchNorm1d(512)
#         )

#         self.out_features = len(PHONEMES)
        
#         # TODO : look up the documentation. You might need to pass some additional parameters.
#         self.lstm = nn.LSTM(input_size = 512, hidden_size = 512, num_layers = 3, bias = True, bidirectional = True, dropout = 0.1) 
       
#         self.classification = nn.Sequential(
#             nn.Dropout(p = 0.1, inplace=False),
#             torch.nn.Linear(512*2, 512),
#             torch.nn.Linear(512, self.out_features)
#         )

#         self.logSoftmax = torch.nn.LogSoftmax(dim = 2)

#     def forward(self, x, lx):
#         x = self.embedding(torch.transpose(x, 1, 2))
#         x = torch.transpose(x, 1, 2)

#         x = pack_padded_sequence(x, lx, batch_first=True, enforce_sorted=False)
#         x, _ = self.lstm(x)
#         del _

#         x, lens_unpacked  = pad_packed_sequence(x, batch_first=True)
#         x = self.classification(x)
#         x = F.log_softmax(x, dim=2)

#         return x, lens_unpacked

In [15]:
torch.cuda.empty_cache()

class PermuteBlock(torch.nn.Module):
    def forward(self, x):
        return x.transpose(1, 2)

In [16]:
torch.cuda.empty_cache()

class Network(nn.Module):

    def __init__(self):

        super(Network, self).__init__()

        # Adding some sort of embedding layer or feature extractor might help performance.
        self.embedding = nn.Sequential(PermuteBlock(),
                                       nn.Conv1d(27, 128, kernel_size = 5, stride = 2, padding = 2, bias = False),
                                       nn.Dropout(p = 0.3, inplace = False),
                                       nn.GELU(),
                                       nn.BatchNorm1d(128),
                                       
                                       nn.Conv1d(128, 256, kernel_size = 3, stride = 2, padding = 1, bias = False),
                                       nn.Dropout(p = 0.3, inplace = False),
                                       nn.GELU(),
                                       nn.BatchNorm1d(256),
                                       PermuteBlock()
        )

        self.out_features = len(PHONEMES)
        
        # TODO : look up the documentation. You might need to pass some additional parameters.
        self.lstm = nn.LSTM(input_size = 256, hidden_size = 512, num_layers = 5, bias = True, bidirectional = True, dropout = 0.2) 
       
        self.classification = nn.Sequential(
            torch.nn.Linear(1024, 1024),
            nn.Dropout(p = 0.2, inplace = False),
            torch.nn.Linear(1024, self.out_features)
        )

        self.logSoftmax = torch.nn.LogSoftmax(dim = 2)

    def forward(self, x, lx):
        x = self.embedding(x)

        lx = (lx-2)//4
        x = pack_padded_sequence(x, lx, batch_first=True, enforce_sorted=False)
        x, _ = self.lstm(x)
        del _

        x, lens_unpacked  = pad_packed_sequence(x, batch_first=True)
        x = self.classification(x)
        x = F.log_softmax(x, dim=2)

        return x, lens_unpacked

# INIT
(If trying out the basic Network)

In [17]:
torch.cuda.empty_cache()

model = Network().to(device)
summary(model, x.to(device), lx) # x and lx come from the sanity check above :)

                              Kernel Shape      Output Shape      Params  \
Layer                                                                      
0_embedding.PermuteBlock_0               -   [256, 27, 1690]           -   
1_embedding.Conv1d_1          [27, 128, 5]   [256, 128, 845]      17.28k   
2_embedding.Dropout_2                    -   [256, 128, 845]           -   
3_embedding.GELU_3                       -   [256, 128, 845]           -   
4_embedding.BatchNorm1d_4            [128]   [256, 128, 845]       256.0   
5_embedding.Conv1d_5         [128, 256, 3]   [256, 256, 423]     98.304k   
6_embedding.Dropout_6                    -   [256, 256, 423]           -   
7_embedding.GELU_7                       -   [256, 256, 423]           -   
8_embedding.BatchNorm1d_8            [256]   [256, 256, 423]       512.0   
9_embedding.PermuteBlock_9               -   [256, 423, 256]           -   
10_lstm                                  -     [80671, 1024]  28.352512M   
11_classific

Unnamed: 0_level_0,Kernel Shape,Output Shape,Params,Mult-Adds
Layer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0_embedding.PermuteBlock_0,-,"[256, 27, 1690]",,
1_embedding.Conv1d_1,"[27, 128, 5]","[256, 128, 845]",17280.0,14601600.0
2_embedding.Dropout_2,-,"[256, 128, 845]",,
3_embedding.GELU_3,-,"[256, 128, 845]",,
4_embedding.BatchNorm1d_4,[128],"[256, 128, 845]",256.0,128.0
5_embedding.Conv1d_5,"[128, 256, 3]","[256, 256, 423]",98304.0,41582592.0
6_embedding.Dropout_6,-,"[256, 256, 423]",,
7_embedding.GELU_7,-,"[256, 256, 423]",,
8_embedding.BatchNorm1d_8,[256],"[256, 256, 423]",512.0,256.0
9_embedding.PermuteBlock_9,-,"[256, 423, 256]",,


In [18]:
!nvidia-smi

Sat Apr  8 04:17:54 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   39C    P0    25W /  70W |  13487MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [19]:
torch.cuda.empty_cache()

In [20]:
!nvidia-smi

Sat Apr  8 04:17:55 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   40C    P0    32W /  70W |   1289MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

# Training Config

In [21]:
config = {
    "beam_width" : 5,
    "lr" : 2e-3,
    "epochs" : 50
    }

In [22]:
criterion = torch.nn.CTCLoss()
# Define CTC loss as the criterion. How would the losses be reduced?
# CTC Loss: https://pytorch.org/docs/stable/generated/torch.nn.CTCLoss.html
# Refer to the handout for hints

optimizer =  torch.optim.AdamW(model.parameters(), lr = config['lr'], weight_decay = config['lr']/100)

# Declare the decoder. Use the CTC Beam Decoder to decode phonemes
# CTC Beam Decoder Doc: https://github.com/parlance/ctcdecode
decoder = CTCBeamDecoder(LABELS, beam_width = config['beam_width'], log_probs_input=True)

scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max = config['epochs'], eta_min = config['lr']/200, verbose=True)

# Mixed Precision, if you need it
scaler = torch.cuda.amp.GradScaler()

Adjusting learning rate of group 0 to 2.0000e-03.


## Decode Prediction

In [27]:
def decode_prediction(output, output_lens, decoder, PHONEME_MAP = LABELS):
    
    # TODO: look at docs for CTC.decoder and find out what is returned here. Check the shape of output and expected shape in decode.
    beam_results, beam_scores, timesteps, out_seq_len = decoder.decode(output, seq_lens= output_lens) #lengths - list of lengths

    pred_strings    = []
    
    for i in range(beam_results.shape[0]):
        #TODO: Create the prediction from the output of decoder.decode. Don't forget to map it using PHONEMES_MAP.
        beam_temp = beam_results[i,0,:out_seq_len[i,0]]
        pred_str = "".join([PHONEME_MAP[i] for i in beam_temp])
        pred_strings.append(pred_str)
    
    return pred_strings

def calculate_levenshtein(output, label, output_lens, label_lens, decoder, PHONEME_MAP = LABELS): # y - sequence of integers
    
    dist            = 0
    batch_size      = label.shape[0]

    pred_strings    = decode_prediction(output, output_lens, decoder, PHONEME_MAP)
    
    for i in range(batch_size):
        # TODO: Get predicted string and label string for each element in the batch
        pred_str = pred_strings[i]
        label_temp = label[i,0:label_lens[i]]
        label_str = [PHONEME_MAP[k] for k in label_temp]
        dist += Levenshtein.distance(pred_str, label_str)

    dist /= batch_size # TODO: Uncomment this, but think about why we are doing this
    # raise NotImplemented
    return dist

In [None]:
# test code to check shapes

model.eval()
for i, data in enumerate(val_loader, 0):
    x, y, lx, ly = data
    x, y = x.to(device), y.to(device)
    h, lh = model(x, lx)
    print(h.shape)
    h = torch.permute(h, (1, 0, 2))
    print(h.shape, y.shape)
    loss = criterion(h, y, lh, ly)
    print(loss)

    h = torch.permute(h, (1, 0, 2))
    print(calculate_levenshtein(h, y, lx, ly, decoder, LABELS))

    break

torch.Size([256, 733, 41])
torch.Size([733, 256, 41]) torch.Size([256, 265])
tensor(7.0678, device='cuda:0', grad_fn=<MeanBackward0>)
203.24609375


## wandb

You will need to fetch your api key from wandb.ai

In [3]:
import wandb
wandb.login(key="2a3537b39181a31bafb5eabede64d932c94e54e8")

[34m[1mwandb[0m: Currently logged in as: [33mlsethi[0m ([33mverydeeplearning[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [None]:
run = wandb.init(
    name = "RNN_HW3_v2", ## Wandb creates random run names if you skip this field
    reinit = True, ### Allows reinitalizing runs when you re-run this cell
    # run_id = ### Insert specific run id here if you want to resume a previous run
    # resume = "must" ### You need this to resume previous runs, but comment out reinit = True when using this
    project = "hw3p2-ablations", ### Project should be created in your wandb account 
    config = config ### Wandb Config for your run
)

# Train Functions

In [None]:
from tqdm import tqdm

def train_model(model, train_loader, criterion, optimizer):
    
    model.train()
    batch_bar = tqdm(total=len(train_loader), dynamic_ncols=True, leave=False, position=0, desc='Train') 

    total_loss = 0

    for i, data in enumerate(train_loader):
        optimizer.zero_grad()

        x, y, lx, ly = data
        x, y = x.to(device), y.to(device)

        with torch.cuda.amp.autocast():     
            h, lh = model(x, lx)
            h = torch.permute(h, (1, 0, 2))
            loss = criterion(h, y, lh, ly)

        total_loss += loss.item()

        batch_bar.set_postfix(
            loss="{:.04f}".format(float(total_loss / (i + 1))),
            lr="{:.06f}".format(float(optimizer.param_groups[0]['lr'])))

        batch_bar.update() # Update tqdm bar

        # Another couple things you need for FP16. 
        scaler.scale(loss).backward() # This is a replacement for loss.backward()
        scaler.step(optimizer) # This is a replacement for optimizer.step()
        scaler.update() # This is something added just for FP16

        del x, y, lx, ly, h, lh, loss 
        torch.cuda.empty_cache()

    batch_bar.close() # You need this to close the tqdm bar
    
    return total_loss / len(train_loader)


def validate_model(model, val_loader, decoder, phoneme_map= LABELS):

    model.eval()
    batch_bar = tqdm(total=len(val_loader), dynamic_ncols=True, position=0, leave=False, desc='Val')

    total_loss = 0
    vdist = 0

    for i, data in enumerate(val_loader):

        x, y, lx, ly = data
        x, y = x.to(device), y.to(device)

        with torch.inference_mode():
            h, lh = model(x, lx)
            h = torch.permute(h, (1, 0, 2))
            loss = criterion(h, y, lh, ly)

        total_loss += float(loss)
        vdist += calculate_levenshtein(torch.permute(h, (1, 0, 2)), y, lh, ly, decoder, phoneme_map)

        batch_bar.set_postfix(loss="{:.04f}".format(float(total_loss / (i + 1))), dist="{:.04f}".format(float(vdist / (i + 1))))

        batch_bar.update()
    
        del x, y, lx, ly, h, lh, loss
        torch.cuda.empty_cache()
        
    batch_bar.close()
    total_loss = total_loss/len(val_loader)
    val_dist = vdist/len(val_loader)
    return total_loss, val_dist

In [None]:
# test code to check shapes

model.eval()
for i, data in enumerate(val_loader, 0):
    x, y, lx, ly = data
    x, y = x.to(device), y.to(device)
    h, lh = model(x, lx)
    print(h.shape)
    h = torch.permute(h, (1, 0, 2))
    print(h.shape, y.shape)
    loss = criterion(h, y, lh, ly)
    print(loss)

    h = torch.permute(h, (1, 0, 2))
    print(calculate_levenshtein(h, y, lx, ly, decoder, LABELS))

    break

torch.Size([256, 733, 41])
torch.Size([733, 256, 41]) torch.Size([256, 265])
tensor(7.0678, device='cuda:0', grad_fn=<MeanBackward0>)
203.24609375


### Training Setup

In [None]:
def save_model(model, optimizer, scheduler, metric, epoch, path):
    torch.save(
        {'model_state_dict'         : model.state_dict(),
         'optimizer_state_dict'     : optimizer.state_dict(),
         'scheduler_state_dict'     : scheduler.state_dict(),
         metric[0]                  : metric[1], 
         'epoch'                    : epoch}, 
         path
    )

def load_model(path, model, metric= 'valid_acc', optimizer= None, scheduler= None):

    checkpoint = torch.load(path)
    model.load_state_dict(checkpoint['model_state_dict'])

    if optimizer != None:
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    if scheduler != None:
        scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
        
    epoch   = checkpoint['epoch']
    metric  = checkpoint[metric]

    return [model, optimizer, scheduler, epoch, metric]

In [None]:
# This is for checkpointing, if you're doing it over multiple sessions

last_epoch_completed = 0
start = last_epoch_completed
end = config["epochs"]
best_lev_dist = float("inf") # if you're restarting from some checkpoint, use what you saw there.
epoch_model_path = 'Epoch_checkpoint_v2.pth'
best_model_path = 'Best_checkpoint_v2.pth'

In [None]:
torch.cuda.empty_cache()
gc.collect()

#TODO: Please complete the training loop

for epoch in range(0, config['epochs']):

    print("\nEpoch: {}/{}".format(epoch+1, config['epochs']))
    
    curr_lr = float(optimizer.param_groups[0]['lr'])
    print('Current Learning rate: ', curr_lr)

    train_loss              = train_model(model, train_loader, criterion, optimizer)
    torch.cuda.empty_cache()
    gc.collect()
    
    valid_loss, valid_dist  = validate_model(model, val_loader, decoder, LABELS)
    torch.cuda.empty_cache()
    gc.collect()
    
    scheduler.step()
    #scheduler.step(valid_dist)

    print("\tTrain Loss {:.04f}\t Learning Rate {:.07f}".format(train_loss, curr_lr))
    print("\tVal Dist {:.04f}%\t Val Loss {:.04f}".format(valid_dist, valid_loss))


    wandb.log({
        'train_loss': train_loss,  
        'valid_dist': valid_dist, 
        'valid_loss': valid_loss, 
        'lr'        : curr_lr
    })
    save_model(model, optimizer, scheduler, ['valid_dist', valid_dist], epoch, epoch_model_path)
    wandb.save(epoch_model_path)
    print("Saved epoch model")

    if valid_dist <= best_lev_dist:
        best_lev_dist = valid_dist
        save_model(model, optimizer, scheduler, ['valid_dist', valid_dist], epoch, best_model_path)
        wandb.save(best_model_path)
        print("Saved best model")
      # You may find it interesting to exlplore Wandb Artifcats to version your models
#run.finish()


Epoch: 1/50
Current Learning rate:  0.002




Adjusting learning rate of group 0 to 1.9980e-03.
	Train Loss 1.9066	 Learning Rate 0.0020000
	Val Dist 12.0849%	 Val Loss 0.5505
Saved epoch model
Saved best model

Epoch: 2/50
Current Learning rate:  0.0019980365947861304




Adjusting learning rate of group 0 to 1.9922e-03.
	Train Loss 0.4987	 Learning Rate 0.0019980
	Val Dist 7.3251%	 Val Loss 0.3411
Saved epoch model
Saved best model

Epoch: 3/50
Current Learning rate:  0.0019921541278079056




Adjusting learning rate of group 0 to 1.9824e-03.
	Train Loss 0.3531	 Learning Rate 0.0019922
	Val Dist 5.7559%	 Val Loss 0.2741
Saved epoch model
Saved best model

Epoch: 4/50
Current Learning rate:  0.0019823758144750453




Adjusting learning rate of group 0 to 1.9687e-03.
	Train Loss 0.2891	 Learning Rate 0.0019824
	Val Dist 4.9441%	 Val Loss 0.2361
Saved epoch model
Saved best model

Epoch: 5/50
Current Learning rate:  0.0019687402453229877




Adjusting learning rate of group 0 to 1.9513e-03.
	Train Loss 0.2490	 Learning Rate 0.0019687
	Val Dist 4.4423%	 Val Loss 0.2135
Saved epoch model
Saved best model

Epoch: 6/50
Current Learning rate:  0.0019513012337136777




Adjusting learning rate of group 0 to 1.9301e-03.
	Train Loss 0.2213	 Learning Rate 0.0019513
	Val Dist 4.1488%	 Val Loss 0.2007
Saved epoch model
Saved best model

Epoch: 7/50
Current Learning rate:  0.0019301276034588098




Adjusting learning rate of group 0 to 1.9053e-03.
	Train Loss 0.2002	 Learning Rate 0.0019301
	Val Dist 3.9309%	 Val Loss 0.1923
Saved epoch model
Saved best model

Epoch: 8/50
Current Learning rate:  0.0019053029172036893




Adjusting learning rate of group 0 to 1.8769e-03.
	Train Loss 0.1853	 Learning Rate 0.0019053
	Val Dist 3.7496%	 Val Loss 0.1851
Saved epoch model
Saved best model

Epoch: 9/50
Current Learning rate:  0.0018769251466436438




Adjusting learning rate of group 0 to 1.8451e-03.
	Train Loss 0.1705	 Learning Rate 0.0018769
	Val Dist 3.6216%	 Val Loss 0.1820
Saved epoch model
Saved best model

Epoch: 10/50
Current Learning rate:  0.0018451062858745046




Adjusting learning rate of group 0 to 1.8100e-03.
	Train Loss 0.1589	 Learning Rate 0.0018451
	Val Dist 3.4882%	 Val Loss 0.1763
Saved epoch model
Saved best model

Epoch: 11/50
Current Learning rate:  0.0018099719094030722




Adjusting learning rate of group 0 to 1.7717e-03.
	Train Loss 0.1487	 Learning Rate 0.0018100
	Val Dist 3.3298%	 Val Loss 0.1689
Saved epoch model
Saved best model

Epoch: 12/50
Current Learning rate:  0.0017716606765619098




Adjusting learning rate of group 0 to 1.7303e-03.
	Train Loss 0.1402	 Learning Rate 0.0017717
	Val Dist 3.2937%	 Val Loss 0.1687
Saved epoch model
Saved best model

Epoch: 13/50
Current Learning rate:  0.001730323784284304




Adjusting learning rate of group 0 to 1.6861e-03.
	Train Loss 0.1316	 Learning Rate 0.0017303
	Val Dist 3.2228%	 Val Loss 0.1666
Saved epoch model
Saved best model

Epoch: 14/50
Current Learning rate:  0.0016861243703990448




Adjusting learning rate of group 0 to 1.6392e-03.
	Train Loss 0.1239	 Learning Rate 0.0016861
	Val Dist 3.0918%	 Val Loss 0.1628
Saved epoch model
Saved best model

Epoch: 15/50
Current Learning rate:  0.0016392368697999457




Adjusting learning rate of group 0 to 1.5898e-03.
	Train Loss 0.1158	 Learning Rate 0.0016392
	Val Dist 3.1115%	 Val Loss 0.1627
Saved epoch model

Epoch: 16/50
Current Learning rate:  0.0015898463260310106




Adjusting learning rate of group 0 to 1.5381e-03.
	Train Loss 0.1099	 Learning Rate 0.0015898
	Val Dist 3.0440%	 Val Loss 0.1613
Saved epoch model
Saved best model

Epoch: 17/50
Current Learning rate:  0.0015381476610041012




Adjusting learning rate of group 0 to 1.4843e-03.
	Train Loss 0.1038	 Learning Rate 0.0015381
	Val Dist 2.9824%	 Val Loss 0.1582
Saved epoch model
Saved best model

Epoch: 18/50
Current Learning rate:  0.0014843449057312063




Adjusting learning rate of group 0 to 1.4287e-03.
	Train Loss 0.0982	 Learning Rate 0.0014843
	Val Dist 2.9254%	 Val Loss 0.1589
Saved epoch model
Saved best model

Epoch: 19/50
Current Learning rate:  0.001428650395107247




Adjusting learning rate of group 0 to 1.3713e-03.
	Train Loss 0.0922	 Learning Rate 0.0014287
	Val Dist 2.9179%	 Val Loss 0.1589
Saved epoch model
Saved best model

Epoch: 20/50
Current Learning rate:  0.0013712839299212542




Adjusting learning rate of group 0 to 1.3125e-03.
	Train Loss 0.0869	 Learning Rate 0.0013713
	Val Dist 2.9116%	 Val Loss 0.1586
Saved epoch model
Saved best model

Epoch: 21/50
Current Learning rate:  0.0013124719094030721




Adjusting learning rate of group 0 to 1.2524e-03.
	Train Loss 0.0818	 Learning Rate 0.0013125
	Val Dist 2.8239%	 Val Loss 0.1573
Saved epoch model
Saved best model

Epoch: 22/50
Current Learning rate:  0.0012524464377290298




Adjusting learning rate of group 0 to 1.1914e-03.
	Train Loss 0.0769	 Learning Rate 0.0012524
	Val Dist 2.8066%	 Val Loss 0.1609
Saved epoch model
Saved best model

Epoch: 23/50
Current Learning rate:  0.0011914444080127957




Adjusting learning rate of group 0 to 1.1297e-03.
	Train Loss 0.0730	 Learning Rate 0.0011914
	Val Dist 2.7837%	 Val Loss 0.1586
Saved epoch model
Saved best model

Epoch: 24/50
Current Learning rate:  0.0011297065673964825




Adjusting learning rate of group 0 to 1.0675e-03.
	Train Loss 0.0686	 Learning Rate 0.0011297
	Val Dist 2.7432%	 Val Loss 0.1588
Saved epoch model
Saved best model

Epoch: 25/50
Current Learning rate:  0.0010674765669316665




Adjusting learning rate of group 0 to 1.0050e-03.
	Train Loss 0.0642	 Learning Rate 0.0010675
	Val Dist 2.6983%	 Val Loss 0.1601
Saved epoch model
Saved best model

Epoch: 26/50
Current Learning rate:  0.0010049999999999996




Adjusting learning rate of group 0 to 9.4252e-04.
	Train Loss 0.0602	 Learning Rate 0.0010050
	Val Dist 2.7284%	 Val Loss 0.1635
Saved epoch model

Epoch: 27/50
Current Learning rate:  0.0009425234330683328




Adjusting learning rate of group 0 to 8.8029e-04.
	Train Loss 0.0567	 Learning Rate 0.0009425
	Val Dist 2.6527%	 Val Loss 0.1634
Saved epoch model
Saved best model

Epoch: 28/50
Current Learning rate:  0.000880293432603517




Adjusting learning rate of group 0 to 8.1856e-04.
	Train Loss 0.0527	 Learning Rate 0.0008803
	Val Dist 2.6569%	 Val Loss 0.1650
Saved epoch model

Epoch: 29/50
Current Learning rate:  0.0008185555919872037




Adjusting learning rate of group 0 to 7.5755e-04.
	Train Loss 0.0495	 Learning Rate 0.0008186
	Val Dist 2.6686%	 Val Loss 0.1674
Saved epoch model

Epoch: 30/50
Current Learning rate:  0.0007575535622709692




Adjusting learning rate of group 0 to 6.9753e-04.
	Train Loss 0.0464	 Learning Rate 0.0007576
	Val Dist 2.6469%	 Val Loss 0.1683
Saved epoch model
Saved best model

Epoch: 31/50
Current Learning rate:  0.0006975280905969274




Adjusting learning rate of group 0 to 6.3872e-04.
	Train Loss 0.0437	 Learning Rate 0.0006975
	Val Dist 2.6136%	 Val Loss 0.1682
Saved epoch model
Saved best model

Epoch: 32/50
Current Learning rate:  0.0006387160700787455




Adjusting learning rate of group 0 to 5.8135e-04.
	Train Loss 0.0405	 Learning Rate 0.0006387
	Val Dist 2.6026%	 Val Loss 0.1707
Saved epoch model
Saved best model

Epoch: 33/50
Current Learning rate:  0.0005813496048927525




Adjusting learning rate of group 0 to 5.2566e-04.
	Train Loss 0.0381	 Learning Rate 0.0005813
	Val Dist 2.5877%	 Val Loss 0.1736
Saved epoch model
Saved best model

Epoch: 34/50
Current Learning rate:  0.000525655094268793




Adjusting learning rate of group 0 to 4.7185e-04.
	Train Loss 0.0354	 Learning Rate 0.0005257
	Val Dist 2.5828%	 Val Loss 0.1747
Saved epoch model
Saved best model

Epoch: 35/50
Current Learning rate:  0.0004718523389958979




Adjusting learning rate of group 0 to 4.2015e-04.
	Train Loss 0.0330	 Learning Rate 0.0004719
	Val Dist 2.5667%	 Val Loss 0.1760
Saved epoch model
Saved best model

Epoch: 36/50
Current Learning rate:  0.0004201536739689891




Adjusting learning rate of group 0 to 3.7076e-04.
	Train Loss 0.0311	 Learning Rate 0.0004202
	Val Dist 2.5360%	 Val Loss 0.1799
Saved epoch model
Saved best model

Epoch: 37/50
Current Learning rate:  0.0003707631302000535




Adjusting learning rate of group 0 to 3.2388e-04.
	Train Loss 0.0293	 Learning Rate 0.0003708
	Val Dist 2.5505%	 Val Loss 0.1805
Saved epoch model

Epoch: 38/50
Current Learning rate:  0.00032387562960095454




Adjusting learning rate of group 0 to 2.7968e-04.
	Train Loss 0.0276	 Learning Rate 0.0003239
	Val Dist 2.5381%	 Val Loss 0.1831
Saved epoch model

Epoch: 39/50
Current Learning rate:  0.0002796762157156956




Adjusting learning rate of group 0 to 2.3834e-04.
	Train Loss 0.0258	 Learning Rate 0.0002797
	Val Dist 2.5268%	 Val Loss 0.1868
Saved epoch model
Saved best model

Epoch: 40/50
Current Learning rate:  0.00023833932343808966




Adjusting learning rate of group 0 to 2.0003e-04.
	Train Loss 0.0244	 Learning Rate 0.0002383
	Val Dist 2.4986%	 Val Loss 0.1882
Saved epoch model
Saved best model

Epoch: 41/50
Current Learning rate:  0.00020002809059692728




Adjusting learning rate of group 0 to 1.6489e-04.
	Train Loss 0.0231	 Learning Rate 0.0002000
	Val Dist 2.4964%	 Val Loss 0.1899
Saved epoch model
Saved best model

Epoch: 42/50
Current Learning rate:  0.00016489371412549514




Adjusting learning rate of group 0 to 1.3307e-04.
	Train Loss 0.0220	 Learning Rate 0.0001649
	Val Dist 2.5093%	 Val Loss 0.1913
Saved epoch model

Epoch: 43/50
Current Learning rate:  0.00013307485335635567




Adjusting learning rate of group 0 to 1.0470e-04.
	Train Loss 0.0211	 Learning Rate 0.0001331
	Val Dist 2.4886%	 Val Loss 0.1928
Saved epoch model
Saved best model

Epoch: 44/50
Current Learning rate:  0.00010469708279631069




Adjusting learning rate of group 0 to 7.9872e-05.
	Train Loss 0.0203	 Learning Rate 0.0001047
	Val Dist 2.4909%	 Val Loss 0.1951
Saved epoch model

Epoch: 45/50
Current Learning rate:  7.987239654118987e-05




Adjusting learning rate of group 0 to 5.8699e-05.
	Train Loss 0.0197	 Learning Rate 0.0000799
	Val Dist 2.4847%	 Val Loss 0.1952
Saved epoch model
Saved best model

Epoch: 46/50
Current Learning rate:  5.86987662863222e-05




Adjusting learning rate of group 0 to 4.1260e-05.
	Train Loss 0.0191	 Learning Rate 0.0000587
	Val Dist 2.4809%	 Val Loss 0.1966
Saved epoch model
Saved best model

Epoch: 47/50
Current Learning rate:  4.1259754677012166e-05




Adjusting learning rate of group 0 to 2.7624e-05.
	Train Loss 0.0186	 Learning Rate 0.0000413
	Val Dist 2.4691%	 Val Loss 0.1966
Saved epoch model
Saved best model

Epoch: 48/50
Current Learning rate:  2.762418552495471e-05




Adjusting learning rate of group 0 to 1.7846e-05.
	Train Loss 0.0184	 Learning Rate 0.0000276
	Val Dist 2.4705%	 Val Loss 0.1971
Saved epoch model

Epoch: 49/50
Current Learning rate:  1.784587219209462e-05




Adjusting learning rate of group 0 to 1.1963e-05.
	Train Loss 0.0181	 Learning Rate 0.0000178
	Val Dist 2.4592%	 Val Loss 0.1977
Saved epoch model
Saved best model

Epoch: 50/50
Current Learning rate:  1.1963405213869798e-05




Adjusting learning rate of group 0 to 1.0000e-05.
	Train Loss 0.0180	 Learning Rate 0.0000120
	Val Dist 2.4737%	 Val Loss 0.1980
Saved epoch model


# Generate Predictions and Submit to Kaggle

In [28]:
checkpoint = torch.load("/content/Best_checkpoint_v2.pth")

In [29]:
model.load_state_dict(checkpoint['model_state_dict'])

<All keys matched successfully>

In [30]:
#TODO: Make predictions

# Follow the steps below:
# 1. Create a new object for CTCBeamDecoder with larger (why?) number of beams
# 2. Get prediction string by decoding the results of the beam decoder

TEST_BEAM_WIDTH = int(config['beam_width']*2)

test_decoder    = CTCBeamDecoder(LABELS, beam_width = TEST_BEAM_WIDTH, log_probs_input=True)
results = []

model.eval()
print("Testing")
for data in tqdm(test_loader):

    x, lx   = data
    x       = x.to(device)

    with torch.no_grad():
        h, lh = model(x, lx)

    prediction_string = decode_prediction(h, lh, test_decoder, LABELS)
    results.extend(prediction_string)
    
    del x, lx, h, lh
    torch.cuda.empty_cache()

Testing


100%|██████████| 11/11 [00:24<00:00,  2.25s/it]


In [31]:
data_dir = "/content/11-785-s23-hw3p2" + "/test-clean/random_submission.csv"
df = pd.read_csv(data_dir)
df.label = results
df.to_csv('submission.csv', index = False)

!kaggle competitions submit -c 11-785-s23-hw3p2 -f submission.csv -m "I made it!"

100% 210k/210k [00:00<00:00, 480kB/s]
Successfully submitted to Automatic Speech Recognition (ASR)

In [59]:
run.finish()