# HW1: Frame-Level Speech Recognition

In this homework, you will be working with MFCC data consisting of 28 features at each time step/frame. Your model should be able to recognize the phoneme occured in that frame.

# Libraries

In [2]:
!pip install torchsummaryX==1.1.0 wandb --quiet

In [3]:
import torch
import numpy as np
from torchsummaryX import summary
import sklearn
import gc
import zipfile
import pandas as pd
from tqdm.auto import tqdm
import os
import datetime
import wandb
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print("Device: ", device)

Device:  cuda


In [None]:
''' If you are using colab, you can import google drive to save model checkpoints in a folder
    If you want to use it, uncomment the two lines below
'''
#from google.colab import drive
#drive.mount('/content/drive')

In [4]:
### PHONEME LIST
PHONEMES = [
            '[SIL]',   'AA',    'AE',    'AH',    'AO',    'AW',    'AY',
            'B',     'CH',    'D',     'DH',    'EH',    'ER',    'EY',
            'F',     'G',     'HH',    'IH',    'IY',    'JH',    'K',
            'L',     'M',     'N',     'NG',    'OW',    'OY',    'P',
            'R',     'S',     'SH',    'T',     'TH',    'UH',    'UW',
            'V',     'W',     'Y',     'Z',     'ZH',    '[SOS]', '[EOS]']

# Dataset

This section covers the dataset/dataloader class for speech data. You will have to spend time writing code to create this class successfully. We have given you a lot of comments guiding you on what code to write at each stage, from top to bottom of the class. Please try and take your time figuring this out, as it will immensely help in creating dataset/dataloader classes for future homeworks.

Before running the following cells, please take some time to analyse the structure of data. Try loading a single MFCC and its transcipt, print out the shapes and print out the values. Do the transcripts look like phonemes?

In [5]:
# Dataset class to load train and validation data

class AudioDataset(torch.utils.data.Dataset):

    def __init__(self, root, phonemes = PHONEMES,context=0, partition= "train-clean-100"): # Feel free to add more arguments

        self.context    = context
        self.phonemes   = phonemes

        # TODO: MFCC directory - use partition to acces train/dev directories from kaggle data using root
        self.mfcc_dir   = os.path.join(root, partition, 'mfcc')
        # TODO: Transcripts directory - use partition to acces train/dev directories from kaggle data using root
        self.transcript_dir = os.path.join(root, partition, 'transcript')

        # TODO: List files in sefl.mfcc_dir using os.listdir in sorted order
        mfcc_names          = sorted(os.listdir(self.mfcc_dir))
        # TODO: List files in self.transcript_dir using os.listdir in sorted order
        transcript_names    = sorted(os.listdir(self.transcript_dir))

        # Making sure that we have the same no. of mfcc and transcripts
        assert len(mfcc_names) == len(transcript_names)

        self.mfccs, self.transcripts = [], []

        # TODO: Iterate through mfccs and transcripts
        for i in range(len(mfcc_names)):
        #   Load a single mfcc
            mfcc        = np.load(os.path.join(self.mfcc_dir, mfcc_names[i]))
        #   Do Cepstral Normalization of mfcc (explained in writeup)
            mfcc = (mfcc - np.mean(mfcc, axis=0)) / np.std(mfcc, axis=0)
        #   Load the corresponding transcript
            transcript  = np.load(os.path.join(self.transcript_dir, transcript_names[i]))
            transcript  = transcript[1:-1] # Remove [SOS] and [EOS] from the transcript
            # (Is there an efficient way to do this without traversing through the transcript?)
            # Note that SOS will always be in the starting and EOS at end, as the name suggests.
        #   Append each mfcc to self.mfcc, transcript to self.transcript
            self.mfccs.append(mfcc)
            self.transcripts.append(transcript)

        # NOTE:
        # Each mfcc is of shape T1 x 28, T2 x 28, ...
        # Each transcript is of shape (T1+2), (T2+2) before removing [SOS] and [EOS]

        # TODO: Concatenate all mfccs in self.mfccs such that
        # the final shape is T x 28 (Where T = T1 + T2 + ...)
        self.mfccs = np.concatenate(self.mfccs, axis=0)

        # TODO: Concatenate all transcripts in self.transcripts such that
        # the final shape is (T,) meaning, each time step has one phoneme output
        self.transcripts  = np.concatenate(self.transcripts,axis=0)
        # Hint: Use numpy to concatenate

        # Length of the dataset is now the length of concatenated mfccs/transcripts
        self.length = len(self.mfccs)

        # Take some time to think about what we have done.
        # self.mfcc is an array of the format (Frames x Features).
        # Our goal is to recognize phonemes of each frame
        # We can introduce context by padding zeros on top and bottom of self.mfcc
        pad_width = ((self.context, self.context), (0, 0))
        self.mfccs = np.pad(self.mfccs, pad_width=pad_width, mode='constant')

        # The available phonemes in the transcript are of string data type
        # But the neural network cannot predict strings as such.
        # Hence, we map these phonemes to integers

        # TODO: Map the phonemes to their corresponding list indexes in self.phonemes
        self.transcripts = np.array([self.phonemes.index(p) if p in self.phonemes else 0 for p in self.transcripts])
        # Now, if an element in self.transcript is 0, it means that it is 'SIL' (as per the above example)

    def __len__(self):
        return self.length

    def __getitem__(self, ind):

        # TODO: Based on context and offset, return a frame at given index with context frames to the left, and right.
        frames = self.mfccs[ind:ind + 2 * self.context + 1]
        # After slicing, you get an array of shape 2*context+1 x 28. But our MLP needs 1d data and not 2d.
        frames = frames.flatten() # TODO: Flatten to get 1d data

        frames      = torch.FloatTensor(frames) # Convert to tensors
        phonemes    = torch.tensor(self.transcripts[ind])

        return frames, phonemes

In [6]:
class AudioTestDataset(torch.utils.data.Dataset):
    def __init__(self, root, phonemes = PHONEMES,context=0, partition= "test-clean"): # Feel free to add more arguments

        self.context    = context
        self.phonemes   = phonemes

        # TODO: MFCC directory - use partition to acces train/dev directories from kaggle data using root
        self.mfcc_dir   = os.path.join(root, partition, 'mfcc')
        # TODO: List files in sefl.mfcc_dir using os.listdir in sorted order
        mfcc_names          = sorted(os.listdir(self.mfcc_dir))


        self.mfccs = []

        # TODO: Iterate through mfccs and transcripts
        for i in range(len(mfcc_names)):
        #   Load a single mfcc
            mfcc = np.load(os.path.join(self.mfcc_dir, mfcc_names[i]))
        #   Do Cepstral Normalization of mfcc (explained in writeup)
            mfcc = (mfcc - np.mean(mfcc, axis=0)) / np.std(mfcc, axis=0)
         #   Append each mfcc to self.mfcc, transcript to self.transcript
            self.mfccs.append(mfcc)

        # NOTE:
        # Each mfcc is of shape T1 x 28, T2 x 28, ...
        # Each transcript is of shape (T1+2), (T2+2) before removing [SOS] and [EOS]

        # TODO: Concatenate all mfccs in self.mfccs such that
        # the final shape is T x 28 (Where T = T1 + T2 + ...)
        self.mfccs = np.concatenate(self.mfccs, axis=0)
        self.length = len(self.mfccs)
        # Take some time to think about what we have done.
        # self.mfcc is an array of the format (Frames x Features).
        # Our goal is to recognize phonemes of each frame
        # We can introduce context by padding zeros on top and bottom of self.mfcc
        pad_width = ((self.context, self.context), (0, 0))
        self.mfccs = np.pad(self.mfccs, pad_width=pad_width, mode='constant')

        # The available phonemes in the transcript are of string data type
        # But the neural network cannot predict strings as such.
        # Hence, we map these phonemes to integers


    def __len__(self):
        return self.length

    def __getitem__(self, ind):

        # TODO: Based on context and offset, return a frame at given index with context frames to the left, and right.
        frames = self.mfccs[ind:ind + 2 * self.context + 1]
        # After slicing, you get an array of shape 2*context+1 x 28. But our MLP needs 1d data and not 2d.
        frames = frames.flatten() # TODO: Flatten to get 1d data

        frames = torch.FloatTensor(frames) # Convert to tensors

        return frames


# Parameters Configuration

Storing your parameters and hyperparameters in a single configuration dictionary makes it easier to keep track of them during each experiment. It can also be used with weights and biases to log your parameters for each experiment and keep track of them across multiple experiments.

In [7]:
config = {
    'epochs'        : 80,
    'batch_size'    : 15240,
    'context'       : 33,
    'init_lr'       : 0.05
    # Add more as you need them - e.g dropout values, weight decay, scheduler parameters


}

# Create Datasets

In [9]:
#TODO: Create a dataset object using the AudioDataset class for the training data
train_data = AudioDataset(root="/kaggle/input/11785-hw1p2-f24/11785-f24-hw1p2",context=config["context"])

# TODO: Create a dataset object using the AudioDataset class for the validation data
val_data = AudioDataset(root="/kaggle/input/11785-hw1p2-f24/11785-f24-hw1p2",context=config["context"], partition="dev-clean")

# TODO: Create a dataset object using the AudioTestDataset class for the test data
test_data = AudioTestDataset(root="/kaggle/input/11785-hw1p2-f24/11785-f24-hw1p2",context=config["context"])

In [10]:
# Define dataloaders for train, val and test datasets
# Dataloaders will yield a batch of frames and phonemes of given batch_size at every iteration
# We shuffle train dataloader but not val & test dataloader. Why?

train_loader = torch.utils.data.DataLoader(
    dataset     = train_data,
    num_workers = 6,
    batch_size  = config['batch_size'],
    pin_memory  = True,
    shuffle     = True
)

val_loader = torch.utils.data.DataLoader(
    dataset     = val_data,
    num_workers = 2,
    batch_size  = config['batch_size'],
    pin_memory  = True,
    shuffle     = False
)

test_loader = torch.utils.data.DataLoader(
    dataset     = test_data,
    num_workers = 2,
    batch_size  = config['batch_size'],
    pin_memory  = True,
    shuffle     = False
)

print("Batch size     : ", config['batch_size'])
print("Context        : ", config['context'])
print("Input size     : ", (2*config['context']+1)*28)
print("Output symbols : ", len(PHONEMES))

print("Train dataset samples = {}, batches = {}".format(train_data.__len__(), len(train_loader)))
print("Validation dataset samples = {}, batches = {}".format(val_data.__len__(), len(val_loader)))
print("Test dataset samples = {}, batches = {}".format(test_data.__len__(), len(test_loader)))

Batch size     :  15240
Context        :  33
Input size     :  1876
Output symbols :  42
Train dataset samples = 36091157, batches = 2369
Validation dataset samples = 1928204, batches = 127
Test dataset samples = 1934138, batches = 127




In [11]:
# Testing code to check if your data loaders are working
for i, data in enumerate(train_loader):
    frames, phoneme = data
    print(frames.shape, phoneme.shape)
    break

torch.Size([15240, 1876]) torch.Size([15240])


# Network Architecture


This section defines your network architecture for the homework. We have given you a sample architecture that can easily clear the very low cutoff for the early submission deadline.

In [12]:
# This architecture will make you cross the very low cutoff
# However, you need to run a lot of experiments to cross the medium or high cutoff
class Network(torch.nn.Module):

    def __init__(self, input_size, output_size):

        super(Network, self).__init__()

        self.model = torch.nn.Sequential(

            torch.nn.Linear(input_size, 1024),
            torch.nn.BatchNorm1d(1024),
            torch.nn.GELU(),
            torch.nn.Dropout(p=0.22),

            torch.nn.Linear(1024, 2048),
            torch.nn.BatchNorm1d(2048),
            torch.nn.GELU(),
            torch.nn.Dropout(p=0.24),

            torch.nn.Linear(2048, 3000),
            torch.nn.BatchNorm1d(3000),
            torch.nn.GELU(),
            torch.nn.Dropout(p=0.28),

            torch.nn.Linear(3000, 2048),
            torch.nn.BatchNorm1d(2048),
            torch.nn.GELU(),
            torch.nn.Dropout(p=0.28),

            torch.nn.Linear(2048, 1024),
            torch.nn.BatchNorm1d(1024),
            torch.nn.GELU(),
            torch.nn.Dropout(p=0.24),

            torch.nn.Linear(1024, 924),
            torch.nn.BatchNorm1d(924),
            torch.nn.GELU(),
            torch.nn.Dropout(p=0.2),

            torch.nn.Linear(924, 512),
            torch.nn.BatchNorm1d(512),
            torch.nn.GELU(),
            torch.nn.Dropout(p=0.15),
            
            torch.nn.Linear(512, 256),
            torch.nn.BatchNorm1d(256),
            torch.nn.GELU(),
            torch.nn.Dropout(p=0.1),

            torch.nn.Linear(256, output_size)
        )

    def forward(self, x):
        out = self.model(x)

        return out

# Define Model, Loss Function and Optimizer

Here we define the model, loss function, optimizer and optionally a learning rate scheduler.

In [13]:
INPUT_SIZE  = (2*config['context'] + 1) * 28 # Why is this the case?
model       = Network(INPUT_SIZE, len(train_data.phonemes)).to(device)
summary(model, frames.to(device))
# Check number of parameters of your network
# Remember, you are limited to 20 million parameters for HW1 (including ensembles)

----------------------------------------------------------------------------------------------------
Layer                   Kernel Shape         Output Shape         # Params (K)      # Mult-Adds (M)
0_Linear                [1876, 1024]        [15240, 1024]             1,922.05                 1.92
1_BatchNorm1d                 [1024]        [15240, 1024]                 2.05                 0.00
2_GELU                             -        [15240, 1024]                    -                    -
3_Dropout                          -        [15240, 1024]                    -                    -
4_Linear                [1024, 2048]        [15240, 2048]             2,099.20                 2.10
5_BatchNorm1d                 [2048]        [15240, 2048]                 4.10                 0.00
6_GELU                             -        [15240, 2048]                    -                    -
7_Dropout                          -        [15240, 2048]                    -                    -

In [15]:
criterion = torch.nn.CrossEntropyLoss() # Defining Loss function.
# We use CE because the task is multi-class classification
optimizer = torch.optim.AdamW(model.parameters(), lr= config['init_lr'],weight_decay=0.05)
# Recommended : Define Scheduler for Learning Rate,
# including but not limited to StepLR, MultiStep, CosineAnnealing, CosineAnnealingWithWarmRestarts, ReduceLROnPlateau, etc.
# You can refer to Pytorch documentation for more information on how to use them.
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.3, patience=3, min_lr=1e-8)

# Is your training time very high?
# Look into mixed precision training if your GPU (Tesla T4, V100, etc) can make use of it
# Refer - https://pytorch.org/docs/stable/notes/amp_examples.html

# Training and Validation Functions

This section covers the training, and validation functions for each epoch of running your experiment with a given model architecture. The code has been provided to you, but we recommend going through the comments to understand the workflow to enable you to write these loops for future HWs.

In [16]:
torch.cuda.empty_cache()
gc.collect()

17

In [17]:
import torch
from torch.cuda.amp import autocast, GradScaler
from tqdm import tqdm

def train(model, dataloader, optimizer, criterion):
    model.train()
    tloss, tacc = 0, 0  # Monitoring loss and accuracy
    batch_bar = tqdm(total=len(dataloader), dynamic_ncols=True, leave=False, position=0, desc='Train')
    
    # Initialize GradScaler
    scaler = GradScaler()

    for i, (frames, phonemes) in enumerate(dataloader):
        # Initialize Gradients
        optimizer.zero_grad()

        # Move Data to Device (Ideally GPU)
        frames = frames.to(device)
        phonemes = phonemes.to(device)

        with autocast():  # Mixed precision context
            # Forward Propagation
            logits = model(frames)

            # Loss Calculation
            loss = criterion(logits, phonemes)

        # Backward Propagation with Gradient Scaling
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        tloss += loss.item()
        tacc += torch.sum(torch.argmax(logits, dim=1) == phonemes).item() / logits.shape[0]

        batch_bar.set_postfix(loss="{:.04f}".format(float(tloss / (i + 1))),
                              acc="{:.04f}%".format(float(tacc * 100 / (i + 1))))
        batch_bar.update()

        # Release memory
        del frames, phonemes, logits
        torch.cuda.empty_cache()

    batch_bar.close()
    tloss /= len(dataloader)
    tacc /= len(dataloader)

    return tloss, tacc

In [18]:
def eval(model, dataloader):

    model.eval() # set model in evaluation mode
    vloss, vacc = 0, 0 # Monitoring loss and accuracy
    batch_bar   = tqdm(total=len(val_loader), dynamic_ncols=True, position=0, leave=False, desc='Val')

    for i, (frames, phonemes) in enumerate(dataloader):

        ### Move data to device (ideally GPU)
        frames      = frames.to(device)
        phonemes    = phonemes.to(device)

        # makes sure that there are no gradients computed as we are not training the model now
        with torch.inference_mode():
            ### Forward Propagation
            logits  = model(frames)
            ### Loss Calculation
            loss    = criterion(logits, phonemes)

        vloss   += loss.item()
        vacc    += torch.sum(torch.argmax(logits, dim= 1) == phonemes).item()/logits.shape[0]

        # Do you think we need loss.backward() and optimizer.step() here?

        batch_bar.set_postfix(loss="{:.04f}".format(float(vloss / (i + 1))),
                              acc="{:.04f}%".format(float(vacc*100 / (i + 1))))
        batch_bar.update()

        ### Release memory
        del frames, phonemes, logits
        torch.cuda.empty_cache()

    batch_bar.close()
    vloss   /= len(val_loader)
    vacc    /= len(val_loader)

    return vloss, vacc

# Weights and Biases Setup

This section is to enable logging metrics and files with Weights and Biases. Please refer to wandb documentationa and recitation 0 that covers the use of weights and biases for logging, hyperparameter tuning and monitoring your runs for your homeworks. Using this tool makes it very easy to show results when submitting your code and models for homeworks, and also extremely useful for study groups to organize and run ablations under a single team in wandb.

We have written code for you to make use of it out of the box, so that you start using wandb for all your HWs from the beginning.

In [19]:
wandb.login(key="cf08e7702b2de0374e3a3aa67afce30b75d63532") #API Key is in your wandb account, under settings (wandb.ai/settings)

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [20]:
# Create your wandb run
run = wandb.init(
    name    = "10", ### Wandb creates random run names if you skip this field, we recommend you give useful names
    reinit  = True, ### Allows reinitalizing runs when you re-run this cell
    #id     = "y28t31uz", ### Insert specific run id here if you want to resume a previous run
    #resume = "must", ### You need this to resume previous runs, but comment out reinit = True when using this
    project = "hw1p2", ### Project should be created in your wandb account
    config  = config ### Wandb Config for your run
)

[34m[1mwandb[0m: Currently logged in as: [33mshunli2021[0m ([33mshunli2021-carnegie-mellon-university[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [21]:
### Save your model architecture as a string with str(model)
model_arch  = str(model)

### Save it in a txt file
arch_file   = open("model_arch7.txt", "w")
file_write  = arch_file.write(model_arch)
arch_file.close()

### log it in your wandb run with wandb.save()
wandb.save('model_arch7.txt')

['/kaggle/working/wandb/run-20240920_035646-qk5duqva/files/model_arch7.txt']

# Experiment

Now, it is time to finally run your ablations! Have fun!

In [None]:
# Iterate over number of epochs to train and evaluate your model
torch.cuda.empty_cache()
gc.collect()
wandb.watch(model, log="all")

for epoch in range(config['epochs']):

    print("\nEpoch {}/{}".format(epoch+1, config['epochs']))

    curr_lr                 = float(optimizer.param_groups[0]['lr'])
    train_loss, train_acc   = train(model, train_loader, optimizer, criterion)
    val_loss, val_acc       = eval(model, val_loader)

    print("\tTrain Acc {:.04f}%\tTrain Loss {:.04f}\t Learning Rate {:.07f}".format(train_acc*100, train_loss, curr_lr))
    print("\tVal Acc {:.04f}%\tVal Loss {:.04f}".format(val_acc*100, val_loss))

    ### Log metrics at each epoch in your run
    # Optionally, you can log at each batch inside train/eval functions
    # (explore wandb documentation/wandb recitation)
    wandb.log({'train_acc': train_acc*100, 'train_loss': train_loss,
               'val_acc': val_acc*100, 'valid_loss': val_loss, 'lr': curr_lr})

    ### Highly Recommended: Save checkpoint in drive and/or wandb if accuracy is better than your current best


In [None]:

# Iterate over number of epochs to train and evaluate your model
torch.cuda.empty_cache()
gc.collect()
wandb.watch(model, log="all")
best_acc=0.8
for epoch in range(config['epochs']):

    print("\nEpoch {}/{}".format(epoch+1, config['epochs']))

    curr_lr                 = float(optimizer.param_groups[0]['lr'])
    train_loss, train_acc   = train(model, train_loader, optimizer, criterion)
    val_loss, val_acc       = eval(model, val_loader)

    print("\tTrain Acc {:.04f}%\tTrain Loss {:.04f}\t Learning Rate {:.07f}".format(train_acc*100, train_loss, curr_lr))
    print("\tVal Acc {:.04f}%\tVal Loss {:.04f}".format(val_acc*100, val_loss))
    
    model_save_path='./best_model.pth'
    
    # Update the scheduler with the validation loss
    if scheduler:
        scheduler.step(val_loss)
    
    if best_acc< val_acc:
        best_acc = val_acc
        torch.save(
            {'epoch': epoch,
            'model state dict': model.state_dict(),
            'optimizer state dict': optimizer.state_dict(),
            'scheduler state dict': scheduler.state_dict() if scheduler else None,
            'train loss': train_loss,
             'val loss': val_loss,
            },model_save_path)
        print(f"Saved new best model with validation loss {val_acc:.04f}")

    ### Log metrics at each epoch in your run
    # Optionally, you can log at each batch inside train/eval functions
    # (explore wandb documentation/wandb recitation)
    
    wandb.log({'train_acc': train_acc*100, 'train_loss': train_loss,
               'val_acc': val_acc*100, 'valid_loss': val_loss, 'lr': curr_lr})
    
    
    ### Highly Recommended: Save checkpoint in drive and/or wandb if accuracy is better than your current best



Epoch 1/80


  scaler = GradScaler()
  with autocast():  # Mixed precision context
                                                                                     

	Train Acc 65.4573%	Train Loss 1.1415	 Learning Rate 0.0500000
	Val Acc 73.6738%	Val Loss 0.8234

Epoch 2/80


                                                                                     

	Train Acc 73.2712%	Train Loss 0.8432	 Learning Rate 0.0500000
	Val Acc 74.8637%	Val Loss 0.7773

Epoch 3/80


                                                                                     

	Train Acc 74.1129%	Train Loss 0.8137	 Learning Rate 0.0500000
	Val Acc 75.5331%	Val Loss 0.7569

Epoch 4/80


                                                                                     

	Train Acc 74.4616%	Train Loss 0.8010	 Learning Rate 0.0500000
	Val Acc 75.7787%	Val Loss 0.7454

Epoch 5/80


                                                                                     

	Train Acc 74.6499%	Train Loss 0.7942	 Learning Rate 0.0500000
	Val Acc 75.6004%	Val Loss 0.7500

Epoch 6/80


                                                                                     

	Train Acc 74.7567%	Train Loss 0.7899	 Learning Rate 0.0500000
	Val Acc 76.0728%	Val Loss 0.7382

Epoch 7/80


                                                                                     

	Train Acc 74.8448%	Train Loss 0.7868	 Learning Rate 0.0500000
	Val Acc 76.1886%	Val Loss 0.7307

Epoch 8/80


                                                                                     

	Train Acc 74.8939%	Train Loss 0.7849	 Learning Rate 0.0500000
	Val Acc 75.8685%	Val Loss 0.7429

Epoch 9/80


                                                                                     

	Train Acc 74.9500%	Train Loss 0.7830	 Learning Rate 0.0500000
	Val Acc 76.2863%	Val Loss 0.7242

Epoch 10/80


                                                                                     

	Train Acc 74.9803%	Train Loss 0.7813	 Learning Rate 0.0500000
	Val Acc 76.2077%	Val Loss 0.7303

Epoch 11/80


                                                                                     

	Train Acc 75.0140%	Train Loss 0.7801	 Learning Rate 0.0500000
	Val Acc 76.1232%	Val Loss 0.7308

Epoch 12/80


                                                                                     

	Train Acc 75.0414%	Train Loss 0.7791	 Learning Rate 0.0500000
	Val Acc 75.8643%	Val Loss 0.7403

Epoch 13/80


                                                                                     

	Train Acc 75.0680%	Train Loss 0.7782	 Learning Rate 0.0500000
	Val Acc 76.0825%	Val Loss 0.7359

Epoch 14/80


                                                                                     

	Train Acc 79.3413%	Train Loss 0.6295	 Learning Rate 0.0150000
	Val Acc 80.8894%	Val Loss 0.5709
Saved new best model with validation loss 0.8089

Epoch 15/80


                                                                                     

	Train Acc 79.6520%	Train Loss 0.6189	 Learning Rate 0.0150000
	Val Acc 81.0215%	Val Loss 0.5666
Saved new best model with validation loss 0.8102

Epoch 16/80


                                                                                     

	Train Acc 79.7245%	Train Loss 0.6166	 Learning Rate 0.0150000
	Val Acc 81.1410%	Val Loss 0.5639
Saved new best model with validation loss 0.8114

Epoch 17/80


                                                                                     

	Train Acc 79.7573%	Train Loss 0.6150	 Learning Rate 0.0150000
	Val Acc 81.2528%	Val Loss 0.5605
Saved new best model with validation loss 0.8125

Epoch 18/80


                                                                                     

	Train Acc 79.8060%	Train Loss 0.6135	 Learning Rate 0.0150000
	Val Acc 81.1905%	Val Loss 0.5621

Epoch 19/80


                                                                                     

	Train Acc 79.8354%	Train Loss 0.6123	 Learning Rate 0.0150000
	Val Acc 81.1746%	Val Loss 0.5596

Epoch 20/80


                                                                                     

	Train Acc 79.8720%	Train Loss 0.6112	 Learning Rate 0.0150000
	Val Acc 81.2549%	Val Loss 0.5610
Saved new best model with validation loss 0.8125

Epoch 21/80


                                                                                     

	Train Acc 79.8938%	Train Loss 0.6103	 Learning Rate 0.0150000
	Val Acc 81.2540%	Val Loss 0.5567

Epoch 22/80


                                                                                     

	Train Acc 79.9260%	Train Loss 0.6093	 Learning Rate 0.0150000
	Val Acc 81.2965%	Val Loss 0.5581
Saved new best model with validation loss 0.8130

Epoch 23/80


                                                                                     

	Train Acc 79.9441%	Train Loss 0.6085	 Learning Rate 0.0150000
	Val Acc 81.4396%	Val Loss 0.5535
Saved new best model with validation loss 0.8144

Epoch 24/80


                                                                                     

	Train Acc 79.9630%	Train Loss 0.6078	 Learning Rate 0.0150000
	Val Acc 81.4590%	Val Loss 0.5527
Saved new best model with validation loss 0.8146

Epoch 25/80


                                                                                     

	Train Acc 79.9875%	Train Loss 0.6072	 Learning Rate 0.0150000
	Val Acc 81.4721%	Val Loss 0.5532
Saved new best model with validation loss 0.8147

Epoch 26/80


                                                                                     

	Train Acc 79.9985%	Train Loss 0.6067	 Learning Rate 0.0150000
	Val Acc 81.4017%	Val Loss 0.5528

Epoch 27/80


                                                                                     

	Train Acc 80.0050%	Train Loss 0.6064	 Learning Rate 0.0150000
	Val Acc 81.5261%	Val Loss 0.5497
Saved new best model with validation loss 0.8153

Epoch 28/80


                                                                                     

	Train Acc 80.0221%	Train Loss 0.6059	 Learning Rate 0.0150000
	Val Acc 81.3891%	Val Loss 0.5548

Epoch 29/80


                                                                                     

	Train Acc 80.0270%	Train Loss 0.6055	 Learning Rate 0.0150000
	Val Acc 81.4790%	Val Loss 0.5507

Epoch 30/80


                                                                                     

	Train Acc 80.0462%	Train Loss 0.6051	 Learning Rate 0.0150000
	Val Acc 81.5177%	Val Loss 0.5506

Epoch 31/80


                                                                                     

	Train Acc 80.0580%	Train Loss 0.6047	 Learning Rate 0.0150000
	Val Acc 81.3828%	Val Loss 0.5547

Epoch 32/80


                                                                                     

	Train Acc 83.0057%	Train Loss 0.5045	 Learning Rate 0.0045000
	Val Acc 84.1785%	Val Loss 0.4659
Saved new best model with validation loss 0.8418

Epoch 33/80


                                                                                     

	Train Acc 83.3633%	Train Loss 0.4922	 Learning Rate 0.0045000
	Val Acc 84.2335%	Val Loss 0.4644
Saved new best model with validation loss 0.8423

Epoch 34/80


                                                                                     

	Train Acc 83.3257%	Train Loss 0.4929	 Learning Rate 0.0045000
	Val Acc 84.1744%	Val Loss 0.4656

Epoch 35/80


                                                                                     

	Train Acc 83.2779%	Train Loss 0.4945	 Learning Rate 0.0045000
	Val Acc 84.1954%	Val Loss 0.4653

Epoch 36/80


                                                                                     

	Train Acc 83.2525%	Train Loss 0.4955	 Learning Rate 0.0045000
	Val Acc 84.1824%	Val Loss 0.4651

Epoch 37/80


                                                                                     

	Train Acc 83.2444%	Train Loss 0.4957	 Learning Rate 0.0045000
	Val Acc 84.2082%	Val Loss 0.4661

Epoch 38/80


                                                                                     

	Train Acc 85.0701%	Train Loss 0.4347	 Learning Rate 0.0013500
	Val Acc 85.6527%	Val Loss 0.4222
Saved new best model with validation loss 0.8565

Epoch 39/80


                                                                                     

	Train Acc 85.5261%	Train Loss 0.4189	 Learning Rate 0.0013500
	Val Acc 85.7149%	Val Loss 0.4203
Saved new best model with validation loss 0.8571

Epoch 40/80


                                                                                     

	Train Acc 85.6425%	Train Loss 0.4147	 Learning Rate 0.0013500
	Val Acc 85.7562%	Val Loss 0.4202
Saved new best model with validation loss 0.8576

Epoch 41/80


                                                                                     

	Train Acc 85.6690%	Train Loss 0.4134	 Learning Rate 0.0013500
	Val Acc 85.7097%	Val Loss 0.4212

Epoch 42/80


                                                                                     

	Train Acc 85.6640%	Train Loss 0.4134	 Learning Rate 0.0013500
	Val Acc 85.6923%	Val Loss 0.4211

Epoch 43/80


                                                                                     

	Train Acc 85.6347%	Train Loss 0.4144	 Learning Rate 0.0013500
	Val Acc 85.7731%	Val Loss 0.4203
Saved new best model with validation loss 0.8577

Epoch 44/80


                                                                                     

	Train Acc 85.5997%	Train Loss 0.4154	 Learning Rate 0.0013500
	Val Acc 85.6885%	Val Loss 0.4232

Epoch 45/80


                                                                                     

	Train Acc 86.5260%	Train Loss 0.3847	 Learning Rate 0.0004050
	Val Acc 86.3264%	Val Loss 0.4048
Saved new best model with validation loss 0.8633

Epoch 46/80


                                                                                     

	Train Acc 86.8368%	Train Loss 0.3745	 Learning Rate 0.0004050
	Val Acc 86.3758%	Val Loss 0.4040
Saved new best model with validation loss 0.8638

Epoch 47/80


                                                                                     

	Train Acc 86.9389%	Train Loss 0.3706	 Learning Rate 0.0004050
	Val Acc 86.4039%	Val Loss 0.4036
Saved new best model with validation loss 0.8640

Epoch 48/80


                                                                                     

	Train Acc 87.0096%	Train Loss 0.3683	 Learning Rate 0.0004050
	Val Acc 86.4283%	Val Loss 0.4035
Saved new best model with validation loss 0.8643

Epoch 49/80


                                                                                     

	Train Acc 87.0485%	Train Loss 0.3666	 Learning Rate 0.0004050
	Val Acc 86.4531%	Val Loss 0.4036
Saved new best model with validation loss 0.8645

Epoch 50/80


                                                                                     

	Train Acc 87.0793%	Train Loss 0.3656	 Learning Rate 0.0004050
	Val Acc 86.4453%	Val Loss 0.4045

Epoch 51/80


                                                                                     

	Train Acc 87.1101%	Train Loss 0.3647	 Learning Rate 0.0004050
	Val Acc 86.4297%	Val Loss 0.4045

Epoch 52/80


                                                                                     

	Train Acc 87.1267%	Train Loss 0.3641	 Learning Rate 0.0004050
	Val Acc 86.4378%	Val Loss 0.4046

Epoch 53/80


                                                                                     

	Train Acc 87.4812%	Train Loss 0.3523	 Learning Rate 0.0001215
	Val Acc 86.6416%	Val Loss 0.4000
Saved new best model with validation loss 0.8664

Epoch 54/80


                                                                                     

	Train Acc 87.6083%	Train Loss 0.3482	 Learning Rate 0.0001215
	Val Acc 86.6713%	Val Loss 0.3994
Saved new best model with validation loss 0.8667

Epoch 55/80


                                                                                     

	Train Acc 87.6645%	Train Loss 0.3463	 Learning Rate 0.0001215
	Val Acc 86.6713%	Val Loss 0.4000
Saved new best model with validation loss 0.8667

Epoch 56/80


Train:  83%|████████▎ | 1958/2369 [08:32<01:26,  4.74it/s, acc=87.7176%, loss=0.3447]

# Testing and submission to Kaggle

Before we get to the following code, make sure to see the format of submission given in *sample_submission.csv*. Once you have done so, it is time to fill the following function to complete your inference on test data. Refer the eval function from previous cells to get an idea of how to go about completing this function.

In [None]:
def test(model, test_loader):
    ### What you call for model to perform inference?
    model.eval() # TODO train or eval?

    ### List to store predicted phonemes of test data
    test_predictions = []

    ### Which mode do you need to avoid gradients?
    with torch.no_grad(): # TODO

        for i, mfccs in enumerate(tqdm(test_loader)):

            mfccs   = mfccs.to(device)

            logits  = model(mfccs)

            ### Get most likely predicted phoneme with argmax
            predicted_phonemes = torch.argmax(logits, dim=1)

            ### How do you store predicted_phonemes with test_predictions? Hint, look at eval
            # TODO
            test_predictions.extend(predicted_phonemes.cpu().numpy())

    return test_predictions

In [None]:
# Define and instantiate the model again (for loading)
check_point=torch.load('/kaggle/working/best_model.pth')
model.load_state_dict(check_point['model state dict'])

In [None]:
predictions = test(model, test_loader)

In [None]:
x = [PHONEMES[0:-2][prediction] for prediction in predictions]

In [None]:
### Create CSV file with predictions
with open("./submission1.csv", "w+") as f:
    f.write("id,label\n")
    for i in range(len(x)):
        f.write("{},{}\n".format(i, x[i]))

In [None]:
### Finish your wandb run
run.finish()

In [None]:
### Submit to kaggle competition using kaggle API (Uncomment below to use)
#!kaggle competitions submit -c 11785-hw1p2-f24 -f ./submission.csv -m "Test Submission"

### However, its always safer to download the csv file and then upload to kaggle