



# Library

In [None]:
!pip install torchsummaryX wandb --quiet

[K     |████████████████████████████████| 1.9 MB 2.1 MB/s 
[K     |████████████████████████████████| 162 kB 90.1 MB/s 
[K     |████████████████████████████████| 182 kB 81.7 MB/s 
[K     |████████████████████████████████| 63 kB 1.9 MB/s 
[K     |████████████████████████████████| 162 kB 82.6 MB/s 
[K     |████████████████████████████████| 158 kB 67.6 MB/s 
[K     |████████████████████████████████| 157 kB 71.7 MB/s 
[K     |████████████████████████████████| 157 kB 75.1 MB/s 
[K     |████████████████████████████████| 157 kB 83.1 MB/s 
[K     |████████████████████████████████| 157 kB 76.8 MB/s 
[K     |████████████████████████████████| 157 kB 82.7 MB/s 
[K     |████████████████████████████████| 157 kB 71.7 MB/s 
[K     |████████████████████████████████| 157 kB 68.3 MB/s 
[K     |████████████████████████████████| 156 kB 52.3 MB/s 
[?25h  Building wheel for pathtools (setup.py) ... [?25l[?25hdone


In [None]:
import torch
import numpy as np
from torchsummaryX import summary
import sklearn
import gc
import zipfile
import pandas as pd
from tqdm.auto import tqdm
import os
import datetime
import wandb
import sklearn.metrics
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print("Device: ", device)

Device:  cuda


In [None]:
### PHONEME LIST
PHONEMES = [
            'SIL',   'AA',    'AE',    'AH',    'AO',    'AW',    'AY',  
            'B',     'CH',    'D',     'DH',    'EH',    'ER',    'EY',
            'F',     'G',     'HH',    'IH',    'IY',    'JH',    'K',
            'L',     'M',     'N',     'NG',    'OW',    'OY',    'P',
            'R',     'S',     'SH',    'T',     'TH',    'UH',    'UW',
            'V',     'W',     'Y',     'Z',     'ZH',    '<sos>', '<eos>']

# Dataset

In [None]:
# Dataset class to load train and validation data

class AudioDataset(torch.utils.data.Dataset):

    def __init__(self, data_path, context, offset=0, partition= "train", limit=-1): # Feel free to add more arguments

        self.context = context
        self.offset = offset
        self.data_path = data_path

        if partition=="train":
          path="/content/data/train-clean-100"
        else:
          path="/content/data/dev-clean"

        # TODO: MFCC directory - use partition to acces train/dev directories from kaggle data
        self.mfcc_dir =path+"/mfcc"
      
        
        self.transcript_dir = path+"/transcript"
        # TODO: Transcripts directory - use partition to acces train/dev directories from kaggle data

        mfcc_names = (os.listdir(self.mfcc_dir))
        #print(len(mfcc_names))
        mfcc_names.sort()
         # TODO: List files in sefl.mfcc_dir_dir using os.listdir in sorted order, optionally subset using limit to slice the number of files you load
        transcript_names =(os.listdir(self.transcript_dir))
        #print(len(transcript_names))
        transcript_names.sort()
        assert len(mfcc_names) == len(transcript_names) # Making sure that we have the same no. of mfcc and transcripts

        self.mfccs, self.transcripts = [], []

       
        # Iterate through mfccs and transcripts
        for i in range(0, len(mfcc_names)):
        #   Load a single mfcc
            mfcc = np.load(self.mfcc_dir+"/"+mfcc_names[i])
            mfcc=(mfcc-mfcc.mean(axis=0))/mfcc.std(axis=0)
        #   Optionally do Cepstral Normalization of mfcc
        #   Load the corresponding transcript
            transcript = np.load(self.transcript_dir+"/"+transcript_names[i]) 
            transcript=transcript[(transcript != "<sos>") & (transcript != "<eos>")]
                
            # Remove [SOS] and [EOS] from the transcript (Is there an efficient way to do this 
            # without traversing through the transcript?)
        #   Append each mfcc to self.mfcc, transcript to self.transcript
            self.mfccs.append(mfcc)
            self.transcripts.append(transcript)

        

        # NOTE:
        # Each mfcc is of shape T1 x 15, T2 x 15, ...
        # Each transcript is of shape (T1+2) x 15, (T2+2) x 15 before removing [SOS] and [EOS]

        # TODO: Concatenate all mfccs in self.mfccs such that the final shape is T x 15 (Where T = T1 + T2 + ...) 
        self.mfccs = np.concatenate(self.mfccs)
        print(len(self.mfccs))
        
        self.length = len(self.mfccs)

        # TODO: Concatenate all transcripts in self.transcripts such that the final shape is (T,) meaning, each time step has one phoneme output
        self.transcripts = np.concatenate(self.transcripts)
        # Hint: Use numpy to concatenate

        # Take some time to think about what we have done. self.mfcc is an array of the format (Frames x Features). Our goal is to recognize phonemes of each frame
        # From hw0, you will be knowing what context is. We can introduce context by padding zeros on top and bottom of self.mfcc
        self.mfccs=np.vstack((np.vstack((np.zeros((self.context,15)),self.mfccs)),np.zeros((self.context,15))))

        # These are the available phonemes in the transcript
        self.phonemes = [
            'SIL',   'AA',    'AE',    'AH',    'AO',    'AW',    'AY',  
            'B',     'CH',    'D',     'DH',    'EH',    'ER',    'EY',
            'F',     'G',     'HH',    'IH',    'IY',    'JH',    'K',
            'L',     'M',     'N',     'NG',    'OW',    'OY',    'P',
            'R',     'S',     'SH',    'T',     'TH',    'UH',    'UW',
            'V',     'W',     'Y',     'Z',     'ZH',    '<sos>', '<eos>']
        # But the neural network cannot predict strings as such. Instead we map these phonemes to integers

        def helper(x):
          return self.phonemes.index(x)

        self.transcripts = np.vectorize(helper)(self.transcripts)
         # TODO: Map the phonemes to their corresponding list indexes in self.phonemes
        # Now, if an element in self.transcript is 0, it means that it is 'SIL' (as per the above example)

        # Length of the dataset is now the length of concatenated mfccs/transcripts
        

    def __len__(self):
        return self.length

    def __getitem__(self, ind):
        
        frames = self.mfccs[ind:ind+self.context*2+1] # TODO: Based on context and offset, return a frame at given index with context frames to the left, and right.
        # After slicing, you get an array of shape 2*context+1 x 15. But our MLP needs 1d data and not 2d.
        frames = frames.flatten()# TODO: Flatten to get 1d data

        frames = torch.FloatTensor(frames) # Convert to tensors
        phoneme = torch.tensor(self.transcripts[ind])       

        return frames, phoneme

In [None]:
class AudioTestDataset(torch.utils.data.Dataset):
  

    # TODO: Create a test dataset class similar to the previous class but you dont have transcripts for this
    # Imp: Read the mfccs in sorted order, do NOT shuffle the data here or in your dataloader.
    def __init__(self, data_path, context, offset=0, limit=-1): # Feel free to add more arguments

        self.context = context
        self.offset = offset
        self.data_path = data_path
      
         
       
        self.mfcc_dir =data_path+"/test-clean/mfcc"
        mfcc_names = (os.listdir(self.mfcc_dir))
        mfcc_names.sort() 
        self.mfccs = []

       
        # Iterate through mfccs and transcripts
        for i in range(0, len(mfcc_names)):
    
            mfcc = np.load(self.mfcc_dir+"/"+mfcc_names[i])
            mfcc=(mfcc-mfcc.mean(axis=0))/mfcc.std(axis=0)
            self.mfccs.append(mfcc)
        
       
        self.mfccs = np.concatenate(self.mfccs)
        self.length = len(self.mfccs)
        self.mfccs=np.vstack((np.vstack((np.zeros((self.context,15)),self.mfccs)),np.zeros((self.context,15))))

        

    def __len__(self):
        return self.length

    def __getitem__(self, ind):
        
        frames = self.mfccs[ind:ind+self.context*2+1]
        frames = frames.flatten()
        frames = torch.FloatTensor(frames)
  
        return frames
       

# Parameters Configuration

Storing your parameters and hyperparameters in a single configuration dictionary makes it easier to keep track of them during each experiment. It can also be used with weights and biases to log your parameters for each experiment and keep track of them across multiple experiments. 

In [None]:
config = {
    'epochs': 30,
    'batch_size' : 2048,
    'context' : 35,
    'learning_rate' : 0.001,
    'architecture' : 'high-cutoff'
    # Add more as you need them - e.g dropout values, weight decay, scheduler parameters
}

# Create Datasets

In [None]:
train_data =AudioDataset(data_path="/content/data", context=config["context"], offset=0, partition= "train")

val_data =AudioDataset(data_path="/content/data", context=config["context"], offset=0, partition= "val")

# TODO: Create a dataset object using the AudioTestDataset class for the test data 

36191134
1937496


In [None]:
len(train_data)
len(val_data)

1937496

In [None]:
test_data = AudioTestDataset(data_path="/content/data", context=config["context"], offset=0) 

In [None]:
# Define dataloaders for train, val and test datasets
# Dataloaders will yield a batch of frames and phonemes of given batch_size at every iteration
train_loader = torch.utils.data.DataLoader(train_data, num_workers= 4,
                                           batch_size=config['batch_size'], pin_memory= True,
                                           shuffle= True)

val_loader = torch.utils.data.DataLoader(val_data, num_workers= 2,
                                         batch_size=config['batch_size'], pin_memory= True,
                                         shuffle= False)

test_loader = torch.utils.data.DataLoader(test_data, num_workers= 2, 
                                          batch_size=config['batch_size'], pin_memory= True, 
                                          shuffle= False)


print("Batch size: ", config['batch_size'])
print("Context: ", config['context'])
print("Input size: ", (2*config['context']+1)*15)
print("Output symbols: ", len(PHONEMES))

print("Train dataset samples = {}, batches = {}".format(train_data.__len__(), len(train_loader)))
print("Validation dataset samples = {}, batches = {}".format(val_data.__len__(), len(val_loader)))
print("Test dataset samples = {}, batches = {}".format(test_data.__len__(), len(test_loader)))

Batch size:  2048
Context:  35
Input size:  1065
Output symbols:  42
Train dataset samples = 36191134, batches = 17672
Validation dataset samples = 1937496, batches = 947
Test dataset samples = 1943253, batches = 949


In [None]:
# Testing code to check if your data loaders are working
for i, data in enumerate(train_loader):
    frames, phoneme = data
    print(frames.shape, phoneme.shape)
    break

torch.Size([2048, 1065]) torch.Size([2048])


# Network Architecture


In [None]:

class Network(torch.nn.Module):

    def __init__(self, context):

        super(Network, self).__init__()

        input_size = (2*context + 1) * 15
        output_size = 40
        self.model = torch.nn.Sequential(
            torch.nn.Linear(input_size,2048),
            torch.nn.BatchNorm1d(2048),
            torch.nn.ReLU(),
            torch.nn.Dropout(0.1), 

            torch.nn.Linear(2048,2048),
            torch.nn.ReLU(),
            torch.nn.Dropout(0.1),

            torch.nn.Linear(2048,2048),
            torch.nn.ReLU(),
            torch.nn.Dropout(0.1),

            torch.nn.Linear(2048,2048),
            torch.nn.BatchNorm1d(2048),
            torch.nn.ReLU(),
            torch.nn.Dropout(0.1),

            torch.nn.Linear(2048,2048),
            torch.nn.BatchNorm1d(2048),
            torch.nn.ReLU(),
            torch.nn.Dropout(0.1),

            

            torch.nn.Linear(2048,output_size),

            )
  
    def forward(self, x):
        out = self.model(x)

        return out

# Define Model, Loss Function and Optimizer

In [None]:
input_size = 15*(2*config['context'] + 1)
model = Network(config['context']).to(device)
frames,phoneme = next(iter(train_loader))
# Check number of parameters of your network - Remember, you are limited to 20 million parameters for HW1 (including ensembles)
summary(model, frames.to(device))

                         Kernel Shape  Output Shape     Params  Mult-Adds
Layer                                                                    
0_model.Linear_0         [1065, 2048]  [2048, 2048]  2.183168M   2.18112M
1_model.BatchNorm1d_1          [2048]  [2048, 2048]     4.096k     2.048k
2_model.ReLU_2                      -  [2048, 2048]          -          -
3_model.Dropout_3                   -  [2048, 2048]          -          -
4_model.Linear_4         [2048, 2048]  [2048, 2048]  4.196352M  4.194304M
5_model.ReLU_5                      -  [2048, 2048]          -          -
6_model.Dropout_6                   -  [2048, 2048]          -          -
7_model.Linear_7         [2048, 2048]  [2048, 2048]  4.196352M  4.194304M
8_model.ReLU_8                      -  [2048, 2048]          -          -
9_model.Dropout_9                   -  [2048, 2048]          -          -
10_model.Linear_10       [2048, 2048]  [2048, 2048]  4.196352M  4.194304M
11_model.BatchNorm1d_11        [2048] 

  df_sum = df.sum()


Unnamed: 0_level_0,Kernel Shape,Output Shape,Params,Mult-Adds
Layer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0_model.Linear_0,"[1065, 2048]","[2048, 2048]",2183168.0,2181120.0
1_model.BatchNorm1d_1,[2048],"[2048, 2048]",4096.0,2048.0
2_model.ReLU_2,-,"[2048, 2048]",,
3_model.Dropout_3,-,"[2048, 2048]",,
4_model.Linear_4,"[2048, 2048]","[2048, 2048]",4196352.0,4194304.0
5_model.ReLU_5,-,"[2048, 2048]",,
6_model.Dropout_6,-,"[2048, 2048]",,
7_model.Linear_7,"[2048, 2048]","[2048, 2048]",4196352.0,4194304.0
8_model.ReLU_8,-,"[2048, 2048]",,
9_model.Dropout_9,-,"[2048, 2048]",,


In [None]:
criterion = torch.nn.CrossEntropyLoss() #Defining Loss function 
optimizer = torch.optim.AdamW(model.parameters(), lr=config['learning_rate']) #Defining Optimizer
# Recommended : Define Scheduler for Learning Rate, including but not limited to StepLR, MultiStepLR, CosineAnnealingLR, ReduceLROnPlateau, etc. 
# You can refer to Pytorch documentation for more information on how to use them.

# Training and Validation Functions

In [None]:
torch.cuda.empty_cache()
gc.collect()

60

In [None]:
def train(model, optimizer, criterion, dataloader):

    model.train()
    train_loss = 0.0 #Monitoring Loss
    scheduler =torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[25,30], gamma=0.1)
    
    for iter, (mfccs, phonemes) in enumerate(dataloader):

        ### Move Data to Device (Ideally GPU)
        mfccs = mfccs.to(device)
        phonemes = phonemes.to(device)

        ### Forward Propagation
        logits = model(mfccs)

        ### Loss Calculation
        loss = criterion(logits, phonemes)
        train_loss += loss.item()

        ### Initialize Gradients
        optimizer.zero_grad()

        ### Backward Propagation
        loss.backward()

        ### Gradient Descent
        optimizer.step()
    scheduler.step()
  
    train_loss /= len(dataloader)
    return train_loss

In [None]:
def eval(model, dataloader):

    model.eval() # set model in evaluation mode

    phone_true_list = []
    phone_pred_list = []

    for i, data in enumerate(dataloader):

        frames, phonemes = data
        ### Move data to device (ideally GPU)
        frames, phonemes = frames.to(device), phonemes.to(device) 

        with torch.inference_mode(): # makes sure that there are no gradients computed as we are not training the model now
            ### Forward Propagation
            logits = model(frames)

        ### Get Predictions
        predicted_phonemes = torch.argmax(logits, dim=1)
        
        ### Store Pred and True Labels
        phone_pred_list.extend(predicted_phonemes.cpu().tolist())
        phone_true_list.extend(phonemes.cpu().tolist())
        
        # Do you think we need loss.backward() and optimizer.step() here?
    
        del frames, phonemes, logits
        torch.cuda.empty_cache()

    ### Calculate Accuracy
    accuracy = sklearn.metrics.accuracy_score(phone_pred_list, phone_true_list) 
    return accuracy*100

# Experiment

In [None]:
# Iterate over number of epochs to train and evaluate your model
torch.cuda.empty_cache()

best_acc = 0.0 ### Monitor best accuracy in your run

for epoch in range(config['epochs']):
    print("\nEpoch {}/{}".format(epoch+1, config['epochs']))

    train_loss = train(model, optimizer, criterion, train_loader)
    accuracy = eval(model, val_loader)

    print("\tTrain Loss: {:.4f}".format(train_loss))
    print("\tValidation Accuracy: {:.2f}%".format(accuracy))


    ### Log metrics at each epoch in your run - Optionally, you can log at each batch inside train/eval functions (explore wandb documentation/wandb recitation)
    wandb.log({"train loss": train_loss, "validation accuracy": accuracy})

    ### Save checkpoint if accuracy is better than your current best
    if accuracy >= best_acc:

      ### Save checkpoint with information you want
      torch.save({'epoch': epoch,
              'model_state_dict': model.state_dict(),
              'optimizer_state_dict': optimizer.state_dict(),
              'loss': train_loss,
              'acc': accuracy}, 
        './model_checkpoint.pth')
      
      ### Save checkpoint in wandb
      wandb.save('checkpoint.pth')

    # Is your training time very high? Look into mixed precision training if your GPU (Tesla T4, V100, etc) can make use of it 
    # Refer - https://pytorch.org/docs/stable/notes/amp_examples.html

### Finish your wandb run
run.finish()


Epoch 1/30
	Train Loss: 0.6271
	Validation Accuracy: 82.74%

Epoch 2/30
	Train Loss: 0.4468
	Validation Accuracy: 84.39%

Epoch 3/30
	Train Loss: 0.3895
	Validation Accuracy: 84.84%

Epoch 4/30
	Train Loss: 0.3562
	Validation Accuracy: 85.12%

Epoch 5/30
	Train Loss: 0.3343
	Validation Accuracy: 85.24%

Epoch 6/30
	Train Loss: 0.3191
	Validation Accuracy: 85.27%

Epoch 7/30
	Train Loss: 0.3081
	Validation Accuracy: 85.35%

Epoch 8/30
	Train Loss: 0.2997
	Validation Accuracy: 85.41%

Epoch 9/30
	Train Loss: 0.2931
	Validation Accuracy: 85.44%

Epoch 10/30
	Train Loss: 0.2879
	Validation Accuracy: 85.44%

Epoch 11/30
	Train Loss: 0.2834
	Validation Accuracy: 85.45%

Epoch 12/30
	Train Loss: 0.2800
	Validation Accuracy: 85.45%

Epoch 13/30
	Train Loss: 0.2770
	Validation Accuracy: 85.45%

Epoch 14/30
	Train Loss: 0.2742
	Validation Accuracy: 85.46%

Epoch 15/30
	Train Loss: 0.2719
	Validation Accuracy: 85.50%

Epoch 16/30
	Train Loss: 0.2699
	Validation Accuracy: 85.50%

Epoch 17/30
	Tra

VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
train loss,█▅▄▃▃▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
validation accuracy,▁▅▆▇▇▇▇▇▇▇▇▇▇▇████████████████

0,1
train loss,0.25372
validation accuracy,85.65948


# Testing and submission to Kaggle

In [None]:
def test(model, test_loader):
  ### What you call for model to perform inference?
  model.eval()

  ### List to store predicted phonemes of test data
  test_predictions = []

  ### Which mode do you need to avoid gradients?
  with torch.no_grad():

      for i, frames in enumerate(tqdm(test_loader)):

          frames = frames.float().to(device)             
          
          output = model(frames)

          ### Get most likely predicted phoneme with argmax
          predicted_phonemes = torch.argmax(output, dim=1)
        
          ### Store Pred and True Labels
          test_predictions.extend(predicted_phonemes.cpu().tolist())
          
          ### How do you store predicted_phonemes with test_predictions? Hint, look at eval 
          
  return test_predictions

In [None]:
predictions = test(model, test_loader)

  0%|          | 0/949 [00:00<?, ?it/s]

In [None]:
### Create CSV file with predictions
with open("./submission.csv", "w+") as f:
    f.write("id,label\n")
    for i in range(len(predictions)):
        f.write("{},{}\n".format(i, predictions[i]))

In [None]:
### Submit to kaggle competition using kaggle API
!kaggle competitions submit -c 11-785-f22-hw1p2 -f ./submission.csv -m "Test Submission"

100% 18.6M/18.6M [00:00<00:00, 43.7MB/s]
Successfully submitted to Frame-Level Speech Recognition