# Installs

## imports

In [4]:
import torch
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
from torchsummaryX import summary
from torch.utils.data import Dataset, DataLoader
import zipfile
import pandas as pd
from tqdm import tqdm
import os
import datetime
import warnings

#Dataset Imports
import csv
from IPython.display import Audio, display
import torchaudio
from transformers import Wav2Vec2FeatureExtractor
from torch.nn.utils.rnn import pad_sequence

warnings.filterwarnings('ignore')

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print("Device: ", device)

Device:  cuda


# Dataset and Dataloader

In [5]:
#Set directory to the Audios folder in MSP dataset
AUDIO_ROOT = 'C:\\Users\\Justin\\Documents\\idl\\2022\\Project\\Audios_fixed\\Audios\\'
#Set path to labels_consensus in MSP dataset
LABELS_DIR = 'C:\\Users\\Justin\\Documents\\idl\\2022\\Project\\labels\\labels\\labels_concensus.csv'

In [6]:
#Load the directory
names = sorted(os.listdir(AUDIO_ROOT))
data1 = AUDIO_ROOT + names[0]
#torchaudio.load requires you to install some programs if you get 'No audio I/O backend is available' error
#https://stackoverflow.com/questions/62543843/cannot-import-torch-audio-no-audio-backend-is-available
waveform, sample_rate = torchaudio.load(data1)
print(waveform.shape)
print(sample_rate)

#Load label csv file
with open('labels_concensus.csv') as f:
    reader = csv.reader(f)
    labels = sorted(list(reader)[1:])
print(labels[0])

torch.Size([1, 167814])
16000
['MSP-PODCAST_0001_0008.wav', 'N', '2.2', '4.0', '2.6', '30', 'Male', 'Test1']


In [7]:
#Emotion Classifier Map (Emotion tag to int for model)
#Angry, Sad, Happy, Surprise, Fear, Disgust, Contempt, Neutral, Other
EMOMAP = {'A':1, 'S':2, 'H':3, 'U':4, 'F':5, 'D':6, 'C':7, 'N':8, 'O':9}

In [13]:
class MSPDataset(torch.utils.data.Dataset):
    
    #Initialize the dataset based on the recommended split in MSP dataset.
    def __init__(self, train = False, valid = False, test1 = False, test2 = False): 
        
        self.audio_dir = AUDIO_ROOT
        self.labels_dir = LABELS_DIR
        self.audio_names = sorted(names)
        self.labels_list = labels
        self.EMOMAP = EMOMAP

        self.audio = []
        self.labels = []
        
        #What type of dataset are we making
        setType = 'Train'
        if valid:
            setType = 'Validation'
        elif test1:
            setType = 'Test1'
        elif test2:
            setType = 'Test2'
        print(setType)       
        
        #Sanitycheck1
        assert(len(self.audio_names) == len(self.labels_list))
        
        for i in tqdm(range(0, len(self.audio_names))):
            assert(self.audio_names[i] == self.labels_list[i][0])
            if self.labels_list[i][7] != setType or self.labels_list[i][1] == 'X':
                continue
            #43 Audio files from 1904 podcast seems to be broken. Torchaudio load returns 'no data chunk'
            if self.labels_list[i][0].startswith('MSP-PODCAST_1904'):
                continue
            self.audio.append(self.audio_dir + self.audio_names[i])
            self.labels.append(self.EMOMAP[self.labels_list[i][1]])         
        
        self.length = len(self.audio)
        
    def __len__(self):
        return self.length

    def __getitem__(self, ind):
        audio = self.audio[ind]
        label = self.labels[ind]
        #load audio when getting the item. If we do it in init, computer blue screens.
        waveform, sample_rate = torchaudio.load(audio)
        return waveform, label
    
    def collate_fn(self, batch):
        batch_audio = [x[0].reshape(-1) for x in batch]
        audio_lengths = torch.LongTensor([len(x) for x in batch_audio])
        batch_audio = pad_sequence(batch_audio, padding_value=0.0, batch_first = True)
        batch_label = [x[1] for x in batch]
        
        return batch_audio, audio_lengths, torch.tensor(batch_label)


### Data loaders

In [14]:
# get me RAMMM!!!! 
import gc 
gc.collect()

0

In [15]:
# Create Dataset objects.
train_data = MSPDataset(train = True) 
val_data = MSPDataset(valid = True) 
test_data = MSPDataset(test1 = True)

print(train_data.__len__())
print(val_data.__len__())
print(test_data.__len__())

Train


100%|███████████████████████████████████████████████████████████████████████| 73042/73042 [00:00<00:00, 1162494.65it/s]


Validation


100%|███████████████████████████████████████████████████████████████████████| 73042/73042 [00:00<00:00, 2092453.85it/s]


Test1


100%|███████████████████████████████████████████████████████████████████████| 73042/73042 [00:00<00:00, 1786326.49it/s]

36011
6346
12371





In [16]:
# Do NOT forget to pass in the collate function as parameter while creating the dataloader
BATCH_SIZE = 100

train_loader = torch.utils.data.DataLoader(train_data, num_workers= 0,
                                           batch_size=BATCH_SIZE, pin_memory= True,
                                           shuffle= True, collate_fn = train_data.collate_fn)
val_loader = torch.utils.data.DataLoader(val_data, num_workers= 0,
                                           batch_size=BATCH_SIZE, pin_memory= True,
                                           shuffle= True,collate_fn=val_data.collate_fn)
test_loader = torch.utils.data.DataLoader(test_data, num_workers= 0,
                                           batch_size=BATCH_SIZE, pin_memory= True,
                                           shuffle= False, collate_fn=test_data.collate_fn)

print("Batch size: ", BATCH_SIZE)
print("Train dataset samples = {}, batches = {}".format(train_data.__len__(), len(train_loader)))
print("Val dataset samples = {}, batches = {}".format(val_data.__len__(), len(val_loader)))
print("Test dataset samples = {}, batches = {}".format(test_data.__len__(), len(test_loader)))

Batch size:  100
Train dataset samples = 36011, batches = 361
Val dataset samples = 6346, batches = 64
Test dataset samples = 12371, batches = 124


In [17]:
# Each loader contains batched_audio which is padded to the longest sequence in the batch.
# Lengths of each sequence before it's padded
# Labels of the batches contained in a list.
for i, (batch_audio, audio_lengths, batch_label) in enumerate(train_loader):
    print(batch_audio.shape)
    print(audio_lengths)
    print(batch_label)
    print(i)
    break

torch.Size([100, 326081])
tensor([ 95041, 158720,  47104, 110592,  78068, 156161,  96256, 326081, 112321,
        130721,  84321,  65281, 140480,  94208, 115681,  48161, 138401,  45056,
         65536, 174080,  65121, 173921, 157696, 118784,  46722, 117761,  66241,
        121440, 102881,  59841,  57344,  90112,  69601,  78241,  47361,  81121,
         49152, 119234,  57344,  52764,  48961,  65281, 172481,  97761,  72321,
         59361, 114257,  57601, 136640, 158080, 121281, 135361,  38912,  45440,
         68650, 121921,  81920,  63519, 115477,  76960,  47201, 129024,  53600,
         49152,  34816, 165888,  53281,  67681, 108161,  49152, 173282,  40960,
         88321,  67681,  67584, 164480,  73728,  79361,  79681,  55681,  46561,
         68161,  88064,  96481,  56321,  95041,  69632, 139361, 163840,  89601,
         63034, 111041, 133743,  47414,  74562, 122081,  59392,  98720, 116160,
        115681])
tensor([4, 7, 3, 1, 8, 2, 9, 3, 3, 3, 8, 8, 8, 3, 7, 8, 7, 8, 3, 4, 3, 3, 3, 