# Installs

## imports

In [1]:
import torch
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
from torchsummaryX import summary
from torch.utils.data import Dataset, DataLoader
import zipfile
import pandas as pd
from tqdm import tqdm
import os
import datetime
import warnings

#Dataset Imports
import csv
from IPython.display import Audio, display
import torchaudio
from transformers import Wav2Vec2FeatureExtractor
from torch.nn.utils.rnn import pad_sequence
from datasets import load_dataset, Dataset

warnings.filterwarnings('ignore')

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print("Device: ", device)

  from .autonotebook import tqdm as notebook_tqdm


Device:  cpu


# Dataset and Dataloader

In [5]:
#Set directory to the Audios folder in MSP dataset
AUDIO_ROOT = 'C:\\Users\\Justin\\Documents\\idl\\2022\\Project\\Audios_fixed\\Audios\\'
#Set path to labels_consensus in MSP dataset
LABELS_DIR = 'C:\\Users\\Justin\\Documents\\idl\\2022\\Project\\labels\\labels\\labels_concensus.csv'

In [6]:
#Load the directory
names = sorted(os.listdir(AUDIO_ROOT))
data1 = AUDIO_ROOT + names[0]
#torchaudio.load requires you to install some programs if you get 'No audio I/O backend is available' error
#https://stackoverflow.com/questions/62543843/cannot-import-torch-audio-no-audio-backend-is-available
waveform, sample_rate = torchaudio.load(data1)
print(waveform.shape)
print(sample_rate)

#Load label csv file
with open('labels_concensus.csv') as f:
    reader = csv.reader(f)
    labels = sorted(list(reader)[1:])
print(labels[0])

torch.Size([1, 167814])
16000
['MSP-PODCAST_0001_0008.wav', 'N', '2.2', '4.0', '2.6', '30', 'Male', 'Test1']


In [7]:
#Emotion Classifier Map (Emotion tag to int for model)
#Angry, Sad, Happy, Surprise, Fear, Disgust, Contempt, Neutral, Other
EMOMAP = {'A':1, 'S':2, 'H':3, 'U':4, 'F':5, 'D':6, 'C':7, 'N':8, 'O':9}

In [13]:
class MSPDataset(torch.utils.data.Dataset):
    
    #Initialize the dataset based on the recommended split in MSP dataset.
    def __init__(self, train = False, valid = False, test1 = False, test2 = False): 
        
        self.audio_dir = AUDIO_ROOT
        self.labels_dir = LABELS_DIR
        self.audio_names = sorted(names)
        self.labels_list = labels
        self.EMOMAP = EMOMAP

        self.audio = []
        self.labels = []
        
        #What type of dataset are we making
        setType = 'Train'
        if valid:
            setType = 'Validation'
        elif test1:
            setType = 'Test1'
        elif test2:
            setType = 'Test2'
        print(setType)       
        
        #Sanitycheck1
        assert(len(self.audio_names) == len(self.labels_list))
        
        for i in tqdm(range(0, len(self.audio_names))):
            assert(self.audio_names[i] == self.labels_list[i][0])
            if self.labels_list[i][7] != setType or self.labels_list[i][1] == 'X':
                continue
            #43 Audio files from 1904 podcast seems to be broken. Torchaudio load returns 'no data chunk'
            if self.labels_list[i][0].startswith('MSP-PODCAST_1904'):
                continue
            self.audio.append(self.audio_dir + self.audio_names[i])
            self.labels.append(self.EMOMAP[self.labels_list[i][1]])         
        
        self.length = len(self.audio)
        
    def __len__(self):
        return self.length

    def __getitem__(self, ind):
        audio = self.audio[ind]
        label = self.labels[ind]
        #load audio when getting the item. If we do it in init, computer blue screens.
        waveform, sample_rate = torchaudio.load(audio)
        return waveform, label
    
    def collate_fn(self, batch):
        batch_audio = [x[0].reshape(-1) for x in batch]
        audio_lengths = torch.LongTensor([len(x) for x in batch_audio])
        batch_audio = pad_sequence(batch_audio, padding_value=0.0, batch_first = True)
        batch_label = [x[1] for x in batch]
        
        return batch_audio, audio_lengths, torch.tensor(batch_label)


### Data loaders

In [5]:
# get me RAMMM!!!! 
import gc 
gc.collect()

482

In [15]:
# Create Dataset objects.
train_data = MSPDataset(train = True) 
val_data = MSPDataset(valid = True) 
test_data = MSPDataset(test1 = True)

print(train_data.__len__())
print(val_data.__len__())
print(test_data.__len__())

Train


100%|███████████████████████████████████████████████████████████████████████| 73042/73042 [00:00<00:00, 1162494.65it/s]


Validation


100%|███████████████████████████████████████████████████████████████████████| 73042/73042 [00:00<00:00, 2092453.85it/s]


Test1


100%|███████████████████████████████████████████████████████████████████████| 73042/73042 [00:00<00:00, 1786326.49it/s]

36011
6346
12371





In [None]:
#Set directory to the labelled_emotion folder in NSC dataset
NSC_Root = 'C:\\Users\\Justin\\Documents\\idl\\2022\\Project\\NSC_part5_labelled_emotion\\'

In [None]:
class NSCDataset(torch.utils.data.Dataset):
    
    def __init__(self): 
        
        self.audio_dir = NSC_Root
        #quick way of looping subdirectories. Dataset only has 4 categories. 
        self.subdirectory = [('Anger\\', 0), ('Sad\\', 1), ('Happy\\', 2), ('Neutral\\', 4)]
        self.audio = []
        self.labels = []
        for sub, label in subdirectory:
            NSCaudios = os.listdir(NSC_Root + sub)
            self.audio += [NSC_Root + sub + x for x in NSCaudios]
            self.labels += [label]*len(NSCaudios) 
        #Sanitycheck1
        assert(len(self.audio) == len(self.labels))
        self.length = len(self.audio)
        
    def __len__(self):
        return self.length

    def __getitem__(self, ind):
        audio = self.audio[ind]
        label = self.labels[ind]
        #load audio when getting the item. If we do it in init, computer blue screens.
        waveform, sample_rate = torchaudio.load(audio)
        waveform = processor(waveform, sampling_rate = 16000,padding=True, device = device)
        # waveform = waveform.to(device)
        # label = label.to(device)
        waveform['labels'] = label

        return waveform
        #return waveform, label
    

In [91]:
# get me RAMMM!!!! 
import gc 
gc.collect()

2691

In [None]:
#works with hugging face dataset.
NSCTest = NSCDataset()
NSCTest[0]
NSCtest_dataset = Dataset.from_list(NSCTest)

In [None]:
#stratified sort to train/test splits. Requires encoding the columns to classes first.
NSCtest_dataset2 = NSCtest_dataset.class_encode_column('labels')
NSCtest_dataset2.train_test_split(test_size = 0.1, stratify_by_column = 'labels')

In [11]:
NSC_ASR_Root = 'C:\\Users\\Justin\\Documents\\idl\\2022\\Project\\top20k.tar\\top20k\\'

In [4]:
from transformers import Wav2Vec2Processor
model_name_or_path = "facebook/wav2vec2-base-960h"
processor = Wav2Vec2Processor.from_pretrained(model_name_or_path,)
target_sampling_rate = processor.feature_extractor.sampling_rate
print(target_sampling_rate)

16000


In [98]:
class NSCSpeechDataset(torch.utils.data.Dataset):
    
    def __init__(self): 
        
        self.audio_dir = NSC_ASR_Root + 'top20k\\'
        self.labels_dir = NSC_ASR_Root + 'text.txt'
        self.audio = []
        self.labels = []
        self.audio = sorted(os.listdir(self.audio_dir))

        #Get the correct labels for the 20000 that we have.
        with open(self.labels_dir) as f:
            lines = f.readlines()
            start = False
            for l in lines:
                idx = int(l[4:8])
                #Start at APP_4001 and take 20000 from there
                if idx >= 4001 and len(self.labels) != len(self.audio):
                    #Remove new line and extract transcript
                    self.labels.append(l[:-1].split(" ", 1))
        assert(len(self.audio) == len(self.labels))
        self.length = len(self.audio)
        #Sanity Check!
        #Could be commented out..
        for i in range(len(self.audio)):
            if(self.audio[i][:-4] != self.labels[i][0]):
                print(self.audio[i])
                print(self.labels[i][0])
                break
                
        self.labels = [x[1] for x in self.labels]

    def __len__(self):
        return self.length

    def __getitem__(self, ind):
        audio = self.audio[ind]
        #Sanity Check
        print(audio)
        label = self.labels[ind]
        #load audio when getting the item. If we do it in init, computer blue screens.
        waveform, sample_rate = torchaudio.load(self.audio_dir + audio)
        waveform = processor(waveform, sampling_rate = 16000,padding=True, device = device)
        # waveform = waveform.to(device)
        # label = label.to(device)
        waveform['labels'] = label

        return waveform
        #return waveform, label

In [99]:
NSCSpeech = NSCSpeechDataset()
for i in range(5):
    print(NSCSpeech[i])

app_4001_6001_phnd_deb-1-0000000-0003226.wav
{'input_values': [array([[ 1.5415961e-05,  1.5415961e-05,  1.5415961e-05, ...,
         7.4268505e-03,  6.5004211e-03, -5.5431598e-03]], dtype=float32)], 'labels': 'ART MING GUAN'}
app_4001_6001_phnd_deb-1-0010612-0028188.wav
{'input_values': [array([[-0.00868657, -0.01794732, -0.03357484, ...,  0.28823635,
         0.07350262, -0.15280704]], dtype=float32)], 'labels': 'OKAY SO UH I GUESS IT   S TIME FOR US TO DEBATE THE TOPIC UH TECHNOLOGY CREATES MORE PROBLEM THAN BENEFITS FOR THE SOCIETY SO PPB ERR ERR HAVE YOU UH DONE ANY UH RESEARCH ON THAT TOPIC'}
app_4001_6001_phnd_deb-1-0028188-0031663.wav
{'input_values': [array([[0.04162551, 0.03642268, 0.02750353, ..., 0.06540991, 0.11669502,
        0.08027516]], dtype=float32)], 'labels': 'SO YA THE PROPONENT OPPOSITION'}
app_4001_6001_phnd_deb-1-0034071-0048978.wav
{'input_values': [array([[-0.01377804, -0.02656972, -0.01672996, ..., -0.02952164,
        -0.03050562, -0.00098636]], dtype=float3

In [90]:
NSCSpeechDataset = Dataset.from_list(NSCSpeech)

MemoryError: Unable to allocate 24.9 MiB for an array with shape (1, 6516742) and data type float32