In [1]:
import torch
import torch.nn as nn
import torchaudio
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from emoDB import EmodbDataset
from torch.nn.utils.rnn import pad_sequence
from torch import cuda
import torch.optim as optim

In [2]:
def emodb_collate_fn(batch):
    waveforms = [item['waveform'] for item in batch]
    sample_rates = [item['sample_rate'] for item in batch]
    emotions = [item['emotion'] for item in batch]

    # Find max length in waveforms
    max_length = max([waveform.size(1) for waveform in waveforms])

    # Pad all waveforms to max_length
    waveforms_padded = []
    for waveform in waveforms:
        pad_amount = max_length - waveform.size(1)
        waveform_padded = torch.nn.functional.pad(waveform, (0, pad_amount))
        waveforms_padded.append(waveform_padded)
    # Stack everything up
    
    waveforms_padded = torch.stack(waveforms_padded)
    sample_rates = torch.stack([torch.tensor(sr) for sr in sample_rates])
    emotions = torch.stack([torch.tensor(em) for em in emotions])

    # Create attention mask
    #attention_masks_padded = torch.where(waveforms_padded != 0, 1, 0)
    attention_masks_padded = (waveforms_padded != 0)
    return waveforms_padded, sample_rates, emotions, attention_masks_padded

In [3]:
emoDB = EmodbDataset('./emoDB/')

  self.df = pd.DataFrame(data, columns=['speaker_id', 'code', 'emotion', 'version', 'file'], dtype=np.float32)


In [4]:
from torch.utils.data import random_split

# Let's say you want to use 80% of the data for training, and 20% for testing
train_size = int(0.8 * len(emoDB))
test_size = len(emoDB) - train_size

train_dataset, test_dataset = random_split(emoDB, [train_size, test_size])

BATCH_SIZE = 4
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=emodb_collate_fn)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=emodb_collate_fn)

In [5]:
sample = next(iter(train_loader))
print(sample[3])

tensor([[[ True,  True,  True,  ..., False, False, False]],

        [[ True,  True,  True,  ...,  True,  True,  True]],

        [[ True,  True,  True,  ..., False, False, False]],

        [[ True,  True,  True,  ..., False, False, False]]])


In [10]:
def test(model, test_dl):
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
  total = 0
  correct = 0
  model.to(device)
  model.eval()
  with torch.no_grad():
    for X, rate, y, attention_mask in test_dl:
      X, y, attention_mask = X.to(device), y.to(device), attention_mask.to(device)
      outputs = model.forward(X)
      max, preds = torch.max(outputs.data,1)
      total += y.size(0)
      #print(preds)
      #print(y)
      correct += (preds == y).sum().item()
  accuracy = correct / total

  return accuracy

In [7]:
def train(model, lr, num_epochs, train_dl, test_dl):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(device)
    model = model.to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(),lr=lr)
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        correct = 0
        total = 0
        for X, rate, y, attention_mask in train_dl:
            #print(y.shape)
            #y = y.long()
            X, y, attention_mask = X.to(device), y.to(device), attention_mask.to(device)
            optimizer.zero_grad()
            #print(y.shape)
            #print(X.shape)
            #print(attention_mask.shape)
            outputs = model(X)
            #print(outputs)
            loss = criterion(outputs, y)
            #print()
            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
            """
            for name, parameter in model.named_parameters():
                if parameter.grad is not None:
                    print(f'{name}: gradient max {parameter.grad.data.abs().max()}, gradient min {parameter.grad.data.abs().min()}')
            """
            optimizer.step()

            running_loss += loss.item()
            #print(running_loss)
            _, preds = outputs.max(1)
            total += y.size(0)
            correct += (preds == y).sum().item()

        print(f"Epoch {epoch+1} | Loss: {running_loss / len(train_loader)} | Accuracy: {100.*correct/total}")
        #test_accuracy = test(model,test_dl)
        #print(f"Test Accuracy: {test_accuracy}")
        torch.cuda.empty_cache()



In [8]:
EMO_CLASSES = 7

In [26]:
import models
import importlib
importlib.reload(models)
from models import Baseline
from models import TransferModel
from models import ScratchModel
baseline_model = Baseline(EMO_CLASSES)
transfer_model = TransferModel(EMO_CLASSES)
scratch_model = ScratchModel(EMO_CLASSES)
lr = 0.0001
num_epochs = 30
train(transfer_model, lr, num_epochs, train_loader, test_loader)
final_acc = test(transfer_model, test_loader)
print(f"final accuracy = {final_acc}")

Some weights of the model checkpoint at facebook/hubert-base-ls960 were not used when initializing HubertModel: ['encoder.layers.7.attention.out_proj.weight', 'encoder.layers.6.feed_forward.intermediate_dense.bias', 'encoder.layers.8.attention.q_proj.weight', 'encoder.layers.8.feed_forward.output_dense.weight', 'encoder.layers.10.layer_norm.bias', 'encoder.layers.10.attention.q_proj.weight', 'encoder.layers.7.feed_forward.intermediate_dense.weight', 'encoder.layers.7.feed_forward.output_dense.weight', 'encoder.layers.8.layer_norm.weight', 'encoder.layers.9.feed_forward.output_dense.weight', 'encoder.layers.8.attention.k_proj.weight', 'encoder.layers.8.attention.v_proj.weight', 'encoder.layers.6.attention.q_proj.bias', 'encoder.layers.10.feed_forward.output_dense.bias', 'encoder.layers.11.attention.k_proj.weight', 'encoder.layers.6.layer_norm.weight', 'encoder.layers.6.final_layer_norm.bias', 'encoder.layers.7.attention.k_proj.bias', 'encoder.layers.9.attention.out_proj.weight', 'encode

cuda
Epoch 1 | Loss: 1.5814092407159717 | Accuracy: 37.149532710280376
Test Accuracy: 0.6074766355140186
Epoch 2 | Loss: 1.170530772097757 | Accuracy: 53.97196261682243
Test Accuracy: 0.5420560747663551
Epoch 3 | Loss: 0.9197530951176849 | Accuracy: 65.18691588785046
Test Accuracy: 0.4672897196261682
Epoch 4 | Loss: 0.7689779881283502 | Accuracy: 73.13084112149532
Test Accuracy: 0.8037383177570093
Epoch 5 | Loss: 0.5844865975048498 | Accuracy: 79.67289719626169
Test Accuracy: 0.8130841121495327
Epoch 6 | Loss: 0.41631464299824195 | Accuracy: 87.14953271028037
Test Accuracy: 0.8598130841121495
Epoch 7 | Loss: 0.3090216882579098 | Accuracy: 91.82242990654206
Test Accuracy: 0.8598130841121495
Epoch 8 | Loss: 0.22858841393021084 | Accuracy: 94.1588785046729
Test Accuracy: 0.8037383177570093
Epoch 9 | Loss: 0.18205598725232286 | Accuracy: 95.32710280373831
Test Accuracy: 0.8691588785046729
Epoch 10 | Loss: 0.07247004911703901 | Accuracy: 98.59813084112149
Test Accuracy: 0.8411214953271028
E

In [None]:
from transformers import HubertForSequenceClassification
hubert_model = HubertForSequenceClassification.from_pretrained("superb/hubert-base-superb-ks")

Downloading (…)lve/main/config.json: 100%|██████████| 1.93k/1.93k [00:00<00:00, 644kB/s]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Downloading pytorch_model.bin: 100%|██████████| 378M/378M [00:37<00:00, 10.1MB/s] 
