In [10]:
import os
import random as rnd
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torchaudio
from pytorch_model_summary import summary
import torch.nn as nn
from torch.nn.functional import normalize
import torch.utils.data as data
from torch.utils.data import DataLoader, Dataset, random_split
from torchvision import datasets
from torchvision.transforms import ToTensor
import torchaudio.prototype.models
import torchaudio.prototype.pipelines

In [11]:
if torch.cuda.is_available():
    print(torch.cuda.get_device_name(0))

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

NVIDIA GeForce RTX 3060


In [12]:
class SpikerboxRecordings(Dataset):

    def __init__(self, annotations_file, audio_dir, transformation, target_sample_rate, num_samples, device):
        self.annotations = pd.read_csv(annotations_file)
        self.audio_dir = audio_dir
        self.device = device
        self.transformation = transformation.to(self.device)
        self.target_sample_rate = target_sample_rate
        self.num_samples = num_samples
    
    def __len__(self):
        return len(self.annotations)

    def __getitem__(self, index):
        audio_sample_path = self._get_audio_sample_path(index)
        label = self._get_audio_sample_label(index)
        signal, sample_rate = torchaudio.load(audio_sample_path)
        signal = signal.to(self.device)
        signal = self._cut_if_necessary(signal)
        signal = self._right_pad_if_necessary(signal)
        signal = self.transformation(signal)
        signal = self._make_log_mels(signal)
        signal = self._adjust_mel_width_if_necessary(signal, 96)
        return signal, label
    
    def _get_audio_sample_path(self, index):
        path = os.path.join(self.audio_dir, self.annotations.iloc[index, 0])
        return path
    def _resample_if_necessary(self, signal, sample_rate):
        if sample_rate != self.target_sample_rate:
            resampler = torchaudio.transforms.Resample(sample_rate, self.target_sample_rate).to(self.device)
            signal = resampler(signal)
        return signal
    def _get_audio_sample_label(self, index):
        return self.annotations.iloc[index, 3]
    def _cut_if_necessary(self, signal):
        if signal.shape[1] > self.num_samples:
            signal = signal[:, :self.num_samples]
        return signal
    def _right_pad_if_necessary(self, signal):
        length_signal = signal.shape[1]
        if length_signal < self.num_samples:
            num_missing_samples = self.num_samples - length_signal
            last_dim_padding = (0, num_missing_samples)
            signal = torch.nn.functional.pad(signal, last_dim_padding)
        return signal
    def _make_log_mels(self, signal):
        return torchaudio.transforms.AmplitudeToDB().to(self.device)(signal)
    def _adjust_mel_width_if_necessary(self, log_mel_spectrogram, width):
        if log_mel_spectrogram.shape[-1] < width:
            pad_width = width - log_mel_spectrogram.shape[-1]
            log_mel_spectrogram = torch.nn.functional.pad(log_mel_spectrogram, (0, pad_width))
        elif log_mel_spectrogram.shape[-1] > width:
            log_mel_spectrogram = log_mel_spectrogram[:, :, :width]
        return log_mel_spectrogram

In [13]:
ANNOTATIONS_FILE_TRAIN = "train_data/metadata/file_labels.csv"
AUDIO_DIR_TRAIN = "train_data/files"
ANNOTATIONS_FILE_VAL = "valid_data/metadata/file_labels.csv"
AUDIO_DIR_VAL = "valid_data/files"
SAMPLE_RATE = 10000
NUM_SAMPLES = 9600

mel_spectrogram = torchaudio.transforms.MelSpectrogram(
    sample_rate = SAMPLE_RATE,
    n_fft = 400,
    hop_length = 160,
    n_mels = 64
)

spr_train = SpikerboxRecordings(
    ANNOTATIONS_FILE_TRAIN,
    AUDIO_DIR_TRAIN,
    mel_spectrogram,
    SAMPLE_RATE,
    NUM_SAMPLES,
    device
)
spr_valid = SpikerboxRecordings(
    ANNOTATIONS_FILE_VAL,
    AUDIO_DIR_VAL,
    mel_spectrogram,
    SAMPLE_RATE,
    NUM_SAMPLES,
    device
)

In [14]:
class VGGishNetwork(nn.Module):
    def __init__(self):
        super(VGGishNetwork, self).__init__()
        self.features = nn.Sequential(
            nn.Conv2d(1, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)),
            nn.ReLU(inplace=True),
            nn.Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)),
            nn.ReLU(inplace=True),
            nn.Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2)
        )
        
        self.embeddings = nn.Sequential(
            nn.Linear(12288, 4096),
            nn.ReLU(inplace=True),
            nn.Linear(4096, 4096),
            nn.ReLU(inplace=True),
            nn.Linear(4096, 10)
        )

    def forward(self, x):
        x = self.features(x)
        x = torch.flatten(x, 1)
        x = self.embeddings(x)
        return x

In [15]:
def scaled_accuracy(output, target, max_distance, device):
    output, target = output.to(device), target.to(device)
    with torch.no_grad():
        pred = torch.argmax(output, dim=1)
        target = target.view(-1)
        distance = torch.abs(pred - target)
        scaled_acc = torch.clamp(1 - (distance.float() / max_distance), min=0.0)
        return scaled_acc.mean().item()

In [16]:
def train(model, train_dl, val_dl, optimizer, loss_func, epochs, device):
    for epoch in range(epochs):
        model.train()
        running_loss = 0.0
        running_scaled_accuracy_train = 0.0
        correct_train = 0
        total_train = 0
        for inputs, targets in train_dl:
            inputs, targets = inputs.to(device), targets.to(device)
            optimizer.zero_grad()
            predictions = model(inputs)
            loss = loss_func(predictions, targets)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
            running_scaled_accuracy_train += scaled_accuracy(predictions, targets, 9, device)
            _, predicted = torch.max(predictions, 1)
            total_train += targets.size(0)
            correct_train += (predicted == targets).sum().item()
        
        epoch_loss = running_loss / len(train_dl)
        epoch_scaled_accuracy_train = (running_scaled_accuracy_train / len(train_dl)) * 100
        train_accuracy = correct_train / total_train
        model.eval()
        running_val_loss = 0.0
        running_scaled_accuracy_val = 0.0
        correct_val = 0
        total_val = 0
        with torch.no_grad():
            for val_inputs, val_targets in val_dl:
                val_inputs, val_targets = val_inputs.to(device), val_targets.to(device)
                val_outputs = model(val_inputs)
                running_val_loss += loss_func(val_outputs, val_targets).item()
                running_scaled_accuracy_val += scaled_accuracy(val_outputs, val_targets, 9, device)
                _, predicted_val = torch.max(val_outputs, 1)
                total_val += val_targets.size(0)
                correct_val += (predicted_val == val_targets).sum().item()
        
        val_loss = running_val_loss / len(val_dl)
        val_accuracy = correct_val / total_val
        epoch_scaled_accuracy_val = (running_scaled_accuracy_val / len(val_dl)) * 100
        print(f'Epoch [{epoch+1}/{epochs}]:\nAvg. Train Loss: {epoch_loss:.7f}, Train Accuracy: {100 * train_accuracy:.2f}%, Scaled Train Accuracy: {epoch_scaled_accuracy_train:.2f}%\nAvg. Valid Loss: {val_loss:.7f}, Valid Accuracy: {100 * val_accuracy:.2f}%, Scaled Valid Accuracy: {epoch_scaled_accuracy_val:.2f}%')

In [17]:
torch.cuda.empty_cache()
BATCH_SIZE = 128
EPOCHS = 10
LEARNING_RATE = 0.001
WEIGHT_DECAY = 0.01

generator = torch.Generator()
generator.manual_seed(569567390)
train_dl = DataLoader(spr_train, batch_size = BATCH_SIZE, shuffle = True, generator = generator)
val_dl = DataLoader(spr_valid, batch_size = BATCH_SIZE, shuffle = False, generator = generator)

VGGish_Stress = VGGishNetwork().to(device)
loss_fn = nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.AdamW(VGGish_Stress.parameters(), lr = LEARNING_RATE, weight_decay = WEIGHT_DECAY)

train(VGGish_Stress, train_dl, val_dl, optimizer, loss_fn, EPOCHS, device)

Epoch [1/10]:
Avg. Train Loss: 2.6007324, Train Accuracy: 30.19%, Scaled Train Accuracy: 83.80%
Avg. Valid Loss: 4.0549359, Valid Accuracy: 31.16%, Scaled Valid Accuracy: 78.86%
Epoch [2/10]:
Avg. Train Loss: 1.2385280, Train Accuracy: 45.72%, Scaled Train Accuracy: 89.87%
Avg. Valid Loss: 8.6048568, Valid Accuracy: 0.65%, Scaled Valid Accuracy: 72.23%
Epoch [3/10]:
Avg. Train Loss: 0.8808708, Train Accuracy: 56.22%, Scaled Train Accuracy: 92.83%
Avg. Valid Loss: 13.2961718, Valid Accuracy: 2.40%, Scaled Valid Accuracy: 70.00%
Epoch [4/10]:
Avg. Train Loss: 0.6935807, Train Accuracy: 65.05%, Scaled Train Accuracy: 93.84%
Avg. Valid Loss: 11.3774577, Valid Accuracy: 1.46%, Scaled Valid Accuracy: 72.15%
Epoch [5/10]:
Avg. Train Loss: 0.6420310, Train Accuracy: 67.78%, Scaled Train Accuracy: 94.19%
Avg. Valid Loss: 10.0724328, Valid Accuracy: 10.74%, Scaled Valid Accuracy: 73.00%
Epoch [6/10]:
Avg. Train Loss: 0.5702324, Train Accuracy: 71.53%, Scaled Train Accuracy: 94.83%
Avg. Valid Los

In [18]:
torch.save(VGGish_Stress.state_dict(), "trained_models/VGGish_Stress.pth")