In [1]:
# from google.colab import drive
# drive.mount('/content/drive')

In [20]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import random
import os
from tqdm import tqdm

from glob import glob

import librosa
import torch
from torch.utils.data import Dataset, DataLoader, Subset
import torch.nn as nn
from torch.optim import Adam

from sklearn.model_selection import KFold
from scipy.stats import pearsonr
import time

In [None]:
# performing a check
id = 35
path = f'/DEAM_Dataset/DEAM_audio/MEMD_audio/{id}.mp3'
y, sr = librosa.load(path, sr=44100)
print(f"{len(y)} samples at {sr} Hz")


1986607 samples at 44100 Hz


In [4]:
def compute_beats(y, sr, hop_length):

    tempo, beat_frames = librosa.beat.beat_track(
        y=y,
        sr=sr,
        hop_length=hop_length,
        units="frames"
    )

    beat_times_sec = librosa.frames_to_time(
        beat_frames,
        sr=sr,
        hop_length=hop_length
    )

    return beat_times_sec, beat_frames, tempo

In [5]:
def aggregate_feature(mfcc, cens, beat_frames, beat_times):

    X = []
    durations = []

    # beat_durations = np.diff(beat_times)
    # beat_durations = np.clip(beat_durations, 1e-2, None)

    # tempo_inst = 60.0 / beat_durations
    # tempo_features = np.column_stack([
    #         tempo_inst,
    #         np.log(tempo_inst),
    #         np.diff(np.concatenate([[tempo_inst[0]], tempo_inst]))
    #     ])

    for t in range(len(beat_frames) - 1):
        start = beat_frames[t]
        end   = beat_frames[t + 1]

        if end > start:
            mfcc_t = mfcc[start:end].mean(axis=0)
            cens_t = cens[start:end].mean(axis=0)
        else:
            mfcc_t = mfcc[start]
            cens_t = cens[start:end]

        X.append(np.concatenate([mfcc_t, cens_t]))
        durations.append(beat_times[t + 1] - beat_times[t])



    X = np.vstack(X)
    return X, np.array(durations)

In [6]:
def parse_annotation_times(df):

    time_cols = [c for c in df.columns if c.startswith("sample_")]

    times_sec = np.array([int(c.replace("sample_", "").replace("ms", "")) / 1000.0
                          for c in time_cols])

    return time_cols, times_sec

In [7]:
def aggregate_annotations_to_beats(
        times_sec,
        values,
        beat_times_sec
):

    y_beats = []

    for t in range(len(beat_times_sec) - 1):
        start = beat_times_sec[t]
        end   = beat_times_sec[t + 1]

        mask = (times_sec >= start) & (times_sec < end)

        if np.any(mask):
            y_beats.append(values[mask].mean())
        else:
            # fallback: nearest neighbor
            idx = np.argmin(np.abs(times_sec - start))
            y_beats.append(values[idx])
    return np.array(y_beats)

In [8]:
def get_song_ids(filepath):

    ids = []
    for root, dirs, files in os.walk(filepath):
        for file in files:
            if ".mp3" in file:
                ids.append(file[:file.index(".")])

    return ids

In [9]:
class DEAMDataset(Dataset):

    def __init__(
            self,
            audio_dir,
            arousal_labels,
            valence_labels,
            song_ids,
            hop_length=512,
            n_mfcc=20
    ):

        self.audio_dir = audio_dir
        self.arousal_labels = pd.read_csv(arousal_labels)
        self.valence_labels = pd.read_csv(valence_labels)
        self.hop_length = hop_length
        self.n_mfcc = n_mfcc
        self.song_ids = song_ids
        self.cache = {}
        self.time_cols, self.time_sec = parse_annotation_times(self.valence_labels)

        self.valence_labels["song_id"] = self.valence_labels["song_id"].astype(str).str.strip()
        self.arousal_labels["song_id"] = self.arousal_labels["song_id"].astype(str).str.strip()

        self.valence_dict = self.valence_labels.set_index("song_id")
        self.arousal_dict = self.arousal_labels.set_index("song_id")


    def __len__(self):
        return len(self.song_ids)

    def __getitem__(self, idx):

        song_id = self.song_ids[idx]

        if song_id in self.cache:
          return self.cache[song_id]

        path = os.path.join(self.audio_dir, f"{song_id}.mp3")

        # y, sr = librosa.load(path, sr=44100)

        # beat_times_sec, beat_frames, tempo = compute_beats(y, sr, self.hop_length)

        y, sr = librosa.load(path, sr=22050, mono=True)

        tempo, beat_frames = librosa.beat.beat_track(
          y=y, sr=sr, hop_length=self.hop_length, units="frames"
        )

        beat_times_sec = librosa.frames_to_time(beat_frames, sr=sr, hop_length=self.hop_length)

        if beat_times_sec is None or len(beat_times_sec) < 2:
          return None

        mfcc = librosa.feature.mfcc(
            y=y,
            sr=sr,
            n_mfcc=self.n_mfcc,
            hop_length=self.hop_length
        ).T

        # y_harmonic, _ = librosa.effects.hpss(y)

        # hpcp = librosa.feature.chroma_cqt(
        #     y=y_harmonic,
        #     sr=sr,
        #     bins_per_octave=36,
        #     n_chroma=12
        # ).T

        cens = librosa.feature.chroma_cens(
            y=y,
            sr=sr,
            hop_length=self.hop_length
        ).T

        X, durations = aggregate_feature(mfcc, cens, beat_frames, beat_times_sec)

        valence = self.valence_dict.loc[str(song_id), self.time_cols].values
        arousal = self.arousal_dict.loc[str(song_id), self.time_cols].values

        val_beats = aggregate_annotations_to_beats(self.time_sec, valence, beat_times_sec)
        aro_beats = aggregate_annotations_to_beats(self.time_sec, arousal, beat_times_sec)

        X = torch.tensor(X, dtype=torch.float32)

        y = np.column_stack([val_beats, aro_beats])

        valid = ~np.isnan(y).any(axis=1)

        X = X[valid]
        y = y[valid]
        durations = durations[valid]

        y = torch.tensor(y, dtype=torch.float32)

        durations = torch.tensor(durations, dtype=torch.float32)

        if X.shape[0] == 0 or y.shape[0] == 0:
          return None

        if not torch.isfinite(X).all():
          return None
        if not torch.isfinite(y).all():
          return None

        sample = {
            "X": X,
            "y": y,
            "durations": durations
        }

        self.cache[song_id] = sample
        return sample


In [None]:
ids = get_song_ids("/DEAM_Dataset/DEAM_audio/MEMD_audio")

dataset = DEAMDataset("/DEAM_Dataset/DEAM_audio/MEMD_audio",
                           "/DEAM_Dataset/DEAM_Annotations/annotations/annotations averaged per song/dynamic (per second annotations)/arousal.csv",
                           "/DEAM_Dataset/DEAM_Annotations/annotations/annotations averaged per song/dynamic (per second annotations)/valence.csv",
                           ids)

print("Precomputing features...")
for i in tqdm(range(len(dataset))):
    _ = dataset[i]
print("Done.")

Precomputing features...


100%|██████████| 1802/1802 [33:10<00:00,  1.10s/it]

Done.





In [11]:
class BeatRegressor(nn.Module):

    def __init__(self, input_dim):

        super().__init__()
        self.next = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 2)
        )

    def forward(self, x):
        return self.next(x)

In [12]:
def flatten_collate(batch):

    batch = [b for b in batch if b is not None]

    if len(batch) == 0:
        return None, None

    X_list = [b["X"] for b in batch]
    y_list = [b["y"] for b in batch]

    X = torch.cat(X_list, dim=0)
    y = torch.cat(y_list, dim=0)

    return X, y



In [13]:
num_songs = len(dataset)
indices = np.arange(num_songs)
kf = KFold(n_splits=10, shuffle=True, random_state=42)


In [None]:
def compute_normalization(loader):
    X_all = []

    for X, _ in loader:
        if X is None:
            continue
        X_all.append(X)

    if len(X_all) == 0:
        raise RuntimeError("No valid samples found")

    X_all = torch.cat(X_all, dim=0)
    mean = X_all.mean(dim=0)
    std = X_all.std(dim=0) + 1e-6
    return mean, std

In [21]:

device = "cuda" if torch.cuda.is_available() else "cpu"

# valid = 0
# invalid = 0

# for i in range(len(dataset)):
#     sample = dataset[i]
#     if sample is None:
#         invalid += 1
#     else:
#         valid += 1

# print("Valid songs:", valid)
# print("Invalid songs:", invalid)

fold_results = []
val_pear_results = []
aro_pear_results = []

for fold, (train_idx, val_idx) in enumerate(kf.split(indices)):
    print(f"===== Fold {fold + 1} of 10 =====")

    train_set = Subset(dataset, train_idx)

    # for i in range(len(train_set)):
    #     try:
    #         sample = train_set[i]
    #     except Exception as e:
    #         print(f"Error at train_set index {i}")
    #         raise


    val_set = Subset(dataset, val_idx)

    train_loader = DataLoader(
        train_set,
        batch_size=8,
        shuffle=True,
        collate_fn=flatten_collate
    )

    # for i, (X, y) in enumerate(train_loader):
    #     print(i, X.shape)

    val_loader = DataLoader(
        dataset=val_set,
        batch_size=8,
        shuffle=False,
        collate_fn=flatten_collate
    )

    # batch = next(iter(train_loader))
    # print(type(batch))
    # print(batch)

    mean, std = compute_normalization(train_loader)

    input_dim = train_set[0]["X"].shape[1]

    model = BeatRegressor(input_dim).to(device)

    optimizer = Adam(model.parameters(), lr=1e-3)
    criterion = nn.MSELoss()

    best_val_loss = float("inf")
    best_val_pear = float("inf")
    best_aro_pear = float("inf")

    for epoch in range(30):
        model.train()
        train_loss = 0.0

        train_corr_valence = 0.0
        train_corr_arousal = 0.0

        n_batches = 0

        for X, y in train_loader:

            X = ((X - mean) / std).to(device)
            y = y.to(device)

            optimizer.zero_grad()
            y_hat = model(X)
            loss = criterion(y_hat, y)
            loss.backward()
            optimizer.step()

            train_loss += loss.item()
            corr_valence, _ = pearsonr(y[:, 0].cpu().detach().numpy(), y_hat[:, 0].cpu().detach().numpy())
            corr_arousal, _ = pearsonr(y[:, 1].cpu().detach().numpy(), y_hat[:, 1].cpu().detach().numpy())
            n_batches += 1

        train_loss /= n_batches
        corr_valence /= n_batches
        corr_arousal /= n_batches

        model.eval()

        val_loss = 0.0
        n_batches = 0
        with torch.no_grad():
            for X, y in val_loader:
                X = ((X - mean) / std).to(device)
                y = y.to(device)
                y_hat = model(X)
                val_loss += criterion(y_hat, y).item()
                n_batches += 1

        val_loss /= n_batches

        print(
            f"Epoch {epoch+1:02d} | "
            f"Train MSE: {train_loss:.4f} | "
            f"Val MSE: {val_loss:.4f} | "
            f"Var Pearson: {corr_valence:.4f} | "
            f"Aro Pearson: {corr_arousal:.4f} "
        )

        best_val_loss = min(best_val_loss, val_loss)
        best_val_pear = min(best_val_pear, corr_valence)
        best_aro_pear = min(best_aro_pear, corr_aro)

    fold_results.append(best_val_loss)
    val_pear_results.append(best_val_pear)
    aro_pear_results.append(best_aro_pear)



===== Fold 1 of 10 =====
Epoch 01 | Train MSE: 0.0518 | Val MSE: 0.0454
Epoch 02 | Train MSE: 0.0458 | Val MSE: 0.0423
Epoch 03 | Train MSE: 0.0439 | Val MSE: 0.0395
Epoch 04 | Train MSE: 0.0437 | Val MSE: 0.0405
Epoch 05 | Train MSE: 0.0428 | Val MSE: 0.0409
Epoch 06 | Train MSE: 0.0418 | Val MSE: 0.0414
Epoch 07 | Train MSE: 0.0413 | Val MSE: 0.0405
Epoch 08 | Train MSE: 0.0407 | Val MSE: 0.0410
Epoch 09 | Train MSE: 0.0405 | Val MSE: 0.0398
Epoch 10 | Train MSE: 0.0403 | Val MSE: 0.0406
Epoch 11 | Train MSE: 0.0395 | Val MSE: 0.0413
Epoch 12 | Train MSE: 0.0394 | Val MSE: 0.0404
Epoch 13 | Train MSE: 0.0392 | Val MSE: 0.0415
Epoch 14 | Train MSE: 0.0391 | Val MSE: 0.0417
Epoch 15 | Train MSE: 0.0382 | Val MSE: 0.0409
Epoch 16 | Train MSE: 0.0382 | Val MSE: 0.0413
Epoch 17 | Train MSE: 0.0378 | Val MSE: 0.0404
Epoch 18 | Train MSE: 0.0373 | Val MSE: 0.0407
Epoch 19 | Train MSE: 0.0373 | Val MSE: 0.0395
Epoch 20 | Train MSE: 0.0369 | Val MSE: 0.0422
Epoch 21 | Train MSE: 0.0368 | Val 

In [22]:
fold_results = np.array(fold_results)
val_pear_results = np.array(val_pear_results)
aro_pear_results = np.array(aro_pear_results)

print("10-Fold CV Results:")
print(f"Mean MSE: {fold_results.mean():.4f}")
print(f"Std  MSE: {fold_results.std():.4f}")

print(f"Mean Val PCC: {val_pear_results.mean():.4f}")
print(f"Std  Val PCC: {val_pear_results.std():.4f}")

print(f"Mean Aro PCC: {aro_pear_results.mean():.4f}")
print(f"Std  Aro PCC: {aro_pear_results.std():.4f}")


10-Fold CV Results:
Mean MSE: 0.0411
Std  MSE: 0.0031
