In [1]:
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import KFold
import numpy as np
from sklearn.preprocessing import StandardScaler
import pandas as pd
import os
import librosa
import tqdm
from sklearn.metrics import mean_squared_error
from scipy.stats import pearsonr
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
annotations_1 = pd.read_csv("DEAM_Dataset/DEAM_Annotations/annotations/annotations averaged per song/song_level/static_annotations_averaged_songs_1_2000.csv")
annotations_2 = pd.read_csv("DEAM_Dataset/DEAM_Annotations/annotations/annotations averaged per song/song_level/static_annotations_averaged_songs_2000_2058.csv")

annotations = pd.concat([annotations_1, annotations_2]).reset_index(drop=True)

annotations.columns = annotations.columns.str.replace(' ', '')

In [None]:
annotations = annotations.drop(columns=["valence_std", "arousal_std", 'valence_max_mean', 'valence_max_std', 'valence_min_mean',
       'valence_min_std', 'arousal_max_mean', 'arousal_max_std',
       'arousal_min_mean', 'arousal_min_std'])


In [13]:
annotations.head(5)

Unnamed: 0,song_id,valence_mean,arousal_mean
0,2,3.1,3.0
1,3,3.5,3.3
2,4,5.7,5.5
3,5,4.4,5.3
4,7,5.8,6.4


In [4]:
def get_song_info(filepath):

    song_info = {"song_id": [], "mfcc": [], "cens": []}

    file_count = sum(len(files) for _, _, files in os.walk(filepath))

    with tqdm.tqdm(total=file_count) as pbar:
        for _, _, files in os.walk(filepath):
            for file in files:
                if ".mp3" in file:
                    id = file[:file.index(".")]

                    y, sr = librosa.load(os.path.join(filepath, file), sr=22050, mono=True)

                    mfcc = librosa.feature.mfcc(
                        y=y,
                        sr=sr,
                        n_mfcc=20,
                        hop_length=512
                    ).T

                    cens = librosa.feature.chroma_cens(
                        y=y,
                        sr=sr,
                        hop_length=512
                    ).T
                    
                    song_info["song_id"].append(id)
                    song_info["mfcc"].append(mfcc)
                    song_info["cens"].append(cens)

                    pbar.update(1)

    return song_info

In [5]:
result = get_song_info("DEAM_Dataset/DEAM_audio/MEMD_audio")


100%|██████████| 1802/1802 [16:33<00:00,  1.81it/s]


In [6]:
features = pd.DataFrame.from_dict(result)

features.columns
features.head(5)

Unnamed: 0,song_id,mfcc,cens
0,746,"[[-517.20264, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0....","[[0.23594299, 0.04124677, 0.04124677, 0.104198..."
1,1588,"[[-139.80067, 117.861465, -55.3455, -6.400688,...","[[0.20087758, 0.040090047, 0.12014431, 0.23118..."
2,1563,"[[-203.79233, 130.13647, 13.749421, 38.079147,...","[[0.613879, 0.44682395, 0.56096673, 0.04413091..."
3,1205,"[[-196.68634, 77.15793, 31.028969, 17.508139, ...","[[0.3299707, 0.27401876, 0.113287285, 0.237430..."
4,791,"[[-688.4617, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0...","[[0.06243906, 0.25726554, 0.0, 0.0016241824, 0..."


In [7]:
def aggregate_song(mfcc, cens):
    # mfcc: (T, n_mfcc) ; cens: (T, n_cens)
    mfcc = np.asarray(mfcc)
    cens = np.asarray(cens)
    mfcc_mean = mfcc.mean(axis=0)
    mfcc_std  = mfcc.std(axis=0)
    cens_mean = cens.mean(axis=0)
    cens_std  = cens.std(axis=0)
    return np.concatenate([mfcc_mean, mfcc_std, cens_mean, cens_std], axis=0)


In [15]:
print(annotations[annotations['song_id'] == 746])

     song_id  valence_mean  arousal_mean
584      746           3.7           4.6


In [17]:
rows = []
for _, r in features.iterrows():
    sid = r['song_id']
    # find annotation row for this song id
    ann = annotations[annotations['song_id'] == int(sid)]
    if ann.empty:
        continue
    # change these if your annotation columns are named differently
    val = ann['valence_mean'].values[0]
    aro = ann['arousal_mean'].values[0]
    vec = aggregate_song(r['mfcc'], r['cens'])
    rows.append((sid, vec, np.array([val, aro])))

song_ids, X, y = zip(*rows)
X = np.vstack(X)
y = np.vstack(y)
print("X shape:", X.shape, "y shape:", y.shape)

X shape: (1802, 64) y shape: (1802, 2)


In [18]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler().fit(X_train)
X_train_s = scaler.transform(X_train)
X_val_s   = scaler.transform(X_val)

# Convert to PyTorch tensors
X_train_t = torch.tensor(X_train_s, dtype=torch.float32)
y_train_t = torch.tensor(y_train, dtype=torch.float32)
X_val_t   = torch.tensor(X_val_s, dtype=torch.float32)
y_val_t   = torch.tensor(y_val, dtype=torch.float32)

In [19]:
class SimpleRegressor(nn.Module):
    def __init__(self, input_dim, hidden=128):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(hidden, hidden//2),
            nn.ReLU(),
            nn.Linear(hidden//2, 2)  # outputs: [valence, arousal]
        )
    def forward(self, x):
        return self.net(x)

In [20]:
def evaluate(model, loader, device, loss_fn):
    model.eval()
    total = 0.0
    loss_sum = 0.0
    preds, trues = [], []
    with torch.no_grad():
        for xb, yb in loader:
            xb, yb = xb.to(device), yb.to(device)
            out = model(xb)
            loss = loss_fn(out, yb)
            loss_sum += loss.item() * xb.size(0)
            total += xb.size(0)
            preds.append(out.cpu().numpy())
            trues.append(yb.cpu().numpy())
    preds = np.vstack(preds)
    trues = np.vstack(trues)
    return loss_sum / total, preds, trues

In [21]:
train_ds = TensorDataset(X_train_t, y_train_t)
val_ds   = TensorDataset(X_val_t, y_val_t)
train_loader = DataLoader(train_ds, batch_size=64, shuffle=True)
val_loader   = DataLoader(val_ds, batch_size=128, shuffle=False)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = SimpleRegressor(X_train_t.shape[1], hidden=128).to(device)
opt = torch.optim.Adam(model.parameters(), lr=1e-3)
loss_fn = nn.MSELoss()

In [22]:
epochs = 40
best_val = float('inf')
for ep in range(epochs):
    model.train()
    for xb, yb in train_loader:
        xb, yb = xb.to(device), yb.to(device)
        opt.zero_grad()
        out = model(xb)
        loss = loss_fn(out, yb)
        loss.backward()
        opt.step()
    val_loss, val_preds, val_trues = evaluate(model, val_loader, device, loss_fn)
    if val_loss < best_val:
        best_val = val_loss
        torch.save(model.state_dict(), "best_regressor.pt")
    print(f"Epoch {ep+1}/{epochs}  val_mse={val_loss:.4f}")

Epoch 1/40  val_mse=9.2238
Epoch 2/40  val_mse=3.4055
Epoch 3/40  val_mse=2.3718
Epoch 4/40  val_mse=1.7994
Epoch 5/40  val_mse=1.5129
Epoch 6/40  val_mse=1.3391
Epoch 7/40  val_mse=1.2217
Epoch 8/40  val_mse=1.1747
Epoch 9/40  val_mse=1.1465
Epoch 10/40  val_mse=1.1101
Epoch 11/40  val_mse=1.0964
Epoch 12/40  val_mse=1.0363
Epoch 13/40  val_mse=0.9968
Epoch 14/40  val_mse=1.0113
Epoch 15/40  val_mse=0.9837
Epoch 16/40  val_mse=0.9643
Epoch 17/40  val_mse=0.9651
Epoch 18/40  val_mse=0.9675
Epoch 19/40  val_mse=0.9653
Epoch 20/40  val_mse=0.9342
Epoch 21/40  val_mse=0.9595
Epoch 22/40  val_mse=0.9346
Epoch 23/40  val_mse=0.9338
Epoch 24/40  val_mse=0.9672
Epoch 25/40  val_mse=0.9214
Epoch 26/40  val_mse=0.9292
Epoch 27/40  val_mse=0.9495
Epoch 28/40  val_mse=0.9136
Epoch 29/40  val_mse=0.9266
Epoch 30/40  val_mse=0.9176
Epoch 31/40  val_mse=0.9258
Epoch 32/40  val_mse=0.9102
Epoch 33/40  val_mse=0.8984
Epoch 34/40  val_mse=0.9022
Epoch 35/40  val_mse=0.8974
Epoch 36/40  val_mse=0.9143
E

In [None]:
val_loss, val_preds, val_trues = evaluate(model, val_loader, device, loss_fn)
mse_val = mean_squared_error(val_trues, val_preds)
print("Val MSE:", mse_val)

for i, name in enumerate(['valence', 'arousal']):
    r, _ = pearsonr(val_trues[:, i], val_preds[:, i])
    print(f"Pearson r ({name}): {r:.3f}")


Val MSE: 0.9206092357635498
Pearson r (valence): 0.633
Pearson r (arousal): 0.673
4.0747476
4.6
