In [None]:
import librosa
import pandas as pd
import torch
import torch.nn as nn
import glob
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from scipy.stats import pearsonr
from sklearn.metrics import mean_squared_error
import random
from sklearn.model_selection import KFold

For tomorrow:

1. Implement cross validation
2. See whether MFCC alone or MFCC + CENS works best
3. save the best state of the model
4. produce graphs showing valence and arousal mappings preds vs true
5. apply the model to covers1000
6. analyze the results of covers1000 keeping in mind # of covers per song, first release year

In [69]:
def get_song_level_predictions():

    anno_1_2000 = pd.read_csv("DEAM_Dataset/DEAM_Annotations/annotations/annotations averaged per song/song_level/static_annotations_averaged_songs_1_2000.csv")
    anno_2001_2058 = pd.read_csv("DEAM_Dataset/DEAM_Annotations/annotations/annotations averaged per song/song_level/static_annotations_averaged_songs_2000_2058.csv")

    annotations = pd.concat([anno_1_2000, anno_2001_2058])
    
    annotations = annotations.rename(columns={column: column.replace(" ", "") for column in annotations.columns})

    annotations['song_id'] = annotations['song_id'].astype(str)
    annotations = annotations.drop(columns=[column for column in annotations.columns if column not in ["song_id", "valence_mean", "arousal_mean"]])
    annotations['valence_mean'] = ((annotations['valence_mean'] - 1) / (9 - 1)) * (1 - (-1)) + (-1)
    annotations['arousal_mean'] = ((annotations['arousal_mean'] - 1) / (9 - 1)) * (1 - (-1)) + (-1)

    return annotations

In [70]:
annotations = get_song_level_predictions()
annotations.head()

Unnamed: 0,song_id,valence_mean,arousal_mean
0,2,-0.475,-0.5
1,3,-0.375,-0.425
2,4,0.175,0.125
3,5,-0.15,0.075
4,7,0.2,0.35


In [45]:
def get_mfcc_from_audio(file):

    y, sr = librosa.load(file)

    mfcc_matrix = librosa.feature.mfcc(
        y=y,
        sr=sr,
        n_mfcc=20
    )

    mfcc_matrix = torch.tensor(mfcc_matrix, dtype=torch.float32)
    mean = mfcc_matrix.mean(dim=1)
    std = mfcc_matrix.std(dim=1)
    min_ = mfcc_matrix.min(dim=1).values
    max_ = mfcc_matrix.max(dim=1).values
    X = torch.cat([mean, std, min_, max_], dim = 0)

    return X

In [None]:
def get_cens_from_audio(file):
    y, sr = librosa.load(file)

    chroma_matrix = librosa.feature.chroma_cens(
        y=y,
        sr=sr,
        n_chroma=12
    )

    chroma_matrix = torch.tensor(chroma_matrix, dtype=torch.float32)
    mean = chroma_matrix.mean(dim=1)
    std = chroma_matrix.std(dim=1)
    min_ = chroma_matrix.min(dim=1).values
    max_ = chroma_matrix.max(dim=1).values
    X = torch.cat([mean, std, min_, max_], dim = 0)

    return X



In [None]:
def create_feature_tensor():

    audio_dir = sorted(glob.glob("DEAM_Dataset/DEAM_audio/MEMD_audio/*.mp3", recursive=True))
    X = torch.empty(0,80)

    for song in tqdm(audio_dir, total=len(audio_dir)):
        mfcc_tensor = get_mfcc_from_audio(song)
        reshaped = mfcc_tensor.view(1, -1)
        X = torch.cat([X, reshaped], dim=0)
    
    return X

In [52]:
X = create_feature_tensor()

['DEAM_Dataset/DEAM_audio/MEMD_audio/10.mp3', 'DEAM_Dataset/DEAM_audio/MEMD_audio/1000.mp3', 'DEAM_Dataset/DEAM_audio/MEMD_audio/1001.mp3', 'DEAM_Dataset/DEAM_audio/MEMD_audio/1002.mp3', 'DEAM_Dataset/DEAM_audio/MEMD_audio/1003.mp3', 'DEAM_Dataset/DEAM_audio/MEMD_audio/1004.mp3', 'DEAM_Dataset/DEAM_audio/MEMD_audio/1005.mp3', 'DEAM_Dataset/DEAM_audio/MEMD_audio/1006.mp3', 'DEAM_Dataset/DEAM_audio/MEMD_audio/1007.mp3', 'DEAM_Dataset/DEAM_audio/MEMD_audio/1008.mp3', 'DEAM_Dataset/DEAM_audio/MEMD_audio/1009.mp3', 'DEAM_Dataset/DEAM_audio/MEMD_audio/101.mp3', 'DEAM_Dataset/DEAM_audio/MEMD_audio/1010.mp3', 'DEAM_Dataset/DEAM_audio/MEMD_audio/1011.mp3', 'DEAM_Dataset/DEAM_audio/MEMD_audio/1012.mp3', 'DEAM_Dataset/DEAM_audio/MEMD_audio/1013.mp3', 'DEAM_Dataset/DEAM_audio/MEMD_audio/1014.mp3', 'DEAM_Dataset/DEAM_audio/MEMD_audio/1015.mp3', 'DEAM_Dataset/DEAM_audio/MEMD_audio/1016.mp3', 'DEAM_Dataset/DEAM_audio/MEMD_audio/1017.mp3', 'DEAM_Dataset/DEAM_audio/MEMD_audio/1018.mp3', 'DEAM_Dataset/D

In [101]:
def create_label_tensor(annotations_df):
    valence = annotations_df['valence_mean'].to_numpy().astype(np.float32)
    valence = torch.from_numpy(valence).view(-1,1)

    arousal = annotations_df['arousal_mean'].to_numpy().astype(np.float32)
    arousal = torch.from_numpy(arousal).view(-1,1)

    return torch.concat((valence, arousal), dim=1)

In [None]:
y = create_label_tensor(annotations.sort_values(by='song_id'))

torch.Size([1802, 2])


In [94]:
class EmotionRegressor(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU()
        )
        self.out = nn.Linear(32, 2)

    def forward(self, x):
        return self.out(self.net(x))

In [None]:
def training_pass(X, y, lr, epochs):

    model = EmotionRegressor(input_dim=X.shape[1])
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)

    X_train = (X_train - X_train.mean(dim=0)) / (X_train.std(dim=0) + 1e-8)
    X_test = (X_test - X_train.mean(dim=0)) / (X_train.std(dim=0) + 1e-8)

    best_valence_r = -1

    for epoch in range(epochs):
        model.train()
        optimizer.zero_grad()
        preds = model(X_train)
        loss = criterion(preds, y_train)
        loss.backward()
        optimizer.step()

        if epoch % 5 == 0:
            model.eval()
            with torch.no_grad():
                preds = model(X_test)

            valence_pred = preds[:, 0]
            arousal_pred = preds[:, 1]

            v_correlation_coefficient, _ = pearsonr(valence_pred.detach().cpu().numpy(), y_test[:, 0].detach().cpu().numpy())
            a_correlation_coefficient, _ = pearsonr(arousal_pred.detach().cpu().numpy(), y_test[:, 1].detach().cpu().numpy())

            val_mse = mean_squared_error(valence_pred.detach().cpu().numpy(), y_test[:, 0].detach().cpu().numpy())
            aro_mse = mean_squared_error(arousal_pred.detach().cpu().numpy(), y_test[:, 1].detach().cpu().numpy())

            if v_correlation_coefficient > best_valence_r:
                best_valence_r = v_correlation_coefficient
                best_state = model.state_dict()

            print(f"Epoch {epoch} | loss={loss.item():.4f} | Val MSE={val_mse:.4f} | valence pearson={v_correlation_coefficient:.4f} | arousal pearson={a_correlation_coefficient:.4f}")
    
    return best_valence_r, best_state


In [145]:
best_valence_r, best_state = training_pass(X, y, 0.0001, 60)
print(best_valence_r)
print(best_state)

Epoch 0 | loss=0.1042 | Val MSE=1.6556 | valence pearson=-0.1658 | arousal pearson=0.2413
Epoch 5 | loss=0.1004 | Val MSE=2.3260 | valence pearson=-0.1490 | arousal pearson=0.2877
Epoch 10 | loss=0.0968 | Val MSE=3.1786 | valence pearson=-0.1261 | arousal pearson=0.3261
Epoch 15 | loss=0.0936 | Val MSE=4.1163 | valence pearson=-0.1009 | arousal pearson=0.3570
Epoch 20 | loss=0.0906 | Val MSE=5.0188 | valence pearson=-0.0716 | arousal pearson=0.3811
Epoch 25 | loss=0.0880 | Val MSE=5.8025 | valence pearson=-0.0366 | arousal pearson=0.3996
Epoch 30 | loss=0.0855 | Val MSE=6.4413 | valence pearson=0.0032 | arousal pearson=0.4140
Epoch 35 | loss=0.0832 | Val MSE=6.9439 | valence pearson=0.0457 | arousal pearson=0.4257
Epoch 40 | loss=0.0812 | Val MSE=7.3601 | valence pearson=0.0884 | arousal pearson=0.4358
Epoch 45 | loss=0.0793 | Val MSE=7.7106 | valence pearson=0.1321 | arousal pearson=0.4445
Epoch 50 | loss=0.0776 | Val MSE=8.0180 | valence pearson=0.1761 | arousal pearson=0.4518
Epoch 