In [256]:
import os
import glob
import librosa
from scipy.io import loadmat
from collections import defaultdict
import torch
from tqdm import tqdm
from torch import nn
import re
import math
from pathlib import Path

In [188]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [346]:
def add_first_release_data():

    for dirpath, dirnames, _ in os.walk('Metadata'):
        for dir in dirnames:
            total_lines = []
            for file in os.walk(os.path.join(dirpath, dir)):
                for file_name in file[2]:
                    with open(os.path.join(dirpath, dir, file_name)) as f:
                        lines = [line.rstrip("\n") for line in f]
                        year = lines[2]
                        if "," in year:
                            year = year[year.index(","):]
                        year = re.sub(r'[^0-9]', '', year)
                        lines[2] = year
                        total_lines.append(lines)
                found_first_release = False
                for line in total_lines:
                    if len(line) == 4 and "first" in line[3].lower():
                        found_first_release = True
                if not found_first_release:
                    years = []
                    for line in total_lines:
                        years.append(int(line[2]))
                    first_year = min(years)
                    years = [str(year) for year in years]
                    total_lines[years.index(str(first_year))].append("First release")
                    for line, file_name in zip(total_lines, file[2]):
                        with open(os.path.join(dirpath, dir, file_name), 'w') as f:
                                for l in line:
                                    f.write(f"{l}\n")
                

In [189]:
def get_song_info():

    song_dict = defaultdict(dict)

    songs = glob.glob("Metadata/**/*.txt", recursive=True)
    
    for file in tqdm(songs):
        with open(file) as f:
                key1 = file[file.index("/") + 1: file.rindex("/")]
                key2 = file[file.rindex("/") + 1:file.index(".")]
                lines = [line.rstrip("\n") for line in f]
                song_dict[key1][key2] = {"Metadata": lines}
                              
    
    return song_dict

In [190]:
def get_audio_info(song_dict):

    mfcc_files = glob.glob("MFCCs/**/*.mat", recursive=True)
    cens_files = glob.glob("CENS/**/*.mat", recursive=True)

    num = len(mfcc_files)

    for mfcc_file, cens_file in zip(mfcc_files, cens_files):
        mat1 = loadmat(mfcc_file)
        mfcc_matrix = mat1['XMFCC'].squeeze()
        mfcc_matrix = torch.tensor(mfcc_matrix, dtype=torch.float32)
        mean = mfcc_matrix.mean(dim=1)
        std = mfcc_matrix.std(dim=1)
        min_ = mfcc_matrix.min(dim=1).values
        max_ = mfcc_matrix.max(dim=1).values
        mfcc_tensor = torch.cat([mean, std, min_, max_], dim = 0)

        mat2 = loadmat(cens_file)
        cens_matrix = mat2["XCENS"].squeeze()
        cens_matrix = torch.tensor(cens_matrix, dtype=torch.float32)
        mean = cens_matrix.mean(dim=1)
        std = cens_matrix.std(dim=1)
        min_ = cens_matrix.min(dim=1).values
        max_ = cens_matrix.max(dim=1).values
        cens_tensor = torch.cat([mean, std, min_, max_], dim = 0)

        key1 = mfcc_file[mfcc_file.index("/") + 1: mfcc_file.rindex("/")]
        key2 = mfcc_file[mfcc_file.rindex("/") + 1: mfcc_file.index("_")]
        song_dict[key1][key2]["X"] = torch.cat([mfcc_tensor, cens_tensor], dim=0)

    return song_dict



In [191]:
def create_feature_tensor(song_dict):

    X = []
    song_list = []

    for folder in sorted(song_dict):
        for song in sorted(song_dict[folder]):
            song_list.append(song)
            X.append(song_dict[folder][song]["X"])
    X = torch.stack(X)
    return X, song_list

In [339]:
def group_songs_by_feature(song_dict):

    # songs grouped based on when the original was released
    first_release_groupings = {}

    # songs grouped based on when the cover was released
    cover_release_groupings = {}

    # songs grouped by number of covers released
    songs_with_more_than_3_covers = {}

    # first release songs mapped to their corresponding covers
    first_releases_to_covers = {}

    for folder in song_dict:
        if len(song_dict[folder]) >=4:
            songs_with_more_than_3_covers[folder] = [song for song in song_dict[folder]]
        for song in song_dict[folder]:
            if ("First release" in song_dict[folder][song]["Metadata"] 
                or "First recording" in song_dict[folder][song]["Metadata"] 
                or "First performance" in song_dict[folder][song]["Metadata"]):
                first_releases_to_covers[song] = [tune for tune in song_dict[folder] if tune != song]
                year = song_dict[folder][song]["Metadata"][2]
                if "," in year:
                    year = year[year.index(","):]
                year = re.sub(r'[^0-9]', '', year)
                if f"{year[:3]}0" in first_release_groupings:
                    first_release_groupings[f"{year[:3]}0"].append(song)
                else:
                    first_release_groupings[f"{year[:3]}0"] = [song]
            else:
                year = song_dict[folder][song]["Metadata"][2]
                if "," in year:
                    year = year[year.index(","):]
                year = re.sub(r'[^0-9]', '', year)
                if f"{year[:3]}0" in cover_release_groupings:
                    cover_release_groupings[f"{year[:3]}0"].append(song)
                else:
                    cover_release_groupings[f"{year[:3]}0"] = [song]

    return (
        first_release_groupings,
        cover_release_groupings,
        songs_with_more_than_3_covers,
        first_releases_to_covers
    )


In [193]:
class EmotionRegressor(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 2)
        )

    def forward(self, x):
        return self.net(x)

In [194]:
def evaluate_model(X):

    ckpt = torch.load("emotion_regressor.pt")

    model = EmotionRegressor(ckpt["input_dim"]).to(device)
    model.load_state_dict(ckpt["model_state_dict"])
    X = (X - ckpt["X_mean"]) / ckpt["X_std"]
    model.eval()

    with torch.no_grad():
        preds = model(X)
    
    return preds

In [195]:
def find_cover_distances(originals, song_tensors, original_val, original_aro):
    
    cover_distances = []
    for cover in originals:
        cover_val = song_tensors[cover][0]
        cover_aro = song_tensors[cover][1]

        distance = math.hypot(cover_val - original_val, cover_aro - original_aro)
        cover_distances.append(distance)
    
    distance = sum(cover_distances)/len(cover_distances)
    return distance

In [340]:
def compute_distances(song_list,
                      preds,
                      originals_to_covers, 
                      first_release_groupings, 
                      cover_release_groupings, 
                      song_with_more_than_3):

    global_distance_metrics = {}
    
    song_tensors = {}
    for i, row in enumerate(preds):
        song_tensors[song_list[i]] = row

    org_to_cover_distances = []
    for original in originals_to_covers:
        original_val = song_tensors[original][0]
        original_aro = song_tensors[original][1]

        cover_distance = find_cover_distances(originals_to_covers[original], song_tensors, original_val, original_aro)
        org_to_cover_distances.append(cover_distance)

    avg_distance = sum(org_to_cover_distances)/len(org_to_cover_distances)
    global_distance_metrics["overall average distance"] = avg_distance

    distances_by_year = {}
    for year in first_release_groupings:
        distances = []
        for original in first_release_groupings[year]:
            original_val = song_tensors[original][0]
            original_aro = song_tensors[original][1]

            year_distance = find_cover_distances(originals_to_covers[original], song_tensors, original_val, original_aro)
            distances.append(year_distance)

        distances_by_year[year] = float(f"{sum(distances)/len(distances):.4f}")

    global_distance_metrics["dist by first release year"] = distances_by_year

    covers_distances_by_year = {}
    for year in cover_release_groupings:
        distances = []
        for song in cover_release_groupings[year]:
            cover_val = song_tensors[song][0]
            cover_aro = song_tensors[song][1]

            original = [org for org in originals_to_covers if song in originals_to_covers[org]]
            original_val = song_tensors[original[0]][0]
            original_aro = song_tensors[original[0]][1]

            distance = math.hypot(cover_val - original_val, cover_aro - original_aro)
            distances.append(distance)

        covers_distances_by_year[year] = float(f"{sum(distances)/len(distances):.4f}")
    
    global_distance_metrics["dist by cover year"] = covers_distances_by_year

    distances = []
    for folder in song_with_more_than_3:
        cover_preds = []
        cover_distances = []
        for song in song_with_more_than_3[folder]:
            if song in originals_to_covers:
                original_val = song_tensors[song][0]
                original_aro = song_tensors[song][1]
            else:
                cover_val = song_tensors[song][0]
                cover_aro = song_tensors[song][1]
                cover_preds.append((cover_val, cover_aro))
        for cover in cover_preds:
            distance = math.hypot(cover[0] - original_val, cover[1] - original_aro)
            cover_distances.append(distance)  
        distance = sum(cover_distances)/len(cover_distances)
        distances.append(distance)

    avg_distance = sum(distances)/len(distances)
    global_distance_metrics["3+ covers avg"] = avg_distance

    return global_distance_metrics

In [347]:
def full_monty():
    
    add_first_release_data()
    song_dict = get_song_info()
    song_dict = get_audio_info(song_dict)
    X, song_list = create_feature_tensor(song_dict)

    grouped_dictionaries = group_songs_by_feature(song_dict)
    first_release_groupings = grouped_dictionaries[0]
    cover_release_groupings = grouped_dictionaries[1]
    song_with_more_than_3 = grouped_dictionaries[2]
    originals_to_covers = grouped_dictionaries[3]
    
    preds = evaluate_model(X)

    distance_dict = compute_distances(song_list,
                                      preds,
                                      originals_to_covers, 
                                      first_release_groupings, 
                                      cover_release_groupings, 
                                      song_with_more_than_3)

    return distance_dict

In [348]:
distance_dict = full_monty()

100%|██████████| 1000/1000 [00:00<00:00, 46040.15it/s]


In [349]:
print(distance_dict)

{'overall average distance': 0.21010733888295982, 'dist by first release year': {'1980': 0.2361, '1950': 0.2108, '2000': 0.1999, '1970': 0.188, '1990': 0.2349, '2010': 0.1661, '1930': 0.2266, '1960': 0.2064, '1940': 0.1706, '1920': 0.2177, '1900': 0.4132}, 'dist by cover year': {'2010': 0.2119, '1970': 0.1907, '1960': 0.2206, '1990': 0.1996, '2000': 0.2324, '1950': 0.2435, '1930': 0.1623, '1980': 0.2076, '1940': 0.26, '1900': 0.4039}, '3+ covers avg': 0.23195747739719527}


In [198]:
# Plots

# 391 - no words
# 368 - cross genre (Bon Jovi to screamo catamenia)
# 327: Grateful Dead to Courtney Barnett
# 307: Interesting piano to brass jazz example
# 313: Funny cover of Only Happy when It Rains by Richard Cheese
# 259: Different song, difficult example with very stylistically different vocals
# 261: Vocal to jazz example (neat!)
# 232: Very strange cover of Fleetwood Mac
# 207: Cross gender
# 156: Jerry Garcia studio (Deal) versus Grateful Dead live
# 182: Cross genre (R&B to reggae)
# 184: Very different instruments
# 127: Difficult example (old vocals not very distinct, needs to rely on pitch)
# 133: Difficult examples (stylistically very different)
# 76: Good Cross Genre
# 8: A good example of where the notes are quite different
# 12: Good example of screamo cross genre