# Test Set - Cover Songs
- From cover song sets from kaggle: https://www.kaggle.com/datasets/arpanpathak/original-and-cover-song-pairs/data 
- Contains 80 pairs of songs and a corresponding cover song.
- This notebook parses this dataset, gets mel spectograms from each song, and creates triplets with a correponding spectrogram from the cover as well as the spectrogram from a random different song.
- It outputs a pickle file that can be read into another notebook for fine tuning called /data/test_set_covers.pkl

In [4]:
import os
import random
import librosa
import pandas as pd
import numpy as np
import pickle as pkl
from tqdm import tqdm

# get spectrogram data
def load_mel_spectrogram(file_path, sr=22050, n_mels=128):
    y, sr = librosa.load(file_path, sr=sr)
    mel_spectrogram = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels)
    mel_spectrogram_db = librosa.power_to_db(mel_spectrogram, ref=np.max)
    return mel_spectrogram_db

# select random other song's spectrogram
def get_random_negative(exclude_index, file_paths):
    negative_index = random.choice([i for i in range(len(file_paths)) if i != exclude_index])
    negative_file = file_paths[negative_index]
    negative_spectrogram = load_mel_spectrogram(negative_file)
    return negative_spectrogram

# use file names to get song info
def extract_metadata(file_name):
    name, _ = os.path.splitext(file_name)
    artist, album, song = name.split('+')
    return artist, album, song

In [9]:
# Go through files to get all of the songs, similar to how we did lastfm
def process_audio_files(base_dir):
    data = []
    song_files = []

    for folder_name in os.listdir(base_dir):
        folder_path = os.path.join(base_dir, folder_name)
        if os.path.isdir(folder_path):
            files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith('.mp3')]
            song_files.extend(files)

    for folder_name in tqdm(os.listdir(base_dir), desc="Processing folders"):
        folder_path = os.path.join(base_dir, folder_name)
        if os.path.isdir(folder_path):
            files = [f for f in os.listdir(folder_path) if f.endswith('.mp3')]
            
            if len(files) == 2:
                song_title = folder_name
                anchor_file = os.path.join(folder_path, files[0])
                positive_file = os.path.join(folder_path, files[1])
                
                anchor_spectrogram = load_mel_spectrogram(anchor_file)
                positive_spectrogram = load_mel_spectrogram(positive_file)
                
                anchor_index = song_files.index(anchor_file)
                negative_spectrogram = get_random_negative(anchor_index, song_files)
                
                artist, album, song = extract_metadata(files[0])

                data.append({
                    'song_title': song_title,
                    'artist': artist,
                    'album': album,
                    'song': song,
                    'anchors': anchor_spectrogram,
                    'positives': positive_spectrogram,
                    'negatives': negative_spectrogram
                })

    df = pd.DataFrame(data)
    return df

In [10]:
base_dir = '/Users/reggiebain/erdos/song-similarity-erdos-old/data/coversongs/covers32k'
df = process_audio_files(base_dir)
df.head(2)

Processing folders: 100%|██████████| 83/83 [03:11<00:00,  2.31s/it]


Unnamed: 0,song_title,artist,album,song,anchors,positives,negatives
0,Claudette,everly_brothers,The_Fabulous_Style_of,01-Claudette,"[[-80.0, -80.0, -80.0, -80.0, -80.0, -80.0, -8...","[[-80.0, -80.0, -80.0, -61.775627, -48.010227,...","[[-80.0, -80.0, -80.0, -53.419292, -39.97673, ..."
1,I_Don_t_Want_To_Miss_A_Thing,aerosmith,Armageddon_Original_Soundtrack_,01-I_Don_t_Want_To_Miss_A_Thing,"[[-80.0, -80.0, -80.0, -79.25159, -56.510735, ...","[[-80.0, -80.0, -80.0, -80.0, -80.0, -80.0, -8...","[[-80.0, -80.0, -80.0, -58.83052, -51.61307, -..."


In [11]:
# save to pickle
df.to_pickle('../data/test_set_covers.pkl')

#### Create Test Set
- output anchors, positives, negatives, just the spectrograms

In [None]:
output_df = df[['anchor', 'positive', 'negative']]
output_df