# Test Set - Cover Songs
- From cover song sets from kaggle: https://www.kaggle.com/datasets/arpanpathak/original-and-cover-song-pairs/data 
- Contains 80 pairs of songs and a corresponding cover song.
- This notebook parses this dataset, gets mel spectograms from each song, and creates triplets with a correponding spectrogram from the cover as well as the spectrogram from a random different song.
- It outputs a pickle file that can be read into another notebook for fine tuning called /data/test_set_covers.pkl

In [8]:
import os
import random
import librosa
import pandas as pd
import numpy as np
import pickle as pkl
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from pydub import AudioSegment
import io

In [15]:
# Function to clip audio to 30s (30000 ms by default)
def clip_audio(audio_data, segment_length = 60000, sr=22050):
    audio = AudioSegment.from_file(audio_data)
    # check audio is at least segment_length
    if len(audio) < segment_length:
        raise ValueError("Audio is shorter than the segment length.")

    # Randomly generate start point at least 10s from the end of the song.
    start = np.random.randint(0, len(audio) - segment_length)
    audio_segment = audio[start:start + segment_length]

    # Return clip in format importable to Librosa
    audio_segment_io = io.BytesIO()
    audio_segment.export(audio_segment_io, format="mp3")
    audio_segment_io.seek(0)
    return audio_segment_io # Export (with pointer at start of audio)

# get spectrogram data
def load_mel_spectrogram(file_path, sr=22050, n_mels=128):
    audio_data = clip_audio(file_path)
    y, sr = librosa.load(audio_data, sr=sr)
    mel_spectrogram = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels)
    mel_spectrogram_db = librosa.power_to_db(mel_spectrogram, ref=np.max)
    return mel_spectrogram_db

# select random other song's spectrogram
def get_random_negative(exclude_index, file_paths):
    negative_index = random.choice([i for i in range(len(file_paths)) if i != exclude_index])
    negative_file = file_paths[negative_index]
    negative_spectrogram = load_mel_spectrogram(negative_file)
    return negative_file, negative_spectrogram

# use file names to get song info
def extract_metadata(file_name):
    name, _ = os.path.splitext(file_name)
    if len(name.split('+')) == 3:
        artist, album, song = name.split('+')
        return artist, album, song
    else:
        return "need artist name", "need album name", 'need song name'


In [16]:
# Go through files to get all of the songs, similar to how we did lastfm
def process_audio_files(base_dir):
    data = []
    song_files = []

    for folder_name in os.listdir(base_dir):
        folder_path = os.path.join(base_dir, folder_name)
        if os.path.isdir(folder_path):
            files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith('.mp3')]
            song_files.extend(files)

    for folder_name in tqdm(os.listdir(base_dir), desc="Processing folders"):
        folder_path = os.path.join(base_dir, folder_name)
        if os.path.isdir(folder_path):
            files = [f for f in os.listdir(folder_path) if f.endswith('.mp3')]
            
            if len(files) == 2:
                song_title = folder_name
                anchor_file = os.path.join(folder_path, files[0])
                positive_file = os.path.join(folder_path, files[1])

                # Clip the audio, then get mel spectrogram
                anchor_spectrogram = load_mel_spectrogram(anchor_file)
                positive_spectrogram = load_mel_spectrogram(positive_file)
                
                anchor_index = song_files.index(anchor_file)
                negative_file, negative_spectrogram = get_random_negative(anchor_index, song_files)
                neg_artist, neg_album, neg_song = extract_metadata(negative_file)
                
                artist, album, song = extract_metadata(files[0])

                data.append({
                    'song_title': song_title,
                    'artist': artist,
                    'album': album,
                    'song': song,
                    'anchors': anchor_spectrogram,
                    'positives': positive_spectrogram,
                    'negatives': negative_spectrogram,
                    'neg_song': neg_song,
                    'neg_artist': neg_artist,
                    'neg_album': neg_album,
                })

    df = pd.DataFrame(data)
    return df

In [17]:
base_dir = '/Users/reggiebain/erdos/song-similarity-erdos/data/coversongs/covers32k'
df = process_audio_files(base_dir)
df.head(2)

Processing folders:   0%|          | 0/83 [00:00<?, ?it/s]

Processing folders: 100%|██████████| 83/83 [18:18<00:00, 13.23s/it]


Unnamed: 0,song_title,artist,album,song,anchors,positives,negatives,neg_song,neg_artist,neg_album
0,Claudette,everly_brothers,The_Fabulous_Style_of,01-Claudette,"[[-49.7658, -50.9627, -44.81154, -38.146698, -...","[[-26.068644, -23.991093, -23.518074, -26.1214...","[[-39.67006, -36.35625, -30.621237, -25.635677...",12-Day_Tripper,/Users/reggiebain/erdos/song-similarity-erdos/...,1
1,I_Don_t_Want_To_Miss_A_Thing,aerosmith,Armageddon_Original_Soundtrack_,01-I_Don_t_Want_To_Miss_A_Thing,"[[-16.02195, -15.988646, -18.057266, -15.57520...","[[-26.193792, -20.062042, -20.986141, -20.8752...","[[-14.437675, -12.7644005, -15.01837, -17.1538...",06-Enjoy_The_Silence,/Users/reggiebain/erdos/song-similarity-erdos/...,Violator


In [15]:
# save to pickle
df.to_pickle('../data/test_set_covers.pkl')

### Create Test Set
- After having made the dataset, we can work with it more here

In [16]:
df = pd.read_pickle('../data/test_set_covers.pkl')
df.head(2)

Unnamed: 0,song_title,artist,album,song,anchors,positives,negatives,neg_song,neg_artist,neg_album
0,Claudette,everly_brothers,The_Fabulous_Style_of,01-Claudette,"[[-80.0, -80.0, -80.0, -80.0, -80.0, -80.0, -8...","[[-80.0, -80.0, -80.0, -61.775627, -48.010227,...","[[-80.0, -80.0, -80.0, -80.0, -76.417206, -74....",03-Addicted_To_Love,/Users/reggiebain/erdos/song-similarity-erdos/...,Riptide
1,I_Don_t_Want_To_Miss_A_Thing,aerosmith,Armageddon_Original_Soundtrack_,01-I_Don_t_Want_To_Miss_A_Thing,"[[-80.0, -80.0, -80.0, -79.25159, -56.510735, ...","[[-80.0, -80.0, -80.0, -80.0, -80.0, -80.0, -8...","[[-80.0, -80.0, -80.0, -80.0, -80.0, -80.0, -8...",09-Summertime_Blues,/Users/reggiebain/erdos/song-similarity-erdos/...,Surfin_USA_Surfin_Safari


In [18]:
def create_pairs_from_row(row):
    # Get elements
    song_title = row['song_title']
    artist = row['artist']
    album = row['album']
    anchor = row['anchors']
    positive = row['positives']
    negative = row['negatives']
    neg_song = row['neg_song']
    neg_artist = row['neg_artist']
    neg_album = row['neg_album']

    # Create pairs with labels
    positive_pair = {'song_title_1': song_title, 'artist_1': artist, 'album_1': album, 'song_1': anchor, 
                     'song_title_2':song_title, 'artist_2': artist, 'album_2': album, 'song_2': positive, 
                     'label': 1} 
    negative_pair = {'song_title_1': song_title, 'artist_1': artist, 'album_1': album, 'song_1': anchor,
                     'song_title_2': neg_song, 'artist_2': neg_artist, 'album_2': neg_album, 'song_2': negative, 
                     'label': 0}
    
    # Return them as two rows as a dataframe instead of 1 row
    return pd.DataFrame([positive_pair, negative_pair])

def create_pairs_dataframe(df):
    # Apply the function to each row and concatenate the results, returns series object
    pairs_df = df.apply(create_pairs_from_row, axis=1)
    # Turn
    pairs_df = pd.concat(pairs_df.tolist(), ignore_index=True)
    return pairs_df


In [19]:
paired_df = create_pairs_dataframe(df)
paired_df.head()

Unnamed: 0,song_title_1,artist_1,album_1,song_1,song_title_2,artist_2,album_2,song_2,label
0,Claudette,everly_brothers,The_Fabulous_Style_of,"[[-49.7658, -50.9627, -44.81154, -38.146698, -...",Claudette,everly_brothers,The_Fabulous_Style_of,"[[-26.068644, -23.991093, -23.518074, -26.1214...",1
1,Claudette,everly_brothers,The_Fabulous_Style_of,"[[-49.7658, -50.9627, -44.81154, -38.146698, -...",12-Day_Tripper,/Users/reggiebain/erdos/song-similarity-erdos/...,1,"[[-39.67006, -36.35625, -30.621237, -25.635677...",0
2,I_Don_t_Want_To_Miss_A_Thing,aerosmith,Armageddon_Original_Soundtrack_,"[[-16.02195, -15.988646, -18.057266, -15.57520...",I_Don_t_Want_To_Miss_A_Thing,aerosmith,Armageddon_Original_Soundtrack_,"[[-26.193792, -20.062042, -20.986141, -20.8752...",1
3,I_Don_t_Want_To_Miss_A_Thing,aerosmith,Armageddon_Original_Soundtrack_,"[[-16.02195, -15.988646, -18.057266, -15.57520...",06-Enjoy_The_Silence,/Users/reggiebain/erdos/song-similarity-erdos/...,Violator,"[[-14.437675, -12.7644005, -15.01837, -17.1538...",0
4,Happiness_is_a_Warm_Gun,beatles,White_Album_Disc_1,"[[-33.150192, -37.771618, -41.097336, -43.5215...",Happiness_is_a_Warm_Gun,beatles,White_Album_Disc_1,"[[-19.768545, -16.877508, -16.295105, -17.8528...",1


In [20]:
# Save this dataframe for potential depoloyment later
paired_df.to_pickle('../data/paired_songs_spectrograms_test.pkl')

In [48]:
for idx, array in enumerate(paired_df['song_1']):
    print(f"Array {idx} size: {array.shape}")

Array 0 size: (128, 5842)
Array 1 size: (128, 5842)
Array 2 size: (128, 12797)
Array 3 size: (128, 12797)
Array 4 size: (128, 7044)
Array 5 size: (128, 7044)
Array 6 size: (128, 12379)
Array 7 size: (128, 12379)
Array 8 size: (128, 7644)
Array 9 size: (128, 7644)
Array 10 size: (128, 12498)
Array 11 size: (128, 12498)
Array 12 size: (128, 5517)
Array 13 size: (128, 5517)
Array 14 size: (128, 16270)
Array 15 size: (128, 16270)
Array 16 size: (128, 5824)
Array 17 size: (128, 5824)
Array 18 size: (128, 14311)
Array 19 size: (128, 14311)
Array 20 size: (128, 12970)
Array 21 size: (128, 12970)
Array 22 size: (128, 9568)
Array 23 size: (128, 9568)
Array 24 size: (128, 7380)
Array 25 size: (128, 7380)
Array 26 size: (128, 10210)
Array 27 size: (128, 10210)
Array 28 size: (128, 15666)
Array 29 size: (128, 15666)
Array 30 size: (128, 9120)
Array 31 size: (128, 9120)
Array 32 size: (128, 6022)
Array 33 size: (128, 6022)
Array 34 size: (128, 12745)
Array 35 size: (128, 12745)
Array 36 size: (128,

In [45]:
# Function to shuffle and split
def stratified_train_test_split(df, test_size=0.2, random_state=None):
    # Shuffle so its not all lined up
    df = df.sample(frac=1, random_state=random_state).reset_index(drop=True)
    # Split while stratifying by label
    train_df, test_df = train_test_split(df, test_size=test_size, random_state=random_state, stratify=df['label'])
    return train_df, test_df

In [25]:
train_df, test_df = stratified_train_test_split(paired_df, test_size=0.2, random_state=123)
train_df.head(2)

Unnamed: 0,song_title_1,artist_1,album_1,song_1,song_title_2,artist_2,album_2,song_2,label
10,Purple_Rain,prince,Purple_Rain,"[[-80.0, -80.0, -80.0, -80.0, -80.0, -80.0, -8...",09-Lodi,/Users/reggiebain/erdos/song-similarity-erdos/...,Five_Man_Acoustic_Jam,"[[-80.0, -80.0, -80.0, -57.511948, -48.521374,...",0
23,Red_Red_Wine,ub40,Labour_of_Love,"[[-80.0, -80.0, -80.0, -80.0, -80.0, -80.0, -8...",10-Proud_Mary,/Users/reggiebain/erdos/song-similarity-erdos/...,Live_in_Europe,"[[-80.0, -80.0, -80.0, -55.84829, -45.63012, -...",0


In [29]:
train_df.loc[0, 'song_1']

array([[-80.      , -80.      , -80.      , ..., -74.13394 , -60.55564 ,
        -56.41207 ],
       [-80.      , -80.      , -80.      , ..., -75.43332 , -60.111782,
        -55.252895],
       [-80.      , -80.      , -80.      , ..., -62.552994, -59.266247,
        -55.035957],
       ...,
       [-80.      , -80.      , -80.      , ..., -80.      , -80.      ,
        -80.      ],
       [-80.      , -80.      , -80.      , ..., -80.      , -80.      ,
        -80.      ],
       [-80.      , -80.      , -80.      , ..., -80.      , -80.      ,
        -80.      ]], dtype=float32)

In [9]:
#make sure it stratified
print("Train Set Label Distribution:")
print(train_df['label'].value_counts())
print("\nTest Set Label Distribution:")
print(test_df['label'].value_counts())

Train Set Label Distribution:
label
1    62
0    61
Name: count, dtype: int64

Test Set Label Distribution:
label
0    16
1    15
Name: count, dtype: int64
