# Data Augmentation - Similar and Different Tracks
- Get dataset of songs including links
- Create dataset of augmented versions of every song (multiple if needed) to create similar songs.
- Identify different songs for each song in dataset

In [22]:
!pip install -q audiomentations tables pydub

In [1]:
import numpy as np
import pandas as pd
import requests
import io
from IPython.display import Audio, display
import librosa
import librosa.display
import matplotlib.pyplot as plt
import os
import re
import json
import pickle as pkl
import base64
import time
import urllib.parse
import spotipy
from sklearn.model_selection import train_test_split
from spotipy.oauth2 import SpotifyClientCredentials
from requests.exceptions import ReadTimeout
from tqdm import tqdm
import sys
from tenacity import retry, wait_exponential, stop_after_attempt, RetryError, retry_if_exception_type
import soundfile as sf
from audiomentations import Compose, AddGaussianNoise, TimeStretch, PitchShift, Shift, AddGaussianSNR, ClippingDistortion, Gain
from pydub import AudioSegment
import random

### Augmenting Audio Demo
- Load song from dataset, download and demo augmentations

In [2]:
# Get dataframe from kaggle dataset
music_info_df = pd.read_csv('/Users/reggiebain/erdos/song-similarity-erdos-old/data/music_info.csv')
music_info_df.head(2)

Unnamed: 0,track_id,name,artist,spotify_preview_url,spotify_id,tags,genre,year,duration_ms,danceability,...,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,TRIOREW128F424EAF0,Mr. Brightside,The Killers,https://p.scdn.co/mp3-preview/4d26180e6961fd46...,09ZQ5TmUG8TSL56n0knqrj,"rock, alternative, indie, alternative_rock, in...",,2004,222200,0.355,...,1,-4.36,1,0.0746,0.00119,0.0,0.0971,0.24,148.114,4
1,TRRIVDJ128F429B0E8,Wonderwall,Oasis,https://p.scdn.co/mp3-preview/d012e536916c927b...,06UfBBDISthj1ZJAtX4xjj,"rock, alternative, indie, pop, alternative_roc...",,2006,258613,0.409,...,2,-4.373,1,0.0336,0.000807,0.0,0.207,0.651,174.426,4


In [9]:
# Function to get spotify audio from row of DF
def get_raw_audio(row, which):
    try:
        url = row[which]
        response = requests.get(url)
        audio_data = io.BytesIO(response.content)
        return audio_data
    except requests.RequestException as e:
        print(f"Error downloading {url}: {e}")

# Play the audio in jupyter
def play_audio(row):
    display(Audio(get_raw_audio(row).read(), autoplay=True))

def get_audio_data(row, which):
    y, sr = librosa.load(get_raw_audio(row, which))
    return y, sr

# Function to get the mel spectrogram for each song in each row of a dataframe
def get_spectrogram(row, which):
    audio_data = get_raw_audio(row, which)
    y, sr = librosa.load(audio_data, sr=None)
    S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128)
    S_DB = librosa.power_to_db(S, ref=np.max)
    return pkl.dumps((S_DB, sr))        

# Function to get chromagram. Return serialized data and sampling rate
def get_chromagram(row, which):
    audio_data = get_raw_audio(row, which)
    y, sr = librosa.load(audio_data)
    chroma = librosa.feature.chroma_stft(y=y, sr=sr)
    return pkl.dumps((chroma, sr))

In [6]:
play_audio(music_info_df.iloc[0,:])

In [1]:
# Define exceptions to handle with retry
class RateLimitError(Exception):
    def __init__(self, message, retry_after=None):
        super().__init__(message)
        self.retry_after = retry_after

# Function to clip audio to 10s (10000 ms by default)
def clip_audio(audio_data, segment_length = 10000, sr=22050):
    audio = AudioSegment.from_file(audio_data)
    # check audio is at least segment_length
    if len(audio) < segment_length:
        raise ValueError("Audio is shorter than the segment length.")

    # Randomly generate start point at least 10s from the end of the song.
    start = np.random.randint(0, len(audio) - segment_length)
    audio_segment = audio[start:start + segment_length]

    # Return clip in format importable to Librosa
    audio_segment_io = io.BytesIO()
    audio_segment.export(audio_segment_io, format="wav")
    audio_segment_io.seek(0)
    return audio_segment_io # Export (with pointer at start of audio)

# Function to download and process a single audio file
@retry(stop=stop_after_attempt(10), wait=wait_exponential(multiplier=1, min=4, max=60), retry=retry_if_exception_type(RateLimitError))
def download_and_process_audio(url, sr=22050):
    try:
        response = requests.get(url)
    except requests.exceptions.RequestException as e:
        print(f"Error downloading {url}: {e}")
        return None   
     
    if response.status_code == 429:
        retry_after = response.headers.get('Retry-After', 'Not provided')
        print(f"Rate limit exceeded. Retry-After: {retry_after} seconds")
        raise RateLimitError("Rate limit exceeded", retry_after)
        
    response.raise_for_status()  # Check if the request was successful
    # get audio data from url
    audio_data = io.BytesIO(response.content)
    # feed into audio clipping function
    audio_data_clip = clip_audio(audio_data)


    y, sr = librosa.load(audio_data_clip, sr=sr)
    #mel_spectrogram = librosa.feature.melspectrogram(y=audio, sr=sr)
    #mel_spectrogram_db = librosa.power_to_db(mel_spectrogram, ref=np.max)
    #time.sleep(0.5)
    return y, sr
    #return mel_spectrogram_db


# Function to apply download_and_process_audio to each row
def process_row(row, sr=22050):
    url = row['spotify_preview_url']
    if url:
        return download_and_process_audio(url, sr)
    return None

# Function to download and process all audio files in the DataFrame
def download_and_process_all_audio(df, url_column, output_dir, batch_size=00, start_batch=0, sr=22050):
    os.makedirs(output_dir, exist_ok=True)
    # Dataframe to add audio too
    df['processed_audio'] = None

    num_batches = (len(df) // batch_size)
    for batch_num in range(start_batch, num_batches):
        batch_file_path = os.path.join(output_dir, f"batch_{batch_num + 1}.pkl")

        # Check if the batch has already been processed
        if os.path.exists(batch_file_path):
            print(f"Batch {batch_num + 1} already processed. Skipping...")
            continue

        start_idx = batch_num * batch_size
        end_idx = min((batch_num + 1) * batch_size, len(df))
        batch_df = df.iloc[start_idx:end_idx]

        # Pass each row to other functions with progress bar. Output to pickle
        tqdm.pandas(desc=f"Processing batch {batch_num + 1}/{num_batches}")
        batch_df['processed_audio'] = batch_df.progress_apply(lambda row: process_row(row, sr), axis=1).copy()
        batch_df.to_pickle(batch_file_path)

        # Combine progressively each batch with overall df. Pickle.
        #df.iloc[start_idx:end_idx] = batch_df
        #df.to_pickle(os.path.join(output_dir, "full_dataframe.pkl"))
        time.sleep(10)


def combine_batches(pickle_dir, output_file=None):
    dataframes = []

    # Go through all pickle files taht start with batch
    for filename in sorted(os.listdir(pickle_dir)):
        if filename.startswith('batch_') and filename.endswith('.pkl'):
            file_path = os.path.join(pickle_dir, filename)
            # Load the DataFrame and append to the list
            df = pd.read_pickle(file_path)
            dataframes.append(df)

    # Concatenate all DataFrames into a single DataFrame
    combined_df = pd.concat(dataframes, ignore_index=True)

    # Save the combined DataFrame to a new pickle file (optional)
    if output_file:
        combined_df.to_pickle(output_file)

    return combined_df

NameError: name 'retry' is not defined

In [4]:
# Load your DataFrame (example with a column 'spotify_preview_url' containing URLs to the audio files)
df = music_info_df.iloc[10000:20000, :].copy()

# Specify the output directory for downloaded files
output_dir = "../data/audio_download/"

# Download and process all audio files and update the DataFrame with the processed data
download_and_process_all_audio(df, 'spotify_preview_url', output_dir, batch_size=1000)

# merge all of the dfs
#combined_df = combine_batches('../audio_data/', '../audio_data/all_batches_df.pkl')
#print(combined_df.head())

Processing batch 1/11: 100%|██████████| 1000/1000 [12:28<00:00,  1.34it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  batch_df['processed_audio'] = batch_df.progress_apply(lambda row: process_row(row, sr), axis=1)
Processing batch 2/11: 100%|██████████| 1000/1000 [11:46<00:00,  1.41it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  batch_df['processed_audio'] = batch_df.progress_apply(lambda row: process_row(row, sr), axis=1)
Processing batch 3/11: 100%|██████████| 1000/1000 [12:09<00:00,  1.37it/s]
A value is trying to be set on a copy of a slice from

Error downloading nan: Invalid URL 'nan': No scheme supplied. Perhaps you meant https://nan?


In [35]:
#df = combine_batches('/Users/reggiebain/erdos/song-similarity-erdos-old/data/audio_download', 
#                     '/Users/reggiebain/erdos/song-similarity-erdos-old/data/audio_download/all_batches_df.pkl')

#### ---------- SANDBOX FOR AUDIO AUGMENTATIONS ------

In [76]:
df.iloc[0,:]['processed_audio'][0]

array([-0.24091958, -0.30352473, -0.3382524 , ...,  0.26228857,
        0.19583827,  0.28442264], dtype=float32)

In [34]:
# Try a few different augmentations from Emelie's code
gauss_noise_augment = AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.015,p=1.0)
gauss_snr_augment = AddGaussianSNR(min_snr_db=5.0, max_snr_db=10.0,p=1.0)
pitch_shift_augment = PitchShift(min_semitones=-4, max_semitones=4, p=0.2)
clip_augment = ClippingDistortion(min_percentile_threshold=10, max_percentile_threshold=60, p=1.0)
gain_augment = Gain(min_gain_in_db=-5, max_gain_in_db=5, p=1.0)

In [58]:
# Sandbox for listening to different augmentations based on Emelie's code
test_audio_row = music_info_df.iloc[0,:]
y, sr = get_audio_data(test_audio_row, 'spotify_preview_url')
#augmented_audio = gauss_noise_augment(y, sample_rate=sr)
#augmented_audio = gauss_snr_augment(y, sample_rate=sr)
#augmented_audio = pitch_shift_augment(y, sample_rate=sr)
#augmented_audio = clip_augment(y, sample_rate=sr)
augmented_sound = gain_augment(y, sample_rate=sr)
Audio(augmented_audio, rate=sr)

#### --------------------

### Script to Augment - Similar
- Write function for creating similar song augmentations
- Apply to dataframe with efficient pipeline
- Pair each song with similar/augmented song

In [5]:
# Reload data from scratch. Do 1 batch at a time
#df = pd.read_pickle('/Users/reggiebain/erdos/song-similarity-erdos-old/data/audio_download/all_batches_df.pkl')
#df.head(2)

In [7]:
# Function to read audio from dataframe and add augmented column
def augment_audio(row, which):
    # Somewhat random assortment of small augmentations
    augment = Compose([
        AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.015, p=0.5),
        TimeStretch(min_rate=0.8, max_rate=1.25, p=0.5),
        PitchShift(min_semitones=-4, max_semitones=4, p=0.5),
        Gain(min_gain_in_db=-5, max_gain_in_db=5, p=1.0),
        AddGaussianSNR(min_snr_db=5.0, max_snr_db=10.0,p=1.0),
        #Shift(min_fraction=-0.5, max_fraction=0.5, p=0.5),
    ])
    # Augment: row[which][0] is y, row[which][1] is sr
    augmented_audio = augment(samples = row[which][0], sample_rate=row[which][1])
    return augmented_audio, row[which][1]

# Match each song with a random other song from that batch. 
# NOTE: Training/validation sets will be made with different batches which won't have songs in common
def select_random_song(row, df):
    # Exclude the current row to avoid selecting the same song
    available_indices = df.index[df.index != row.name]
    random_index = np.random.choice(available_indices)
    #return df.loc[random_index, 'processed_audio']
    return df.loc[random_index, ['processed_audio', 'artist', 'name']]


# Do augmenting for each batched file
def apply_in_batches(base_dir, output_dir, batch_prefix="batch_"):
    # Make a list of batch pickle files and get the number of batches
    batch_files = [f for f in os.listdir(base_dir) if f.startswith(batch_prefix) and f.endswith('.pkl')]
    num_batches = len(batch_files)
    for i in tqdm(range(1, num_batches + 1), desc="Processing batches"):
        batch_file = os.path.join(base_dir, f"{batch_prefix}{i}.pkl")
        df = pd.read_pickle(batch_file)
        
        # Apply augmentation to each row
        df['augmented_audio'] = df.apply(lambda row: augment_audio(row, 'processed_audio'), axis=1)
        
        random_df = df.progress_apply(lambda row: select_random_song(row, df), axis=1)

        # rename the columns 
        random_df.columns = ['diff_processed_audio', 'diff_artist', 'diff_name']

        # Combine current row with random rows
        #tqdm.pandas(desc=f"Concatenating random songs with original df...")
        sim_and_diff_df = pd.concat([df, random_df], axis=1)

        # Save the augmented dataframe
        augmented_batch_file = os.path.join(output_dir, f"{batch_prefix}{i}_augmented.pkl")
        sim_and_diff_df.to_pickle(augmented_batch_file)

In [8]:
# Create augmented audio in batches. Specify folders
processed_dir = '/Users/reggiebain/erdos/song-similarity-erdos-old/data/audio_download'
augmented_dir = '/Users/reggiebain/erdos/song-similarity-erdos-old/data/augmented_audio'
apply_in_batches(processed_dir, augmented_dir)

Processing batch 11/11: 100%|██████████| 1000/1000 [00:02<00:00, 339.14it/s]
Processing batch 11/11: 100%|██████████| 1000/1000 [00:01<00:00, 700.93it/s]
Processing batch 11/11: 100%|██████████| 1000/1000 [00:01<00:00, 954.32it/s]
Processing batch 11/11: 100%|██████████| 1000/1000 [00:00<00:00, 1106.98it/s]
Processing batch 11/11: 100%|██████████| 1000/1000 [00:00<00:00, 1190.22it/s]
Processing batch 11/11: 100%|██████████| 1000/1000 [00:02<00:00, 341.76it/s]
Processing batch 11/11: 100%|██████████| 1000/1000 [00:00<00:00, 1030.09it/s]
Processing batch 11/11: 100%|██████████| 1000/1000 [00:01<00:00, 643.39it/s]
Processing batch 11/11: 100%|██████████| 1000/1000 [00:02<00:00, 376.80it/s]
Processing batch 11/11: 100%|██████████| 1000/1000 [00:02<00:00, 380.28it/s]
Processing batches:  91%|█████████ | 10/11 [23:00<02:18, 138.09s/it]


ValueError: Cannot set a DataFrame with multiple columns to the single column augmented_audio

In [9]:
# Apply augmentations
#tqdm.pandas(desc=f"Applying augmentations to dataframe...")
#df['augmented_audio'] = df.progress_apply(lambda row: augment_audio(row, 'processed_audio'), axis=1)
#df.head(2)

### Script for Augmentation/Matching - Different
- Pipeline for pairing anchor/similar with different songs

In [None]:
# Match each song with a random other song
def select_random_song(row, df):
    # Exclude the current row to avoid selecting the same song
    available_indices = df.index[df.index != row.name]
    random_index = np.random.choice(available_indices)
    #return df.loc[random_index, 'processed_audio']
    return df.loc[random_index, ['processed_audio', 'artist', 'name']]

In [None]:
# Get temp df of the info we want for random song
tqdm.pandas(desc=f"Getting random different song...")
random_df = df.progress_apply(lambda row: select_random_song(row, df), axis=1)

# rename the columns 
random_df.columns = ['diff_processed_audio', 'diff_artist', 'diff_name']

# Combine current row with random rows
tqdm.pandas(desc=f"Concatenating random songs with original df...")
sim_and_diff_df = pd.concat([df, random_df], axis=1)

sim_and_diff_df.head(2)

Getting random different song...: 100%|██████████| 100/100 [00:00<00:00, 373.53it/s]


Unnamed: 0,track_id,name,artist,spotify_preview_url,spotify_id,tags,genre,year,duration_ms,danceability,...,instrumentalness,liveness,valence,tempo,time_signature,processed_audio,augmented_audio,diff_processed_audio,diff_artist,diff_name
0,TRIOREW128F424EAF0,Mr. Brightside,The Killers,https://p.scdn.co/mp3-preview/4d26180e6961fd46...,09ZQ5TmUG8TSL56n0knqrj,"rock, alternative, indie, alternative_rock, in...",,2004,222200,0.355,...,0.0,0.0971,0.24,148.114,4,"([-0.2620229, -0.35404703, -0.3487168, -0.3190...","([-0.03797362, -0.21291901, -0.15998803, -0.26...","([-0.060790915, 0.09364325, 0.2367863, 0.21237...",Radiohead,Paranoid Android
1,TRRIVDJ128F429B0E8,Wonderwall,Oasis,https://p.scdn.co/mp3-preview/d012e536916c927b...,06UfBBDISthj1ZJAtX4xjj,"rock, alternative, indie, pop, alternative_roc...",,2006,258613,0.409,...,0.0,0.207,0.651,174.426,4,"([-0.061054245, -0.17240259, -0.19763069, -0.1...","([-0.3070413, -0.20995288, 0.08281514, -0.2914...","([0.11163922, 0.21386491, 0.2824207, 0.1797000...",Katy Perry,I Kissed a Girl


In [None]:
memory_usage = sim_and_diff_df.memory_usage(deep=True).sum()
memory_usage_mb = memory_usage / (1024 ** 2)
print(f"Memory usage of DataFrame: {memory_usage_mb:.2f} MB")

Memory usage of DataFrame: 0.10 MB


#### Output Data Needed for Deep Learning with Just Audio
- Save BATCH_NUM h5 file for each batch. Will need to import later to deep learning model.
- Keep subset of columns needed for DL purposes. Way to id the songs and the spectrograms
1. processed_audio
2. augmented_audio
3. diff_processed_audio
4. diff_artist
5. diff_name
6. artist
7. name

In [None]:
# Function to estimate the size of dataframe
def estimate_df_size(df, sample_size=1000, sample_file_name='sample.h5'):
    # Take sample of df, save to file, get the size of the sample
    df_sample = df.iloc[:sample_size]
    df_sample.to_hdf(sample_file_name, key='df', mode='w')
    sample_file_size = os.path.getsize(sample_file_name)

    # Extrapolate the size for the entire DataFrame then delete save file
    total_size_estimate = (sample_file_size / sample_size) * len(df)
    os.remove(sample_file_name)

    return total_size_estimate

In [None]:
# Create working dataframe selecting only audio columns with titles/artists
working_df = sim_and_diff_df[['name', 'artist', 'processed_audio', 'augmented_audio', 'diff_name', 'diff_artist', 'diff_processed_audio']]
#working_df.to_pickle('../data/working_df.pkl')
working_df.to_hdf(f'../working_df_{BATCH_NUM}.h5', key='df', mode='w', complevel=9, complib='blosc')

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block0_values] [items->Index(['name', 'artist', 'processed_audio', 'augmented_audio', 'diff_name',
       'diff_artist', 'diff_processed_audio'],
      dtype='object')]

  working_df.to_hdf('../working_df.h5', key='df', mode='w', complevel=9, complib='blosc')


In [20]:
# Loading the DataFrame from HDF5
df_loaded = pd.read_hdf('../working_df.h5', key='df')
df_loaded.head(2)

Unnamed: 0,name,artist,processed_audio,augmented_audio,diff_name,diff_artist,diff_processed_audio
0,Mr. Brightside,The Killers,"([-0.24091958, -0.30352473, -0.3382524, -0.442...","([-0.20250607, -0.4502711, -0.6215848, -0.4549...",Can't Stop,Red Hot Chili Peppers,"([0.076196045, 0.12404013, 0.12600788, 0.04493..."
1,Wonderwall,Oasis,"([-0.03920526, -0.105055355, -0.19810554, -0.2...","([0.027309434, 0.029346019, -0.22353631, -0.11...",I Will Follow You Into The Dark,Death Cab for Cutie,"([-0.0058964454, -0.010358613, -0.010039683, -..."
