# Data Augmentation - Similar and Different Tracks
- Get dataset of songs including links
- Create dataset of augmented versions of every song (multiple if needed) to create similar songs.
- Identify different songs for each song in dataset

In [1]:
!pip install -q audiomentations

In [41]:
import numpy as np
import pandas as pd
import requests
import io
from IPython.display import Audio, display
import librosa
import librosa.display
import matplotlib.pyplot as plt
import os
import re
import json
import pickle as pkl
import base64
import time
import urllib.parse
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from requests.exceptions import ReadTimeout
from tqdm import tqdm
import sys
from tenacity import retry, wait_exponential, stop_after_attempt, RetryError, retry_if_exception_type
import soundfile as sf
from audiomentations import Compose, AddGaussianNoise, TimeStretch, PitchShift, Shift, AddGaussianSNR, ClippingDistortion, Gain

### Augmenting Audio Demo
- Load song from dataset, download and demo augmentations

In [38]:
# Get dataframe from kaggle dataset
music_info_df = pd.read_csv('/Users/reggiebain/erdos/song-similarity-erdos-old/data/music_info.csv')
music_info_df.head(2)

Unnamed: 0,track_id,name,artist,spotify_preview_url,spotify_id,tags,genre,year,duration_ms,danceability,...,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,TRIOREW128F424EAF0,Mr. Brightside,The Killers,https://p.scdn.co/mp3-preview/4d26180e6961fd46...,09ZQ5TmUG8TSL56n0knqrj,"rock, alternative, indie, alternative_rock, in...",,2004,222200,0.355,...,1,-4.36,1,0.0746,0.00119,0.0,0.0971,0.24,148.114,4
1,TRRIVDJ128F429B0E8,Wonderwall,Oasis,https://p.scdn.co/mp3-preview/d012e536916c927b...,06UfBBDISthj1ZJAtX4xjj,"rock, alternative, indie, pop, alternative_roc...",,2006,258613,0.409,...,2,-4.373,1,0.0336,0.000807,0.0,0.207,0.651,174.426,4


In [49]:
requests.get(music_info_df.iloc[0,:]['spotify_preview_url'])

<Response [200]>

In [39]:
# Function to get spotify audio from row of DF
def get_raw_audio(row, which):
    try:
        url = row[which]
        response = requests.get(url)
        audio_data = io.BytesIO(response.content)
        return audio_data
    except requests.RequestException as e:
        print(f"Error downloading {url}: {e}")

# Play the audio in jupyter
def play_audio(row):
    display(Audio(get_raw_audio(row).read(), autoplay=True))

def get_audio_data(row, which):
    y, sr = librosa.load(get_raw_audio(row, which))
    return y, sr

# Function to get the mel spectrogram for each song in each row of a dataframe
def get_spectrogram(row, which):
    audio_data = get_raw_audio(row, which)
    y, sr = librosa.load(audio_data, sr=None)
    S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128)
    S_DB = librosa.power_to_db(S, ref=np.max)
    return pkl.dumps((S_DB, sr))        

# Function to get chromagram. Return serialized data and sampling rate
def get_chromagram(row, which):
    audio_data = get_raw_audio(row, which)
    y, sr = librosa.load(audio_data)
    chroma = librosa.feature.chroma_stft(y=y, sr=sr)
    return pkl.dumps((chroma, sr))

In [6]:
play_audio(music_info_df.iloc[0,:])

In [50]:
# Define exceptions to handle with retry
class RateLimitError(Exception):
    pass

# Function to download and process a single audio file
@retry(stop=stop_after_attempt(10), wait=wait_exponential(multiplier=1, min=4, max=60), retry=retry_if_exception_type(RateLimitError))
def download_and_process_audio(url, sr=22050):
    try:
        response = requests.get(url)
        if response.status_code == 429:
            raise RateLimitError("Rate limit exceeded")
        response.raise_for_status()  # Check if the request was successful
        
        audio_data = io.BytesIO(response.content)
        y, sr = librosa.load(audio_data, sr=sr)
        #mel_spectrogram = librosa.feature.melspectrogram(y=audio, sr=sr)
        #mel_spectrogram_db = librosa.power_to_db(mel_spectrogram, ref=np.max)
        return y, sr
        #return mel_spectrogram_db
    except requests.exceptions.RequestException as e:
        print(f"Error downloading {url}: {e}")
        return None

# Function to apply download_and_process_audio to each row
def process_row(row, sr=22050):
    url = row['spotify_preview_url']
    if url:
        return download_and_process_audio(url, sr)
    return None

# Function to download and process all audio files in the DataFrame
def download_and_process_all_audio(df, url_column, output_dir, batch_size=100, start_batch=0, sr=22050):
    os.makedirs(output_dir, exist_ok=True)
    # Dataframe to add audio too
    df['processed_audio'] = None

    num_batches = (len(df) // batch_size) + 1
    for batch_num in range(start_batch, num_batches):
        batch_file_path = os.path.join(output_dir, f"batch_{batch_num + 1}.pkl")

        # Check if the batch has already been processed
        if os.path.exists(batch_file_path):
            print(f"Batch {batch_num + 1} already processed. Skipping...")
            continue

        start_idx = batch_num * batch_size
        end_idx = min((batch_num + 1) * batch_size, len(df))
        batch_df = df.iloc[start_idx:end_idx]

        # Pass each row to other functions with progress bar. Output to pickle
        tqdm.pandas(desc=f"Processing batch {batch_num + 1}/{num_batches}")
        batch_df['processed_audio'] = batch_df.progress_apply(lambda row: process_row(row, sr), axis=1)
        batch_df.to_pickle(batch_file_path)

        # Combine progressively each batch with overall df. Pickle.
        df.iloc[start_idx:end_idx] = batch_df
        df.to_pickle(os.path.join(output_dir, "full_dataframe.pkl"))

In [52]:
# Load your DataFrame (example with a column 'spotify_preview_url' containing URLs to the audio files)
df = music_info_df.head().copy()

# Specify the output directory for downloaded files
output_dir = "../data/audio_download/"

# Download and process all audio files and update the DataFrame with the processed data
download_and_process_all_audio(df, 'spotify_preview_url', output_dir, batch_size=5000)

# Save the updated DataFrame
df.to_pickle("../data/music_info_w_Audio.pkl")

Processing batch 1/1: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 5/5 [00:02<00:00,  2.48it/s]


#### ---------- SANDBOX FOR AUDIO AUGMENTATIONS ------

In [76]:
df.iloc[0,:]['processed_audio'][0]

array([-0.24091958, -0.30352473, -0.3382524 , ...,  0.26228857,
        0.19583827,  0.28442264], dtype=float32)

In [34]:
# Try a few different augmentations from Emelie's code
gauss_noise_augment = AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.015,p=1.0)
gauss_snr_augment = AddGaussianSNR(min_snr_db=5.0, max_snr_db=10.0,p=1.0)
pitch_shift_augment = PitchShift(min_semitones=-4, max_semitones=4, p=0.2)
clip_augment = ClippingDistortion(min_percentile_threshold=10, max_percentile_threshold=60, p=1.0)
gain_augment = Gain(min_gain_in_db=-5, max_gain_in_db=5, p=1.0)

In [58]:
# Sandbox for listening to different augmentations based on Emelie's code
test_audio_row = music_info_df.iloc[0,:]
y, sr = get_audio_data(test_audio_row, 'spotify_preview_url')
#augmented_audio = gauss_noise_augment(y, sample_rate=sr)
#augmented_audio = gauss_snr_augment(y, sample_rate=sr)
#augmented_audio = pitch_shift_augment(y, sample_rate=sr)
#augmented_audio = clip_augment(y, sample_rate=sr)
augmented_sound = gain_augment(y, sample_rate=sr)
Audio(augmented_audio, rate=sr)

### Script to Augment - Similar
- Write function for creating similar song augmentations
- Apply to dataframe with efficient pipeline
- Pair each song with similar/augmented song

In [87]:
# Reload data from scratch
df = pd.read_pickle('/Users/reggiebain/erdos/song-similarity-erdos-old/data/audio_download/full_dataframe.pkl')
df.head(2)

Unnamed: 0,track_id,name,artist,spotify_preview_url,spotify_id,tags,genre,year,duration_ms,danceability,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,processed_audio
0,TRIOREW128F424EAF0,Mr. Brightside,The Killers,https://p.scdn.co/mp3-preview/4d26180e6961fd46...,09ZQ5TmUG8TSL56n0knqrj,"rock, alternative, indie, alternative_rock, in...",,2004,222200,0.355,...,-4.36,1,0.0746,0.00119,0.0,0.0971,0.24,148.114,4,"([-0.24091958, -0.30352473, -0.3382524, -0.442..."
1,TRRIVDJ128F429B0E8,Wonderwall,Oasis,https://p.scdn.co/mp3-preview/d012e536916c927b...,06UfBBDISthj1ZJAtX4xjj,"rock, alternative, indie, pop, alternative_roc...",,2006,258613,0.409,...,-4.373,1,0.0336,0.000807,0.0,0.207,0.651,174.426,4,"([-0.03920526, -0.105055355, -0.19810554, -0.2..."


In [88]:
# Function to read audio from dataframe and add augmented column
def augment_audio(row, which):
    # Somewhat random assortment of small augmentations
    augment = Compose([
        AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.015, p=0.5),
        TimeStretch(min_rate=0.8, max_rate=1.25, p=0.5),
        PitchShift(min_semitones=-4, max_semitones=4, p=0.5),
        Gain(min_gain_in_db=-5, max_gain_in_db=5, p=1.0),
        AddGaussianSNR(min_snr_db=5.0, max_snr_db=10.0,p=1.0),
        #Shift(min_fraction=-0.5, max_fraction=0.5, p=0.5),
    ])
    # Augment: row[which][0] is y, row[which][1] is sr
    augmented_audio = augment(samples = row[which][0], sample_rate=row[which][1])
    return augmented_audio

In [89]:
df['augmented_audio'] = df.apply(lambda row: augment_audio(row, 'processed_audio'), axis=1)
df.head(2)

Unnamed: 0,track_id,name,artist,spotify_preview_url,spotify_id,tags,genre,year,duration_ms,danceability,...,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,processed_audio,augmented_audio
0,TRIOREW128F424EAF0,Mr. Brightside,The Killers,https://p.scdn.co/mp3-preview/4d26180e6961fd46...,09ZQ5TmUG8TSL56n0knqrj,"rock, alternative, indie, alternative_rock, in...",,2004,222200,0.355,...,1,0.0746,0.00119,0.0,0.0971,0.24,148.114,4,"([-0.24091958, -0.30352473, -0.3382524, -0.442...","[-0.463119, -0.9101088, -0.87173533, -0.678944..."
1,TRRIVDJ128F429B0E8,Wonderwall,Oasis,https://p.scdn.co/mp3-preview/d012e536916c927b...,06UfBBDISthj1ZJAtX4xjj,"rock, alternative, indie, pop, alternative_roc...",,2006,258613,0.409,...,1,0.0336,0.000807,0.0,0.207,0.651,174.426,4,"([-0.03920526, -0.105055355, -0.19810554, -0.2...","[-0.010030596, -0.053237088, -0.1502307, -0.29..."


In [90]:
# Calculate total memory usage and print
memory_usage = df.memory_usage(deep=True).sum()
memory_usage_mb = memory_usage / (1024 ** 2)
print(f"Memory usage of DataFrame: {memory_usage_mb:.2f} MB")

Memory usage of DataFrame: 12.62 MB


### Script for Augmentation/Matching - Different
- Pipeline for pairing anchor/similar with different songs

In [93]:
# Match each song with a random other song
def select_random_song(row, df):
    # Exclude the current row to avoid selecting the same song
    available_indices = df.index[df.index != row.name]
    random_index = np.random.choice(available_indices)
    #return df.loc[random_index, 'processed_audio']
    return df.loc[random_index, ['processed_audio', 'artist', 'name']]

In [94]:
# Get temp df of the info we want for random song
random_df = df.apply(lambda row: select_random_song(row, df), axis=1)

# rename the columns 
random_df.columns = ['diff_processed_audio', 'diff_artist', 'diff_name']

# Combine current row with random rows
sim_and_diff_df = pd.concat([df, random_df], axis=1)

sim_and_diff_df.head(2)

Unnamed: 0,track_id,name,artist,spotify_preview_url,spotify_id,tags,genre,year,duration_ms,danceability,...,instrumentalness,liveness,valence,tempo,time_signature,processed_audio,augmented_audio,diff_processed_audio,diff_artist,diff_name
0,TRIOREW128F424EAF0,Mr. Brightside,The Killers,https://p.scdn.co/mp3-preview/4d26180e6961fd46...,09ZQ5TmUG8TSL56n0knqrj,"rock, alternative, indie, alternative_rock, in...",,2004,222200,0.355,...,0.0,0.0971,0.24,148.114,4,"([-0.24091958, -0.30352473, -0.3382524, -0.442...","[-0.463119, -0.9101088, -0.87173533, -0.678944...","([0.11006924, 0.14516525, 0.051283143, -0.0514...",Radiohead,Creep
1,TRRIVDJ128F429B0E8,Wonderwall,Oasis,https://p.scdn.co/mp3-preview/d012e536916c927b...,06UfBBDISthj1ZJAtX4xjj,"rock, alternative, indie, pop, alternative_roc...",,2006,258613,0.409,...,0.0,0.207,0.651,174.426,4,"([-0.03920526, -0.105055355, -0.19810554, -0.2...","[-0.010030596, -0.053237088, -0.1502307, -0.29...","([0.11006924, 0.14516525, 0.051283143, -0.0514...",Radiohead,Creep
