# Get Song Data
- Get audio files from web urls
- Process with librosa to get y and sr arrays
- Add to dataframe
- Save to file

In [7]:
!pip install -q pydub

In [2]:
import numpy as np
import pandas as pd
import requests, io, os, re, json, time, sys, time, base64
from IPython.display import Audio, display
import librosa, librosa.display
import matplotlib.pyplot as plt
import pickle as pkl
import urllib.parse
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from requests.exceptions import ReadTimeout
from tqdm import tqdm
from tenacity import retry, wait_exponential, stop_after_attempt, RetryError, retry_if_exception_type
from pydub import AudioSegment



In [3]:
# Get dataframe from kaggle dataset (eventually custom dataset)
music_info_df = pd.read_csv('/Users/reggiebain/erdos/song-similarity-erdos-old/data/music_info.csv')
music_info_df.head(2)

Unnamed: 0,track_id,name,artist,spotify_preview_url,spotify_id,tags,genre,year,duration_ms,danceability,...,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,TRIOREW128F424EAF0,Mr. Brightside,The Killers,https://p.scdn.co/mp3-preview/4d26180e6961fd46...,09ZQ5TmUG8TSL56n0knqrj,"rock, alternative, indie, alternative_rock, in...",,2004,222200,0.355,...,1,-4.36,1,0.0746,0.00119,0.0,0.0971,0.24,148.114,4
1,TRRIVDJ128F429B0E8,Wonderwall,Oasis,https://p.scdn.co/mp3-preview/d012e536916c927b...,06UfBBDISthj1ZJAtX4xjj,"rock, alternative, indie, pop, alternative_roc...",,2006,258613,0.409,...,2,-4.373,1,0.0336,0.000807,0.0,0.207,0.651,174.426,4


In [2]:
# Get data from spotify links (from most updated DF)
def get_raw_audio(row, which):
    try:
        url = row[which]
        response = requests.get(url)
        audio_data = io.BytesIO(response.content)
        return audio_data
    except requests.RequestException as e:
        print(f"Error downloading {url}: {e}")

In [5]:
test = get_raw_audio(music_info_df.iloc[0,:], 'spotify_preview_url')

In [6]:
type(test)

_io.BytesIO

In [None]:
# Get dataframe from kaggle dataset
music_info_df = pd.read_csv('/Users/reggiebain/erdos/song-similarity-erdos-old/data/music_info.csv')
music_info_df.head(2)

In [4]:
# Define exceptions to handle with retry
class RateLimitError(Exception):
    pass

# Function to download a single audio file
@retry(stop=stop_after_attempt(10), wait=wait_exponential(multiplier=1, min=4, max=60), retry=retry_if_exception_type(RateLimitError))
def download_audio(url, output_dir, file_name):
    try:
        response = requests.get(url, stream=True)
        if response.status_code == 429:
            raise RateLimitError("Rate limit exceeded")
        response.raise_for_status()  # Check if the request was successful
        file_path = os.path.join(output_dir, file_name + '.mp3')
        with open(file_path, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)
        return file_path
    except requests.exceptions.RequestException as e:
        print(f"Error downloading {url}: {e}")
        return None

# Function to download all audio files in the DataFrame
def download_all_audio(df, url_column, output_dir, batch_size=100, start_batch=0):
    os.makedirs(output_dir, exist_ok=True)
    df['downloaded_file_path'] = None

    num_batches = (len(df) // batch_size) + 1
    for batch_num in range(start_batch, num_batches):
        start_idx = batch_num * batch_size
        end_idx = min((batch_num + 1) * batch_size, len(df))
        batch_df = df.iloc[start_idx:end_idx]

        for index, row in tqdm(batch_df.iterrows(), total=len(batch_df), desc=f"Downloading batch {batch_num + 1}/{num_batches}"):
            url = row[url_column]
            if url:
                file_name = f"audio_files_batch_{batch_num + 1}_{index - start_idx + 1}"
                file_path = download_audio(url, output_dir, file_name)
                df.at[index, 'downloaded_file_path'] = file_path

        # Save the batch DataFrame to a pickle file
        batch_file_path = os.path.join(output_dir, f"batch_{batch_num + 1}.pkl")
        batch_df.to_pickle(batch_file_path)

        # Save the full DataFrame state after each batch
        df.to_pickle(os.path.join(output_dir, "full_dataframe.pkl"))

In [5]:
# Load your DataFrame (example with a column 'spotify_preview_url' containing URLs to the audio files)
df = music_info_df[:50].copy()

# Specify the output directory for downloaded files
output_dir = "../data/audio_files/"

# Download all audio files and update the DataFrame with the file paths
download_all_audio(df, 'spotify_preview_url', output_dir, batch_size=100)

# Save the updated DataFrame
df.to_pickle("../data/audio_files/path_to_updated_dataframe.pkl")

Downloading batch 1/1:   0%|          | 0/50 [00:00<?, ?it/s]

Downloading batch 1/1: 100%|██████████| 50/50 [00:16<00:00,  3.10it/s]


In [None]:
# Split DataFrame into batches
batch_size = 5000  # Adjust this as needed
num_batches = int(np.ceil(len(df) / batch_size))
batches = np.array_split(df, num_batches)

# Process each batch and save the state
for i, batch_df in enumerate(batches):
    batch_file = f"batch_{i}.csv"
    if not os.path.exists(batch_file):  # Check if this batch has already been processed
        process_batch(batch_df, i)
    else:
        print(f"Batch {i} already processed. Skipping...")

# Combine all batches into a single DataFrame (if needed)
final_df = pd.concat([pd.read_csv(f"batch_{i}.csv") for i in range(num_batches)])
final_df.to_csv("final_results.csv", index=False)