Notebook for creating the song_metadata.json file containing more metadata for songs in spotify_songs.csv. 

Uses the spotify API to retrieve song metadata

In [137]:
import pandas as pd
import json
import csv
from dotenv import load_dotenv
import os
import time
import requests

In [138]:
load_dotenv() # Load env variables from .env file

# Get Spotify api credentials from environment variables
client_id = os.getenv('SPOTIFY_CLIENT_ID')
client_secret = os.getenv('SPOTIFY_CLIENT_SECRET')

# Get the access token
auth_url = 'https://accounts.spotify.com/api/token'
auth_data = {
    'grant_type': 'client_credentials',
    'client_id': client_id,
    'client_secret': client_secret
}
auth_response = requests.post(auth_url, data=auth_data)
access_token = auth_response.json()['access_token']

# Set up the API endpoint and headers
api_url = 'https://api.spotify.com/v1/tracks'
headers = {
    'Authorization': f'Bearer {access_token}'
}

In [139]:
csv_file = "../dataset/spotify_songs.csv"
df = pd.read_csv(csv_file)

## Using the Spotify API
Take the songs from csv file and calls the tracks api to get song metadata.

Saved in song_metadata.json

In [141]:
%time 
# Load existing song metadata from the JSON file if it exists
if os.path.exists('temp_data/song_metadata_2.json'):
    with open('temp_data/song_metadata_2.json', 'r') as file:
        song_metadata = json.load(file)
else:
    song_metadata = []

id_start = 0 # update this with the index of the song you want to start with
batch_size = 30
# Create batches of 10 track IDs
track_id_batches = [df['track_id'].tolist()[i:i+batch_size] for i in range(id_start, len(df), batch_size)]

# Iterate over each batch of track IDs
for i, batch in enumerate(track_id_batches):
    print(f"Processing batch {i+1}/{len(track_id_batches)}")
    
    # Create the API request parameters
    params = {
        'ids': ','.join(batch)
    }
    
    # Retry the API request until a successful response is received
    while True:
        # Make the API request
        response = requests.get(api_url, headers=headers, params=params)
        print(response)
        
        if response.status_code == 200:
            track_results = response.json()
            
            # Iterate over each track in the batch
            for track in track_results['tracks']:
                if track:
                    track_id = track['id']
                    track_url = track['external_urls']['spotify']
                    preview_url = track['preview_url']
                    image_url = track['album']['images'][0]['url'] if track['album']['images'] else ""
                    artists = [artist["name"] for artist in track["album"]["artists"]]
                    
                    # Find the corresponding index in the DataFrame
                    index = df.index[df['track_id'] == track_id].tolist()[0]
                    
                    # Create a dictionary with the song metadata
                    song_info = {
                        'id': index,
                        'name': df.loc[index, 'track_name'],
                        'track_id': track_id,
                        'artists': artists,
                        'image_url': image_url,
                        'track_url': track_url,
                        'preview_url': preview_url
                    }
                    
                    song_metadata.append(song_info)
            
            # Save the updated song metadata to the JSON file after processing each batch
            with open('temp_data/song_metadata_2.json', 'w') as file:
                json.dump(song_metadata, file, indent=4)
            
            # Break out of the retry loop since a successful response was received
            break
        
        else:
            print(f"API request failed with status code {response.status_code}")
            print("Retrying in 30 seconds...")
            time.sleep(30)  # Sleep for 5 seconds before retrying

print("Data retrieval completed")

CPU times: user 2 µs, sys: 1 µs, total: 3 µs
Wall time: 5.96 µs
Processing batch 1/1095
<Response [200]>
Processing batch 2/1095
<Response [200]>
Processing batch 3/1095
<Response [200]>
Processing batch 4/1095
<Response [200]>
Processing batch 5/1095
<Response [200]>
Processing batch 6/1095
<Response [200]>
Processing batch 7/1095
<Response [200]>
Processing batch 8/1095
<Response [200]>
Processing batch 9/1095
<Response [200]>
Processing batch 10/1095
<Response [200]>
Processing batch 11/1095
<Response [200]>
Processing batch 12/1095
<Response [200]>
Processing batch 13/1095
<Response [200]>
Processing batch 14/1095
<Response [200]>
Processing batch 15/1095
<Response [200]>
Processing batch 16/1095
<Response [200]>
Processing batch 17/1095
<Response [200]>
Processing batch 18/1095
<Response [200]>
Processing batch 19/1095
<Response [200]>
Processing batch 20/1095
<Response [200]>
Processing batch 21/1095
<Response [200]>
Processing batch 22/1095
<Response [200]>
Processing batch 23/1

## Ignoring the API
Create the song_metadata.json file without the API. 

Don't need this part if API does not rate limit you

In [7]:
# Create an empty list to store the song metadata
song_metadata = []

# Iterate over each row in the DataFrame
for index, row in df.iterrows():
    # Create a dictionary with the song metadata
    song_info = {
        'id': index,
        'name': row['track_name'],
        'track_id': row['track_id'],
        'artists': [row['track_artist']],
        'image_url': "",
        'track_url': "",
        'preview_url': ""
    }
    song_metadata.append(song_info)

# Save the song metadata to a JSON file
with open('temp_data/song_metadata.json', 'w') as file:
    json.dump(song_metadata, file, indent=4)

print("Song metadata saved to 'song_metadata.json'")

Song metadata saved to 'song_metadata.json'


### Save track IDs in a csv

Save 50 track IDs as a line in csv

In [18]:
# Specify the path to the song metadata JSON file
json_file_path = 'temp_data/song_metadata.json'

# Specify the path to the output CSV file
csv_file_path = 'track_id_batches.csv'

# Specify the batch size (number of track IDs per batch)
batch_size = 50

# Read the song metadata from the JSON file
with open(json_file_path, 'r') as file:
    song_metadata = json.load(file)

# Extract the track IDs from the song metadata
track_ids = [song['track_id'] for song in song_metadata]

# Create batches of track IDs
track_id_batches = [track_ids[i:i+batch_size] for i in range(0, len(track_ids), batch_size)]

# Save the track ID batches to a CSV file
with open(csv_file_path, 'w', newline='') as file:
    writer = csv.writer(file)
    for batch in track_id_batches:
        writer.writerow([','.join(batch)])

print(f"Track ID batches saved to '{csv_file_path}'")

Track ID batches saved to 'track_id_batches.csv'


### Parse output from spotify api

In [98]:
# Specify the path to the song metadata JSON file
json_file_path = 'temp_data/spotify_res.json'

song_metadata = []

# Read the song metadata from the JSON file
with open(json_file_path, 'r') as file:
    track_data = json.load(file)

for track in track_data["tracks"]:
    track_id = track['id']
    track_name = track['name']
    track_url = track['external_urls']['spotify']
    preview_url = track['preview_url']
    image_url = track['album']['images'][0]['url'] if track['album']['images'] else ""
    artists = [artist["name"] for artist in track["album"]["artists"]]
    
    # Find the corresponding index in the DataFrame
    index = df.index[df['track_id'] == track_id].tolist()[0]
    
    # Create a dictionary with the song metadata
    song_info = {
        'id': index,
        'name': df.loc[index, 'track_name'],
        'track_id': track_id,
        'artists': artists,
        'image_url': image_url,
        'track_url': track_url,
        'preview_url': preview_url
    }
    song_metadata.append(song_info)

# Save the song metadata to a JSON file
with open('temp_data/spotify_res_cleaned.json', 'w') as file:
    json.dump(song_metadata, file, indent=4)

print("Song metadata saved to 'spotify_res_cleaned.json'")

Song metadata saved to 'spotify_res_cleaned.json'


### Use cleaned output to update song_metadata.json

In [99]:
# Specify the path to the song metadata JSON file
cleaned_json_path = 'temp_data/spotify_res_cleaned.json'

song_metadata_path = 'temp_data/song_metadata.json'

# Read the batched cleaned metadata
with open(cleaned_json_path, 'r') as file:
    cleaned_data = json.load(file)

# open the song metadata json file
with open(song_metadata_path, 'r') as file:
    song_metadata = json.load(file)

for cleaned_track in cleaned_data:
    id = cleaned_track["id"]
    metadata = song_metadata[id]
    metadata["artists"] = cleaned_track["artists"]
    metadata["image_url"] = cleaned_track["image_url"]
    metadata["track_url"] = cleaned_track["track_url"]
    metadata["preview_url"] = cleaned_track["preview_url"]

# Save the updated song metadata to a JSON file
with open('temp_data/song_metadata.json', 'w') as file:
    json.dump(song_metadata, file, indent=4)

print("Updated song metadata saved to 'spotify_res_cleaned.json'")

Updated song metadata saved to 'spotify_res_cleaned.json'


## Validate Metadata

In [143]:
# Specify the path to the song metadata JSON file
json_file_path = 'song_metadata.json'

# Read the song metadata from the JSON file
with open(json_file_path, 'r') as file:
    song_metadata = json.load(file)

# Get the unique track IDs and names from the DataFrame
df_track_ids = set(df['track_id'])
df_names = dict(zip(df['track_id'], df['track_name']))

# Check if all the track IDs from the DataFrame exist in the JSON objects
json_track_ids = set(song['track_id'] for song in song_metadata)
missing_track_ids = df_track_ids - json_track_ids

if missing_track_ids:
    print(f"Missing track IDs: {missing_track_ids}")
    print("Please ensure all track IDs from the DataFrame exist in the JSON objects.")
else:
    print("All track IDs from the DataFrame exist in the JSON objects.\n")

    # Validate if the names in the JSON objects match the corresponding names in the DataFrame
    mismatched_names = []
    for song in song_metadata:
        if song['track_id'] in df_names and song['name'] != df_names[song['track_id']]:
            mismatched_names.append(song['track_id'])

    if mismatched_names:
        print(f"Mismatched names for track IDs: {mismatched_names}")
        print("Please ensure the names in the JSON objects match the corresponding names in the DataFrame.")
    else:
        print("All names in the JSON objects match the corresponding names in the DataFrame.")

        # Sort the song metadata based on the "id" field in ascending order
        sorted_song_metadata = sorted(song_metadata, key=lambda x: x['id'])

        # Save the sorted song metadata back to the JSON file
        with open(json_file_path, 'w') as file:
            json.dump(sorted_song_metadata, file, indent=4)

        print(f"Song metadata sorted and saved to '{json_file_path}'")

All track IDs from the DataFrame exist in the JSON objects.

Mismatched names for track IDs: ['69gRFGOWY9OMpFJgFol1u0', '5cjecvX0CmC9gK0Laf5EMQ', '5TTzhRSWQS4Yu8xTgAuq6D', '3VKFip3OdAvv4OfNTgFWeQ', '69gRFGOWY9OMpFJgFol1u0']
Please ensure the names in the JSON objects match the corresponding names in the DataFrame.
