In [2]:
from spotify_functions import *
import pandas as pd

In [3]:
artists_ids_df = pd.read_csv("artist_ids.csv")

In [1]:
# python
from spotify_functions import *
import pandas as pd

artists_ids_df = pd.read_csv("artist_ids.csv")

artists_data = []
for index, row in artists_ids_df.iterrows():
    artist_name = row['artist']
    artist_id = row['artist_id']
    if pd.isna(artist_id):
        print(f"Skipping {artist_name} due to missing artist ID.")
        continue
    print(f"Processing Artist: {artist_name}, Spotify ID: {artist_id}")
    try:
        artist_info = get_artist_info(artist_id, access_token)

        # Normalize artist_info (handle list or nested structures)
        if isinstance(artist_info, list) and artist_info:
            artist_info = artist_info[0]
        elif isinstance(artist_info, dict) and isinstance(artist_info.get('artists'), list) and artist_info['artists']:
            artist_info = artist_info['artists'][0]

        followers_count = None
        if isinstance(artist_info, dict):
            followers = artist_info.get('followers', {})
            if isinstance(followers, dict):
                followers_count = followers.get('total')

        # Get albums and normalize to a list
        albums_resp = get_artist_albums(artist_id, access_token)
        if isinstance(albums_resp, dict) and isinstance(albums_resp.get('items'), list):
            albums_list = albums_resp['items']
        elif isinstance(albums_resp, list):
            albums_list = albums_resp
        else:
            albums_list = []

        album_names = [a.get('name') for a in albums_list]
        album_release_dates = [a.get('release_date') for a in albums_list]

        tracks = []
        for album in albums_list:
            album_id = album.get('id') if isinstance(album, dict) else None
            if not album_id:
                continue

            album_tracks_resp = get_album_tracks(album_id, access_token)
            # Normalize album tracks (dict with 'items' or list)
            if isinstance(album_tracks_resp, dict) and isinstance(album_tracks_resp.get('items'), list):
                tracks_list = album_tracks_resp['items']
            elif isinstance(album_tracks_resp, list):
                tracks_list = album_tracks_resp
            else:
                tracks_list = []

            for track in tracks_list:
                if isinstance(track, dict) and 'name' in track:
                    tracks.append(track['name'])

        artists_data.append({
            'artist': artist_name,
            'artist_id': artist_id,
            'followers_count': followers_count,
            'albums': album_names,
            'album_release_dates': album_release_dates,
            'tracks': tracks
        })
        print(f"Collected data for {artist_name}: {len(album_names)} albums, {len(tracks)} tracks, {followers_count} followers.")
    except Exception as e:
        print(f"Error processing {artist_name}: {e}")

Processing Artist: Jeff Tweedy, Spotify ID: 2rDxtYUzTAYJJE3Bl3Z5IN
Collected data for Jeff Tweedy: 21 albums, 155 tracks, None followers.
Processing Artist: Geese, Spotify ID: 0WCo84qtCKfbyIf1lqQWB4
Collected data for Geese: 9 albums, 49 tracks, None followers.
Processing Artist: Piotr Kurek, Spotify ID: 0mneo6UHjcOtZBm1Tw8t67


KeyboardInterrupt: 

In [None]:
# Save the collected artist data to a CSV file
artists_data_df = pd.DataFrame(artists_data)
artists_data_df.to_csv('spotify_artists_data.csv', index=False)

In [5]:
import json

# Save the current data
with open('../spotify_artists_backup.json', 'w') as f:
    json.dump(artists_data, f, indent=2)

print(f"Saved {len(artists_data)} artists to spotify_artists_backup.json")

Saved 156 artists to spotify_artists_backup.json


In [6]:
# View spotify_artists_backup.json
with open('../spotify_artists_backup.json', 'r') as f:
    artists_backup = json.load(f)

print(f"Loaded {len(artists_backup)} artists from spotify_artists_backup.json")

Loaded 156 artists from spotify_artists_backup.json


## New Code to solve backup and rate limit issues

In [8]:
from spotify_functions import *
import pandas as pd
import json
import os
import time
from datetime import datetime

# Load the CSV with artist IDs
artists_ids_df = pd.read_csv("artist_ids.csv")

# Load existing data if file exists (resume functionality)
backup_file = '../spotify_artists_backup.json'
if os.path.exists(backup_file):
    with open(backup_file, 'r') as f:
        artists_data = json.load(f)
    print(f"Resuming with {len(artists_data)} existing artists")
    # Get list of already processed artist IDs
    processed_ids = {artist['artist_id'] for artist in artists_data}
else:
    artists_data = []
    processed_ids = set()

# Counter for tracking progress
total_artists = len(artists_ids_df)
processed_count = len(artists_data)

# Rate limiting settings
REQUEST_DELAY = 0.5  # Wait 0.5 seconds between requests
SAVE_INTERVAL = 10   # Save backup every 10 artists

print(f"Processing {total_artists - processed_count} remaining artists out of {total_artists} total")

for index, row in artists_ids_df.iterrows():
    artist_name = row['artist']
    artist_id = row['artist_id']

    # Skip if already processed
    if artist_id in processed_ids:
        print(f"Skipping {artist_name} (already processed)")
        continue

    if pd.isna(artist_id):
        print(f"Skipping {artist_name} due to missing artist ID.")
        continue

    print(f"[{processed_count + 1}/{total_artists}] Processing Artist: {artist_name}, Spotify ID: {artist_id}")

    try:
        # Add delay to respect rate limits
        time.sleep(REQUEST_DELAY)

        # Get artist info
        artist_info = get_artist_info(artist_id, access_token)

        # Normalize artist_info
        if isinstance(artist_info, list) and artist_info:
            artist_info = artist_info[0]
        elif isinstance(artist_info, dict) and isinstance(artist_info.get('artists'), list) and artist_info['artists']:
            artist_info = artist_info['artists'][0]

        followers_count = None
        if isinstance(artist_info, dict):
            followers = artist_info.get('followers', {})
            if isinstance(followers, dict):
                followers_count = followers.get('total')

        # Add delay before next request
        time.sleep(REQUEST_DELAY)

        # Get albums
        albums_resp = get_artist_albums(artist_id, access_token)
        if isinstance(albums_resp, dict) and isinstance(albums_resp.get('items'), list):
            albums_list = albums_resp['items']
        elif isinstance(albums_resp, list):
            albums_list = albums_resp
        else:
            albums_list = []

        album_names = [a.get('name') for a in albums_list]
        album_release_dates = [a.get('release_date') for a in albums_list]

        tracks = []
        for album in albums_list:
            album_id = album.get('id') if isinstance(album, dict) else None
            if not album_id:
                continue

            # Add delay before each album tracks request
            time.sleep(REQUEST_DELAY)

            album_tracks_resp = get_album_tracks(album_id, access_token)
            if isinstance(album_tracks_resp, dict) and isinstance(album_tracks_resp.get('items'), list):
                tracks_list = album_tracks_resp['items']
            elif isinstance(album_tracks_resp, list):
                tracks_list = album_tracks_resp
            else:
                tracks_list = []

            for track in tracks_list:
                if isinstance(track, dict) and 'name' in track:
                    tracks.append(track['name'])

        # Add collected data
        artists_data.append({
            'artist': artist_name,
            'artist_id': artist_id,
            'followers_count': followers_count,
            'albums': album_names,
            'album_release_dates': album_release_dates,
            'tracks': tracks
        })

        processed_count += 1
        print(f"Collected data for {artist_name}: {len(album_names)} albums, {len(tracks)} tracks, {followers_count} followers.")

        # Save backup periodically
        if processed_count % SAVE_INTERVAL == 0:
            with open(backup_file, 'w') as f:
                json.dump(artists_data, f, indent=2)
            print(f"✓ Backup saved ({processed_count} artists)")

    except Exception as e:
        print(f"Error processing {artist_name}: {e}")
        # Save backup on error
        with open(backup_file, 'w') as f:
            json.dump(artists_data, f, indent=2)
        print(f"✓ Backup saved after error")

        # If it's a rate limit error (status 429), wait longer
        if "429" in str(e) or "rate limit" in str(e).lower():
            wait_time = 60  # Wait 60 seconds on rate limit
            print(f"Rate limit detected. Waiting {wait_time} seconds...")
            time.sleep(wait_time)

# Final save
with open(backup_file, 'w') as f:
    json.dump(artists_data, f, indent=2)
print(f"\n✓ Final backup saved: {len(artists_data)} artists")

# Save to CSV
artists_data_df = pd.DataFrame(artists_data)
artists_data_df.to_csv('spotify_artists_data.csv', index=False)
print(f"✓ Saved to CSV: spotify_artists_data.csv")

Resuming with 5 existing artists
Processing 5179 remaining artists out of 5184 total
Skipping Jeff Tweedy (already processed)
Skipping Geese (already processed)
Skipping Piotr Kurek (already processed)
Skipping Paul St. Hilaire (already processed)
Skipping Ø (already processed)
[6/5184] Processing Artist: Frost Children, Spotify ID: 6R1kfr0GIWnwxY4zW11Vag
Collected data for Frost Children: 0 albums, 0 tracks, None followers.
[7/5184] Processing Artist: múm, Spotify ID: 4mw86zm4QZIL8SksdyE6OU
Collected data for múm: 0 albums, 0 tracks, None followers.
[8/5184] Processing Artist: Cardi B, Spotify ID: 4kYSro6naA4h99UJvo89HB
Collected data for Cardi B: 0 albums, 0 tracks, None followers.
[9/5184] Processing Artist: Asher White, Spotify ID: 1TQhvHMVoECTNs3Xxo3RMv
Collected data for Asher White: 0 albums, 0 tracks, None followers.
[10/5184] Processing Artist: Orcutt Shelley Miller, Spotify ID: 0m2UovS6OpaxRZ3e30plk1
Collected data for Orcutt Shelley Miller: 0 albums, 0 tracks, None followers

KeyboardInterrupt: 

In [7]:
# python
from spotify_functions import *
import pandas as pd
import json
import os
import time
from datetime import datetime
from tqdm.notebook import tqdm

# Load the CSV with artist IDs
artists_ids_df = pd.read_csv("artist_ids.csv")

# Load existing data if file exists (resume functionality)
backup_file = '../spotify_artists_backup.json'
if os.path.exists(backup_file):
    with open(backup_file, 'r') as f:
        artists_data = json.load(f)
    print(f"Resuming with {len(artists_data)} existing artists")
    # Get list of already processed artist IDs
    processed_ids = {artist.get('artist_id') for artist in artists_data}
else:
    artists_data = []
    processed_ids = set()

# Counter for tracking progress
total_artists = len(artists_ids_df)
processed_count = len(artists_data)

# Rate limiting settings
REQUEST_DELAY = 2.0  # Wait 0.5 seconds between requests
SAVE_INTERVAL = 5   # Save backup every 10 artists

print(f"Processing {total_artists - processed_count} remaining artists out of {total_artists} total")

with tqdm(total=total_artists, initial=processed_count, desc='Artists', unit='artist') as pbar:
    for index, row in artists_ids_df.iterrows():
        artist_name = row['artist']
        artist_id = row['artist_id']

        # Skip if already processed (ensure progress bar advances)
        if artist_id in processed_ids:
            print(f"Skipping {artist_name} (already processed)")
            pbar.update(1)
            continue

        if pd.isna(artist_id):
            print(f"Skipping {artist_name} due to missing artist ID.")
            pbar.update(1)
            continue

        print(f"[{processed_count + 1}/{total_artists}] Processing Artist: {artist_name}, Spotify ID: {artist_id}")

        try:
            # Respect rate limits
            time.sleep(REQUEST_DELAY)

            # Get artist info
            artist_info = get_artist_info(artist_id, access_token)

            # Normalize artist_info
            if isinstance(artist_info, list) and artist_info:
                artist_info = artist_info[0]
            elif isinstance(artist_info, dict) and isinstance(artist_info.get('artists'), list) and artist_info['artists']:
                artist_info = artist_info['artists'][0]

            followers_count = None
            if isinstance(artist_info, dict):
                followers = artist_info.get('followers', {})
                if isinstance(followers, dict):
                    followers_count = followers.get('total')

            # Additional delay before next request
            time.sleep(REQUEST_DELAY)

            # Get albums
            albums_resp = get_artist_albums(artist_id, access_token)
            if isinstance(albums_resp, dict) and isinstance(albums_resp.get('items'), list):
                albums_list = albums_resp['items']
            elif isinstance(albums_resp, list):
                albums_list = albums_resp
            else:
                albums_list = []

            album_names = [a.get('name') for a in albums_list]
            album_release_dates = [a.get('release_date') for a in albums_list]

            tracks = []
            for album in albums_list:
                album_id = album.get('id') if isinstance(album, dict) else None
                if not album_id:
                    continue

                time.sleep(REQUEST_DELAY)

                album_tracks_resp = get_album_tracks(album_id, access_token)
                if isinstance(album_tracks_resp, dict) and isinstance(album_tracks_resp.get('items'), list):
                    tracks_list = album_tracks_resp['items']
                elif isinstance(album_tracks_resp, list):
                    tracks_list = album_tracks_resp
                else:
                    tracks_list = []

                for track in tracks_list:
                    if isinstance(track, dict) and 'name' in track:
                        tracks.append(track['name'])

            # Add collected data
            artists_data.append({
                'artist': artist_name,
                'artist_id': artist_id,
                'followers_count': followers_count,
                'albums': album_names,
                'album_release_dates': album_release_dates,
                'tracks': tracks
            })
            # Mark as processed
            processed_ids.add(artist_id)
            processed_count += 1

            print(f"Collected data for {artist_name}: {len(album_names)} albums, {len(tracks)} tracks, {followers_count} followers.")

            # Save backup periodically
            if processed_count % SAVE_INTERVAL == 0:
                with open(backup_file, 'w') as f:
                    json.dump(artists_data, f, indent=2)
                print(f"✓ Backup saved ({processed_count} artists)")

            pbar.update(1)

        except Exception as e:
            print(f"Error processing {artist_name}: {e}")
            # Save backup on error
            with open(backup_file, 'w') as f:
                json.dump(artists_data, f, indent=2)
            print(f"✓ Backup saved after error")

            # If it's a rate limit error (status 429), wait longer
            if "429" in str(e) or "rate limit" in str(e).lower():
                wait_time = 60  # Wait 60 seconds on rate limit
                print(f"Rate limit detected. Waiting {wait_time} seconds...")
                time.sleep(wait_time)

            pbar.update(1)

# Final save
with open(backup_file, 'w') as f:
    json.dump(artists_data, f, indent=2)
print(f"\n✓ Final backup saved: {len(artists_data)} artists")

# Save to CSV
artists_data_df = pd.DataFrame(artists_data)
artists_data_df.to_csv('spotify_artists_data.csv', index=False)
print(f"✓ Saved to CSV: spotify_artists_data.csv")

Processing 5184 remaining artists out of 5184 total


Artists:   0%|          | 0/5184 [00:00<?, ?artist/s]

[1/5184] Processing Artist: Jeff Tweedy, Spotify ID: 2rDxtYUzTAYJJE3Bl3Z5IN
Collected data for Jeff Tweedy: 0 albums, 0 tracks, None followers.
[2/5184] Processing Artist: Geese, Spotify ID: 0WCo84qtCKfbyIf1lqQWB4
Collected data for Geese: 0 albums, 0 tracks, None followers.
[3/5184] Processing Artist: Piotr Kurek, Spotify ID: 0mneo6UHjcOtZBm1Tw8t67
Collected data for Piotr Kurek: 0 albums, 0 tracks, None followers.
[4/5184] Processing Artist: Paul St. Hilaire, Spotify ID: 2z6qOkQVyn7h3XzUuNlRlM
Collected data for Paul St. Hilaire: 0 albums, 0 tracks, None followers.
[5/5184] Processing Artist: Ø, Spotify ID: 0dgJbQ0bKPyUXco8hEXN7X
Collected data for Ø: 0 albums, 0 tracks, None followers.
✓ Backup saved (5 artists)
[6/5184] Processing Artist: Frost Children, Spotify ID: 6R1kfr0GIWnwxY4zW11Vag


KeyboardInterrupt: 