In [1]:
import requests
from spotify_functions import *
# Test with a single known artist ID
test_response = requests.get(
    "https://api.spotify.com/v1/artists/0OdUWJ0sBjDrqHygGUXeCF",  # Band of Horses
    headers={"Authorization": f"Bearer {access_token}"}
)
print(f"Status: {test_response.status_code}")
print(f"Response: {test_response.text[:200]}")

Status: 200
Response: {"external_urls":{"spotify":"https://open.spotify.com/artist/0OdUWJ0sBjDrqHygGUXeCF"},"followers":{"href":null,"total":1043702},"genres":[],"href":"https://api.spotify.com/v1/artists/0OdUWJ0sBjDrqHygG


In [2]:
from spotify_functions import *

# Iterate over artist IDs in chunks to avoid URL length limits and get followers count from list of artist IDs
def get_artists_followers(artist_ids, access_token):
    followers_dict = {}
    chunk_size = 50  # Spotify API allows up to 50 IDs per request
    for i in range(0, len(artist_ids), chunk_size):
        chunk_ids = artist_ids[i:i + chunk_size]
        ids_param = ",".join(chunk_ids)
        url = f"https://api.spotify.com/v1/artists?ids={ids_param}"
        response = requests.get(
            url,
            headers={"Authorization": f"Bearer {access_token}"}
        )
        if response.status_code != 200:
            print(f"Error fetching artists: {response.status_code} - {response.text}")
            continue
        data = response.json()
        for artist in data['artists']:
            followers_dict[artist['id']] = artist['followers']['total']
    return followers_dict

# Actual function call
artists_ids_csv_path = "data/artists_ids.csv"
followers_data = get_artists_followers(artists_ids_csv_path, access_token)
# Save followers data to CSV
followers_df = pd.DataFrame(list(followers_data.items()), columns=['artist_id', 'followers_count'])
followers_df.to_csv("artists_followers.csv", index=False)
print("Followers data saved to artists_followers.csv")


Error fetching artists: 400 - {"error": {"status": 400, "message": "Invalid base62 id" } }
Followers data saved to artists_followers.csv


In [3]:
import pandas as pd

# Load and inspect your CSV
artist_ids_df = pd.read_csv('artist_ids.csv')

print("=== CSV INSPECTION ===")
print(f"Shape: {artist_ids_df.shape}")
print(f"\nColumns: {list(artist_ids_df.columns)}")
print(f"\nFirst 10 rows:")
print(artist_ids_df.head(10))
print(f"\nData types:")
print(artist_ids_df.dtypes)
print(f"\nAny null values?")
print(artist_ids_df.isnull().sum())

# Check for invalid IDs
print("\n=== CHECKING ID VALIDITY ===")
for idx, row in artist_ids_df.head(20).iterrows():
    artist_id = str(row['artist_id'])
    length = len(artist_id)
    is_valid = len(artist_id) == 22 and artist_id.isalnum()
    print(f"Row {idx}: '{artist_id}' - Length: {length}, Valid: {is_valid}")

=== CSV INSPECTION ===
Shape: (5184, 2)

Columns: ['artist', 'artist_id']

First 10 rows:
                  artist               artist_id
0            Jeff Tweedy  2rDxtYUzTAYJJE3Bl3Z5IN
1                  Geese  0WCo84qtCKfbyIf1lqQWB4
2            Piotr Kurek  0mneo6UHjcOtZBm1Tw8t67
3       Paul St. Hilaire  2z6qOkQVyn7h3XzUuNlRlM
4                      Ø  0dgJbQ0bKPyUXco8hEXN7X
5         Frost Children  6R1kfr0GIWnwxY4zW11Vag
6                    múm  4mw86zm4QZIL8SksdyE6OU
7                Cardi B  4kYSro6naA4h99UJvo89HB
8            Asher White  1TQhvHMVoECTNs3Xxo3RMv
9  Orcutt Shelley Miller  0m2UovS6OpaxRZ3e30plk1

Data types:
artist       object
artist_id    object
dtype: object

Any null values?
artist       1
artist_id    5
dtype: int64

=== CHECKING ID VALIDITY ===
Row 0: '2rDxtYUzTAYJJE3Bl3Z5IN' - Length: 22, Valid: True
Row 1: '0WCo84qtCKfbyIf1lqQWB4' - Length: 22, Valid: True
Row 2: '0mneo6UHjcOtZBm1Tw8t67' - Length: 22, Valid: True
Row 3: '2z6qOkQVyn7h3XzUuNlRlM' - Lengt

In [4]:
import pandas as pd
import re

artist_ids_df = pd.read_csv('artist_ids.csv')

# Deep inspection for hidden issues
print("=== DETAILED ID INSPECTION ===")
for idx in range(min(10, len(artist_ids_df))):
    artist_id = artist_ids_df.loc[idx, 'artist_id']
    artist_name = artist_ids_df.loc[idx, 'artist']

    # Get the raw string
    raw_id = repr(artist_id)
    clean_id = str(artist_id).strip()

    print(f"\nRow {idx}: {artist_name}")
    print(f"  Raw: {raw_id}")
    print(f"  Length: {len(clean_id)}")
    print(f"  Valid format: {bool(re.match(r'^[a-zA-Z0-9]{22}$', clean_id))}")

    # Check for hidden characters
    if len(clean_id) != len(str(artist_id)):
        print(f"  ⚠️ WHITESPACE DETECTED")

# Check if any IDs have issues
print("\n=== SCANNING ALL IDS ===")
invalid_count = 0
for idx, row in artist_ids_df.iterrows():
    artist_id = str(row['artist_id']).strip()
    if not re.match(r'^[a-zA-Z0-9]{22}$', artist_id):
        invalid_count += 1
        if invalid_count <= 5:  # Show first 5 invalid
            print(f"Invalid ID at row {idx}: '{artist_id}' (len={len(artist_id)})")

print(f"\nTotal invalid IDs: {invalid_count} out of {len(artist_ids_df)}")

# Test a specific ID with the API
print("\n=== API TEST ===")
import requests
test_id = artist_ids_df.loc[0, 'artist_id'].strip()
print(f"Testing ID: '{test_id}'")

response = requests.get(
    f"https://api.spotify.com/v1/artists/{test_id}",
    headers={"Authorization": f"Bearer {access_token}"}
)
print(f"Status: {response.status_code}")
if response.status_code != 200:
    print(f"Response: {response.text}")
else:
    print(f"Success! Followers: {response.json()['followers']['total']:,}")

=== DETAILED ID INSPECTION ===

Row 0: Jeff Tweedy
  Raw: '2rDxtYUzTAYJJE3Bl3Z5IN'
  Length: 22
  Valid format: True

Row 1: Geese
  Raw: '0WCo84qtCKfbyIf1lqQWB4'
  Length: 22
  Valid format: True

Row 2: Piotr Kurek
  Raw: '0mneo6UHjcOtZBm1Tw8t67'
  Length: 22
  Valid format: True

Row 3: Paul St. Hilaire
  Raw: '2z6qOkQVyn7h3XzUuNlRlM'
  Length: 22
  Valid format: True

Row 4: Ø
  Raw: '0dgJbQ0bKPyUXco8hEXN7X'
  Length: 22
  Valid format: True

Row 5: Frost Children
  Raw: '6R1kfr0GIWnwxY4zW11Vag'
  Length: 22
  Valid format: True

Row 6: múm
  Raw: '4mw86zm4QZIL8SksdyE6OU'
  Length: 22
  Valid format: True

Row 7: Cardi B
  Raw: '4kYSro6naA4h99UJvo89HB'
  Length: 22
  Valid format: True

Row 8: Asher White
  Raw: '1TQhvHMVoECTNs3Xxo3RMv'
  Length: 22
  Valid format: True

Row 9: Orcutt Shelley Miller
  Raw: '0m2UovS6OpaxRZ3e30plk1'
  Length: 22
  Valid format: True

=== SCANNING ALL IDS ===
Invalid ID at row 1024: 'nan' (len=3)
Invalid ID at row 2537: 'nan' (len=3)
Invalid ID at row

In [1]:
# File: spotify/get_artists_followers.ipynb
import os
import time
import random
import pandas as pd
from spotify_functions import get_artist_followers

# Resolve access token
access_token = os.environ.get("SPOTIFY_ACCESS_TOKEN")
if not access_token:
    try:
        import spotify_functions
        access_token = getattr(spotify_functions, "access_token", None)
    except ImportError:
        access_token = None

if not access_token:
    raise RuntimeError(
        "Spotify access token not found. Set environment variable `SPOTIFY_ACCESS_TOKEN` "
        "or define `access_token` in `spotify_functions.py`"
    )

# Load and validate CSV
artist_ids_df = pd.read_csv('artist_ids.csv')
if 'artist_id' not in artist_ids_df.columns:
    raise ValueError("CSV must contain an `artist_id` column")

# **FILTER OUT NaN/NULL VALUES**
original_count = len(artist_ids_df)
artist_ids_df = artist_ids_df.dropna(subset=['artist_id'])
artist_ids_df = artist_ids_df[artist_ids_df['artist_id'].astype(str).str.len() == 22]
cleaned_count = len(artist_ids_df)

print(f"Original artists: {original_count}")
print(f"After removing invalid IDs: {cleaned_count}")
print(f"Removed: {original_count - cleaned_count} invalid entries\n")

# Check if we have a progress file to resume from
progress_file = 'artist_followers_progress.csv'
if os.path.exists(progress_file):
    print(f"Found progress file. Resuming from where we left off...")
    existing_df = pd.read_csv(progress_file)
    processed_ids = set(existing_df['artist_id'].values)
    followers_data = existing_df.to_dict('records')
else:
    processed_ids = set()
    followers_data = []

total_artists = len(artist_ids_df)
skipped = 0
failed = 0

for idx, row in enumerate(artist_ids_df.itertuples(index=False), 1):
    artist_id = str(getattr(row, 'artist_id')).strip()

    # Skip if already processed
    if artist_id in processed_ids:
        skipped += 1
        continue

    followers_count = get_artist_followers(artist_id, access_token)
    followers_data.append({'artist_id': artist_id, 'followers_count': followers_count})

    if followers_count is not None:
        print(f"[{idx}/{total_artists}] {artist_id}: {followers_count:,} followers")
    else:
        print(f"[{idx}/{total_artists}] {artist_id}: FAILED")
        failed += 1

    # Sleep to avoid rate limiting
    time.sleep(random.uniform(0.5, 1.5))

    # Save progress every 100 artists
    if idx % 100 == 0:
        temp_df = pd.DataFrame(followers_data)
        temp_df.to_csv(progress_file, index=False)
        print(f"  → Progress saved ({len(followers_data)} artists, {failed} failed)\n")

# Save final results
followers_df = pd.DataFrame(followers_data)
followers_df.to_csv('artist_followers.csv', index=False)

# Clean up progress file
if os.path.exists(progress_file):
    os.remove(progress_file)

print(f"\n{'='*50}")
print(f"Complete! Processed {len(followers_df)} artists")
print(f"Skipped (already done): {skipped}")
print(f"Failed: {failed}")
print(f"Success rate: {((len(followers_df) - failed) / len(followers_df) * 100):.1f}%")

Original artists: 5184
After removing invalid IDs: 5179
Removed: 5 invalid entries

Found progress file. Resuming from where we left off...
[3165/5179] 2DxlS3lTLFIq70S7ap5H3y: 469,007 followers
[3166/5179] 7rAxGyWTTI799jQ63IMEer: 1,759 followers
[3167/5179] 3sl5LqGRzuwjQW2kQyqY9J: 15,235 followers
[3168/5179] 00NwpiTtYKYzE6yEhSbFIk: 25,892 followers
[3169/5179] 1sDWIdL18InXgES9TwvsL2: 78,469 followers
[3170/5179] 29LOCR81IrdEJjCAeCEOU3: 49,088 followers
[3172/5179] 4cUkGQyhLFqKHBtL58HYVp: 347,291 followers
[3173/5179] 7oJ1EabmX7ejrie3NBzn0p: 37,889 followers
[3174/5179] 3W5JSivrknDg2W86ByUpdk: 211 followers
[3175/5179] 2gHsHeJxm4bTuS94fKQgB5: 12,214 followers
[3176/5179] 6U9PwS3WsiKBVewf7wtwIZ: 24,315 followers
[3177/5179] 2Ui7OT9J7U6GJnh4XRv29A: 1,058 followers
[3178/5179] 4x1nvY2FN8jxqAFA0DA02H: 6,628,295 followers
[3179/5179] 0wQu6RGpwgoD20qNxb4vwj: 8,305 followers
[3180/5179] 0W498bDDNlJIrYMKXdpLHA: 1,114,933 followers
[3181/5179] 5yV1qdnmxyIYiSFB02wpDj: 635,459 followers
[3182/517