In [3]:
import json
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from spotipy.exceptions import SpotifyException
import datetime
import csv
import time
import pandas as pd
import matplotlib.pyplot as plt

In [4]:
#Read in the json file that has Spotify api credentials 
with open('spot.json', 'r') as info_file:
    info = json.load(info_file)
client_id = info['client_id'] 
client_secret = info['client_secret']

#Set up Spotify client credentials manager using clientID and cleint secret
client_credentials_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)
#Create Spotify client object
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

In [6]:
df = pd.read_csv('artist_genders_more_women.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 860 entries, 0 to 859
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   name    860 non-null    object
 1   gender  860 non-null    object
dtypes: object(2)
memory usage: 13.6+ KB


In [9]:
def get_artist_info(artist_name):
    results = sp.search(q=artist_name, type='artist', limit=1)
    items = results['artists']['items']
    if items:
        artist = items[0]
        return {
            'artist_name': artist['name'],
            'artist_id': artist['id'],
            'genres': artist['genres'],
            'popularity': artist['popularity'],  
            'followers': artist['followers']['total']
        }
    else:
        return {
            'artist_name': artist_name,
            'artist_id': None,
            'genres': [],
            'popularity': None,  
            'followers': None
        }

artist_info_list = []
for name in df['name']:
    try:
        info = get_artist_info(name)
        artist_info_list.append(info)
        time.sleep(0.2)  # Rate limiting
    except Exception as e:
        print(f"Error with {name}: {e}")
        artist_info_list.append({'artist_name': name, 'artist_id': None, 'genres': [], 'popularity': None, 'followers': None})

artist_df = pd.DataFrame(artist_info_list)


In [10]:
artist_df.to_csv('artists_with_info_new.csv', index=False)

In [11]:
artist_df.head()

Unnamed: 0,artist_name,artist_id,genres,popularity,followers
0,Taylor Swift,06HL4z0CvFAxyc27GXpf02,[],98,136112787
1,Chris Stapleton,4YLtscXsxbVgi031ovDDdh,[country],84,8339679
2,Juice WRLD,4MCBfE4596Uoi2O4DtmEMz,"[melodic rap, emo rap]",89,39820404
3,Playboi Carti,699OTQXzgjhIYAHMy9RyPD,[rage rap],94,14064414
4,The Weeknd,1Xyo4u8uXC1ZmMpatF05PJ,[],97,103633316


In [None]:
artist_df = pd.read_csv('artists_with_info.csv')

In [None]:
def get_top_albums_by_artist(sp, artist_df, limit=5):
    all_albums = []

    for i, (_, row) in enumerate(artist_df.iterrows()):
        artist_name = row['artist_name']
        artist_id = row['artist_id']
        genres = row['genres']
        genre = ', '.join(genres) if isinstance(genres, list) else genres

        if pd.isna(artist_id):
            continue

        try:
            results = sp.artist_albums(artist_id=artist_id, album_type='album', limit=limit)
            time.sleep(0.5)  # rate control

            for album in results['items']:
                album_id = album['id']
                album_name = album['name']
                release_date = album['release_date']
                total_tracks = album['total_tracks']
                image_url = album['images'][0]['url'] if album['images'] else None

                # Album popularity
                try:
                    album_info = sp.album(album_id)
                    popularity = album_info.get('popularity', None)
                    time.sleep(0.5)
                except SpotifyException as se:
                    if se.http_status == 429:  # rate limited
                        retry_after = int(se.headers.get("Retry-After", 10))
                        print(f"    Rate limited. Sleeping for {retry_after} seconds...")
                        time.sleep(retry_after + 2)
                        album_info = sp.album(album_id)
                        popularity = album_info.get('popularity', None)
                    else:
                        print(f"    Spotify exception on popularity for {album_name}: {se}")
                        popularity = None
                except Exception as e:
                    print(f"    Error getting popularity for {album_name}: {e}")
                    popularity = None

                all_albums.append({
                    'artist_name': artist_name,
                    'artist_id': artist_id,
                    'genre': genre,
                    'album_id': album_id,
                    'album_name': album_name,
                    'release_date': release_date,
                    'total_tracks': total_tracks,
                    'popularity': popularity,
                    'image_url': image_url
                })

        except SpotifyException as se:
            if se.http_status == 429:
                retry_after = int(se.headers.get("Retry-After", 60))
                print(f"[{i}] RATE LIMIT: Sleeping for {retry_after} seconds...")
                time.sleep(retry_after + 2)
                continue
            else:
                print(f"[{i}] Spotify error for {artist_name}: {se}")
        except Exception as e:
            print(f"[{i}] General error for {artist_name}: {e}")

        if i % 25 == 0:
            print(f"Processed {i} / {len(artist_df)} artists...")
        if i % 100 == 0:
            pd.DataFrame(all_albums).to_csv("partial_albums.csv", index=False)

    return pd.DataFrame(all_albums)

albums_df = get_top_albums_by_artist(sp, artist_df)
# albums_df.to_csv("top_5_albums_per_artist.csv", index=False)




KeyboardInterrupt: 

In [None]:
albums_df

In [None]:
# Load both CSVs 
df1 = pd.read_csv("partial_albums.csv")
df2 = pd.read_csv("few_artist.csv")

# Combine them
combined_df = pd.concat([df1, df2], ignore_index=True)

# # Remove duplicates based on album ID (or album name + artist ID if needed)
# combined_df.drop_duplicates(subset=["album_id"], inplace=True)

# Save the merged final file 
combined_df.to_csv("albums_final.csv", index=False)