In [1]:
import pandas as pd
import numpy as np
import json
from IPython.display import display

## MY SPOTIFY STREAMING HISTORY OVER THE PAST YEAR (01/2023 - 01/2024)

In [2]:
# combine streaming history in past year 01/31/2023 - 01/31/2024 
stream_history = pd.DataFrame() 
for file in ['StreamingHistory_music_0.json', 'StreamingHistory_music_1.json', 'StreamingHistory_music_2.json']:
    df = pd.read_json(file, orient='columns')
    stream_history = pd.concat([stream_history, df], ignore_index=True)

stream_history['minPlayed'] = stream_history['msPlayed']/(60 * 1000)
stream_history['hourPlayed'] = stream_history['msPlayed']/3600000

In [3]:
stream_history

Unnamed: 0,endTime,artistName,trackName,msPlayed,minPlayed,hourPlayed
0,2023-01-31 00:07,Joe Hisaishi,One Summer Day,189800,3.163333,0.052722
1,2023-01-31 00:11,Paintamelody,Above the Treetops (Lith Harbor),241893,4.031550,0.067193
2,2023-01-31 00:13,Paintamelody,Cygnus Garden,149472,2.491200,0.041520
3,2023-01-31 00:15,Nemu ネム,Nausicaä Requiem,125869,2.097817,0.034964
4,2023-01-31 00:18,Arcade Player,"Ellinia Tree Dungeon (From ""MapleStory"")",141409,2.356817,0.039280
...,...,...,...,...,...,...
29952,2024-01-31 23:41,BIGBANG,BAE BAE,169429,2.823817,0.047064
29953,2024-01-31 23:45,BIGBANG,Blue,233098,3.884967,0.064749
29954,2024-01-31 23:49,BIGBANG,Bad Boy,236817,3.946950,0.065782
29955,2024-01-31 23:52,G-DRAGON,WHO YOU?,201428,3.357133,0.055952


In [4]:
plays = pd.DataFrame(stream_history[['trackName', 'artistName']].value_counts()).reset_index()
plays

Unnamed: 0,trackName,artistName,count
0,We Might Even Be Falling In Love (Duet) - Spot...,Victoria Monét,225
1,H.S.K.T. (feat. Wonstein),LeeHi,201
2,Hush - Still Woozy Remix,The Marías,176
3,telepatía,Kali Uchis,171
4,Addiction,Doja Cat,167
...,...,...,...
3714,Let Me,ZAYN,1
3715,Let Me Be With You,ROUND TABLE featuring Nino,1
3716,Let Me Calm Down (feat. J. Cole),Nicki Minaj,1
3717,Let Me Explain,Bryson Tiller,1


In [5]:
# aggregate data 
stream_sum = stream_history.groupby(['artistName', 'trackName']).sum().reset_index()
stream_sum = stream_sum.drop(columns=['endTime','msPlayed'])
stream_sum = stream_sum.merge(plays, on = ['trackName', 'artistName'])
stream_sum = stream_sum[(stream_sum['minPlayed'] > 0) & (stream_sum['count'] > 4)]

In [6]:
# remove songs that are not available on Spotify anymore 
stream_sum = stream_sum[~((stream_sum['artistName'] == 'ocean sixteen') | (stream_sum['artistName'] == 'Lisandra') | (stream_sum['artistName'] == 'extremely bad man'))]

# round and sort values 
stream_sum.minPlayed = stream_sum.minPlayed.round(2)
stream_sum.hourPlayed = stream_sum.hourPlayed.round(2)
stream_sum = stream_sum.sort_values(by = ['count', 'minPlayed'], ascending = False).reset_index(drop=True)

stream_sum

Unnamed: 0,artistName,trackName,minPlayed,hourPlayed,count
0,Victoria Monét,We Might Even Be Falling In Love (Duet) - Spot...,261.96,4.37,225
1,LeeHi,H.S.K.T. (feat. Wonstein),476.73,7.95,201
2,The Marías,Hush - Still Woozy Remix,359.78,6.00,176
3,Kali Uchis,telepatía,324.70,5.41,171
4,Doja Cat,Addiction,333.91,5.57,167
...,...,...,...,...,...
967,Syd,Fast Car,0.16,0.00,5
968,IU,Friday (feat.Jang Yi-jeong),0.14,0.00,5
969,Harry Styles,Daylight,0.13,0.00,5
970,NIKI,Every Summertime,0.10,0.00,5


In [7]:
stream_sum = stream_sum.rename(columns = {'minPlayed': 'totalminPlayed', 'hourPlayed': 'totalhourPlayed'})
stream_sum

Unnamed: 0,artistName,trackName,totalminPlayed,totalhourPlayed,count
0,Victoria Monét,We Might Even Be Falling In Love (Duet) - Spot...,261.96,4.37,225
1,LeeHi,H.S.K.T. (feat. Wonstein),476.73,7.95,201
2,The Marías,Hush - Still Woozy Remix,359.78,6.00,176
3,Kali Uchis,telepatía,324.70,5.41,171
4,Doja Cat,Addiction,333.91,5.57,167
...,...,...,...,...,...
967,Syd,Fast Car,0.16,0.00,5
968,IU,Friday (feat.Jang Yi-jeong),0.14,0.00,5
969,Harry Styles,Daylight,0.13,0.00,5
970,NIKI,Every Summertime,0.10,0.00,5


In [8]:
# 
track_and_artist = stream_sum.apply(lambda row: [row['trackName'], row['artistName']], axis=1)
type(track_and_artist)

pandas.core.series.Series

## FIRST ATTEMPT: WEBSCRAPING METRICS ABOUT SONGS IN MY LIBRARY

#### ISSUES: 
- Organize Your Music had only 584 out of 975 songs from my streaming history
- Updates my library based on my playlists, so if I removed a song I used to listen to from these playlists (from 01/2023 - 01/2024), they will not show up 

## SECOND ATTEMPT: SPOTIFY API CALLS

In [9]:
client_id = '2f6d1ff2547045228babc0a1530b615c'
client_secret = '743702a0239a4216b65652452b5d604f'
redirect_uri = 'http://127.0.0.1:5000/redirect'
scope = 'user-library-read'

### AUTHORIZATON + ACCESS TOKEN

In [10]:
import requests
from urllib.parse import urlencode, urlparse, parse_qs
from spotipy.oauth2 import SpotifyOAuth
import spotipy
import json
import time
import pandas as pd
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from selenium import webdriver
import webbrowser
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC

# endpoint 
authorization_base_url = 'https://accounts.spotify.com/authorize'

scope = 'user-library-read'

oauth = SpotifyOAuth(
    client_id=client_id,
    client_secret=client_secret,
    redirect_uri=redirect_uri,
    scope=scope
)

# get the authorization URL and automatically open it in a browser 
authorization_url = oauth.get_authorize_url()
#webbrowser.open(authorization_url)
driver = webdriver.Chrome()
driver.get(authorization_url)

username_input = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, "login-username")))
password_input = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, "login-password")))

username_input.send_keys("##########")
password_input.send_keys("##########")

login_button = driver.find_element(By.ID, "login-button")
login_button.click()
wait = WebDriverWait(driver, 10)
content_element = wait.until(EC.visibility_of_element_located((By.CLASS_NAME, "neterror")))

auth_code_url = driver.current_url
driver.quit()

parsed_url = urlparse(auth_code_url)
query_params = parse_qs(parsed_url.query)
if 'code' in query_params:
    code = query_params['code'][0]
else:
    print("Authorization code not found in the URL")

# exchange authorization code for access token and refresh token
token_info = oauth.get_cached_token()

In [11]:
# extract the access token and refresh token from the token_info
access_token = token_info['access_token']
refresh_token = token_info.get('refresh_token')

sp = spotipy.Spotify(auth=access_token)

# check if the access token has expired
if oauth.is_token_expired(token_info):
    print('Access token is expired.')

    new_token_info = oauth.refresh_access_token(refresh_token)
    new_access_token = new_token_info['access_token']

    sp = spotipy.Spotify(auth=new_access_token)

    # update the access token with the new access token
    access_token = new_access_token
    print('Access token is refreshed!')

In [12]:
# cannot find Songs for Women by Frank Ocean in API :( 
query = f"track:{'Songs for Women'} artist:{'Frank Ocean'}"
results = sp.search(q=query, type='track', limit=1)
results

{'tracks': {'href': 'https://api.spotify.com/v1/search?query=track%3ASongs+for+Women+artist%3AFrank+Ocean&type=track&offset=0&limit=1',
  'items': [],
  'limit': 1,
  'next': None,
  'offset': 0,
  'previous': None,
  'total': 0}}

### ARTIST ID + TRACK ID COLLECTION

In [13]:
all_ids = []

def get_track_and_artist_ids(track_names, artist_names):
    """Gets track id, track name, artist id, artist name

    Parameters
    ----------
    track_names : list of str
        A list of track names 
    artist_names : list of str
        A list of artist names 
    
    Returns
    -------
    nested list of data for each track in track_names 
        [[track id, track name, artist id, artist name], [track id, track name, artist id, artist name], ...]
    """
    ids = []
    for i in range(len(track_names)):
        orig_track_name = track_names[i]
        orig_artist_name = artist_names[i]
        
        # BUG: spotify search api does not recognize apostrophes - need to delete them before searching for track ids
        track_name = track_names[i].replace("'", "")
        artist_name = artist_names[i].replace("'", "")
        
        query = f"track:{track_name} artist:{artist_name}"
        results = sp.search(q=query, type='track', limit=1)
        
        track_id = results['tracks']['items'][0]['id']
        artist_id = results['tracks']['items'][0]['artists'][0]['id']
        track_and_artist_id = [track_id, orig_track_name, artist_id, orig_artist_name]
        ids.append(track_and_artist_id)
        
    return ids

# delay between each batch (in seconds)
batch_delay = 2

# how many requests are being sent in each batch 
batch_size = 30

# split track_and_artist list into batches
for i in range(0, len(track_and_artist), batch_size):
    batch = track_and_artist[i:i+batch_size]
    track_names = [pair[0] for pair in batch]
    artist_names = [pair[1] for pair in batch]
    
    batch_track_ids = get_track_and_artist_ids(track_names, artist_names)
    all_ids.extend(batch_track_ids)

    # apply delay between batches
    time.sleep(batch_delay)  

In [14]:
all_ids[0:10]

[['0wOtc2nY3NOohp4xSwOyTN',
  'We Might Even Be Falling In Love (Duet) - Spotify Singles',
  '63XBtGSEZINSyXylZxEUbv',
  'Victoria Monét'],
 ['39382sUtIOwIXftX0i76do',
  'H.S.K.T. (feat. Wonstein)',
  '7cVZApDoQZpS447nHTsNqu',
  'LeeHi'],
 ['4dGuRldChjvboZktprNJFM',
  'Hush - Still Woozy Remix',
  '2sSGPbdZJkaSE2AbcGOACx',
  'The Marías'],
 ['6tDDoYIxWvMLTdKpjFkc1B',
  'telepatía',
  '1U1el3k54VvEUzo3ybLPlM',
  'Kali Uchis'],
 ['2OAcH9SD8ehxuG0tWNe0cU', 'Addiction', '5cj0lLjcoR7YOSnhnX0Po5', 'Doja Cat'],
 ['0wzCQjc8JRa39ej1TFkAFt',
  'Endlessly',
  '1U1el3k54VvEUzo3ybLPlM',
  'Kali Uchis'],
 ['0O3TAouZE4vL9dM5SyxgvH',
  'Fashion Killa',
  '13ubrt8QOOCPljQ2FL1Kca',
  'A$AP Rocky'],
 ['15EPc80XuFrb2LmOzGjuRg',
  'Crew (feat. Brent Faiyaz & Shy Glizzy)',
  '5XenQ7XfcvQdfIbpLEFaKQ',
  'GoldLink'],
 ['41SwdQIX8Hy2u6fuEDgvWr', '10%', '6qgnBH6iDM91ipVXv28OMu', 'KAYTRANADA'],
 ['2p37Mfy2PWajgOS3i2aaep',
  'U Say (feat. Tyler, The Creator & Jay Prince)',
  '5XenQ7XfcvQdfIbpLEFaKQ',
  'GoldLink'

In [15]:
track_ids = [track_id[0] for track_id in all_ids]

### TRACK FEATURES COLLECTION

In [16]:
# set batch size and delay
batch_size = 20
delay = 2

all_features = []

# get track features for a list of track IDs
def get_track_features(track_ids):
    """Gets audio features for each track ID 

    Parameters
    ----------
    track_ids : list of str
        A list of track ids (each ID corresponds to one unique song)  
    
    Returns
    -------
    list of dictionaries (each dictionary corresponds to one track) 
    where the key is the track feature and the value is the value for that feature 
        [{'danceability': #,
          'energy': #,
          'key': #,
          'loudness': #,
          'mode': #,
          'speechiness': #,
          'acousticness': #,
          'instrumentalness': #,
          'liveness': #,
          'valence': #,
          'tempo': #,
          'type': 'audio_features',
          'id': '...',
          'uri': '...',
          'track_href': '...',
          'analysis_url': '...',
          'duration_ms': #,
          'time_signature': #}, 
          ...]
    """
    for i in range(0, len(track_ids), batch_size):
        batch_track_ids = track_ids[i:i+batch_size]
        
        # track features for the current batch
        track_features = sp.audio_features(batch_track_ids)
        all_features.extend(track_features)

        # delay between batches
        if i + batch_size < len(track_ids):
            time.sleep(delay)

    return all_features

In [17]:
x = get_track_features(track_ids[0:10])

In [18]:
x[0]

{'danceability': 0.731,
 'energy': 0.423,
 'key': 6,
 'loudness': -10.147,
 'mode': 1,
 'speechiness': 0.0784,
 'acousticness': 0.473,
 'instrumentalness': 0.000413,
 'liveness': 0.129,
 'valence': 0.78,
 'tempo': 76.964,
 'type': 'audio_features',
 'id': '0wOtc2nY3NOohp4xSwOyTN',
 'uri': 'spotify:track:0wOtc2nY3NOohp4xSwOyTN',
 'track_href': 'https://api.spotify.com/v1/tracks/0wOtc2nY3NOohp4xSwOyTN',
 'analysis_url': 'https://api.spotify.com/v1/audio-analysis/0wOtc2nY3NOohp4xSwOyTN',
 'duration_ms': 90111,
 'time_signature': 4}

In [19]:
features = pd.DataFrame(get_track_features(track_ids)) 

In [20]:
features

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,type,id,uri,track_href,analysis_url,duration_ms,time_signature
0,0.731,0.423,6,-10.147,1,0.0784,0.47300,0.000413,0.1290,0.780,76.964,audio_features,0wOtc2nY3NOohp4xSwOyTN,spotify:track:0wOtc2nY3NOohp4xSwOyTN,https://api.spotify.com/v1/tracks/0wOtc2nY3NOo...,https://api.spotify.com/v1/audio-analysis/0wOt...,90111,4
1,0.756,0.672,9,-5.711,0,0.0373,0.25200,0.000341,0.1170,0.327,113.975,audio_features,39382sUtIOwIXftX0i76do,spotify:track:39382sUtIOwIXftX0i76do,https://api.spotify.com/v1/tracks/39382sUtIOwI...,https://api.spotify.com/v1/audio-analysis/3938...,204453,4
2,0.708,0.554,10,-6.830,0,0.0311,0.40300,0.042800,0.1260,0.564,103.999,audio_features,4dGuRldChjvboZktprNJFM,spotify:track:4dGuRldChjvboZktprNJFM,https://api.spotify.com/v1/tracks/4dGuRldChjvb...,https://api.spotify.com/v1/audio-analysis/4dGu...,182347,4
3,0.653,0.524,11,-9.016,0,0.0502,0.11200,0.000000,0.2030,0.553,83.970,audio_features,6tDDoYIxWvMLTdKpjFkc1B,spotify:track:6tDDoYIxWvMLTdKpjFkc1B,https://api.spotify.com/v1/tracks/6tDDoYIxWvML...,https://api.spotify.com/v1/audio-analysis/6tDD...,160191,4
4,0.775,0.708,7,-6.073,1,0.1660,0.00993,0.000537,0.1280,0.548,90.005,audio_features,2OAcH9SD8ehxuG0tWNe0cU,spotify:track:2OAcH9SD8ehxuG0tWNe0cU,https://api.spotify.com/v1/tracks/2OAcH9SD8ehx...,https://api.spotify.com/v1/audio-analysis/2OAc...,208480,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
977,0.822,0.480,11,-7.175,0,0.0475,0.06900,0.004860,0.3220,0.577,111.959,audio_features,0NBTjVJKPWkfsJ7PkUU66K,spotify:track:0NBTjVJKPWkfsJ7PkUU66K,https://api.spotify.com/v1/tracks/0NBTjVJKPWkf...,https://api.spotify.com/v1/audio-analysis/0NBT...,210000,4
978,0.681,0.525,9,-4.823,1,0.0512,0.61900,0.000000,0.1030,0.551,80.052,audio_features,0GsRx0gPft6RmijIwMsKmG,spotify:track:0GsRx0gPft6RmijIwMsKmG,https://api.spotify.com/v1/tracks/0GsRx0gPft6R...,https://api.spotify.com/v1/audio-analysis/0GsR...,217130,4
979,0.686,0.445,0,-7.189,1,0.0398,0.48400,0.001440,0.1750,0.626,145.500,audio_features,51Zw1cKDgkad0CXv23HCMU,spotify:track:51Zw1cKDgkad0CXv23HCMU,https://api.spotify.com/v1/tracks/51Zw1cKDgkad...,https://api.spotify.com/v1/audio-analysis/51Zw...,164533,3
980,0.628,0.676,6,-4.227,1,0.0424,0.39300,0.000069,0.0978,0.723,78.996,audio_features,68HocO7fx9z0MgDU0ZPHro,spotify:track:68HocO7fx9z0MgDU0ZPHro,https://api.spotify.com/v1/tracks/68HocO7fx9z0...,https://api.spotify.com/v1/audio-analysis/68Ho...,215687,4


In [21]:
id_df = pd.DataFrame(all_ids, columns = ['id', 'trackName', 'artistid', 'artistName'])

In [22]:
id_df

Unnamed: 0,id,trackName,artistid,artistName
0,0wOtc2nY3NOohp4xSwOyTN,We Might Even Be Falling In Love (Duet) - Spot...,63XBtGSEZINSyXylZxEUbv,Victoria Monét
1,39382sUtIOwIXftX0i76do,H.S.K.T. (feat. Wonstein),7cVZApDoQZpS447nHTsNqu,LeeHi
2,4dGuRldChjvboZktprNJFM,Hush - Still Woozy Remix,2sSGPbdZJkaSE2AbcGOACx,The Marías
3,6tDDoYIxWvMLTdKpjFkc1B,telepatía,1U1el3k54VvEUzo3ybLPlM,Kali Uchis
4,2OAcH9SD8ehxuG0tWNe0cU,Addiction,5cj0lLjcoR7YOSnhnX0Po5,Doja Cat
...,...,...,...,...
967,0NBTjVJKPWkfsJ7PkUU66K,Fast Car,3jk39CGeaaSO3FPKNx1RUx,Syd
968,0GsRx0gPft6RmijIwMsKmG,Friday (feat.Jang Yi-jeong),3HqSLMAZ3g3d5poNaI7GOU,IU
969,51Zw1cKDgkad0CXv23HCMU,Daylight,6KImCVD70vtIoJWnq6nGn3,Harry Styles
970,68HocO7fx9z0MgDU0ZPHro,Every Summertime,2kxP07DLgs4xlWz8YHlvfh,NIKI


In [23]:
features_and_track = features.merge(id_df, on = ['id']).drop_duplicates().merge(stream_sum, on = ['artistName', 'trackName'])
features_and_track

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,...,track_href,analysis_url,duration_ms,time_signature,trackName,artistid,artistName,totalminPlayed,totalhourPlayed,count
0,0.731,0.423,6,-10.147,1,0.0784,0.47300,0.000413,0.1290,0.780,...,https://api.spotify.com/v1/tracks/0wOtc2nY3NOo...,https://api.spotify.com/v1/audio-analysis/0wOt...,90111,4,We Might Even Be Falling In Love (Duet) - Spot...,63XBtGSEZINSyXylZxEUbv,Victoria Monét,261.96,4.37,225
1,0.756,0.672,9,-5.711,0,0.0373,0.25200,0.000341,0.1170,0.327,...,https://api.spotify.com/v1/tracks/39382sUtIOwI...,https://api.spotify.com/v1/audio-analysis/3938...,204453,4,H.S.K.T. (feat. Wonstein),7cVZApDoQZpS447nHTsNqu,LeeHi,476.73,7.95,201
2,0.708,0.554,10,-6.830,0,0.0311,0.40300,0.042800,0.1260,0.564,...,https://api.spotify.com/v1/tracks/4dGuRldChjvb...,https://api.spotify.com/v1/audio-analysis/4dGu...,182347,4,Hush - Still Woozy Remix,2sSGPbdZJkaSE2AbcGOACx,The Marías,359.78,6.00,176
3,0.653,0.524,11,-9.016,0,0.0502,0.11200,0.000000,0.2030,0.553,...,https://api.spotify.com/v1/tracks/6tDDoYIxWvML...,https://api.spotify.com/v1/audio-analysis/6tDD...,160191,4,telepatía,1U1el3k54VvEUzo3ybLPlM,Kali Uchis,324.70,5.41,171
4,0.775,0.708,7,-6.073,1,0.1660,0.00993,0.000537,0.1280,0.548,...,https://api.spotify.com/v1/tracks/2OAcH9SD8ehx...,https://api.spotify.com/v1/audio-analysis/2OAc...,208480,4,Addiction,5cj0lLjcoR7YOSnhnX0Po5,Doja Cat,333.91,5.57,167
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
967,0.822,0.480,11,-7.175,0,0.0475,0.06900,0.004860,0.3220,0.577,...,https://api.spotify.com/v1/tracks/0NBTjVJKPWkf...,https://api.spotify.com/v1/audio-analysis/0NBT...,210000,4,Fast Car,3jk39CGeaaSO3FPKNx1RUx,Syd,0.16,0.00,5
968,0.681,0.525,9,-4.823,1,0.0512,0.61900,0.000000,0.1030,0.551,...,https://api.spotify.com/v1/tracks/0GsRx0gPft6R...,https://api.spotify.com/v1/audio-analysis/0GsR...,217130,4,Friday (feat.Jang Yi-jeong),3HqSLMAZ3g3d5poNaI7GOU,IU,0.14,0.00,5
969,0.686,0.445,0,-7.189,1,0.0398,0.48400,0.001440,0.1750,0.626,...,https://api.spotify.com/v1/tracks/51Zw1cKDgkad...,https://api.spotify.com/v1/audio-analysis/51Zw...,164533,3,Daylight,6KImCVD70vtIoJWnq6nGn3,Harry Styles,0.13,0.00,5
970,0.628,0.676,6,-4.227,1,0.0424,0.39300,0.000069,0.0978,0.723,...,https://api.spotify.com/v1/tracks/68HocO7fx9z0...,https://api.spotify.com/v1/audio-analysis/68Ho...,215687,4,Every Summertime,2kxP07DLgs4xlWz8YHlvfh,NIKI,0.10,0.00,5


### ARTIST GENRE COLLECTION

In [24]:
# test get artist genre 
artist_ids = [artist_id[2] for artist_id in all_ids]
artist_ids[0:10]

['63XBtGSEZINSyXylZxEUbv',
 '7cVZApDoQZpS447nHTsNqu',
 '2sSGPbdZJkaSE2AbcGOACx',
 '1U1el3k54VvEUzo3ybLPlM',
 '5cj0lLjcoR7YOSnhnX0Po5',
 '1U1el3k54VvEUzo3ybLPlM',
 '13ubrt8QOOCPljQ2FL1Kca',
 '5XenQ7XfcvQdfIbpLEFaKQ',
 '6qgnBH6iDM91ipVXv28OMu',
 '5XenQ7XfcvQdfIbpLEFaKQ']

In [25]:
all_artist_genres = []

# nested list of [artist ID (str), artist name (str), [artist genres (str)]]
def get_artist_genres(artist_id_list):
    """Gets genres associated with each artist 

    Parameters
    ----------
    artist_id_list : list of str
        A list of artist ids (each ID corresponds to one artist)  
    
    Returns
    -------
    nested list of data for each artist in artist_id_list 
        [[artist ID, artist name, [list of artist genres]], ...]
    """
    artist_genres = [] 
    for artist in artist_id_list: 
        # track features for the current ID in batch 
        artist_info = sp.artist(artist)
        name = artist_info['name']
        genres = artist_info['genres']
        artist_genres.append([artist, name, genres])
    return artist_genres 

batch_size = 20
delay = 2 

for i in range(0, len(artist_ids), batch_size):
    # list of batch_size artist IDs
    batch = artist_ids[i:i+batch_size]
    
    batch_artist_genres = get_artist_genres(batch)
    all_artist_genres.extend(batch_artist_genres)
    
    # delay between batches
    if i + batch_size < len(track_ids):
        time.sleep(delay)

In [26]:
all_artist_genres[0:10]

[['63XBtGSEZINSyXylZxEUbv', 'Victoria Monét', ['alternative r&b', 'r&b']],
 ['7cVZApDoQZpS447nHTsNqu', 'LeeHi', ['k-pop']],
 ['2sSGPbdZJkaSE2AbcGOACx', 'The Marías', ['bedroom pop', 'la indie']],
 ['1U1el3k54VvEUzo3ybLPlM', 'Kali Uchis', ['colombian pop']],
 ['5cj0lLjcoR7YOSnhnX0Po5', 'Doja Cat', ['dance pop', 'pop']],
 ['1U1el3k54VvEUzo3ybLPlM', 'Kali Uchis', ['colombian pop']],
 ['13ubrt8QOOCPljQ2FL1Kca',
  'A$AP Rocky',
  ['east coast hip hop', 'hip hop', 'rap', 'trap']],
 ['5XenQ7XfcvQdfIbpLEFaKQ', 'GoldLink', ['alternative r&b']],
 ['6qgnBH6iDM91ipVXv28OMu',
  'KAYTRANADA',
  ['alternative r&b', 'escape room', 'indie soul', 'lgbtq+ hip hop']],
 ['5XenQ7XfcvQdfIbpLEFaKQ', 'GoldLink', ['alternative r&b']]]

In [27]:
genres = pd.DataFrame(all_artist_genres, columns = ['artistid', 'artistName', 'artistGenres'])

In [28]:
unique_genres = genres.drop_duplicates(subset = ['artistid', 'artistName'])

In [37]:
unique_genres

Unnamed: 0,artistid,artistName,artistGenres
0,63XBtGSEZINSyXylZxEUbv,Victoria Monét,"[alternative r&b, r&b]"
1,7cVZApDoQZpS447nHTsNqu,LeeHi,[k-pop]
2,2sSGPbdZJkaSE2AbcGOACx,The Marías,"[bedroom pop, la indie]"
3,1U1el3k54VvEUzo3ybLPlM,Kali Uchis,[colombian pop]
4,5cj0lLjcoR7YOSnhnX0Po5,Doja Cat,"[dance pop, pop]"
...,...,...,...
934,5Q0U6ogBrMX2oxmxy5OTzU,SISTAR19,"[k-pop, k-pop girl group]"
937,5T0MSzX9RC5NA6gAI6irSn,Estelle,"[neo soul, r&b]"
953,25b5QFnCedG5cvOrX3dOiN,CyYu,[otacore]
956,0TImkz4nPqjegtVSMZnMRq,TLC,"[atl hip hop, contemporary r&b, dance pop, gir..."


In [30]:
def general_artist_genres(df_column):
    """Gets artist genres and simplifies them into major genres

    Parameters
    ----------
    df_column : Series containing list of artist genres / dataframe column
        Each index is an of artist genres
    
    Returns
    -------
    nested list of general genres for each artist  
        [['genre1 for artist1', 'genre2 for artist1'], ['genre1 for artist2', 'genre2 for artist2'], ...]
    """
    general_genres = ['pop', 'jazz', 'rock', 'hip hop', 'house', 'r&b', 'edm', 'rap', 'indie', 'soul', 'trap', 'k-pop', 'korean']
    
    new_list = []
    for artist_genre in df_column:
        new_genres = []
        if artist_genre == []:
            new_genres = ['not available']
        else:
            for general in general_genres:
                if general in artist_genre and general != artist_genre:
                    new_genres.append(general)
        if new_genres:
            new_list.append(new_genres)
        else:
            new_genres = ['other']
            new_list.append(new_genres)

    return new_list

In [38]:
# add general genres to unique_genres dataframe as a new column
unique_genres['artistgeneralGenres'] = general_artist_genres(unique_genres['artistGenres'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unique_genres['artistgeneralGenres'] = general_artist_genres(unique_genres['artistGenres'])


In [39]:
feat_track_art = features_and_track.merge(unique_genres[['artistid', 'artistGenres', 'artistgeneralGenres']], on= 'artistid')
feat_track_art

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,...,duration_ms,time_signature,trackName,artistid,artistName,totalminPlayed,totalhourPlayed,count,artistGenres,artistgeneralGenres
0,0.731,0.423,6,-10.147,1,0.0784,0.47300,0.000413,0.1290,0.780,...,90111,4,We Might Even Be Falling In Love (Duet) - Spot...,63XBtGSEZINSyXylZxEUbv,Victoria Monét,261.96,4.37,225,"[alternative r&b, r&b]",[r&b]
1,0.803,0.385,2,-5.837,0,0.0664,0.22400,0.000001,0.1810,0.509,...,187059,4,Smoke (feat. Lucky Daye),63XBtGSEZINSyXylZxEUbv,Victoria Monét,90.87,1.51,43,"[alternative r&b, r&b]",[r&b]
2,0.723,0.628,6,-7.410,1,0.0553,0.00902,0.001310,0.0812,0.364,...,211115,4,Jaguar,63XBtGSEZINSyXylZxEUbv,Victoria Monét,81.84,1.36,34,"[alternative r&b, r&b]",[r&b]
3,0.768,0.506,6,-8.913,1,0.1650,0.41500,0.037600,0.1330,0.862,...,51914,4,We Might Even Be Falling In Love (Interlude),63XBtGSEZINSyXylZxEUbv,Victoria Monét,13.54,0.23,25,"[alternative r&b, r&b]",[r&b]
4,0.756,0.672,9,-5.711,0,0.0373,0.25200,0.000341,0.1170,0.327,...,204453,4,H.S.K.T. (feat. Wonstein),7cVZApDoQZpS447nHTsNqu,LeeHi,476.73,7.95,201,[k-pop],[k-pop]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
967,0.751,0.767,7,-4.752,0,0.0577,0.35300,0.000003,0.2890,0.768,...,196702,4,Ma Boy,5Q0U6ogBrMX2oxmxy5OTzU,SISTAR19,3.67,0.06,5,"[k-pop, k-pop girl group]",[k-pop]
968,0.693,0.729,0,-2.990,1,0.3780,0.17100,0.000000,0.0700,0.521,...,284733,4,American Boy,5T0MSzX9RC5NA6gAI6irSn,Estelle,3.05,0.05,5,"[neo soul, r&b]",[r&b]
969,0.530,0.949,6,-7.932,0,0.0482,0.00244,0.008870,0.1560,0.521,...,252152,4,"Sprint - Shissou (From ""Ouran High School Host...",25b5QFnCedG5cvOrX3dOiN,CyYu,0.60,0.01,5,[otacore],[other]
970,0.679,0.597,11,-8.601,0,0.0439,0.09130,0.000673,0.0487,0.900,...,315040,4,Baby-Baby-Baby,0TImkz4nPqjegtVSMZnMRq,TLC,0.47,0.01,5,"[atl hip hop, contemporary r&b, dance pop, gir...",[r&b]


### ADD INDIVIDUAL STREAMS AS ROWS 

In [40]:
end_times = stream_history[['artistName', 'trackName', 'endTime', 'minPlayed', 'hourPlayed']]
end_times 

Unnamed: 0,artistName,trackName,endTime,minPlayed,hourPlayed
0,Joe Hisaishi,One Summer Day,2023-01-31 00:07,3.163333,0.052722
1,Paintamelody,Above the Treetops (Lith Harbor),2023-01-31 00:11,4.031550,0.067193
2,Paintamelody,Cygnus Garden,2023-01-31 00:13,2.491200,0.041520
3,Nemu ネム,Nausicaä Requiem,2023-01-31 00:15,2.097817,0.034964
4,Arcade Player,"Ellinia Tree Dungeon (From ""MapleStory"")",2023-01-31 00:18,2.356817,0.039280
...,...,...,...,...,...
29952,BIGBANG,BAE BAE,2024-01-31 23:41,2.823817,0.047064
29953,BIGBANG,Blue,2024-01-31 23:45,3.884967,0.064749
29954,BIGBANG,Bad Boy,2024-01-31 23:49,3.946950,0.065782
29955,G-DRAGON,WHO YOU?,2024-01-31 23:52,3.357133,0.055952


In [41]:
final = feat_track_art.merge(end_times, on = ['artistName', 'trackName'])

In [42]:
final['endTime'] = pd.to_datetime(final['endTime'])
final['year'] = final['endTime'].dt.year
final['monthNum'] = final['endTime'].dt.month
final['month'] = final['endTime'].dt.strftime('%B')
final['day'] = final['endTime'].dt.day
final['time'] = final['endTime'].dt.strftime('%H:%M')

In [43]:
final

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,...,artistGenres,artistgeneralGenres,endTime,minPlayed,hourPlayed,year,monthNum,month,day,time
0,0.731,0.423,6,-10.147,1,0.0784,0.473,0.000413,0.1290,0.780,...,"[alternative r&b, r&b]",[r&b],2023-02-14 20:24:00,1.501833,0.025031,2023,2,February,14,20:24
1,0.731,0.423,6,-10.147,1,0.0784,0.473,0.000413,0.1290,0.780,...,"[alternative r&b, r&b]",[r&b],2023-02-14 20:30:00,1.501833,0.025031,2023,2,February,14,20:30
2,0.731,0.423,6,-10.147,1,0.0784,0.473,0.000413,0.1290,0.780,...,"[alternative r&b, r&b]",[r&b],2023-02-14 22:00:00,0.003167,0.000053,2023,2,February,14,22:00
3,0.731,0.423,6,-10.147,1,0.0784,0.473,0.000413,0.1290,0.780,...,"[alternative r&b, r&b]",[r&b],2023-02-14 22:01:00,1.501833,0.025031,2023,2,February,14,22:01
4,0.731,0.423,6,-10.147,1,0.0784,0.473,0.000413,0.1290,0.780,...,"[alternative r&b, r&b]",[r&b],2023-02-14 22:24:00,0.436333,0.007272,2023,2,February,14,22:24
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25897,0.628,0.676,6,-4.227,1,0.0424,0.393,0.000069,0.0978,0.723,...,[indonesian r&b],[other],2023-03-08 22:53:00,0.006000,0.000100,2023,3,March,8,22:53
25898,0.628,0.676,6,-4.227,1,0.0424,0.393,0.000069,0.0978,0.723,...,[indonesian r&b],[other],2023-04-18 22:32:00,0.008833,0.000147,2023,4,April,18,22:32
25899,0.628,0.676,6,-4.227,1,0.0424,0.393,0.000069,0.0978,0.723,...,[indonesian r&b],[other],2023-04-28 02:42:00,0.053250,0.000888,2023,4,April,28,02:42
25900,0.628,0.676,6,-4.227,1,0.0424,0.393,0.000069,0.0978,0.723,...,[indonesian r&b],[other],2023-05-05 21:28:00,0.015833,0.000264,2023,5,May,5,21:28


In [44]:
final.to_csv('spotify_streaming_history.csv')