In [3]:
import spotipy
import base64
import requests
from spotipy.oauth2 import SpotifyClientCredentials
from creds import CLIENT_ID, CLIENT_SECRET
import pandas as pd
import time


In [4]:
# token
auth_manager = SpotifyClientCredentials(CLIENT_ID, CLIENT_SECRET)
sp = spotipy.Spotify(auth_manager=auth_manager)


In [5]:
kaggle_df = pd.read_csv('dataset.csv')

kaggle_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 114000 entries, 0 to 113999
Data columns (total 21 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   Unnamed: 0        114000 non-null  int64  
 1   track_id          114000 non-null  object 
 2   artists           113999 non-null  object 
 3   album_name        113999 non-null  object 
 4   track_name        113999 non-null  object 
 5   popularity        114000 non-null  int64  
 6   duration_ms       114000 non-null  int64  
 7   explicit          114000 non-null  bool   
 8   danceability      114000 non-null  float64
 9   energy            114000 non-null  float64
 10  key               114000 non-null  int64  
 11  loudness          114000 non-null  float64
 12  mode              114000 non-null  int64  
 13  speechiness       114000 non-null  float64
 14  acousticness      114000 non-null  float64
 15  instrumentalness  114000 non-null  float64
 16  liveness          11

In [6]:
#Get spotify playlist ids and names
def get_all_playlists(user_id='spotify'): #api call
    playlist_ids = []
    playlist_names = []
    playlists = sp.user_playlists(user_id) #api call
    #my_playlists = sp.user_playlist('z8ctw1rqti6nguka70zk1ishf')

    while playlists: # example provided in spotipy documentation
        for i, playlist in enumerate(playlists['items']):
            print("%4d %s %s" % (i + 1 + playlists['offset'], playlist['uri'],  playlist['name']))
            playlist_ids.append(playlist['uri'][-22:])
            playlist_names.append(playlist['name'])
        if playlists['next']:
            playlists = sp.next(playlists)
            
        else:
            playlists = None
    return playlist_ids, playlist_names

todays_top_hits = '37i9dQZF1DXcBWIGoYBM5M'



In [7]:
# Get track info from playlists into a dataframe - separated out API calls

def get_playlist_tracks(playlist_id):#api call
    try: 
        tracks = sp.playlist_tracks(playlist_id, limit = 100, fields='items(track(id, name, artists, album(id, name)))')
        return tracks
    
    except Exception as e:
        print(f"get_playlist_tracks error fetching {e}")
        return []
    
def get_track_details(track_id):
    try:
        track_info = sp.track(track_id) if track_id else None
        return track_info['popularity']
    except Exception as e:
        print(f"Error fetching track {e}")
        return None
    
def get_audio_features(track_id): #api call
    try:
        audio_stuffs = sp.audio_features(track_id)[0] if track_id else None 
        return audio_stuffs
    except Exception as e:
        print(f"get_audio_features error fetching {e}")
        return None
    
def get_tracks(playlist_id, playlist_name):
    music_data = []
    tracks = get_playlist_tracks(playlist_id)  

    for track_info in tracks['items']:
        track = track_info['track']

        if track:
            track_name = track['name']
            artists = ', '.join([artist['name'] for artist in track['artists']])
            album_name = track['album']['name']
            album_id = track['album']['id']
            track_id = track['id']
        
        audio_stuffs = get_audio_features(track_id) #api call
        popularity = get_track_details(track_id) #api call


        track_data = {            
            'track_id': track_id,
            'track_name': track_name,
            'artists': artists,
            'album_name': album_name,
            'popularity': popularity,
            'duration_ms': audio_stuffs['duration_ms'] if audio_stuffs else None,
            # 'explicit': audio_stuffs['explicit'] if audio_stuffs else None,
            'acousticness': audio_stuffs['acousticness'] if audio_stuffs else None,
            'danceability': audio_stuffs['danceability'] if audio_stuffs else None,
            'energy': audio_stuffs['energy'] if audio_stuffs else None,
            'key': audio_stuffs['key'] if audio_stuffs else None,                 
            'instrumentalness': audio_stuffs['instrumentalness'] if audio_stuffs else None,
            'loudness': audio_stuffs['loudness'] if audio_stuffs else None,
            'liveness': audio_stuffs['liveness'] if audio_stuffs else None,
            'speechiness': audio_stuffs['speechiness'] if audio_stuffs else None,
            'tempo': audio_stuffs['tempo'] if audio_stuffs else None,
            'mode': audio_stuffs['mode'] if audio_stuffs else None,
            'valence': audio_stuffs['valence'] if audio_stuffs else None,
            'time_signature': audio_stuffs['time_signature'] if audio_stuffs else None,
            # 'track_genre': audio_stuffs['track_genre'] if audio_stuffs else None
        }
        
        music_data.append(track_data)
    return music_data

all_tracks = []  
# playlist_ids, playlist_names = get_all_playlists()

# for playlist_id, playlist_name in zip(playlist_ids[0], playlist_names[0]): #Test with first 10 IDs
#     print(f"Getting tracks from playlist ID: {id}")
tracks = get_tracks(todays_top_hits,"Today's Top Hits") #Two API calls per function call
all_tracks.extend(tracks)

spotify_track_df = pd.DataFrame(all_tracks)

spotify_track_df.head()


Unnamed: 0,track_id,track_name,artists,album_name,popularity,duration_ms,acousticness,danceability,energy,key,instrumentalness,loudness,liveness,speechiness,tempo,mode,valence,time_signature
0,4xdBrk0nFZaP54vvZj0yx7,HOT TO GO!,Chappell Roan,The Rise and Fall of a Midwest Princess,90,184841,0.0187,0.742,0.757,6,0.0,-4.981,0.305,0.0421,139.982,1,0.957,4
1,6dOtVTDdiauQNBQEDOtlAB,BIRDS OF A FEATHER,Billie Eilish,HIT ME HARD AND SOFT,99,210373,0.2,0.747,0.507,2,0.0608,-10.171,0.117,0.0358,104.978,1,0.438,4
2,5N3hjp1WNayUPZrA8kJmJP,Please Please Please,Sabrina Carpenter,Please Please Please,98,186365,0.274,0.669,0.586,9,0.0,-6.073,0.104,0.054,107.071,1,0.579,4
3,19RybK6XDbAVpcdxSbZL1o,Apple,Charli xcx,BRAT,85,151740,0.125,0.804,0.957,0,0.0174,-5.452,0.114,0.034,126.035,1,0.962,4
4,51eSHglvG1RJXtL3qI5trr,Slow It Down,Benson Boone,Fireworks & Rollerblades,90,161831,0.174,0.432,0.583,8,0.0,-4.682,0.0933,0.0687,181.489,1,0.544,4


In [8]:
# concat two dataframes together

music_data = kaggle_df.merge(spotify_track_df,how='outer')

music_data.head()

Unnamed: 0.1,Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,39463.0,0000vdREvCVMxbQTkS888c,Rill,Lolly,Lolly,44,160725,True,0.91,0.374,...,-9.844,0,0.199,0.0757,0.00301,0.154,0.432,104.042,4,german
1,17586.0,000CC8EParg64OmTxVnZ0p,Glee Cast,Glee Love Songs,It's All Coming Back To Me Now (Glee Cast Vers...,47,322933,False,0.269,0.516,...,-7.361,1,0.0366,0.406,0.0,0.117,0.341,178.174,4,club
2,73441.0,000Iz0K615UepwSJ5z2RE5,Paul Kalkbrenner;Pig&Dan,X,Böxig Leise - Pig & Dan Remix,22,515360,False,0.686,0.56,...,-13.264,0,0.0462,0.00114,0.181,0.111,0.108,119.997,4,minimal-techno
3,51365.0,000RDCYioLteXcutOjeweY,Jordan Sandhu,Teeje Week,Teeje Week,62,190203,False,0.679,0.77,...,-3.537,1,0.19,0.0583,0.0,0.0825,0.839,161.721,4,hip-hop
4,73649.0,000qpdoc97IMTBvF8gwcpy,Paul Kalkbrenner,Zeit,Tief,19,331240,False,0.519,0.431,...,-13.606,0,0.0291,0.000964,0.72,0.0916,0.234,129.971,4,minimal-techno


In [10]:

music_data.info()
music_data.to_excel('dataset2.xlsx')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 114050 entries, 0 to 114049
Data columns (total 21 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   Unnamed: 0        114000 non-null  float64
 1   track_id          114050 non-null  object 
 2   artists           114049 non-null  object 
 3   album_name        114049 non-null  object 
 4   track_name        114049 non-null  object 
 5   popularity        114050 non-null  int64  
 6   duration_ms       114050 non-null  int64  
 7   explicit          114000 non-null  object 
 8   danceability      114050 non-null  float64
 9   energy            114050 non-null  float64
 10  key               114050 non-null  int64  
 11  loudness          114050 non-null  float64
 12  mode              114050 non-null  int64  
 13  speechiness       114050 non-null  float64
 14  acousticness      114050 non-null  float64
 15  instrumentalness  114050 non-null  float64
 16  liveness          11

In [None]:
# Actual model

