# Spotify Data Analysis

### Auth

First, we need to save our credentials (client_id and client_secret) into a credentials.py in the same folder

In [1]:
import spotipy
import credentials

In [2]:
from spotipy.oauth2 import SpotifyClientCredentials

In [3]:
client_credentials_manager = SpotifyClientCredentials(client_id=credentials.client_id,client_secret=credentials.client_secret)

In [4]:
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

### Gathering the data

Next, using the spotify API we need to gather the data about the artist we want to analyze

The following function returns all of the tracks of a list of artists

In [5]:
def get_artist_tracks(artists):
    tracks = []
    
    for artist in artists:
        
        # Get the artist URI (a unique ID)
        artist_uri = sp.search(artist)['tracks']['items'][0]['artists'][0]['uri']

        # Spotify has a lot of duplicate albums, but we'll cross-reference them with this list to avoid extra loops
        album_checker = []
        
        # The starting point of our loop of albums for those artists with more than 50
        n = 0
        
        # Note the album_type = 'album'. This discounts singles, compilations and collaborations
        while len(sp.artist_albums(artist_uri, album_type = 'album', limit=50, offset = n)['items']) > 0:
            
            # Avoid overloading Spotify with requests by assigning the list of album dictionaries to a variable
            dict_list = sp.artist_albums(artist_uri, album_type = 'album', limit=50, offset = n)['items']
            
            for i, album in list(enumerate(dict_list)):

                # Add the featured artists for the album in question to the checklist
                check_this_album = [j['name'] for j in dict_list[i]['artists']]
                # And the album name
                check_this_album.append(dict_list[i]['name'])
                # And its date
                check_this_album.append(dict_list[i]['release_date'])

                # Only continue looping if that album isn't in the checklist
                if check_this_album not in album_checker:
                    
                    # Add this album to the checker
                    album_checker.append(check_this_album)
                    # For every song on the album, get its descriptors and features in a list and add to the tracklist
                    tracks.extend([[artist, album['name'], album['uri'], song['name'],

                      album['release_date']] + list(sp.audio_features(song['uri'])[0].values()) 
                                   for song in sp.album_tracks(album['uri'])['items']])
            
            # Go through the next 50 albums (otherwise we'll get an infinite while loop)
            n += 50

    return tracks
    
        

So now we need to set our list of artists

We are going to use the Top Global Charts Playlist from spotify to extract the top 50 artists

In [6]:
top_50_global_chart = sp.playlist_tracks('37i9dQZEVXbMDoHDwVN2tF',market='AR')

In [7]:
list_top50 = list(top_50_global_chart['items'])

In [8]:
list_top50[0]['track']['artists'][0]['name']

'Roddy Ricch'

In [9]:
top_artists = []
for artist in list_top50:
    print(artist['track']['artists'][0]['name'])
    top_artists.append(artist['track']['artists'][0]['name'])   

Roddy Ricch
Tones and I
The Weeknd
Dua Lipa
KAROL G
Trevor Daniel
Justin Bieber
Future
Arizona Zervas
Justin Bieber
Billie Eilish
Eminem
Post Malone
Maroon 5
Lewis Capaldi
The Black Eyed Peas
Billie Eilish
Shawn Mendes
Harry Styles
Halsey
Lewis Capaldi
Camila Cabello
Dua Lipa
Doja Cat
Dimelo Flow
Selena Gomez
Roddy Ricch
MEDUZA
YNW Melly
SAINt JHN
Jonas Brothers
Regard
Bad Bunny
Khalid
Anuel AA
Dalex
Ali Gatie
Selena Gomez
J Balvin
DaBaby
BROCKHAMPTON
Ed Sheeran
Rauw Alejandro
Post Malone
Lil Mosey
Joji
blackbear
Nicky Jam
Billie Eilish
Mustard


Now that we got our artist we are going to use the function prevously defined

In [10]:
tracks = get_artist_tracks(top_artists)

In [11]:
tracks

[['Roddy Ricch',
  'Please Excuse Me For Being Antisocial',
  'spotify:album:52u4anZbHd6UInnmHRFzba',
  'Intro',
  '2019-12-06',
  0.738,
  0.657,
  3,
  -8.737,
  0,
  0.109,
  0.164,
  0,
  0.163,
  0.716,
  159.01,
  'audio_features',
  '7AOtINPzQ4N4JVXk7ygZZz',
  'spotify:track:7AOtINPzQ4N4JVXk7ygZZz',
  'https://api.spotify.com/v1/tracks/7AOtINPzQ4N4JVXk7ygZZz',
  'https://api.spotify.com/v1/audio-analysis/7AOtINPzQ4N4JVXk7ygZZz',
  135849,
  4],
 ['Roddy Ricch',
  'Please Excuse Me For Being Antisocial',
  'spotify:album:52u4anZbHd6UInnmHRFzba',
  'The Box',
  '2019-12-06',
  0.896,
  0.586,
  10,
  -6.687,
  0,
  0.0559,
  0.104,
  0,
  0.79,
  0.642,
  116.971,
  'audio_features',
  '0nbXyq5TXYPCO7pr3N8S4I',
  'spotify:track:0nbXyq5TXYPCO7pr3N8S4I',
  'https://api.spotify.com/v1/tracks/0nbXyq5TXYPCO7pr3N8S4I',
  'https://api.spotify.com/v1/audio-analysis/0nbXyq5TXYPCO7pr3N8S4I',
  196653,
  4],
 ['Roddy Ricch',
  'Please Excuse Me For Being Antisocial',
  'spotify:album:52u4anZ

In [12]:
len(tracks)

5105

In [18]:
audio_ft = list(sp.audio_features('7tr2za8SQg2CI8EDgrdtNl')[0].keys())

In [19]:
df_columns = ['artist','album_name','album_uri','track','release_date'] + audio_ft

df_columns

['artist',
 'album_name',
 'album_uri',
 'track',
 'release_date',
 'danceability',
 'energy',
 'key',
 'loudness',
 'mode',
 'speechiness',
 'acousticness',
 'instrumentalness',
 'liveness',
 'valence',
 'tempo',
 'type',
 'id',
 'uri',
 'track_href',
 'analysis_url',
 'duration_ms',
 'time_signature']

In [20]:
import pandas as pd

df = pd.DataFrame(tracks, columns=df_columns)

In [21]:
df

Unnamed: 0,artist,album_name,album_uri,track,release_date,danceability,energy,key,loudness,mode,...,liveness,valence,tempo,type,id,uri,track_href,analysis_url,duration_ms,time_signature
0,Roddy Ricch,Please Excuse Me For Being Antisocial,spotify:album:52u4anZbHd6UInnmHRFzba,Intro,2019-12-06,0.738,0.657,3,-8.737,0,...,0.1630,0.716,159.010,audio_features,7AOtINPzQ4N4JVXk7ygZZz,spotify:track:7AOtINPzQ4N4JVXk7ygZZz,https://api.spotify.com/v1/tracks/7AOtINPzQ4N4...,https://api.spotify.com/v1/audio-analysis/7AOt...,135849,4
1,Roddy Ricch,Please Excuse Me For Being Antisocial,spotify:album:52u4anZbHd6UInnmHRFzba,The Box,2019-12-06,0.896,0.586,10,-6.687,0,...,0.7900,0.642,116.971,audio_features,0nbXyq5TXYPCO7pr3N8S4I,spotify:track:0nbXyq5TXYPCO7pr3N8S4I,https://api.spotify.com/v1/tracks/0nbXyq5TXYPC...,https://api.spotify.com/v1/audio-analysis/0nbX...,196653,4
2,Roddy Ricch,Please Excuse Me For Being Antisocial,spotify:album:52u4anZbHd6UInnmHRFzba,Start Wit Me (feat. Gunna),2019-12-06,0.886,0.479,4,-8.054,0,...,0.1350,0.408,130.029,audio_features,7A20WnLkjtZU2d7hIB6Ziu,spotify:track:7A20WnLkjtZU2d7hIB6Ziu,https://api.spotify.com/v1/tracks/7A20WnLkjtZU...,https://api.spotify.com/v1/audio-analysis/7A20...,158692,4
3,Roddy Ricch,Please Excuse Me For Being Antisocial,spotify:album:52u4anZbHd6UInnmHRFzba,Perfect Time,2019-12-06,0.577,0.734,8,-8.484,0,...,0.1200,0.254,94.122,audio_features,6kiIVIbmwEw6JvIZc7UG0E,spotify:track:6kiIVIbmwEw6JvIZc7UG0E,https://api.spotify.com/v1/tracks/6kiIVIbmwEw6...,https://api.spotify.com/v1/audio-analysis/6kiI...,142710,4
4,Roddy Ricch,Please Excuse Me For Being Antisocial,spotify:album:52u4anZbHd6UInnmHRFzba,Moonwalkin (feat. Lil Durk),2019-12-06,0.779,0.580,1,-7.594,0,...,0.1500,0.299,160.243,audio_features,33NoCir8q4WA4HNmyrD3fu,spotify:track:33NoCir8q4WA4HNmyrD3fu,https://api.spotify.com/v1/tracks/33NoCir8q4WA...,https://api.spotify.com/v1/audio-analysis/33No...,167851,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5100,Mustard,10 Summers,spotify:album:5RnrEPrcCnCbicoDYUaqgA,Can’t Tell Me,2014-08-11,0.796,0.566,0,-8.755,0,...,0.1790,0.758,95.489,audio_features,5UHwKUzLGsve19xOrmPhLT,spotify:track:5UHwKUzLGsve19xOrmPhLT,https://api.spotify.com/v1/tracks/5UHwKUzLGsve...,https://api.spotify.com/v1/audio-analysis/5UHw...,191147,4
5101,Mustard,10 Summers,spotify:album:5RnrEPrcCnCbicoDYUaqgA,Tinashe Checks In,2014-08-11,0.445,0.459,11,-13.081,0,...,0.4630,0.620,185.477,audio_features,5okemX1yAFd7ScrQ4EJAOL,spotify:track:5okemX1yAFd7ScrQ4EJAOL,https://api.spotify.com/v1/tracks/5okemX1yAFd7...,https://api.spotify.com/v1/audio-analysis/5oke...,119160,4
5102,Mustard,10 Summers,spotify:album:5RnrEPrcCnCbicoDYUaqgA,4 Digits,2014-08-11,0.647,0.345,10,-10.994,1,...,0.0963,0.259,95.561,audio_features,0WtHRqsj9cIjU0vNcX02W6,spotify:track:0WtHRqsj9cIjU0vNcX02W6,https://api.spotify.com/v1/tracks/0WtHRqsj9cIj...,https://api.spotify.com/v1/audio-analysis/0WtH...,191413,4
5103,Mustard,10 Summers,spotify:album:5RnrEPrcCnCbicoDYUaqgA,Ty Dolla $ign Checks In,2014-08-11,0.567,0.569,5,-13.167,1,...,0.5350,0.908,130.868,audio_features,7bkQnIgUZ2q1MoGnCem1X0,spotify:track:7bkQnIgUZ2q1MoGnCem1X0,https://api.spotify.com/v1/tracks/7bkQnIgUZ2q1...,https://api.spotify.com/v1/audio-analysis/7bkQ...,68733,3


In [22]:
df['artist'].value_counts()

Maroon 5               761
Future                 515
Eminem                 480
Ed Sheeran             480
Justin Bieber          384
The Black Eyed Peas    229
Dua Lipa               208
Selena Gomez           206
The Weeknd             178
Jonas Brothers         163
Nicky Jam              161
Post Malone            134
Shawn Mendes           127
J Balvin               117
Roddy Ricch             90
BROCKHAMPTON            88
DaBaby                  80
Halsey                  75
Khalid                  62
blackbear               62
Doja Cat                57
Lewis Capaldi           54
Mustard                 46
YNW Melly               45
Joji                    44
Billie Eilish           42
Dalex                   37
KAROL G                 29
SAINt JHN               27
Lil Mosey               26
Camila Cabello          25
Bad Bunny               23
Harry Styles            22
Anuel AA                12
Ali Gatie                9
Rauw Alejandro           7
Name: artist, dtype: int64

In [31]:
df_artist = df.groupby('artist').mean()

In [32]:
df_artist

Unnamed: 0_level_0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
artist,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Ali Gatie,0.618778,0.507889,7.555556,-7.497333,0.666667,0.0368,0.485556,7.044444e-06,0.1636,0.343111,125.908667,173448.0,3.666667
Anuel AA,0.73875,0.72825,5.416667,-4.4615,0.833333,0.11555,0.410583,0.001101467,0.146683,0.601083,139.951333,216206.833333,4.0
BROCKHAMPTON,0.620477,0.567523,5.136364,-9.186011,0.670455,0.177314,0.372821,0.03264731,0.257175,0.469416,113.28392,188858.511364,3.920455
Bad Bunny,0.72487,0.638565,4.521739,-6.389739,0.478261,0.125652,0.213026,0.0003192174,0.166952,0.441087,129.237261,221873.73913,4.0
Billie Eilish,0.583786,0.281721,5.0,-15.085071,0.714286,0.193086,0.656857,0.1206927,0.239307,0.288007,105.494286,183947.642857,3.571429
Camila Cabello,0.61932,0.53628,4.96,-6.12232,0.56,0.085424,0.329892,0.0001693564,0.145332,0.420684,112.84728,201105.52,3.96
DaBaby,0.789137,0.67755,4.9125,-5.688563,0.55,0.228078,0.090121,1.383825e-05,0.186372,0.524,131.556588,155104.4625,3.9625
Dalex,0.638892,0.677514,4.594595,-6.110919,0.486486,0.192846,0.472203,2.836054e-05,0.299389,0.616027,135.314243,235816.081081,4.0
Doja Cat,0.721684,0.640158,4.561404,-5.728298,0.701754,0.088316,0.159661,0.001901324,0.159854,0.398649,126.991421,209294.017544,4.017544
Dua Lipa,0.64499,0.670913,5.826923,-5.411942,0.442308,0.070889,0.097657,6.935192e-06,0.168674,0.44751,108.685798,214407.711538,4.038462


In [33]:
df_artist.drop(columns=['time_signature'],inplace=True)

In [34]:
df_artist

Unnamed: 0_level_0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms
artist,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Ali Gatie,0.618778,0.507889,7.555556,-7.497333,0.666667,0.0368,0.485556,7.044444e-06,0.1636,0.343111,125.908667,173448.0
Anuel AA,0.73875,0.72825,5.416667,-4.4615,0.833333,0.11555,0.410583,0.001101467,0.146683,0.601083,139.951333,216206.833333
BROCKHAMPTON,0.620477,0.567523,5.136364,-9.186011,0.670455,0.177314,0.372821,0.03264731,0.257175,0.469416,113.28392,188858.511364
Bad Bunny,0.72487,0.638565,4.521739,-6.389739,0.478261,0.125652,0.213026,0.0003192174,0.166952,0.441087,129.237261,221873.73913
Billie Eilish,0.583786,0.281721,5.0,-15.085071,0.714286,0.193086,0.656857,0.1206927,0.239307,0.288007,105.494286,183947.642857
Camila Cabello,0.61932,0.53628,4.96,-6.12232,0.56,0.085424,0.329892,0.0001693564,0.145332,0.420684,112.84728,201105.52
DaBaby,0.789137,0.67755,4.9125,-5.688563,0.55,0.228078,0.090121,1.383825e-05,0.186372,0.524,131.556588,155104.4625
Dalex,0.638892,0.677514,4.594595,-6.110919,0.486486,0.192846,0.472203,2.836054e-05,0.299389,0.616027,135.314243,235816.081081
Doja Cat,0.721684,0.640158,4.561404,-5.728298,0.701754,0.088316,0.159661,0.001901324,0.159854,0.398649,126.991421,209294.017544
Dua Lipa,0.64499,0.670913,5.826923,-5.411942,0.442308,0.070889,0.097657,6.935192e-06,0.168674,0.44751,108.685798,214407.711538


In [36]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans

In [37]:
kmeans = KMeans(n_clusters=4)
kmeans.fit(df_artist)
y_kmeans = kmeans.predict(df_artist)

In [39]:
df_artist.to_csv('top50data')