In [0]:
# Install Spotipy, which is a python 3rd party library helps connect to Spotify web API
!pip3 install spotipy

Collecting spotipy
  Downloading https://files.pythonhosted.org/packages/59/46/3c957255c96910a8a0e2d9c25db1de51a8676ebba01d7966bedc6e748822/spotipy-2.4.4.tar.gz
Building wheels for collected packages: spotipy
  Building wheel for spotipy (setup.py) ... [?25l[?25hdone
  Created wheel for spotipy: filename=spotipy-2.4.4-cp36-none-any.whl size=12336 sha256=91c093da6dc5563c5ef31a5ba8ccdcb4ab7538542d13fe89ddea4226f9785d4c
  Stored in directory: /root/.cache/pip/wheels/76/28/19/a86ca9bb0e32dbd4a4f580870250f5aeef852870578e0427e6
Successfully built spotipy
Installing collected packages: spotipy
Successfully installed spotipy-2.4.4


In [0]:
import spotipy
import spotipy.util as util
from spotipy.oauth2 import SpotifyClientCredentials

#### Authorization
To connect to Spotify API, it requires you to have authorization.
Please Check README for it.

In [0]:
username = '12977977'
scope = 'user-read-private'

client_id = '5fd7312f2d4348a196322a692ae3dd30'
client_secret = '8868bbd27bc845fd8bbce969996941af'

In [0]:
client_credentials_manager = SpotifyClientCredentials(client_id = client_id, client_secret = client_secret)
spotify = spotipy.Spotify(client_credentials_manager = client_credentials_manager)

Get Country List

In [0]:
# Install BeautifulSoup
!pip3 install bs4



In [0]:
from bs4 import BeautifulSoup
import requests
import re
from pprint import pprint

In [0]:
def get_available_countries():
    """
    Crawling names of countries that are available to use Spotify.
    Name of countries are nested in the Name of Continent.
    
    Input: None
    Output: A Dictionary of Continent:Country names
            {continent : [country, country, ...]}
    """
    url = "https://support.spotify.com/us/using_spotify/the_basics/full-list-of-territories-where-spotify-is-available/"
    html = requests.get(url).text

    soup = BeautifulSoup(html, 'html.parser')

    # available_countries[continent] = [country, country, country]
    available_countries = {}
    target_tags = soup.find_all(['td'])
    for tag in target_tags:
        countries = re.findall('[\w\s]+(?=[.,])', tag.text)
        if countries:
            countries = [country.lstrip() for country in countries]
            available_countries[continent] = countries
        else:
            continent = tag.text

    return available_countries

In [0]:
def collect_chart_ids(available_countries):
    """
    Collect playlist ids through Spotify API
    
    Input: A Dictionary of {continent: [country, country...], ....}
    Output: A tuple of ((continent, country, playlist_id), ...)
    """
    chart_ids = []
    for continent in available_countries.keys():
        countries = available_countries[continent]
    
        for country in countries:
            playlist_name = country + ' Top 50'
            playlists = spotify.search(q = playlist_name, type = 'playlist')
            for playlist in playlists['playlists']['items']:
                
                # If not Spotify Official playlist
                if playlist['owner']['display_name'] != 'spotifycharts':
                    continue
                # If not Top 50 ex) Viral 50
                if playlist['name'] != playlist_name:
                    continue
                
                chart_ids.append((continent, country, playlist['id']))

    return chart_ids                

In [0]:
def collect_chart(continent, country, tracks):
    """
    Picking out only required information from track information
    
    Input: continent(String), country(String), tracks(json)
    Output: A list of dictionary that contains track information
            [{'continent':, 'country': , 'rank': , 'song': , 'artist': , ...}, ]
    """
    track_infos = []
    for idx, track in enumerate(tracks['items'], 1):
        track_info = {'continent': '', 'country' : '', 'rank': 0, 'song' : '', 'artist' : '', 'album' : '', 'song_id': '', 'artist_id': '', 'album_id': '', 'release_data': '', 'song_popularity' : 0, 'date': ''}

        track_info['rank'] = idx

        track_info['continent'] = continent
        track_info['country'] = country

        track_info['song'] = track['track']['name']
        track_info['song_id'] = track['track']['id']
        track_info['song_popularity'] = track['track']['popularity']

        track_info['album'] = track['track']['album']['name']
        track_info['album_id'] = track['track']['album']['id']
        
        try:
            track_info['release_date'] = datetime.strptime(track['track']['album']['release_date'], '%Y-%m-%d')
        except:
            track_info['release_date'] = datetime.strptime(track['track']['album']['release_date'], '%Y')

        track_info['artist'] = track['track']['artists'][0]['name']
        track_info['artist_id'] = track['track']['artists'][0]['id']

        track_info['date'] = date.today()
        track_infos.append(track_info)

    return track_infos

In [0]:
def playlist_track(self, playlist_id):
    """ 
    returns a single artist given the artist's ID, URI or URL
    
    Input: a list of playlist_ids [id, id, id...]
    output: json format track description
    """
    return self._get('playlists/' + playlist_id + '/tracks')

In [0]:
import types
spotify.playlist_track = types.MethodType(playlist_track, spotify)

#### Collect Country names and playlist ids

In [0]:
import pandas as pd
from datetime import date, datetime

In [0]:
available_countries = get_available_countries()

#chart_ids = [(continent, country, playlist_id), ...]
chart_ids = collect_chart_ids(available_countries)

In [0]:
_columns = ['continent', 'country', 'rank', 'song', 'artist', 'album', 'release_date', 'song_popularity', 'song_id', 'artist_id', 'album_id', 'date']

global_chart = pd.DataFrame(columns = _columns)

"""
for loop below COLLECTs tracks information of each country and
MERGEs it into global_chart dataframe
"""

for (continent, country, playlist_id) in chart_ids:
    
    tracks = spotify.playlist_track(playlist_id)
    dict_country_chart = collect_chart(continent, country, tracks)
    
    country_chart = pd.DataFrame(dict_country_chart, columns = _columns)
    global_chart = pd.concat([global_chart, country_chart], ignore_index = True)

In [0]:
global_chart[global_chart['country'] == 'United States'].head(5)

Unnamed: 0,continent,country,rank,song,artist,album,release_date,song_popularity,song_id,artist_id,album_id,date
2500,North America,United States,1,Heartless,The Weeknd,Heartless,2019-11-27,91,57vxBYXtHMk6H1aD29V7PU,1Xyo4u8uXC1ZmMpatF05PJ,7vRcickwa6GCfY1qKKe4lK,2019-12-03
2501,North America,United States,2,ROXANNE,Arizona Zervas,ROXANNE,2019-10-10,95,696DnlkuDOXcMAnKlTgXXK,0vRvGUQVUjytro0xpb26bs,6HJDrXs0hpebaRFKA1sF90,2019-12-03
2502,North America,United States,3,Blinding Lights,The Weeknd,Blinding Lights,2019-11-29,83,0sf12qNH5qcw8qpgymFOqD,1Xyo4u8uXC1ZmMpatF05PJ,2ZfHkwHuoAZrlz7RMj0PDz,2019-12-03
2503,North America,United States,4,Circles,Post Malone,Hollywood's Bleeding,2019-09-06,99,21jGcNKet2qwijlDFuPiPb,246dkjvS1zLTtiykXe5h60,4g1ZRSobMefqF6nelkgibi,2019-12-03
2504,North America,United States,5,everything i wanted,Billie Eilish,everything i wanted,2019-11-13,97,3ZCTVFBt2Brf31RLEnCkWJ,6qqNVTkY8uBg9cP3Jd7DAH,4i3rAwPw7Ln2YrKDusaWyT,2019-12-03


In [0]:
global_chart.head(5)

Unnamed: 0,continent,country,rank,song,artist,album,release_date,song_popularity,song_id,artist_id,album_id,date
0,Africa,South Africa,1,Dance Monkey,Tones and I,Dance Monkey,2019-05-10,81,1rgnBhdG2JDFTbYkYRZAku,2NjfBq1NflQcKSeiDooVjY,31IDBea3eEs57a0joX6TjN,2019-12-03
1,Africa,South Africa,2,Circles,Post Malone,Hollywood's Bleeding,2019-09-06,99,21jGcNKet2qwijlDFuPiPb,246dkjvS1zLTtiykXe5h60,4g1ZRSobMefqF6nelkgibi,2019-12-03
2,Africa,South Africa,3,Memories,Maroon 5,Memories,2019-09-20,100,2b8fOow8UzyDFAE27YhOZM,04gDigrS5kc9YWfZHwBETP,3nR9B40hYLKLcR0Eph3Goc,2019-12-03
3,Africa,South Africa,4,ROXANNE,Arizona Zervas,ROXANNE,2019-10-10,95,696DnlkuDOXcMAnKlTgXXK,0vRvGUQVUjytro0xpb26bs,6HJDrXs0hpebaRFKA1sF90,2019-12-03
4,Africa,South Africa,5,Heartless,The Weeknd,Heartless,2019-11-27,91,57vxBYXtHMk6H1aD29V7PU,1Xyo4u8uXC1ZmMpatF05PJ,7vRcickwa6GCfY1qKKe4lK,2019-12-03


In [0]:
def return_ids(requested_type, limits):
    """
    A generator that returns requested data(ids) in 50 pieces (Spotify API limits 50 ids per request)
    
    Input: requested_type(string), name of columns  ex) 'song_id', 'artist_id', 'album_id'
           limits(input), number of items returns each time  ex) 50, 20 
    Output: A list of collected ids ['id', 'id', 'id']
    """
    ids = []
    for idx, song_id in enumerate((x for x in global_chart[requested_type].tolist()), 1):
        ids.append(song_id)
        if idx % limits == 0:
            yield ids
            ids.clear()

    if len(ids):
        yield ids

In [0]:
#### Collect song information

In [0]:
_columns = ['song_id', 'acousticness', 'danceability', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'valence']
songs_df = pd.DataFrame(columns = _columns)

"""
for loop below COLLECTs songs information 
MERGEs it into songs_df dataframe
"""

for x in return_ids('song_id', 50):
    
    songs_features = []
    song_infos = spotify.audio_features(x)
    
    for idx, song_info in enumerate(song_infos):
        #print(idx,song_info)
        song_features = {'song_id': '', 'acousticness': None, 'danceability': None, 'energy': None, 'instrumentalness': None, 'liveness': None, 'loudness': None, 'speechiness': None, 'valence': None}

        try:        
            song_features['song_id'] = song_info['id']
            song_features['acousticness'] = song_info['acousticness']
            song_features['danceability'] = song_info['danceability']
            song_features['energy'] = song_info['energy']
            song_features['instrumentalness'] = song_info['instrumentalness']
            song_features['liveness'] = song_info['liveness']
            song_features['loudness'] = song_info['loudness']
            song_features['speechiness'] = song_info['speechiness']
            song_features['valence'] = song_info['valence']
        except:
            """
            Some of songs do not have analysis data yet, so put Nan first.
            But keeping song_id.
            
            Checked no feature analysis data from Spotify API
            """
            song_features['song_id'] = x[idx]

        songs_features.append(song_features)
    
    temp_songs_df = pd.DataFrame(songs_features, columns = _columns)
    songs_df = pd.concat([songs_df, temp_songs_df], ignore_index = True)
songs_df.head(5)

retrying ...1secs
retrying ...1secs


Unnamed: 0,song_id,acousticness,danceability,energy,instrumentalness,liveness,loudness,speechiness,valence
0,1rgnBhdG2JDFTbYkYRZAku,0.688,0.825,0.593,0.000161,0.17,-6.401,0.0988,0.54
1,21jGcNKet2qwijlDFuPiPb,0.192,0.695,0.762,0.00244,0.0863,-3.497,0.0395,0.553
2,2b8fOow8UzyDFAE27YhOZM,0.837,0.764,0.32,0.0,0.0822,-7.209,0.0546,0.575
3,696DnlkuDOXcMAnKlTgXXK,0.0522,0.621,0.601,0.0,0.46,-5.616,0.148,0.457
4,57vxBYXtHMk6H1aD29V7PU,0.00632,0.531,0.75,7.6e-05,0.117,-5.831,0.111,0.198


#### Merge songs_df into global_chart

In [0]:
"""
Becuase there are songs in multiple countries' chart
No unique key to merge, so concatenate dataframes now.
TTD: It is based on index, which is vulnerable. So need a fix to create Unique Keys.
"""
global_chart = pd.concat([global_chart, songs_df], axis = 1)
global_chart = global_chart.loc[:,~global_chart.columns.duplicated()]

In [0]:
global_chart.shape

(3100, 20)

#### Collect artist information

In [0]:
_columns = ['artist_id', 'genre', 'artist_popularity', 'followers']
artists_df = pd.DataFrame(columns = _columns)

for x in return_ids('artist_id', 50):
    
    artists_features = []
    
    artist_infos = spotify.artists(x)

    for artist_info in artist_infos['artists']:
        artist_features = {'artist_id': '', 'genre': '', 'artist_popularity': 0, 'followers': 0}
        artist_features['artist_id'] = artist_info['id']
        try:
            artist_features['genre'] = artist_info['genres'][0].split(' ')[0]
        except:
            # TTD : Searching Genre from google? 
            artist_features['genre'] = None
            #print(artist_info['genres'], artist_info['name'])
        artist_features['artist_popularity'] = artist_info['popularity']
        artist_features['followers'] = artist_info['followers']['total']
        
        artists_features.append(artist_features)
    
    temp_artists_df = pd.DataFrame(artists_features, columns = _columns)
    artists_df = pd.concat([artists_df, temp_artists_df], ignore_index = True)

artists_df.head(5)

Unnamed: 0,artist_id,genre,artist_popularity,followers
0,2NjfBq1NflQcKSeiDooVjY,australian,90,280508
1,246dkjvS1zLTtiykXe5h60,dfw,100,19797863
2,04gDigrS5kc9YWfZHwBETP,pop,93,21014566
3,0vRvGUQVUjytro0xpb26bs,pop,86,113407
4,1Xyo4u8uXC1ZmMpatF05PJ,canadian,93,17814954


#### Merge artist_df into global_chart

In [0]:
global_chart = pd.concat([global_chart, artists_df], axis = 1)
global_chart = global_chart.loc[:,~global_chart.columns.duplicated()]

In [0]:
global_chart.head(5)

Unnamed: 0,continent,country,rank,song,artist,album,release_date,song_popularity,song_id,artist_id,album_id,date,acousticness,danceability,energy,instrumentalness,liveness,loudness,speechiness,valence,genre,artist_popularity,followers
0,Africa,South Africa,1,Dance Monkey,Tones and I,Dance Monkey,2019-05-10,81,1rgnBhdG2JDFTbYkYRZAku,2NjfBq1NflQcKSeiDooVjY,31IDBea3eEs57a0joX6TjN,2019-12-03,0.688,0.825,0.593,0.000161,0.17,-6.401,0.0988,0.54,australian,90,280508
1,Africa,South Africa,2,Circles,Post Malone,Hollywood's Bleeding,2019-09-06,99,21jGcNKet2qwijlDFuPiPb,246dkjvS1zLTtiykXe5h60,4g1ZRSobMefqF6nelkgibi,2019-12-03,0.192,0.695,0.762,0.00244,0.0863,-3.497,0.0395,0.553,dfw,100,19797863
2,Africa,South Africa,3,Memories,Maroon 5,Memories,2019-09-20,100,2b8fOow8UzyDFAE27YhOZM,04gDigrS5kc9YWfZHwBETP,3nR9B40hYLKLcR0Eph3Goc,2019-12-03,0.837,0.764,0.32,0.0,0.0822,-7.209,0.0546,0.575,pop,93,21014566
3,Africa,South Africa,4,ROXANNE,Arizona Zervas,ROXANNE,2019-10-10,95,696DnlkuDOXcMAnKlTgXXK,0vRvGUQVUjytro0xpb26bs,6HJDrXs0hpebaRFKA1sF90,2019-12-03,0.0522,0.621,0.601,0.0,0.46,-5.616,0.148,0.457,pop,86,113407
4,Africa,South Africa,5,Heartless,The Weeknd,Heartless,2019-11-27,91,57vxBYXtHMk6H1aD29V7PU,1Xyo4u8uXC1ZmMpatF05PJ,7vRcickwa6GCfY1qKKe4lK,2019-12-03,0.00632,0.531,0.75,7.6e-05,0.117,-5.831,0.111,0.198,canadian,93,17814954


#### Collect album information

In [0]:
_columns = ['album_id', 'album_popularity']
albums_df = pd.DataFrame(columns = _columns)

for x in return_ids('album_id', 20):
    
    albums_features = []
    albums_infos = spotify.albums(x)

    for album_info in albums_infos['albums']:
        album_features = {'album_id': '', 'album_popularity': 0}
        album_features['album_id'] = album_info['id']
        album_features['album_popularity'] = album_info['popularity']
        
        albums_features.append(album_features)
    
    temp_albums_df = pd.DataFrame(albums_features, columns = _columns)
    albums_df = pd.concat([albums_df, temp_albums_df], ignore_index = True)

retrying ...1secs
retrying ...1secs
retrying ...2secs
retrying ...1secs


In [0]:
albums_df.head(5)

Unnamed: 0,album_id,album_popularity
0,31IDBea3eEs57a0joX6TjN,72
1,4g1ZRSobMefqF6nelkgibi,100
2,3nR9B40hYLKLcR0Eph3Goc,90
3,6HJDrXs0hpebaRFKA1sF90,86
4,7vRcickwa6GCfY1qKKe4lK,81


#### Merge albums_df into global_chart

In [0]:
global_chart = pd.concat([global_chart, albums_df], axis = 1)
global_chart = global_chart.loc[:,~global_chart.columns.duplicated()]

In [0]:
global_chart[global_chart['country'] == 'United States'].head(5)

Unnamed: 0,continent,country,rank,song,artist,album,release_date,song_popularity,song_id,artist_id,album_id,date,acousticness,danceability,energy,instrumentalness,liveness,loudness,speechiness,valence,genre,artist_popularity,followers,album_popularity
2500,North America,United States,1,Heartless,The Weeknd,Heartless,2019-11-27,91,57vxBYXtHMk6H1aD29V7PU,1Xyo4u8uXC1ZmMpatF05PJ,7vRcickwa6GCfY1qKKe4lK,2019-12-03,0.00632,0.531,0.75,7.6e-05,0.117,-5.831,0.111,0.198,canadian,93,17814954,81
2501,North America,United States,2,ROXANNE,Arizona Zervas,ROXANNE,2019-10-10,95,696DnlkuDOXcMAnKlTgXXK,0vRvGUQVUjytro0xpb26bs,6HJDrXs0hpebaRFKA1sF90,2019-12-03,0.0522,0.621,0.601,0.0,0.46,-5.616,0.148,0.457,pop,86,113407,86
2502,North America,United States,3,Blinding Lights,The Weeknd,Blinding Lights,2019-11-29,83,0sf12qNH5qcw8qpgymFOqD,1Xyo4u8uXC1ZmMpatF05PJ,2ZfHkwHuoAZrlz7RMj0PDz,2019-12-03,0.00148,0.504,0.795,0.000155,0.0902,-4.118,0.0703,0.379,canadian,93,17814954,74
2503,North America,United States,4,Circles,Post Malone,Hollywood's Bleeding,2019-09-06,99,21jGcNKet2qwijlDFuPiPb,246dkjvS1zLTtiykXe5h60,4g1ZRSobMefqF6nelkgibi,2019-12-03,0.192,0.695,0.762,0.00244,0.0863,-3.497,0.0395,0.553,dfw,100,19797863,100
2504,North America,United States,5,everything i wanted,Billie Eilish,everything i wanted,2019-11-13,97,3ZCTVFBt2Brf31RLEnCkWJ,6qqNVTkY8uBg9cP3Jd7DAH,4i3rAwPw7Ln2YrKDusaWyT,2019-12-03,0.902,0.704,0.225,0.657,0.106,-14.454,0.0994,0.243,electropop,97,16445261,88


#### Reorder Columns

In [0]:
cols = ['date',
 'continent',
 'country',
 'rank',
 'song',
 'artist',
 'album',
 'genre',
 'followers',
 'song_popularity',
 'artist_popularity',
 'album_popularity',
 'release_date',
 'acousticness',
 'danceability',
 'energy',
 'instrumentalness',
 'liveness',
 'loudness',
 'speechiness',
 'valence',
 'song_id',
 'artist_id',
 'album_id']

global_chart = global_chart[cols]

In [0]:
#### Saving dataframe as csv file

In [0]:
# Incase of working on Google Colab
"""
from google.colab import files

global_chart.to_csv('spotify_top50_chart.csv')
files.download('spotify_top50_chart.csv')
"""

"\nfrom google.colab import files\n\nglobal_chart.to_csv('spotify_top50_chart.csv')\nfiles.download('spotify_top50_chart.csv')\n"

In [0]:
# Incase of working on local jupyter notebook or py file
global_chart.to_csv('spotify_top50_chart.csv')