In [1]:
import pandas as pd
import os
import numpy
import spotipy
import spotipy.util as util
import json
import time
import tqdm
import re

In [2]:
data_folder='../'
data_main = 'MSD/main/'
data_librosa = 'librosa_features/'
data_meta = 'meta/'

# 1. Getting Genres and followers of each artists

## 1.1. Loading and preparing 'tracks_features' dataset

In [15]:
# Loading 'tracks_features.csv'
songs_features = pd.read_csv('{}{}songs_features_final.csv'.format(data_folder,data_main),
                             index_col=None, usecols=['trackID', 'artistID', 'albumID', 'URL'],
                             encoding='utf-8', low_memory=False)

                              
print(songs_features.shape)
songs_features.head()

(64502, 4)


Unnamed: 0,trackID,artistID,albumID,URL
0,1oPJpjMvYEQJhqEDsYgHSh,3uOCouLFR4bVx0XeiQJSbl,3jq7b66l8MswqDmi0mxzjq,https://p.scdn.co/mp3-preview/77b2e4f2d3df4851...
1,5TqXuugXB5ov1LJiXb5IAl,65Gh3BfK84aTIugiRCgLBA,0upTl2RUS4gmStWBlXjt9l,https://p.scdn.co/mp3-preview/541e7bb62eecf5c4...
2,5ClMLCk8IFxkvlr5sS8PhM,6rSpV5hdCNJ4v1i602nj22,7uvLu2RAki7aKiASSxJPzi,https://p.scdn.co/mp3-preview/795802acaaa84ca7...
3,6YIBHGP1MaxJsZV8ykzc07,15Rn3v2LgUxR0donJY8e1E,4apgpVOan95H6xvIDAfX44,https://p.scdn.co/mp3-preview/6eef762849d6bdbc...
4,2tBoHZ9UtMMRRljiwubtQc,0X0TajZO1RmY3D8u1JymYq,0iHug1S696AsitJpyUr4T6,https://p.scdn.co/mp3-preview/35811ad304778701...


## 1.2. Gettting the infor using Spotify API

In [16]:
# Defining Authentication credentials
scope = 'user-library-read' # You can change the scope based on your usage
username = "" # Put the user's username
client_id = '' #Provide the client id from your own Spotify developer 
client_secret = '' #Provide the client secret from your own Spotify developer

# Getting the tocken for authentication
token = util.prompt_for_user_token(username, scope, client_id=client_id , 
                                   client_secret=client_secret, redirect_uri='http://google.com/')

try: # Test whether the authentication credentials are valid, if yes create Spotipy object 
    sp = spotipy.Spotify(auth=token)
except:
    raise ValueError('There is problem with Token!')

In [17]:
# Initializing global variables
genres_set = set()
step_size = 49

In [18]:
def genresSpotify(features_df):
    '''
    Gets genre, artist's followers for each track id.
    
        Return:
            Data frame containing genre and artist's followers information
        Parameters:
            - features_df: dataframe containing ('trackId','artistID') pairs
    '''
    global genres_set
    global sp
    # Making a copy of 'features_df'
    features_df = features_df.reset_index(drop=True).copy()
    
    # Initializing the genres dataframe lists
    artistID = []
    followers = []
    genres_l = []
    artist_name= []
    
    # Initializing tqdm
    total = features_df.shape[0]
    pbar = tqdm.tqdm(total=total)
    
    while not features_df.empty:
        try:
            artist_info = sp.artists(artists=features_df.loc[:step_size,'artistID'])
        except:
            print('Error in getting Artists info!!   Countdown has started...')
            time.sleep(15)
            # Getting the tocken for authentication
            token = util.prompt_for_user_token(username, scope, client_id=client_id , 
                                               client_secret=client_secret, redirect_uri='http://google.com/')

            try: # Test whether the authentication credentials are valid, if yes create Spotipy object 
                sp = spotipy.Spotify(auth=token)
            except:
                print('There is problem with Token!')
            try:
                artist_info = sp.artists(artists=features_df.loc[:step_size,'artistID'])
            except:
                print('There is problem with on of these ArtistIDs: \n{}\n**********************\n'
                      .format(features_df.loc[:step_size,'artistID']))
                artist_info = {'artists': [None]}
        
        # Getting list of artists info
        artist_info = artist_info['artists']
        
        for i, item in enumerate(artist_info):
            if item:
                # Adding new genres to 'genres_Set'
                genres_set = genres_set | set(item['genres'])
                # Adding to lists
                artistID.append(item['id'])
                followers.append(item['followers']['total'])
                genres_l.append(str(item['genres']))
                artist_name.append(item['name'])
            else:
                print('Problem in item: \n{}'.format(json.dumps(item, indent=4)))
        # Updating
        pbar.update(min(step_size, features_df.shape[0]))
        features_df = features_df.iloc[step_size:,:].reset_index(drop=True).copy()

        
    # Defining the 'genres_df'
    genres_df = pd.DataFrame(dict(artistID=artistID,
                                  artistName=artist_name,
                                  artistFollowers=followers,
                                  genres=genres_l))
    # Closing tqdm
    pbar.close()
    return genres_df

In [19]:
#######################################################
##### Getting genres information from Spotify API #####
#######################################################

# Getting info and saving to 'genres_df' dataframe
genres_df = genresSpotify(songs_features)

# Saving the resulting dataset
genres_df.to_csv('{}{}{}artistGenres_data.csv'.format(data_folder, data_main, data_meta),
                 encoding='utf-8', index=True)

print('genres_df size without dropna: ', genres_df.shape)
print('genres_df size with dropna: ',genres_df.dropna(how='any').shape)
genres_df.head()

 14%|█▍        | 9212/64502 [00:13<01:17, 709.91it/s]

retrying ...3secs
retrying ...1secs


 22%|██▏       | 14112/64502 [00:24<01:05, 769.65it/s]

retrying ...2secs
retrying ...1secs


 32%|███▏      | 20776/64502 [00:36<00:56, 779.54it/s]

retrying ...1secs


 40%|███▉      | 25676/64502 [00:43<00:49, 791.08it/s]

retrying ...2secs
retrying ...1secs


 47%|████▋     | 30576/64502 [00:53<00:41, 822.71it/s]

retrying ...3secs
retrying ...1secs


 55%|█████▍    | 35476/64502 [01:03<00:34, 848.71it/s]

retrying ...3secs
retrying ...1secs


 63%|██████▎   | 40376/64502 [01:13<00:27, 883.22it/s]

retrying ...3secs
retrying ...1secs


 74%|███████▎  | 47432/64502 [01:25<00:18, 905.62it/s]

retrying ...1secs
retrying ...1secs


 95%|█████████▌| 61593/64502 [01:41<00:02, 1090.72it/s]

retrying ...5secs
retrying ...1secs


100%|██████████| 64502/64502 [01:50<00:00, 975.16it/s] 


genres_df size without dropna:  (65818, 4)
genres_df size with dropna:  (65818, 4)


Unnamed: 0,artistID,artistName,artistFollowers,genres
0,3uOCouLFR4bVx0XeiQJSbl,Harold Budd,30414,"['ambient', 'american contemporary classical',..."
1,65Gh3BfK84aTIugiRCgLBA,Dave Edmunds,28604,"['blues-rock', 'british blues', 'power pop', '..."
2,6rSpV5hdCNJ4v1i602nj22,LANDA DANIEL,794,[]
3,15Rn3v2LgUxR0donJY8e1E,I Love You But I've Chosen Darkness,6603,['austindie']
4,0X0TajZO1RmY3D8u1JymYq,Au Revoir Simone,61008,"['brooklyn indie', 'indie pop']"


In [26]:
######################################################
##### Finding all distinct genres in the dataset #####
######################################################

# defining regex pattern
pattern = r"'([^']*)'"

genre_set = set()
for genre in genres_df.genres:
    genre_set |= set(re.findall(pattern, genre))


## 1.3. Looking into the 10 most popular artists

In [30]:
genres_df.sort_values(by=['artistFollowers'], ascending=False).drop_duplicates(subset=['artistName']).head(20)

Unnamed: 0,artistID,artistName,artistFollowers,genres
54869,5pKCCKE2ajJHZ9KAiaK11H,Rihanna,28963934,"['dance pop', 'pop', 'post-teen pop', 'r&b', '..."
65361,1Cs0zKBU1kc0i8ypK3B9ai,David Guetta,17164993,"['dance pop', 'edm', 'pop']"
65760,4gzpq5DPGxSnKTe4SA8HAU,Coldplay,16941734,"['permanent wave', 'pop']"
63155,6vWDO969PvNqNYHIOW5v0m,Beyoncé,16925994,"['dance pop', 'pop', 'post-teen pop', 'r&b']"
61162,0EmeFodog0BfCgMzAIvKQp,Shakira,14852100,"['colombian pop', 'dance pop', 'latin', 'latin..."
63947,6XyY86QOPPrYVGvF9ch6wz,Linkin Park,11124042,"['alternative metal', 'nu metal', 'post-grunge..."
203,4VMYDCV2IEDYJArk749S6m,Daddy Yankee,9866720,"['latin', 'latin hip hop', 'pop', 'reggaeton',..."
39018,5WUlDfRSoLAfcVSX1WnrxN,Sia,9782699,"['australian dance', 'australian pop', 'dance ..."
55627,0L8ExT028jH3ddEcZwqJJ5,Red Hot Chili Peppers,9330591,"['alternative rock', 'funk metal', 'permanent ..."
56981,1SupJlEpv7RS2tPNRaHViT,Nicky Jam,8455777,"['latin', 'latin hip hop', 'pop', 'reggaeton',..."


In [31]:
#####################################################################
##### All genres that has appeared in the 'genres_df' dataframe #####
#####################################################################

# Saving genres set
genres_series = pd.Series(list(genres_set), name='genres')
genres_series.to_csv('{}{}{}genres_list.csv'.format(data_folder, data_main, data_meta),
                     encoding='utf-8', header=True, index=False)

In [32]:
genres_series.head()

0        viking metal
1           candy pop
2           deep funk
3    classic arab pop
4              jungle
Name: genres, dtype: object

# 2. Getting meta data

## 2.1. Loading dataset with albumID and dropping duplicates

In [3]:
data_folder = '../../data/'
data_main = 'MSD/main/'
data_date = 'MSD/main/dates_data/'
data_meta = 'MSD/main/meta/'

In [7]:
# Loading 'tracks' dataset
tracks = pd.read_csv(data_folder+data_main+'songs_features_final.csv', 
                        index_col=0, encoding='utf-8')

# Dropping duplicates in 'albumID' column
tracks = tracks[['albumID', 'artistID']].drop_duplicates(subset=['albumID'])\
                                        .dropna(subset=['albumID']).reset_index(drop=True)

print(tracks.shape)
tracks.head()

(30435, 2)


Unnamed: 0,albumID,artistID
0,3jq7b66l8MswqDmi0mxzjq,3uOCouLFR4bVx0XeiQJSbl
1,0upTl2RUS4gmStWBlXjt9l,65Gh3BfK84aTIugiRCgLBA
2,7uvLu2RAki7aKiASSxJPzi,6rSpV5hdCNJ4v1i602nj22
3,4apgpVOan95H6xvIDAfX44,15Rn3v2LgUxR0donJY8e1E
4,0iHug1S696AsitJpyUr4T6,0X0TajZO1RmY3D8u1JymYq


In [62]:
# Loading 'genres' dataset
genres = pd.read_csv(data_folder+data_meta+'artistGenres_data.csv', 
                        index_col=0, encoding='utf-8', nrows=1000)

print(genres.shape)
genres.head()

(1000, 4)


Unnamed: 0,artistID,artistName,artistFollowers,genres
0,3uOCouLFR4bVx0XeiQJSbl,Harold Budd,30414,"['ambient', 'american contemporary classical',..."
1,65Gh3BfK84aTIugiRCgLBA,Dave Edmunds,28604,"['blues-rock', 'british blues', 'power pop', '..."
2,6rSpV5hdCNJ4v1i602nj22,LANDA DANIEL,794,[]
3,15Rn3v2LgUxR0donJY8e1E,I Love You But I've Chosen Darkness,6603,['austindie']
4,0X0TajZO1RmY3D8u1JymYq,Au Revoir Simone,61008,"['brooklyn indie', 'indie pop']"


## 2.2. Getting Dates

In [8]:
def dateSpotify(features_df):
    '''
    Gets genre, artist's followers for each track id.
    
        Return:
            Data frame containing genre and artist's followers information
        Parameters:
            - features_df: dataframe containing ('trackId','artistID') pairs
    '''
    global sp
    global step_size
    
    # Making a copy of 'features_df'
    features_df = features_df.reset_index(drop=True).copy()
    
    # Initializing the genres dataframe lists
    albumID = []
    date_r = []
    
    # Initializing tqdm
    total = features_df.shape[0]
    pbar = tqdm.tqdm(total=total)
    
    # pattern for regex
    pattern = r'(\d{3}[^-])'
    
    while not features_df.empty:
        try:
            album_info = sp.albums(albums=features_df.loc[:step_size,'albumID'])
        except:
            print('Error in getting Artists info!!   Countdown has started...')
            time.sleep(15)
            # Getting the tocken for authentication
            token = util.prompt_for_user_token(username, scope, client_id=client_id , 
                                               client_secret=client_secret, redirect_uri='http://google.com/')

            try: # Test whether the authentication credentials are valid, if yes create Spotipy object 
                sp = spotipy.Spotify(auth=token)
            except:
                print('There is problem with Token!')
            try:
                album_info = sp.albums(albums=features_df.loc[:step_size,'albumID'])
            except:
                print('There is problem with on of these ArtistIDs: \n{}\n**********************\n'
                      .format(features_df.loc[:step_size,'artistID']))
                album_info = {'albums': [None]}
        
        # Getting list of artists info
        album_info = album_info['albums']
        
        for i, item in enumerate(album_info):
            if item:
                if item['release_date']:
                    # Adding to lists
                    albumID.append(item['id'])
                    date_r.append(re.match(pattern, item['release_date'])[1])
            else:
                print('Problem in item: \n{}'.format(json.dumps(item, indent=4)))
        # Updating
        pbar.update(min(step_size, features_df.shape[0]))
        features_df = features_df.iloc[step_size:,:].reset_index(drop=True).copy()
    
    # Closing tqdm
    pbar.close()
        
    # Defining the 'genres_df'
    dates_df = pd.DataFrame(dict(albumID=albumID,
                                  releaseDate=date_r))

    return dates_df

In [9]:
###########################################
######  Initializing Spotify Object  ######
###########################################

# Defining Authentication credentials
scope = 'user-library-read' # You can change the scope based on your usage
username = "" # Put the user's username
client_id = '' #Provide the client id from your own Spotify developer 
client_secret = '' #Provide the client secret from your own Spotify developer

# Getting the authentication tocken
token = util.prompt_for_user_token(username, scope, client_id=client_id,
                                   client_secret=client_secret, redirect_uri='http://google.com/')
# Initializing Spotify Object
sp = spotipy.Spotify(auth=token)

# Step size: maximum is 20 for albums request
step_size = 19

# Getting Audio features
dates_df = dateSpotify(tracks)

# Saving audio features dataframe
dates_df.to_csv(data_folder+data_meta+'album_dates.csv', encoding='utf-8', index=True)

dates_df.head()

 12%|█▏        | 3800/30435 [00:17<01:57, 225.75it/s]

retrying ...1secs
retrying ...1secs


 19%|█▊        | 5700/30435 [00:28<01:56, 211.79it/s]

retrying ...1secs


 25%|██▍       | 7600/30435 [00:38<01:42, 222.61it/s]

retrying ...1secs


 31%|███       | 9500/30435 [00:47<01:31, 228.59it/s]

retrying ...1secs


 52%|█████▏    | 15846/30435 [01:16<01:02, 235.16it/s]

retrying ...1secs
retrying ...1secs


 58%|█████▊    | 17746/30435 [01:27<00:56, 224.21it/s]

retrying ...1secs
retrying ...1secs


 64%|██████▍   | 19627/30435 [01:37<00:49, 220.04it/s]

retrying ...1secs


 78%|███████▊  | 23693/30435 [01:56<00:28, 238.87it/s]

retrying ...1secs
retrying ...1secs


 92%|█████████▏| 27911/30435 [02:16<00:10, 234.50it/s]

retrying ...1secs
retrying ...1secs


 98%|█████████▊| 29830/30435 [02:26<00:02, 250.86it/s]

retrying ...1secs
retrying ...1secs


100%|██████████| 30435/30435 [02:31<00:00, 240.00it/s]


Unnamed: 0,albumID,releaseDate
0,3jq7b66l8MswqDmi0mxzjq,2000
1,0upTl2RUS4gmStWBlXjt9l,1981
2,7uvLu2RAki7aKiASSxJPzi,2003
3,4apgpVOan95H6xvIDAfX44,2003
4,0iHug1S696AsitJpyUr4T6,2009
