In [4]:
#################
## PREPARATION ##
#################

# Import modules
import sys
# If your authentification script is not in the project directory
# append its folder to sys.path
# sys.path.append("../spotify_api_web_app")
import authorization
import pandas as pd
from tqdm import tqdm
import time

In [5]:
# Authorize and call access object "sp"
sp = authorization.authorize()

# Get all genres
genres = sp.recommendation_genre_seeds()

# Set number of recommendations per genre
n_recs = 25

# Initiate a dictionary with all the information you want to crawl
data_dict = {"id":[], "genre":[], "track_name":[], "artist_name":[], "popularity":[],
             "valence":[], "energy":[], "danceability":[], "acousticness":[], 
             "tempo":[], "speechiness":[], "mode":[], "instrumentalness":[]}


In [6]:
################
## CRAWL DATA ##
################

# Get recs for every genre
for g in tqdm(genres):
    
    # Get n recommendations
    recs = sp.recommendations(genres = [g], limit = n_recs)
    # json-like string to dict
    recs = eval(recs.json().replace("null", "-999").replace("false", "False").replace("true", "True"))["tracks"]
    
    # Crawl data from each track
    for track in recs:
        # ID and Genre
        data_dict["id"].append(track["id"])
        data_dict["genre"].append(g)
        # Metadata
        track_meta = sp.track(track["id"])
        data_dict["track_name"].append(track_meta.name)
        data_dict["artist_name"].append(track_meta.album.artists[0].name)
        data_dict["popularity"].append(track_meta.popularity)
        # Track Features
        track_features = sp.track_audio_features(track["id"])
        data_dict["valence"].append(track_features.valence)
        data_dict["energy"].append(track_features.energy)
        data_dict["danceability"].append(track_features.danceability)
        data_dict["acousticness"].append(track_features.acousticness)
        data_dict["tempo"].append(track_features.tempo)
        data_dict["speechiness"].append(track_features.speechiness)
        data_dict["mode"].append(track_features.mode)
        data_dict["instrumentalness"].append(track_features.instrumentalness)
        
        
        # Wait 0.2 seconds per track so that the api doesnt overheat
        time.sleep(0.2)

100%|████████████████████████████████████████████████████████████████████████████████| 126/126 [26:21<00:00, 12.55s/it]


In [7]:
##################
## PROCESS DATA ##
##################

# Store data in dataframe
df = pd.DataFrame(data_dict)

# Drop duplicates
df.drop_duplicates(subset = "id", keep = "first", inplace = True)

df

Unnamed: 0,id,genre,track_name,artist_name,popularity,valence,energy,danceability,acousticness,tempo,speechiness,mode,instrumentalness
0,7GILkDXz9y9x5KPHBKdrHA,acoustic,Spring för livet,Sara Varga,0,0.5980,0.572,0.786,0.2930,119.943,0.0590,1,0.000000
1,2xwsf9FuFINP1X4FTsqZ7Q,acoustic,Peace,O.A.R.,62,0.4930,0.721,0.449,0.1660,178.141,0.0392,1,0.000000
2,4VJgrWjrkodaGiq3xKz62z,acoustic,Sometimes (I Wish),City and Colour,40,0.1350,0.467,0.523,0.3110,129.953,0.0281,0,0.000020
3,46XMysg4VurmyAQ28tshqz,acoustic,The Mess I Made,Parachute,47,0.1730,0.591,0.405,0.0153,165.377,0.0339,1,0.000000
4,25J4d78ESH2MgAJoIakB8O,acoustic,Lost In The Light,Bahamas,50,0.2460,0.267,0.597,0.7930,75.535,0.0309,1,0.001400
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3145,3CCGsOj2efotChiScMmCWg,world-music,A Minha Menina,Os Mutantes,0,0.4290,0.930,0.455,0.0568,91.410,0.1260,1,0.000000
3146,6bYpEZmbTldBmQtnnfDRDk,world-music,Raoui,Various Artists,0,0.6250,0.058,0.560,0.9260,107.658,0.0500,0,0.000003
3147,1xJtdt697VaDWFJ1eEwlPO,world-music,Cat Bed Music,Pet Music World,5,0.0394,0.033,0.304,0.9920,71.282,0.0655,1,0.963000
3148,1Nov1m4HA8PBs2vyWjp2Cv,world-music,Mas Que Nada,Jorge Ben Jor,0,0.5980,0.403,0.514,0.6720,89.672,0.0325,0,0.000366


In [8]:
df.to_csv("music_characteristics_dataset3.csv", index = False)