In [2]:
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import os
import json
import multiprocessing
from collections import defaultdict
import concurrent.futures

In [None]:
# code used on kaggle to generate a list of all songs in the dataset
# keep track of the song name, uri, and artist name
path = '/kaggle/input/spotify-millions-playlist/spotify/data/'
output_file = '/kaggle/working/song_set.pickle'

def process_file(file):
    if not file.startswith('mpd') or not file.endswith('.json'):
        return []
    with open(os.path.join(path, file), 'rb') as f:
        playlists = json.loads(f.read())['playlists']
        song_set = set()
        for playlist in playlists:
            for track in playlist['tracks']:
                song_name = track['track_name'] 
                song_uri = track['track_uri'] 
                artist_name = track['artist_name'] 
                song_set.add((song_name, song_uri, artist_name))
        return song_set

# utilize multiprocessing to speed up the process
if __name__ == '__main__':
    pool = multiprocessing.Pool(processes=multiprocessing.cpu_count())
    song_sets = pool.map(process_file, os.listdir(path))
    pool.close()
    pool.join()
    song_set = set.union(*song_sets)
    with open(output_file, 'wb') as f:
        pickle.dump(song_set, f)


In [None]:
# open the set of unique songs in the dataset (used kaggle to execute)
with open('/kaggle/input/songinfo/song_set.pickle', 'rb') as handle:
    song_set = pickle.load(handle)

# create a dictionary that maps song uris to song names
song_uri_to_name = {song[1]: {'name': song[0], 'artist': song[2]} for song in song_set}

# keep track of a target array of song uris and song names
song_uris = [song[1] for song in song_set]
song_names = [song[0] for song in song_set]

os.environ['SPOTIPY_CLIENT_ID']='3e8627e438fa4781abf064e0d3bc7b6f'
os.environ['SPOTIPY_CLIENT_SECRET']='0ac3077f50b54e248fc32c64c8fc9e13'

# initialize spotipy
client_credentials_manager = SpotifyClientCredentials()
spotify = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

audio_features = defaultdict(dict)

# exrract audio features for each list of uris
def get_audio_features(uris):
    features = spotify.audio_features(uris)
    for feature in features:
        if not feature:
            continue
        feature_dict = {
            'accousticness': feature['acousticness'],
            'danceability': feature['danceability'],
            'energy': feature['energy'],
            'instrumentalness': feature['instrumentalness'],
            'liveness': feature['liveness'],
            'loudness': feature['loudness'],
            'speechiness': feature['speechiness'],
            'tempo': feature['tempo'],
            'valence': feature['valence']
        }
        audio_features[feature['uri']] = feature_dict

# split uris into groups of 100
uri_groups = [song_uris[i:i + 100] for i in range(0, len(song_uris), 100)]

# use ThreadPoolExecutor to parallelize requests
with concurrent.futures.ThreadPoolExecutor() as executor:
    results = executor.map(get_audio_features, uri_groups)

# convert defaultdict to dict
audio_features = dict(audio_features)

# save audio features to pickle file
with open('audio_features.pickle', 'wb') as handle:
    pickle.dump(audio_features, handle, protocol=pickle.HIGHEST_PROTOCOL)


In [3]:
# open the generated audio features dataset
with open ('audio_features.pickle', 'rb') as handle:
    audio_features = pickle.load(handle)

audio_features_df = pd.DataFrame.from_dict(audio_features, orient='index')

audio_features_df.head()

Unnamed: 0,accousticness,danceability,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence
spotify:track:2fnKjWfQy65KOyr7d4tKPd,0.152,0.633,0.984,0.0,0.387,-4.81,0.248,163.783,0.358
spotify:track:4gmhhvoNX9qoxwHloDbmfC,0.902,0.427,0.0939,0.0,0.105,-16.886,0.0989,74.004,0.444
spotify:track:48ngiwoeIG9dTDp6pc4HZV,0.462,0.208,0.334,0.0325,0.141,-12.387,0.0266,74.703,0.214
spotify:track:50yXsheD1xQJzKWksVhlag,0.927,0.234,0.17,0.0,0.0995,-13.923,0.0355,138.598,0.0816
spotify:track:1TA1aqzgONd2XW11DyZ2xJ,0.795,0.309,0.35,0.85,0.149,-14.351,0.0442,149.469,0.282


In [4]:
# get the total number of songs in the dataset
num_songs = len(audio_features_df)
print(num_songs)

2261591


In [3]:
with open ('song_playlist_mapping.pickle', 'rb') as handle:
    song_playlist_mapping = pickle.load(handle)

song_playlist_mapping_df = pd.DataFrame.from_dict(song_playlist_mapping, orient='index')