# IACS Capstone Project Team Spotify 1: Playlist Prediction
### Spotify API dataset creation

In [2]:
import numpy as np
import pandas as pd
import spotipy
import spotipy.util as util
from spotipy.oauth2 import SpotifyClientCredentials
import sys
from sklearn.metrics import mean_squared_error
from math import sqrt

### Authentication

In [3]:
client_credentials_manager = SpotifyClientCredentials(client_id='df846cfd28e745178054587b3484f91c',                                                client_secret='e3d39fc92a954e028ff1490288f3fe5c')
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

### Playlist DataFrame Generation
The below code generates a DataFrame with all the playlists that were generated by the user ID "spotify" - in other words, those created by the company for wide distribution and recommendation. Alongside each playlist ID, we have extracted the mean popularity of the playlist's tracks, the number of followers of the playlist, as well as the playlist's name.

In [4]:
#sp.user_playlists('spotify', limit=50, offset=0)
#playlists

In [5]:
#sp.audio_features(tracks=item['track']['id'])

In [6]:
import time

class Timer(object):
    def __init__(self, verbose=True):
        self.verbose = verbose

    def __enter__(self):
        self.start = time.time()
        return self

    def __exit__(self, *args):
        self.end = time.time()
        self.secs = self.end - self.start
        self.msecs = self.secs * 1000  # millisecs
        if self.verbose:
            print('elapsed time: %f ms %f projected finish' % (self.msecs, self.msecs*1726/(4*1000*3600)))

In [7]:
with Timer() as t:
    for i in range(1000):
        a = i*34

elapsed time: 0.186920 ms 0.000022 projected finish


In [14]:
def getPlaylistData(offset, numLoops, queue, limit=50):
    names = []
    total_tracks = []
    followers = []
    popularity_means = []
    mean_num_markets = []
    added_date = []
    audio_attributes = []
    #audio_attributes = np.empty((numLoops*limit,12), dtype=np.float64)
    playlist_ids = []

    for loopIndex in range(numLoops):
        playlists = sp.user_playlists('spotify', limit=limit, offset=(offset + limit*loopIndex))
        for i, playlist in enumerate(playlists['items']):
            try:
                with Timer() as t:
                    metadata = sp.user_playlist('spotify', playlist_id=playlist['id'],
                        fields='followers.total,tracks.items(added_at,track.available_markets, track.popularity, track.name, track.id)')
                    popularities = np.empty(len(metadata['tracks']['items']))
                    added_at = []
                    num_markets = np.empty(len(metadata['tracks']['items']))
                    aud_features = np.empty((len(metadata['tracks']['items']), 12))
                    all_ids = []
                    for index, item in enumerate(metadata['tracks']['items']):
                        all_ids.append(item['track']['id'])
                        popularities[index] = item['track']['popularity']
                        added_at.append(item['added_at'])
                        num_markets[index] = len(item['track']['available_markets'])
                        #print("%s" % (item['track']['name']))
                    features = sp.audio_features(tracks=all_ids)
                    for index, songData in enumerate(features):
                        aud_features[index][0] = songData['acousticness']
                        aud_features[index][1] = songData['danceability']
                        aud_features[index][2] = songData['energy']
                        aud_features[index][3] = songData['instrumentalness']
                        aud_features[index][4] = songData['key']
                        aud_features[index][5] = songData['liveness']
                        aud_features[index][6] = songData['loudness']
                        aud_features[index][7] = songData['mode']
                        aud_features[index][8] = songData['tempo']
                        aud_features[index][9] = songData['time_signature']
                        aud_features[index][10] = songData['valence']
                        aud_features[index][11] = songData['duration_ms']
                    audio_attributes.append(aud_features.mean(axis=0))
                    added_date.append(added_at[0])
                    mean_num_markets.append(num_markets.mean())
                    popularity_means.append(popularities.mean())
                    playlist_ids.append(playlist['id'])
                    followers.append(metadata['followers']['total'])
                    names.append(playlist['name'])
                    total_tracks.append(playlist['tracks']['total'])
                    print("%4d %s" % (i + 1 + playlists['offset'], playlist['name']))
            except:
            #    a=0
                print("NO METADATA")
        if playlists['next']==None:
            break
    audio_attributes = np.array(audio_attributes)
    playlist_data = pd.DataFrame({
    'names': names,
    'playlist_id': playlist_ids,
    'total_tracks': total_tracks,
    'followers': followers,
    'mean_popularity': popularity_means,
    'added_date': added_date, 
    'num_markets': mean_num_markets,
    'acousticness': audio_attributes[:,0],
    'danceability': audio_attributes[:,1],
    'energy': audio_attributes[:,2],
    'instrumentalness': audio_attributes[:,3],
    'key': audio_attributes[:,4],
    'liveness': audio_attributes[:,5],
    'loudness': audio_attributes[:,6],
    'mode': audio_attributes[:,7],
    'tempo': audio_attributes[:,8],
    'time_signature': audio_attributes[:,9],
    'valence': audio_attributes[:,10],
    'duration': audio_attributes[:,11]
    })
    q.put(playlist_data)
    name = multiprocessing.current_process().name
    print(name, 'Exiting')

In [19]:
from multiprocessing import Process, Queue

if __name__ == '__main__':
    q = Queue()
    for offset in [0,400,800,1200,1600]:
        Process(target=getPlaylistData, args=(offset,8,q)).start()
    #p.start()
    #print(q.get())    # prints "[42, None, 'hello']"
    #p.join()
    

   1 Today's Top Hits
1201 Good Vibes
elapsed time: 1290.265083 ms 0.154653 projected finish
elapsed time: 1365.007877 ms 0.163611 projected finish
1601 Pure Rock & Roll
elapsed time: 1364.413977 ms 0.163540 projected finish
 801 Smash The Clock
 401 Trad Folk – Jigs & Reels
elapsed time: 1419.708014 ms 0.170168 projected finish
elapsed time: 1424.412966 ms 0.170732 projected finish
   2 Rap Caviar
elapsed time: 1180.284977 ms 0.141470 projected finish
1602 Running to Rock 170-190 BPM
elapsed time: 1157.624960 ms 0.138754 projected finish
1202 Throwback Workout
elapsed time: 1331.318140 ms 0.159573 projected finish
 802 Pop Punk Alarm Clock
elapsed time: 1296.093941 ms 0.155351 projected finish
 402 My Folksy Love
elapsed time: 1309.532881 ms 0.156962 projected finish
1603 Six string peacefulness
elapsed time: 1203.233004 ms 0.144221 projected finish
   3 electroNOW
elapsed time: 1318.842888 ms 0.158078 projected finish
 803 Crossfit and Rock
elapsed time: 1166.653872 ms 0.139836 proje

In [20]:
all_dfs = []
for _ in range(5):
    all_dfs.append(q.get())

In [21]:
playlist_data = all_dfs[0]
for each in all_dfs[1:]:
    playlist_data = playlist_data.append(each,ignore_index=True)

In [22]:
playlist_data

Unnamed: 0,acousticness,added_date,danceability,duration,energy,followers,instrumentalness,key,liveness,loudness,mean_popularity,mode,names,num_markets,playlist_id,tempo,time_signature,total_tracks,valence
0,0.053329,2016-12-09T14:58:03Z,0.455600,255623.166667,0.836450,425688,0.040480,5.483333,0.205875,-6.542017,49.733333,0.683333,Pure Rock & Roll,46.350000,2YbfuxUyuOhtb5z7J1H3ak,128.363750,3.950000,60,0.547017
1,0.032500,2016-12-12T08:37:01Z,0.471828,210527.862069,0.909431,193874,0.040732,4.965517,0.194610,-4.557552,38.982759,0.948276,Running to Rock 170-190 BPM,31.534483,1FDRmXIvHqQnTCqN1gK35k,151.842948,4.000000,58,0.608293
2,0.893902,2017-02-16T06:07:09Z,0.607927,187029.439024,0.162644,14474,0.821707,5.560976,0.123239,-17.061927,44.731707,0.731707,Six string peacefulness,59.658537,6zNDAR7uYeIKV7JwDjjJTk,115.061439,3.804878,41,0.237963
3,0.593164,2017-02-09T13:45:42Z,0.547466,247844.551724,0.323248,204511,0.370789,5.741379,0.117093,-12.642672,41.810345,0.689655,Smooth Morning,59.155172,3i3jnTWkJKqDTJHkUSZsRl,118.284414,3.810345,58,0.228612
4,0.181797,2016-04-08T11:46:08Z,0.681925,253717.725000,0.691755,156,0.014551,6.175000,0.160555,-9.973500,27.000000,0.650000,80s Sophisti Pop,42.250000,5I4dHisbOJVlOTSamhJ6Td,115.170550,3.925000,40,0.765180
5,0.818533,2017-02-28T06:33:13Z,0.419869,237438.137500,0.143100,465068,0.882675,4.525000,0.145010,-20.926600,48.337500,0.675000,Sleep Tight,59.987500,1YRQAGw7qVJCLxWFGDsS3l,110.628625,3.850000,80,0.123534
6,0.171186,2017-02-13T14:29:29Z,0.636157,230357.294118,0.746314,1619219,0.005300,5.568627,0.146749,-6.746490,62.901961,0.725490,Songs to Sing in the Shower,49.921569,4TNBeyX7awz89qwtTmh9D4,118.062706,3.980392,51,0.670667
7,0.832494,2017-01-27T13:16:13Z,0.408792,259872.822785,0.126122,197787,0.900937,4.582278,0.152210,-21.796013,47.303797,0.582278,SPA Treatment,56.962025,4TWssDPl7RXSOTRgvzmLcc,103.420342,3.746835,79,0.117995
8,0.458478,2017-01-27T14:49:11Z,0.560267,245517.633333,0.452133,200127,0.011025,4.800000,0.153470,-8.854617,54.366667,0.666667,Spooning,49.316667,3D5kEDz3Kb5rETQbKQpwPH,117.169650,3.933333,60,0.329400
9,0.416103,2017-02-18T21:29:11Z,0.575482,225908.614458,0.512193,179610,0.035546,5.361446,0.144810,-8.030434,55.325301,0.686747,Sunday Stroll,51.542169,0sMRTIZrf9NdwDboJUztZn,116.810060,3.939759,83,0.383825


In [24]:
playlist_data.to_csv("playlist_data_with_audio_attributes_2.28.17.csv")

In [3]:
#sp.user_playlist('spotify','5FJXhjdILmRA2z5bvz4nzf')['tracks']['items'][0]

In [79]:
playlist_sample = sp.user_playlist('spotify','5FJXhjdILmRA2z5bvz4nzf')['tracks']['items']
list_of_tracks = []
for a in playlist_sample:
    list_of_tracks.append(a['track'])
sample = pd.DataFrame(list_of_tracks)[['id','name','external_ids','artists',
                                       'duration_ms','explicit','track_number','popularity']]
features = sp.audio_features(tracks=sample['id'])
features_df = pd.DataFrame(features)
sample['acousticness'] = features_df['acousticness']
sample['danceability'] = features_df['danceability']
sample['energy'] = features_df['energy']
sample['instrumentalness'] = features_df['instrumentalness']
sample['key'] = features_df['key']
sample['liveness'] = features_df['liveness']
sample['loudness'] = features_df['loudness']
sample['mode'] = features_df['speechiness']
sample['tempo'] = features_df['tempo']
sample['time_signature'] = features_df['time_signature']
sample['valence'] = features_df['valence']
sample['sequence'] = sample.index + 1
sample.head()

Unnamed: 0,id,name,external_ids,artists,duration_ms,explicit,track_number,popularity,acousticness,danceability,energy,instrumentalness,key,liveness,loudness,mode,tempo,time_signature,valence,sequence
0,1dNIEtp7AY3oDAKCGg2XkH,Something Just Like This,{u'isrc': u'USQX91700278'},"[{u'name': u'The Chainsmokers', u'external_url...",247626,False,1,0,0.0306,0.607,0.649,2.5e-05,11,0.174,-6.695,0.0362,102.996,4,0.47,1
1,12GEpg2XOPyqk03JZEZnJs,It Ain’t Me (with Selena Gomez),{u'isrc': u'SEBGA1700015'},"[{u'name': u'Kygo', u'external_urls': {u'spoti...",220780,False,1,76,0.0905,0.648,0.532,0.0,0,0.0831,-6.597,0.0746,99.983,4,0.497,2
2,6AeQlMyRzvSl1nkFztZyKl,Issues,{u'isrc': u'USUM71615691'},"[{u'name': u'Julia Michaels', u'external_urls'...",176346,False,1,82,0.416,0.704,0.423,0.0,8,0.0607,-6.792,0.0862,113.962,4,0.45,3
3,0FE9t6xYkqWXU2ahLh6D8X,Shape of You,{u'isrc': u'GBAHS1600463'},"[{u'name': u'Ed Sheeran', u'external_urls': {u...",233712,False,1,100,0.581,0.825,0.652,0.0,1,0.0931,-3.183,0.0802,95.977,4,0.933,4
4,3ebXMykcMXOcLeJ9xZ17XH,Scared To Be Lonely,{u'isrc': u'NLM5S1600025'},"[{u'name': u'Martin Garrix', u'external_urls':...",220883,False,1,91,0.0895,0.584,0.54,0.0,1,0.261,-7.786,0.0576,137.972,4,0.19,5
