# IACS Capstone Project Team Spotify 1: Playlist Prediction
### Spotify API dataset creation

In [124]:
import numpy as np
import pandas as pd
import spotipy
import spotipy.util as util
from spotipy.oauth2 import SpotifyClientCredentials
import sys
from sklearn.metrics import mean_squared_error
from math import sqrt

### Authentication

In [125]:
client_credentials_manager = SpotifyClientCredentials(client_id='df846cfd28e745178054587b3484f91c',                                                client_secret='e3d39fc92a954e028ff1490288f3fe5c')
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

### Playlist DataFrame Generation
The below code generates a DataFrame with all the playlists that were generated by the user ID "spotify" - in other words, those created by the company for wide distribution and recommendation. Alongside each playlist ID, we have extracted the mean popularity of the playlist's tracks, the number of followers of the playlist, as well as the playlist's name.

In [3]:
#sp.user_playlists('spotify', limit=50, offset=0)
#playlists

In [126]:
#sp.audio_features(tracks=item['track']['id'])
featured_names = []
featured_playlists = sp.featured_playlists(locale=None, country=None, timestamp=None, limit=50, offset=0)
for i, playlist in enumerate(featured_playlists['playlists']['items']):
    print(playlist['name'])
    featured_names.append(playlist['name'])

Are & Be
Power Workout
Digging Now
Electro Workout
RapCaviar
Chill Vibes
Deep Focus
Evening Acoustic
Today's Top Hits
Weekly Buzz
Peaceful Piano
Coffee Table Jazz


In [107]:
featured_names

[u'Viral Hits',
 u'Mellow Bars',
 u'Peaceful Piano',
 u"Today's Top Hits",
 u'Rock Hard',
 u'Latin Pop Hits',
 u'Mood Booster',
 u'Hot Rhythmic',
 u'Mega Hit Mix',
 u'Hot Country',
 u'Evening Commute',
 u'Are & Be']

In [6]:
'White Noise' in featured_names

False

In [119]:
import time

class Timer(object):
    def __init__(self, verbose=True):
        self.verbose = verbose

    def __enter__(self):
        self.start = time.time()
        return self

    def __exit__(self, *args):
        self.end = time.time()
        self.secs = self.end - self.start
        self.msecs = self.secs * 1000  # millisecs
        if self.verbose:
            print('elapsed time: %f ms %f projected finish' % (self.msecs, self.msecs*1726/(4*1000*3600)))

In [8]:
with Timer() as t:
    for i in range(1000):
        a = i*34

elapsed time: 0.298977 ms 0.000036 projected finish


78


In [49]:
playlists = sp.user_playlists('spotify', limit=50, offset=(0 + 50*1))
for i, playlist in enumerate(playlists['items']):
    a = sp.user_playlist('spotify', playlist_id=playlist['id'],
                            fields='followers.total,tracks.items(added_at,track.available_markets, track.popularity, track.name, track.id)')
    names = []
    parts = chunks(a['tracks']['items'], 4)
    p1 = parts[0]
    p2 = parts[1]
    p3 = parts[2]
    p4 = parts[3]

    for item in p1:
        names.append(item['track']['name'])
    for item in p2:
        names.append(item['track']['name'])
    for item in p3:
        names.append(item['track']['name'])
    for item in p4:
        names.append(item['track']['name'])

### Sequencing
The following cells generate a dataset of playlist features, creating summary statistics for each quartile of songs in the dataset based on sequence. For example, summary statistics will be presented for songs in the first 25% of the playlist, then recalculated for the second 25%, and so on.

In [38]:
def chunks(seq, num):
    avg = len(seq) / float(num)
    out = []
    last = 0.0
    
    while last < len(seq):
        out.append(seq[int(last):int(last + avg)])
        last += avg
        
    return out

In [163]:
def getPlaylistData(offset, numLoops, queue, limit=50):
    print('Starting...')
    names = []
    total_tracks = []
    followers = []
    
    q1_popularity_means = []
    q2_popularity_means = []
    q3_popularity_means = []
    q4_popularity_means = []
    
    q1_audio_attributes_median = []
    q2_audio_attributes_median = []
    q3_audio_attributes_median = []
    q4_audio_attributes_median = []
    
    q1_mean_num_markets = []
    q2_mean_num_markets = []
    q3_mean_num_markets = []
    q4_mean_num_markets = []

    featured = []
    #audio_attributes = np.empty((numLoops*limit,12), dtype=np.float64)
    playlist_ids = []

    for loopIndex in range(numLoops):
        playlists = sp.user_playlists('spotify', limit=limit, offset=(offset + limit*loopIndex))
        for i, playlist in enumerate(playlists['items']):
            try:
                with Timer() as t:

                    metadata = sp.user_playlist('spotify', playlist_id=playlist['id'],
                        fields='followers.total,tracks.items(added_at,track.available_markets, track.popularity, track.name, track.id)')

                    parts = chunks(metadata['tracks']['items'], 4)

                    # PART 1

                    p1 = parts[0]

                    q1_popularities = np.empty(len(p1))
                    q1_num_markets = np.empty(len(p1))
                    q1_ids = []

                    for ix, item in enumerate(p1):

                        q1_ids.append(item['track']['id'])
                        q1_popularities[ix] = item['track']['popularity']
                        q1_num_markets[ix] = len(item['track']['available_markets'])
                    
                    if q1_popularities.size == 0:
                        q1_popularities = np.array(0)
                    if q1_num_markets.size == 0:
                        q1_num_markets = np.array(0)
                    
                    q1_popularity_means.append(q1_popularities.mean())
                    q1_mean_num_markets.append(q1_num_markets.mean())

                    q1_features = sp.audio_features(tracks=q1_ids)

                    q1_aud_features = np.empty((len(p1), 12))
                    for index, songData in enumerate(q1_features):
                        q1_aud_features[index][0] = songData['acousticness']
                        q1_aud_features[index][1] = songData['danceability']
                        q1_aud_features[index][2] = songData['energy']
                        q1_aud_features[index][3] = songData['instrumentalness']
                        q1_aud_features[index][4] = songData['key']
                        q1_aud_features[index][5] = songData['liveness']
                        q1_aud_features[index][6] = songData['loudness']
                        q1_aud_features[index][7] = songData['mode']
                        q1_aud_features[index][8] = songData['tempo']
                        q1_aud_features[index][9] = songData['time_signature']
                        q1_aud_features[index][10] = songData['valence']
                        q1_aud_features[index][11] = songData['duration_ms']

                    q1_audio_attributes_median.append(np.percentile(q1_aud_features,50, axis=0))

                    # PART 2

                    p2 = parts[1]

                    q2_popularities = np.empty(len(p2))
                    q2_num_markets = np.empty(len(p2))
                    q2_ids = []

                    for ix, item in enumerate(p2):

                        q2_ids.append(item['track']['id'])
                        q2_popularities[ix] = item['track']['popularity']
                        q2_num_markets[ix] = len(item['track']['available_markets'])
                        
                    if q2_popularities.size == 0:
                        q2_popularities = np.array(0)
                    if q2_num_markets.size == 0:
                        q2_num_markets = np.array(0)
                    
                    q2_popularity_means.append(q2_popularities.mean())
                    q2_mean_num_markets.append(q2_num_markets.mean())

                    q2_features = sp.audio_features(tracks=q2_ids)

                    q2_aud_features = np.empty((len(p2), 12))
                    for index, songData in enumerate(q2_features):
                        q2_aud_features[index][0] = songData['acousticness']
                        q2_aud_features[index][1] = songData['danceability']
                        q2_aud_features[index][2] = songData['energy']
                        q2_aud_features[index][3] = songData['instrumentalness']
                        q2_aud_features[index][4] = songData['key']
                        q2_aud_features[index][5] = songData['liveness']
                        q2_aud_features[index][6] = songData['loudness']
                        q2_aud_features[index][7] = songData['mode']
                        q2_aud_features[index][8] = songData['tempo']
                        q2_aud_features[index][9] = songData['time_signature']
                        q2_aud_features[index][10] = songData['valence']
                        q2_aud_features[index][11] = songData['duration_ms']

                    q2_audio_attributes_median.append(np.percentile(q2_aud_features,50, axis=0))

                    # PART 3

                    p3 = parts[2]

                    q3_popularities = np.empty(len(p3))
                    q3_num_markets = np.empty(len(p3))
                    q3_ids = []

                    for ix, item in enumerate(p3):

                        q3_ids.append(item['track']['id'])
                        q3_popularities[ix] = item['track']['popularity']
                        q3_num_markets[ix] = len(item['track']['available_markets'])
                    
                    if q3_popularities.size == 0:
                        q3_popularities = np.array(0)
                    if q3_num_markets.size == 0:
                        q3_num_markets = np.array(0)
                    
                    q3_popularity_means.append(q3_popularities.mean())
                    q3_mean_num_markets.append(q3_num_markets.mean())

                    q3_features = sp.audio_features(tracks=q3_ids)

                    q3_aud_features = np.empty((len(p3), 12))
                    for index, songData in enumerate(q3_features):
                        q3_aud_features[index][0] = songData['acousticness']
                        q3_aud_features[index][1] = songData['danceability']
                        q3_aud_features[index][2] = songData['energy']
                        q3_aud_features[index][3] = songData['instrumentalness']
                        q3_aud_features[index][4] = songData['key']
                        q3_aud_features[index][5] = songData['liveness']
                        q3_aud_features[index][6] = songData['loudness']
                        q3_aud_features[index][7] = songData['mode']
                        q3_aud_features[index][8] = songData['tempo']
                        q3_aud_features[index][9] = songData['time_signature']
                        q3_aud_features[index][10] = songData['valence']
                        q3_aud_features[index][11] = songData['duration_ms']

                    q3_audio_attributes_median.append(np.percentile(q3_aud_features,50, axis=0))
                    
                    # PART 4

                    p4 = parts[3]

                    q4_popularities = np.empty(len(p4))
                    q4_num_markets = np.empty(len(p4))
                    q4_ids = []

                    for ix, item in enumerate(p4):

                        q4_ids.append(item['track']['id'])
                        q4_popularities[ix] = item['track']['popularity']
                        q4_num_markets[ix] = len(item['track']['available_markets'])
                    
                    if q4_popularities.size == 0:
                        q4_popularities = np.array(0)
                    if q4_num_markets.size == 0:
                        q4_num_markets = np.array(0)
                    
                    q4_popularity_means.append(q4_popularities.mean())
                    q4_mean_num_markets.append(q4_num_markets.mean())

                    q4_features = sp.audio_features(tracks=q4_ids)

                    q4_aud_features = np.empty((len(p4), 12))
                    for index, songData in enumerate(q4_features):
                        q4_aud_features[index][0] = songData['acousticness']
                        q4_aud_features[index][1] = songData['danceability']
                        q4_aud_features[index][2] = songData['energy']
                        q4_aud_features[index][3] = songData['instrumentalness']
                        q4_aud_features[index][4] = songData['key']
                        q4_aud_features[index][5] = songData['liveness']
                        q4_aud_features[index][6] = songData['loudness']
                        q4_aud_features[index][7] = songData['mode']
                        q4_aud_features[index][8] = songData['tempo']
                        q4_aud_features[index][9] = songData['time_signature']
                        q4_aud_features[index][10] = songData['valence']
                        q4_aud_features[index][11] = songData['duration_ms']

                    q4_audio_attributes_median.append(np.percentile(q4_aud_features,50, axis=0))

                    # SUMMARY

                    playlist_ids.append(playlist['id'])
                    followers.append(metadata['followers']['total'])
                    names.append(playlist['name'])
                    featured.append(playlist['name'] in featured_names)
                    total_tracks.append(playlist['tracks']['total'])

                    print("%4d %s" % (i + 1 + playlists['offset'], playlist['name']))
            except:
                print("API ERROR")

        if playlists['next']==None:
            break
    
    q1_audio_attributes_median = np.array(q1_audio_attributes_median)
    q2_audio_attributes_median = np.array(q2_audio_attributes_median)
    q3_audio_attributes_median = np.array(q3_audio_attributes_median)
    q4_audio_attributes_median = np.array(q4_audio_attributes_median)
    
    print len(q1_popularity_means)
    print len(q1_mean_num_markets)
    print len(q1_audio_attributes_median)
    print len(q2_audio_attributes_median)
    print len(q2_mean_num_markets)
    print len(q2_popularity_means)
    print len(q3_audio_attributes_median)
    print len(q3_mean_num_markets)
    print len(q3_popularity_means)
    print len(q4_popularity_means)
    print len(q4_mean_num_markets)
    print len(q4_audio_attributes_median)

    
    playlist_data = pd.DataFrame({
    'followers': followers,
    'names': names,
    'playlist_id': playlist_ids,
    'total_tracks': total_tracks,
    'featured': featured,
    #'added_date': added_date, 
            
    # Q1 
            
    'q1_num_markets': q1_mean_num_markets,
    'q1_mean_popularity': q1_popularity_means,
    'q1_acousticness': q1_audio_attributes_median[:,0],
    'q1_danceability': q1_audio_attributes_median[:,1],
    'q1_energy': q1_audio_attributes_median[:,2],
    'q1_instrumentalness': q1_audio_attributes_median[:,3],
    'q1_key': q1_audio_attributes_median[:,4],
    'q1_liveness': q1_audio_attributes_median[:,5],
    'q1_loudness': q1_audio_attributes_median[:,6],
    'q1_mode': q1_audio_attributes_median[:,7],
    'q1_tempo': q1_audio_attributes_median[:,8],
    'q1_time_signature': q1_audio_attributes_median[:,9],
    'q1_valence': q1_audio_attributes_median[:,10],
    'q1_duration': q1_audio_attributes_median[:,11],
            
    # Q2
            
    'q2_num_markets': q2_mean_num_markets,     
    'q2_mean_popularity': q2_popularity_means,
    'q2_acousticness': q2_audio_attributes_median[:,0],
    'q2_danceability': q2_audio_attributes_median[:,1],
    'q2_energy': q2_audio_attributes_median[:,2],
    'q2_instrumentalness': q2_audio_attributes_median[:,3],
    'q2_key': q2_audio_attributes_median[:,4],
    'q2_liveness': q2_audio_attributes_median[:,5],
    'q2_loudness': q2_audio_attributes_median[:,6],
    'q2_mode': q2_audio_attributes_median[:,7],
    'q2_tempo': q2_audio_attributes_median[:,8],
    'q2_time_signature': q2_audio_attributes_median[:,9],
    'q2_valence': q2_audio_attributes_median[:,10],
    'q2_duration': q2_audio_attributes_median[:,11],
         
    # Q3

    'q3_num_markets': q3_mean_num_markets,            
    'q3_mean_popularity': q3_popularity_means,
    'q3_acousticness': q3_audio_attributes_median[:,0],
    'q3_danceability': q3_audio_attributes_median[:,1],
    'q3_energy': q3_audio_attributes_median[:,2],
    'q3_instrumentalness': q3_audio_attributes_median[:,3],
    'q3_key': q3_audio_attributes_median[:,4],
    'q3_liveness': q3_audio_attributes_median[:,5],
    'q3_loudness': q3_audio_attributes_median[:,6],
    'q3_mode': q3_audio_attributes_median[:,7],
    'q3_tempo': q3_audio_attributes_median[:,8],
    'q3_time_signature': q3_audio_attributes_median[:,9],
    'q3_valence': q3_audio_attributes_median[:,10],
    'q3_duration': q3_audio_attributes_median[:,11],
            
    # Q4 

    'q4_num_markets': q4_mean_num_markets,
    'q4_mean_popularity': q4_popularity_means,
    'q4_acousticness': q4_audio_attributes_median[:,0],
    'q4_danceability': q4_audio_attributes_median[:,1],
    'q4_energy': q4_audio_attributes_median[:,2],
    'q4_instrumentalness': q4_audio_attributes_median[:,3],
    'q4_key': q4_audio_attributes_median[:,4],
    'q4_liveness': q4_audio_attributes_median[:,5],
    'q4_loudness': q4_audio_attributes_median[:,6],
    'q4_mode': q4_audio_attributes_median[:,7],
    'q4_tempo': q4_audio_attributes_median[:,8],
    'q4_time_signature': q4_audio_attributes_median[:,9],
    'q4_valence': q4_audio_attributes_median[:,10],
    'q4_duration': q4_audio_attributes_median[:,11],

    })
    
    q.put(playlist_data)
    name = multiprocessing.current_process().name
    print(name, 'Exiting')


In [160]:
b = np.array(0)
b.mean()

0.0

### Final DataFrame Head

In [145]:
a

Unnamed: 0,featured,followers,names,playlist_id,q1_acousticness,q1_danceability,q1_duration,q1_energy,q1_instrumentalness,q1_key,...,q4_key,q4_liveness_q1,q4_loudness,q4_mean_popularity,q4_mode,q4_num_markets,q4_tempo,q4_time_signature,q4_valence,total_tracks
0,True,14092309,Today's Top Hits,5FJXhjdILmRA2z5bvz4nzf,0.10625,0.698,225587.0,0.638,0.0,1.0,...,6.0,0.179,-4.757,83.615385,1.0,25.0,122.381,4.0,0.39,50
1,False,75189,Rap Caviar,5yolys8XG4q7YfjYGl5Lff,0.184,0.7405,238770.5,0.474,2.155e-05,7.5,...,7.0,0.15,-6.471,70.538462,1.0,46.769231,105.045,4.0,0.275,50
2,False,4014312,electroNOW,1GQLlzxBxKTb6tJsD4RxHI,0.0733,0.5855,187051.5,0.895,0.001434,6.0,...,5.0,0.125,-5.514,59.538462,0.0,50.923077,125.995,4.0,0.338,50
3,True,2907618,Are & Be,06CemleTteSalaVGVMbgFy,0.11625,0.7215,223984.5,0.5685,6.2e-07,3.5,...,5.0,0.299,-6.778,53.461538,0.0,19.384615,94.058,4.0,0.393,50
4,False,2566942,Rock This,0lbtgFu3JNKX77J5YOpW7n,0.010315,0.4745,212901.5,0.9,3.95e-05,6.0,...,5.0,0.117,-5.761,51.0,1.0,28.384615,100.043,4.0,0.352,50
5,False,3541661,Hot Country,4ecQaDJHF55Ls9m2lKIXbI,0.1685,0.551,213013.5,0.7965,0.0,3.0,...,7.0,0.145,-4.903,64.461538,1.0,33.230769,125.976,4.0,0.558,51
6,False,2458099,Afternoon Acoustic,16BpjqQV1Ey0HeDueNDSYz,0.882,0.554,216173.0,0.307,0.00424,3.0,...,5.0,0.114,-9.917,49.473684,1.0,32.684211,115.405,4.0,0.337,76
7,True,7623,Peaceful Piano,63dDpdoVHvx5RkK87g4LKk,0.992,0.361,165558.0,0.059,0.908,3.0,...,6.0,0.112,-24.262,61.04,0.0,53.0,99.8,4.0,0.157,150
8,False,7599,SXSW 2017: Daily Guide,1LYK2ahDnbLnS4fddCFY7z,0.177,0.588,205333.0,0.634,6.36e-05,5.0,...,5.5,0.1075,-6.1835,40.142857,1.0,37.142857,131.8685,4.0,0.265,55
9,False,2899,Lost In Austin: Country Music from SXSW,5JdIoXkjfuyWmDdDnznmq1,0.571,0.5355,224380.0,0.5755,8.255e-06,2.0,...,6.0,0.11,-8.943,28.380952,1.0,33.285714,121.728,4.0,0.47,82


In [161]:
q.get()

KeyboardInterrupt: 

In [164]:
from multiprocessing import Process, Queue

if __name__ == '__main__':
    q = Queue()
    for offset in [0,400,800,1200,1600]:
    #for offset in [0]:

        Process(target=getPlaylistData, args=(offset,8,q)).start()
        
    #p.start()
    #print(q.get())    # prints "[42, None, 'hello']"
    #p.join()
    

Starting...
Starting...
Starting...
Starting...
Starting...
retrying ...4secs
retrying ...4secs
retrying ...4secs
retrying ...3secs
retrying ...3secs
retrying ...0secs
retrying ...0secs
retrying ...0secs
retrying ...0secs
retrying ...0secs
retrying ...0secs
retrying ...0secs
retrying ...0secs
retrying ...0secs
1601 Svensk jazz
elapsed time: 5908.314943 ms 0.708177 projected finish
   1 Today's Top Hits
elapsed time: 6235.561132 ms 0.747401 projected finish
1201 Guest List: MC Lars
elapsed time: 6296.535015 ms 0.754710 projected finish
 401 Cocktails & Dreams
elapsed time: 6543.522120 ms 0.784314 projected finish
 801 Family Road Trip
elapsed time: 6585.234880 ms 0.789314 projected finish
1602 Keep Calm
elapsed time: 5536.901951 ms 0.663659 projected finish
1202 Guest List: Kill Screen
elapsed time: 5831.002951 ms 0.698910 projected finish
   2 Rap Caviar
elapsed time: 5957.973003 ms 0.714129 projected finish
 402 Femme Fatale
elapsed time: 6248.695850 ms 0.748976 projected finish
 802 

In [79]:
playlist_sample = sp.user_playlist('spotify','5FJXhjdILmRA2z5bvz4nzf')['tracks']['items']
list_of_tracks = []
for a in playlist_sample:
    list_of_tracks.append(a['track'])
sample = pd.DataFrame(list_of_tracks)[['id','name','external_ids','artists',
                                       'duration_ms','explicit','track_number','popularity']]
features = sp.audio_features(tracks=sample['id'])
features_df = pd.DataFrame(features)
sample['acousticness'] = features_df['acousticness']
sample['danceability'] = features_df['danceability']
sample['energy'] = features_df['energy']
sample['instrumentalness'] = features_df['instrumentalness']
sample['key'] = features_df['key']
sample['liveness'] = features_df['liveness']
sample['loudness'] = features_df['loudness']
sample['mode'] = features_df['speechiness']
sample['tempo'] = features_df['tempo']
sample['time_signature'] = features_df['time_signature']
sample['valence'] = features_df['valence']
sample['sequence'] = sample.index + 1
sample.head()

Unnamed: 0,id,name,external_ids,artists,duration_ms,explicit,track_number,popularity,acousticness,danceability,energy,instrumentalness,key,liveness,loudness,mode,tempo,time_signature,valence,sequence
0,1dNIEtp7AY3oDAKCGg2XkH,Something Just Like This,{u'isrc': u'USQX91700278'},"[{u'name': u'The Chainsmokers', u'external_url...",247626,False,1,0,0.0306,0.607,0.649,2.5e-05,11,0.174,-6.695,0.0362,102.996,4,0.47,1
1,12GEpg2XOPyqk03JZEZnJs,It Ain’t Me (with Selena Gomez),{u'isrc': u'SEBGA1700015'},"[{u'name': u'Kygo', u'external_urls': {u'spoti...",220780,False,1,76,0.0905,0.648,0.532,0.0,0,0.0831,-6.597,0.0746,99.983,4,0.497,2
2,6AeQlMyRzvSl1nkFztZyKl,Issues,{u'isrc': u'USUM71615691'},"[{u'name': u'Julia Michaels', u'external_urls'...",176346,False,1,82,0.416,0.704,0.423,0.0,8,0.0607,-6.792,0.0862,113.962,4,0.45,3
3,0FE9t6xYkqWXU2ahLh6D8X,Shape of You,{u'isrc': u'GBAHS1600463'},"[{u'name': u'Ed Sheeran', u'external_urls': {u...",233712,False,1,100,0.581,0.825,0.652,0.0,1,0.0931,-3.183,0.0802,95.977,4,0.933,4
4,3ebXMykcMXOcLeJ9xZ17XH,Scared To Be Lonely,{u'isrc': u'NLM5S1600025'},"[{u'name': u'Martin Garrix', u'external_urls':...",220883,False,1,91,0.0895,0.584,0.54,0.0,1,0.261,-7.786,0.0576,137.972,4,0.19,5
