# IACS Capstone Project Team Spotify 1: Playlist Prediction
### Spotify API dataset creation

### Goal: create dataset of Spotify-created playlists and API features for those playlists
### Updated to extract the mean, median, and 25th and 75th percentiles of the attributes within each playlist

In [1]:
import numpy as np
import pandas as pd
import spotipy
import spotipy.util as util
from spotipy.oauth2 import SpotifyClientCredentials
import sys
from sklearn.metrics import mean_squared_error
from math import sqrt

### Authentication

In [2]:
client_credentials_manager = SpotifyClientCredentials(client_id='df846cfd28e745178054587b3484f91c',                                                client_secret='e3d39fc92a954e028ff1490288f3fe5c')
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

### Playlist DataFrame Generation
The below code generates a DataFrame with all the playlists that were generated by the user ID "spotify" - in other words, those created by the company for wide distribution and recommendation. Alongside each playlist ID, we have extracted the mean popularity of the playlist's tracks, the number of followers of the playlist, as well as the playlist's name.

In [3]:
#sp.user_playlists('spotify', limit=50, offset=0)
#playlists

In [4]:
#sp.audio_features(tracks=item['track']['id'])
featured_names = []
featured_playlists = sp.featured_playlists(locale=None, country=None, timestamp=None, limit=50, offset=0)
for i, playlist in enumerate(featured_playlists['playlists']['items']):
    print(playlist['name'])
    featured_names.append(playlist['name'])

Evening Acoustic
electroNOW
Songs For Sleeping
Indie Chillout.
Piano in the Background
Late Night Love
Hit Rewind
9 Hour Sleep
Today's Top Hits
Rage Radio
Jazz For Sleep
Peaceful Piano
Night Rider
Sleep
4am Comedown


In [5]:
featured_names

['Evening Acoustic',
 'electroNOW',
 'Songs For Sleeping',
 'Indie Chillout.',
 'Piano in the Background',
 'Late Night Love',
 'Hit Rewind',
 '9 Hour Sleep',
 "Today's Top Hits",
 'Rage Radio',
 'Jazz For Sleep',
 'Peaceful Piano',
 'Night Rider',
 'Sleep',
 '4am Comedown']

In [6]:
'White Noise' in featured_names

False

In [7]:
import time

class Timer(object):
    def __init__(self, verbose=True):
        self.verbose = verbose

    def __enter__(self):
        self.start = time.time()
        return self

    def __exit__(self, *args):
        self.end = time.time()
        self.secs = self.end - self.start
        self.msecs = self.secs * 1000  # millisecs
        if self.verbose:
            print('elapsed time: %f ms %f projected finish' % (self.msecs, self.msecs*1726/(4*1000*3600)))

In [8]:
with Timer() as t:
    for i in range(1000):
        a = i*34

elapsed time: 0.298977 ms 0.000036 projected finish


In [9]:
def getPlaylistData(offset, numLoops, queue, limit=50):
    names = []
    total_tracks = []
    followers = []
    popularity_means = []
    mean_num_markets = []
    added_date = []
    audio_attributes_median = []
    audio_attributes_75 = []
    audio_attributes_25 = []
    featured = []
    #audio_attributes = np.empty((numLoops*limit,12), dtype=np.float64)
    playlist_ids = []

    for loopIndex in range(numLoops):
        playlists = sp.user_playlists('spotify', limit=limit, offset=(offset + limit*loopIndex))
        for i, playlist in enumerate(playlists['items']):
            try:
                with Timer() as t:
                    metadata = sp.user_playlist('spotify', playlist_id=playlist['id'],
                        fields='followers.total,tracks.items(added_at,track.available_markets, track.popularity, track.name, track.id)')
                    popularities = np.empty(len(metadata['tracks']['items']))
                    added_at = []
                    num_markets = np.empty(len(metadata['tracks']['items']))
                    aud_features = np.empty((len(metadata['tracks']['items']), 12))
                    all_ids = []
                    for index, item in enumerate(metadata['tracks']['items']):
                        all_ids.append(item['track']['id'])
                        popularities[index] = item['track']['popularity']
                        added_at.append(item['added_at'])
                        num_markets[index] = len(item['track']['available_markets'])
                        #print("%s" % (item['track']['name']))
                    features = sp.audio_features(tracks=all_ids)
                    for index, songData in enumerate(features):
                        aud_features[index][0] = songData['acousticness']
                        aud_features[index][1] = songData['danceability']
                        aud_features[index][2] = songData['energy']
                        aud_features[index][3] = songData['instrumentalness']
                        aud_features[index][4] = songData['key']
                        aud_features[index][5] = songData['liveness']
                        aud_features[index][6] = songData['loudness']
                        aud_features[index][7] = songData['mode']
                        aud_features[index][8] = songData['tempo']
                        aud_features[index][9] = songData['time_signature']
                        aud_features[index][10] = songData['valence']
                        aud_features[index][11] = songData['duration_ms']
                    audio_attributes_median.append(np.percentile(aud_features,50, axis=0))
                    audio_attributes_75.append(np.percentile(aud_features,75, axis=0))
                    audio_attributes_25.append(np.percentile(aud_features,25, axis=0))
                    added_date.append(added_at[0])
                    mean_num_markets.append(num_markets.mean())
                    popularity_means.append(popularities.mean())
                    playlist_ids.append(playlist['id'])
                    followers.append(metadata['followers']['total'])
                    names.append(playlist['name'])
                    featured.append(playlist['name'] in featured_names)
                    total_tracks.append(playlist['tracks']['total'])
                    print("%4d %s" % (i + 1 + playlists['offset'], playlist['name']))
            except:
            #    a=0
                print("NO METADATA")
        if playlists['next']==None:
            break
    audio_attributes_median = np.array(audio_attributes_median)
    audio_attributes_75 = np.array(audio_attributes_75)
    audio_attributes_25 = np.array(audio_attributes_25)
    playlist_data = pd.DataFrame({
    'followers': followers,
    'names': names,
    'playlist_id': playlist_ids,
    'total_tracks': total_tracks,
    'featured': featured,
    'mean_popularity': popularity_means,
    'added_date': added_date, 
    'num_markets': mean_num_markets,
    'acousticness': audio_attributes_median[:,0],
    'danceability': audio_attributes_median[:,1],
    'energy': audio_attributes_median[:,2],
    'instrumentalness': audio_attributes_median[:,3],
    'key': audio_attributes_median[:,4],
    'liveness': audio_attributes_median[:,5],
    'loudness': audio_attributes_median[:,6],
    'mode': audio_attributes_median[:,7],
    'tempo': audio_attributes_median[:,8],
    'time_signature': audio_attributes_median[:,9],
    'valence': audio_attributes_median[:,10],
    'duration': audio_attributes_median[:,11],
    'acousticness_75': audio_attributes_75[:,0],
    'danceability_75': audio_attributes_75[:,1],
    'energy_75': audio_attributes_75[:,2],
    'instrumentalness_75': audio_attributes_75[:,3],
    'key_75': audio_attributes_75[:,4],
    'liveness_75': audio_attributes_75[:,5],
    'loudness_75': audio_attributes_75[:,6],
    'mode_75': audio_attributes_75[:,7],
    'tempo_75': audio_attributes_75[:,8],
    'time_signature_75': audio_attributes_75[:,9],
    'valence_75': audio_attributes_75[:,10],
    'duration_75': audio_attributes_75[:,11],
    'acousticness_25': audio_attributes_25[:,0],
    'danceability_25': audio_attributes_25[:,1],
    'energy_25': audio_attributes_25[:,2],
    'instrumentalness_25': audio_attributes_25[:,3],
    'key_25': audio_attributes_25[:,4],
    'liveness_25': audio_attributes_25[:,5],
    'loudness_25': audio_attributes_25[:,6],
    'mode_25': audio_attributes_25[:,7],
    'tempo_25': audio_attributes_25[:,8],
    'time_signature_25': audio_attributes_25[:,9],
    'valence_25': audio_attributes_25[:,10],
    'duration_25': audio_attributes_25[:,11]
    })
    q.put(playlist_data)
    name = multiprocessing.current_process().name
    print(name, 'Exiting')

In [10]:
from multiprocessing import Process, Queue

if __name__ == '__main__':
    q = Queue()
    for offset in [0,400,800,1200,1600]:
    #for offset in [0]:

        Process(target=getPlaylistData, args=(offset,8,q)).start()
    #p.start()
    #print(q.get())    # prints "[42, None, 'hello']"
    #p.join()
    

 801 Kids Workout
 401 More Banjo!
elapsed time: 473.773003 ms 0.056787 projected finish
elapsed time: 477.575064 ms 0.057243 projected finish
   1 Today's Top Hits
elapsed time: 486.849070 ms 0.058354 projected finish
1601 Life Sucks
elapsed time: 503.340960 ms 0.060331 projected finish
1201 Retro Gaming
elapsed time: 603.934050 ms 0.072388 projected finish
1602 Love In Paris
elapsed time: 561.575890 ms 0.067311 projected finish
   2 Rap Caviar
 802 Once Upon A Time
elapsed time: 582.211018 ms 0.069784 projected finish
elapsed time: 601.558924 ms 0.072104 projected finish
 402 Stomp & Holler
elapsed time: 610.105991 ms 0.073128 projected finish
1202 Power Gaming
elapsed time: 578.016996 ms 0.069282 projected finish




 803 Trick or Treat
elapsed time: 900.769949 ms 0.107967 projected finish
1603 Lounge - Soft House
elapsed time: 929.445982 ms 0.111404 projected finish
 403 Torch & Twang
1203 Indie Gaming
elapsed time: 835.350037 ms 0.100126 projected finish
elapsed time: 937.969923 ms 0.112426 projected finish




   3 electroNOW
elapsed time: 1053.959131 ms 0.126329 projected finish
 804 We Love Elmo!
elapsed time: 445.460081 ms 0.053393 projected finish
 404 Trad Folk
elapsed time: 490.661144 ms 0.058811 projected finish
1204 Hip Hop Gaming
elapsed time: 508.363008 ms 0.060933 projected finish
elapsed time: 470.386028 ms 0.056381 projected finish
NO METADATA
1604 Låtarna i reklamen
elapsed time: 645.650148 ms 0.077388 projected finish
 805 Father's Day Love
elapsed time: 507.739067 ms 0.060858 projected finish
 405 Bluegrass Covers
elapsed time: 491.198063 ms 0.058876 projected finish
1605 Magic Summer Hits
elapsed time: 444.875956 ms 0.053323 projected finish
   5 Rock This
elapsed time: 484.554052 ms 0.058079 projected finish
1205 Pop Gaming
elapsed time: 559.102058 ms 0.067015 projected finish
 406 Bluegrass Gals
elapsed time: 480.567932 ms 0.057601 projected finish
1606 Meditate to Sounds of Nature
elapsed time: 474.906921 ms 0.056923 projected finish
1206 Bread & Dub
elapsed time: 494.156



elapsed time: 551.623106 ms 0.066118 projected finish
 414 My Folksy Love
elapsed time: 580.163002 ms 0.069539 projected finish
 813 Rock & Relax
elapsed time: 485.582113 ms 0.058202 projected finish
1615 Power Walk
elapsed time: 468.449116 ms 0.056149 projected finish
1216 Day Party!
elapsed time: 449.745178 ms 0.053907 projected finish
  15 SXSW UK Rising
elapsed time: 458.514214 ms 0.054958 projected finish
 415 SongCraft
elapsed time: 449.142218 ms 0.053835 projected finish
 814 Smash The Clock
elapsed time: 504.069090 ms 0.060418 projected finish
1616 Psychedelic Rock
elapsed time: 489.371061 ms 0.058657 projected finish
1217 Energizing Classics
elapsed time: 444.108963 ms 0.053231 projected finish
  16 ExperiWOMENtell
elapsed time: 480.571985 ms 0.057602 projected finish
 416 Sweetgrass
elapsed time: 490.345001 ms 0.058773 projected finish
 815 Pop Punk Alarm Clock
elapsed time: 502.561092 ms 0.060238 projected finish
1617 Pure Rock & Roll
elapsed time: 497.033119 ms 0.059575 pro



1656 Inside Spotify
elapsed time: 570.225954 ms 0.068348 projected finish
1255 undercurrents
elapsed time: 623.838902 ms 0.074774 projected finish
  57 This is: The Rolling Stones
elapsed time: 641.065121 ms 0.076839 projected finish
 453 Rock & Roots
elapsed time: 670.888186 ms 0.080413 projected finish
1657 One Hit Wonders: 90's
elapsed time: 552.060843 ms 0.066171 projected finish
1256 Morning Commute
elapsed time: 517.503977 ms 0.062029 projected finish
 857 Study Break
elapsed time: 656.949997 ms 0.078743 projected finish
  58 This Is: The Weeknd
elapsed time: 446.864843 ms 0.053562 projected finish
1658 38 Great Female Duets
elapsed time: 395.637035 ms 0.047421 projected finish
 454 Interstate Drive Songs
elapsed time: 437.970877 ms 0.052496 projected finish
1257 Evening Commute
elapsed time: 413.468122 ms 0.049559 projected finish
 858 HIIT-Pop
elapsed time: 445.030928 ms 0.053342 projected finish
  59 This Is:  Thomas Rhett
elapsed time: 465.063095 ms 0.055743 projected finish




1259 Stress Relief
elapsed time: 450.128078 ms 0.053953 projected finish
elapsed time: 515.843153 ms 0.061830 projected finish
  61 This Is: Bob Dylan
elapsed time: 468.593121 ms 0.056166 projected finish
1661 Pride-pepp
elapsed time: 487.365007 ms 0.058416 projected finish
 457 Top Male Artists of 2016 - USA
 861 Austin Pops
elapsed time: 450.124979 ms 0.053952 projected finish
elapsed time: 469.415903 ms 0.056265 projected finish
1260 Summer Party
elapsed time: 495.558023 ms 0.059398 projected finish
  62 This Is: Bonobo
elapsed time: 447.169065 ms 0.053598 projected finish
1662 Eurovision Party
elapsed time: 398.782969 ms 0.047799 projected finish
 458 Top Female Artists of 2016 - USA
elapsed time: 432.069063 ms 0.051788 projected finish
 862 This Is: Frank Sinatra
elapsed time: 508.005142 ms 0.060890 projected finish
1261 Totally Stress Free
elapsed time: 499.009848 ms 0.059812 projected finish
  63 This Is: Britney Spears
elapsed time: 438.822031 ms 0.052598 projected finish
1663 

Process Process-5:
Traceback (most recent call last):
  File "//anaconda/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
  File "//anaconda/lib/python3.5/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-9-8c7d8ef183a5>", line 114, in getPlaylistData
    name = multiprocessing.current_process().name
NameError: name 'multiprocessing' is not defined


 955 Political Punks
elapsed time: 414.011955 ms 0.049624 projected finish
 551 Piano Ballads
elapsed time: 486.701965 ms 0.058337 projected finish
 154 Country Nights
elapsed time: 442.061901 ms 0.052986 projected finish
1353 Dark & Stormy
elapsed time: 447.484970 ms 0.053636 projected finish
 956 Hot Rods & Horror Shows
elapsed time: 465.712070 ms 0.055821 projected finish
 552 Indie Folk: Western Vistas
 155 Chillin' on a Dirt Road
elapsed time: 537.393093 ms 0.064413 projected finish
elapsed time: 500.292778 ms 0.059966 projected finish
retrying ...5secs
retrying ...5secs
retrying ...4secs
retrying ...4secs
retrying ...0secs
retrying ...0secs
retrying ...0secs
retrying ...0secs
retrying ...0secs
retrying ...0secs
retrying ...0secs
retrying ...0secs
retrying ...0secs
retrying ...0secs
1354 Mac 'N Cheese
elapsed time: 5641.383171 ms 0.676182 projected finish
 553 J-Track Makunouchi
elapsed time: 5424.253941 ms 0.650157 projected finish
 156 This Is: Merle Haggard
elapsed time: 5452.6

In [11]:
all_dfs = []
for _ in range(5):
    all_dfs.append(q.get())

In [12]:
playlist_data = all_dfs[0]
for each in all_dfs[1:]:
    playlist_data = playlist_data.append(each,ignore_index=True)

In [3]:
playlist_data

NameError: name 'playlist_data' is not defined

In [14]:
playlist_data.to_csv("playlist_data_with_audio_attributes_percentiles_3.11.17.csv")

In [16]:
sp.user_playlist('spotify','5FJXhjdILmRA2z5bvz4nzf')['tracks']['items'][0]

{'added_at': '2017-03-10T17:25:58Z',
 'added_by': {'external_urls': {'spotify': 'http://open.spotify.com/user/spotify'},
  'href': 'https://api.spotify.com/v1/users/spotify',
  'id': 'spotify',
  'type': 'user',
  'uri': 'spotify:user:spotify'},
 'is_local': False,
 'track': {'album': {'album_type': 'single',
   'artists': [{'external_urls': {'spotify': 'https://open.spotify.com/artist/2qxJFvFYMEDqd7ui6kSAcq'},
     'href': 'https://api.spotify.com/v1/artists/2qxJFvFYMEDqd7ui6kSAcq',
     'id': '2qxJFvFYMEDqd7ui6kSAcq',
     'name': 'Zedd',
     'type': 'artist',
     'uri': 'spotify:artist:2qxJFvFYMEDqd7ui6kSAcq'},
    {'external_urls': {'spotify': 'https://open.spotify.com/artist/2wUjUUtkb5lvLKcGKsKqsR'},
     'href': 'https://api.spotify.com/v1/artists/2wUjUUtkb5lvLKcGKsKqsR',
     'id': '2wUjUUtkb5lvLKcGKsKqsR',
     'name': 'Alessia Cara',
     'type': 'artist',
     'uri': 'spotify:artist:2wUjUUtkb5lvLKcGKsKqsR'}],
   'available_markets': ['CA', 'MX', 'US'],
   'external_urls': {

In [79]:
playlist_sample = sp.user_playlist('spotify','5FJXhjdILmRA2z5bvz4nzf')['tracks']['items']
list_of_tracks = []
for a in playlist_sample:
    list_of_tracks.append(a['track'])
sample = pd.DataFrame(list_of_tracks)[['id','name','external_ids','artists',
                                       'duration_ms','explicit','track_number','popularity']]
features = sp.audio_features(tracks=sample['id'])
features_df = pd.DataFrame(features)
sample['acousticness'] = features_df['acousticness']
sample['danceability'] = features_df['danceability']
sample['energy'] = features_df['energy']
sample['instrumentalness'] = features_df['instrumentalness']
sample['key'] = features_df['key']
sample['liveness'] = features_df['liveness']
sample['loudness'] = features_df['loudness']
sample['mode'] = features_df['speechiness']
sample['tempo'] = features_df['tempo']
sample['time_signature'] = features_df['time_signature']
sample['valence'] = features_df['valence']
sample['sequence'] = sample.index + 1
sample.head()

Unnamed: 0,id,name,external_ids,artists,duration_ms,explicit,track_number,popularity,acousticness,danceability,energy,instrumentalness,key,liveness,loudness,mode,tempo,time_signature,valence,sequence
0,1dNIEtp7AY3oDAKCGg2XkH,Something Just Like This,{u'isrc': u'USQX91700278'},"[{u'name': u'The Chainsmokers', u'external_url...",247626,False,1,0,0.0306,0.607,0.649,2.5e-05,11,0.174,-6.695,0.0362,102.996,4,0.47,1
1,12GEpg2XOPyqk03JZEZnJs,It Ain’t Me (with Selena Gomez),{u'isrc': u'SEBGA1700015'},"[{u'name': u'Kygo', u'external_urls': {u'spoti...",220780,False,1,76,0.0905,0.648,0.532,0.0,0,0.0831,-6.597,0.0746,99.983,4,0.497,2
2,6AeQlMyRzvSl1nkFztZyKl,Issues,{u'isrc': u'USUM71615691'},"[{u'name': u'Julia Michaels', u'external_urls'...",176346,False,1,82,0.416,0.704,0.423,0.0,8,0.0607,-6.792,0.0862,113.962,4,0.45,3
3,0FE9t6xYkqWXU2ahLh6D8X,Shape of You,{u'isrc': u'GBAHS1600463'},"[{u'name': u'Ed Sheeran', u'external_urls': {u...",233712,False,1,100,0.581,0.825,0.652,0.0,1,0.0931,-3.183,0.0802,95.977,4,0.933,4
4,3ebXMykcMXOcLeJ9xZ17XH,Scared To Be Lonely,{u'isrc': u'NLM5S1600025'},"[{u'name': u'Martin Garrix', u'external_urls':...",220883,False,1,91,0.0895,0.584,0.54,0.0,1,0.261,-7.786,0.0576,137.972,4,0.19,5
