### Importing the necessary libraries

In [1]:
import pandas as pd
from tqdm import tqdm
import re

import datetime, time
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials 

In [48]:
#connect to Spotify Web API
client_id = os.environ.get('CLIENT_ID')
client_secret = os.environ.get('CLIENT_SECRET')
client_credentials_manager = SpotifyClientCredentials(client_id=client_id,
                                        client_secret=client_secret)
sp = spotipy.client.Spotify(client_credentials_manager=client_credentials_manager)

In [2]:
#Load the raw_data from the repo
dataPath = 'playlist_raw_data.csv'
df = pd.read_csv(dataPath)
df.head()

Unnamed: 0.1,Unnamed: 0,pos,artist_name,track_uri,artist_uri,track_name,album_uri,duration_ms,album_name,name
0,0,0,AronChupa,spotify:track:66U0ASk1VHZsqIkpMjKX3B,spotify:artist:5vCOdeiQt9LyzdI87kt5Sh,Little Swing,spotify:album:4S5MLjwRSi0NJ5nikflYnZ,163809,Little Swing,Party
1,1,1,AronChupa,spotify:track:5MhsZlmKJG6X5kTHkdwC4B,spotify:artist:5vCOdeiQt9LyzdI87kt5Sh,I'm an Albatraoz,spotify:album:1qHVYbxQ6IS8YRviorKDJI,166848,I'm an Albatraoz,Party
2,2,2,Lorde,spotify:track:0GZoB8h0kqXn7XFm4Sj06k,spotify:artist:163tK9Wjr9P9DmM0AVK7lm,Yellow Flicker Beat - From The Hunger Games: M...,spotify:album:4UEPxQx0cTcYNsE0n32MHV,232506,Yellow Flicker Beat,Party
3,3,3,Lorde,spotify:track:35kahykNu00FPysz3C2euR,spotify:artist:163tK9Wjr9P9DmM0AVK7lm,White Teeth Teens,spotify:album:0rmhjUgoVa17LZuS8xWQ3v,216600,Pure Heroine,Party
4,4,4,Lorde,spotify:track:3G6hD9B2ZHOsgf4WfNu7X1,spotify:artist:163tK9Wjr9P9DmM0AVK7lm,Team,spotify:album:0rmhjUgoVa17LZuS8xWQ3v,193058,Pure Heroine,Party


In [3]:
#Edit the track-uris to a more usable format
df["track_uri"] = df["track_uri"].apply(lambda x: re.findall(r'\w+$', x)[0])
df["track_uri"]

0         66U0ASk1VHZsqIkpMjKX3B
1         5MhsZlmKJG6X5kTHkdwC4B
2         0GZoB8h0kqXn7XFm4Sj06k
3         35kahykNu00FPysz3C2euR
4         3G6hD9B2ZHOsgf4WfNu7X1
                   ...          
280995    38griAVM808crjbFp9gcPD
280996    1JClFT74TYSXlzpagbmj0S
280997    4InLm5a9Qtkru6YxEjM4Qc
280998    4hdog9vyyqG9pcppG2Izek
280999    0NiXXAI876aGImAd6rTj8w
Name: track_uri, Length: 281000, dtype: object

In [6]:
first_half = df["track_uri"].unique()[:500]

In [7]:
first_half

array(['66U0ASk1VHZsqIkpMjKX3B', '5MhsZlmKJG6X5kTHkdwC4B',
       '0GZoB8h0kqXn7XFm4Sj06k', '35kahykNu00FPysz3C2euR',
       '3G6hD9B2ZHOsgf4WfNu7X1', '6WQLkih8nE0JdUCEyLaGnQ',
       '37sINbJZcFdHFAsVNsPq1i', '0yhPEz5KxlDwckGJaMlZqM',
       '5j9iuo3tMmQIfnEEQOOjxh', '4eLSCSELtKxZwXnFbNLXT5',
       '4PvD06Pmbm2rHG2JjSlElF', '57nNNkgk768QVXq3uHxu5e',
       '3y8AEUef1AVfr0npU5UOa9', '5jJ69cMDMC0aeWPjZo6VP2',
       '02b5L9jExmkRTdUTqXFzmR', '6fKEplI9iN0JMHsRGQESaT',
       '3ebXMykcMXOcLeJ9xZ17XH', '1J9KJgXKFRqKGIzmJ7GjS3',
       '6aBYg6Npa47VAqtEsLsUdz', '0dA2Mk56wEzDgegdC6R17g',
       '3ik1fFvzdgmVuiob8t8NlG', '05BJQUM4bATKGFheqrC6CW',
       '0rfGRC3ng0dxu0xZ3f1bWc', '1azvKxDb5Vkph2KR7aq1Cx',
       '1Gsv8f8KmowkF5BnfMIGKy', '3KYiA4vq6RPO1dE2XROXd8',
       '252pAlbltOjOcUux4dRtjp', '1ynmMEK1fkyiZ6Z6F3ThEt',
       '7vIyudiO2Tr9hZ2yzfwQmx', '12yHvSYFXI7PGzNecUvIDu',
       '6koWevx9MqN6efQ6qreIbm', '2F46y5zSlBKOqco0k8xQAM',
       '00YnfZzpODxlE6nNwfPCbv', '3PiyMOuCTB4Rc624hRr2rC

In [49]:
# storing track uri and artist uri for further use
track_uri=df["track_uri"]
artist_uri=df["artist_uri"]

## Data preparation & feature extraction
Using the Spotify API for Feature Extraction and Saving Results to a CSV File

Initially, we were using SP.track(SPOTIFY WEB API), but we soon realized that it would take a lot of time and require me to work around numerous Api rate constraints, so we switched to SP.tracks and SP.artists. It took far less time because they allow lists with a limit of 50 URIs and handle them in a single request.
https://spotipy.readthedocs.io/en/latest/

In [50]:

f = open('audio_features.csv','a')

for i in tqdm(range(0, len(track_uri), 100)):
    try:
        track_feature = sp.audio_features(track_uri[i:i+100])
        track_df = pd.DataFrame(track_feature)
        track_csv_data = track_df.to_csv(header=False,index=False)
        f.write(track_csv_data)
    except Exception as error:
        time.sleep(3)
        continue

f.close()

100%|███████████████████████████████████████| 2810/2810 [08:39<00:00,  5.41it/s]


In [51]:
audio_features=pd.read_csv("audio_features.csv")
audio_features

Unnamed: 0,0.89,0.672,7,-5.532,1,0.0621,0.707,0.000604,0.177,0.598,126.036,audio_features,66U0ASk1VHZsqIkpMjKX3B,spotify:track:66U0ASk1VHZsqIkpMjKX3B,https://api.spotify.com/v1/tracks/66U0ASk1VHZsqIkpMjKX3B,https://api.spotify.com/v1/audio-analysis/66U0ASk1VHZsqIkpMjKX3B,163810,4
0,0.883,0.817,0,-5.414,1,0.2350,0.6010,0.004490,0.1100,0.5950,128.078,audio_features,5MhsZlmKJG6X5kTHkdwC4B,spotify:track:5MhsZlmKJG6X5kTHkdwC4B,https://api.spotify.com/v1/tracks/5MhsZlmKJG6X...,https://api.spotify.com/v1/audio-analysis/5Mhs...,166849,4
1,0.586,0.626,1,-8.448,1,0.0356,0.0874,0.000380,0.1130,0.0519,94.965,audio_features,0GZoB8h0kqXn7XFm4Sj06k,spotify:track:0GZoB8h0kqXn7XFm4Sj06k,https://api.spotify.com/v1/tracks/0GZoB8h0kqXn...,https://api.spotify.com/v1/audio-analysis/0GZo...,232507,4
2,0.654,0.304,6,-9.067,0,0.0358,0.6470,0.000072,0.1470,0.1020,113.992,audio_features,35kahykNu00FPysz3C2euR,spotify:track:35kahykNu00FPysz3C2euR,https://api.spotify.com/v1/tracks/35kahykNu00F...,https://api.spotify.com/v1/audio-analysis/35ka...,216600,4
3,0.690,0.578,6,-7.436,1,0.0929,0.1670,0.000000,0.3050,0.4160,99.961,audio_features,3G6hD9B2ZHOsgf4WfNu7X1,spotify:track:3G6hD9B2ZHOsgf4WfNu7X1,https://api.spotify.com/v1/tracks/3G6hD9B2ZHOs...,https://api.spotify.com/v1/audio-analysis/3G6h...,193059,4
4,0.500,0.726,5,-4.124,1,0.0643,0.0362,0.000000,0.4410,0.3970,125.904,audio_features,6WQLkih8nE0JdUCEyLaGnQ,spotify:track:6WQLkih8nE0JdUCEyLaGnQ,https://api.spotify.com/v1/tracks/6WQLkih8nE0J...,https://api.spotify.com/v1/audio-analysis/6WQL...,210013,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
347037,0.741,0.841,2,-6.398,0,0.2160,0.0702,0.000000,0.1460,0.7300,94.846,audio_features,38griAVM808crjbFp9gcPD,spotify:track:38griAVM808crjbFp9gcPD,https://api.spotify.com/v1/tracks/38griAVM808c...,https://api.spotify.com/v1/audio-analysis/38gr...,339573,4
347038,0.767,0.856,7,-2.715,1,0.0410,0.0303,0.000002,0.3600,0.7470,91.535,audio_features,1JClFT74TYSXlzpagbmj0S,spotify:track:1JClFT74TYSXlzpagbmj0S,https://api.spotify.com/v1/tracks/1JClFT74TYSX...,https://api.spotify.com/v1/audio-analysis/1JCl...,285027,4
347039,0.496,0.372,4,-5.841,1,0.0662,0.6820,0.000000,0.1550,0.5580,87.011,audio_features,4InLm5a9Qtkru6YxEjM4Qc,spotify:track:4InLm5a9Qtkru6YxEjM4Qc,https://api.spotify.com/v1/tracks/4InLm5a9Qtkr...,https://api.spotify.com/v1/audio-analysis/4InL...,279322,4
347040,0.707,0.707,11,-6.161,0,0.0375,0.0306,0.000000,0.1270,0.2310,104.039,audio_features,4hdog9vyyqG9pcppG2Izek,spotify:track:4hdog9vyyqG9pcppG2Izek,https://api.spotify.com/v1/tracks/4hdog9vyyqG9...,https://api.spotify.com/v1/audio-analysis/4hdo...,223296,4


In [54]:
f = open('artist_features.csv','a')

for i in tqdm(range(0, len(artist_uri), 50)):
    try:
        artist_features = sp.artists(artist_uri[i:i+50])
        #track_pop = sp.tracks(t_uri[i:i+50])
        
        for x in range(50):
            artist_df = pd.DataFrame([artist_uri[i+x]])
            artist_popularity = artist_features['artists'][x]["popularity"]
            artist_genres = artist_features['artists'][x]["genres"]
            artist_df["artist_pop"] = artist_popularity
            if artist_genres: 
                artist_df["genres"] = " ".join([re.sub(' ', '_', i) for i in artist_genres])
            else:
                artist_df["genres"] = "unknown"
            
            #artist_df["track_pop"] =  track_pop["popularity"]
            #print(artist_df["track_pop"] )
            csv_data = artist_df.to_csv(header=False,index=False)
            f.write(csv_data)
    except Exception as error:
        time.sleep(3)
        continue
        
f.close()

100%|███████████████████████████████████████| 5620/5620 [14:12<00:00,  6.59it/s]


In [55]:
artist_features = pd.read_csv("artist_features.csv")
artist_features

Unnamed: 0,spotify:artist:5vCOdeiQt9LyzdI87kt5Sh,60,swedish_pop
0,spotify:artist:163tK9Wjr9P9DmM0AVK7lm,75,art_pop metropopolis nz_pop pop
1,spotify:artist:4AVFqumd2ogHFlRbKIjp1t,75,dance_pop edm electro_house pop pop_dance trop...
2,spotify:artist:3AQRLZ9PuTAozP28Skbq8V,73,celtic_rock neo_mellow pop pop_rock
3,spotify:artist:4UXqAaa6dQYAk18Lv7PEgX,77,emo modern_rock
4,spotify:artist:0MlOPi3zIDMVrfA9R04Fe3,65,indie_poptimism modern_rock pop_rock
...,...,...,...
295092,spotify:artist:6nnspeopmJAG07xOxHmqTu,61,salsa salsa_puertorriquena tropical
295093,spotify:artist:1ZwdS5xdxEREPySFridCfh,79,g_funk gangster_rap hip_hop rap west_coast_rap
295094,spotify:artist:2Y9lO01ABSO8OkBU8FI1mp,22,minneapolis_indie
295095,spotify:artist:2cFrymmkijnjDg9SS92EPM,78,pop pop_rap


In [56]:
artist_uri = df["artist_uri"]

In [57]:
artist_uri

0         spotify:artist:5vCOdeiQt9LyzdI87kt5Sh
1         spotify:artist:5vCOdeiQt9LyzdI87kt5Sh
2         spotify:artist:163tK9Wjr9P9DmM0AVK7lm
3         spotify:artist:163tK9Wjr9P9DmM0AVK7lm
4         spotify:artist:163tK9Wjr9P9DmM0AVK7lm
                          ...                  
280995    spotify:artist:6nnspeopmJAG07xOxHmqTu
280996    spotify:artist:1ZwdS5xdxEREPySFridCfh
280997    spotify:artist:2Y9lO01ABSO8OkBU8FI1mp
280998    spotify:artist:2cFrymmkijnjDg9SS92EPM
280999    spotify:artist:1HBjj22wzbscIZ9sEb5dyf
Name: artist_uri, Length: 281000, dtype: object

In [58]:
df["track_uri"]

0         66U0ASk1VHZsqIkpMjKX3B
1         5MhsZlmKJG6X5kTHkdwC4B
2         0GZoB8h0kqXn7XFm4Sj06k
3         35kahykNu00FPysz3C2euR
4         3G6hD9B2ZHOsgf4WfNu7X1
                   ...          
280995    38griAVM808crjbFp9gcPD
280996    1JClFT74TYSXlzpagbmj0S
280997    4InLm5a9Qtkru6YxEjM4Qc
280998    4hdog9vyyqG9pcppG2Izek
280999    0NiXXAI876aGImAd6rTj8w
Name: track_uri, Length: 281000, dtype: object

In [67]:
f = open('track_features.csv','a')

for i in tqdm(range(0, len(track_uri), 50)):
    try:
        track_features = sp.tracks(track_uri[i:i+50])
        for x in range(50):
            track_popularity = pd.DataFrame([track_uri[i+x]])
            track_popularity['release_date']=track_features['tracks'][x]['album']['release_date']
            track_popularity['pop'] = track_features['tracks'][x]["popularity"]
            csv_data = track_popularity.to_csv(header=False,index=False)
            f.write(csv_data)
    except Exception as error:
        time.sleep(3)
        continue
        
f.close()

100%|███████████████████████████████████████| 5620/5620 [30:02<00:00,  3.12it/s]


In [68]:
track_features = pd.read_csv("track_features.csv")
track_features

Unnamed: 0,66U0ASk1VHZsqIkpMjKX3B,2016-02-26,59
0,5MhsZlmKJG6X5kTHkdwC4B,2014-08-08,72
1,0GZoB8h0kqXn7XFm4Sj06k,2014-09-30,0
2,35kahykNu00FPysz3C2euR,2013-09-27,58
3,3G6hD9B2ZHOsgf4WfNu7X1,2013-09-27,76
4,6WQLkih8nE0JdUCEyLaGnQ,2015-05-22,0
...,...,...,...
280717,38griAVM808crjbFp9gcPD,2003,14
280718,1JClFT74TYSXlzpagbmj0S,1998-01-01,64
280719,4InLm5a9Qtkru6YxEjM4Qc,2015-03-10,32
280720,4hdog9vyyqG9pcppG2Izek,2015-02-14,53
