In [11]:
import pandas as pd
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from spotipy.oauth2 import SpotifyOAuth
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
import numpy as np
import matplotlib.pyplot as plt
from yellowbrick.cluster import KElbowVisualizer
from sklearn.decomposition import PCA
import plotly.express as px
from sklearn.neighbors import NearestNeighbors
pd.set_option('mode.chained_assignment', None)

In [4]:
CLIENT_ID = input("What is your client ID?")
SECRET_KEY = input("What is your secret key?")

In [13]:
client_credentials_manager = SpotifyClientCredentials(client_id=CLIENT_ID, client_secret=SECRET_KEY)
spotify = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

In [15]:
canada_categories = spotify.categories(country='CA', limit=50)

all_categories = {}
for category in canada_categories['categories']['items']:
    all_categories[category['name']] = category['id']
i=1
while canada_categories['categories']['next']:
    canada_categories = spotify.categories(country='CA',limit=50, offset=i*50)
    for category in canada_categories['categories']['items']:
        all_categories[category['name']] = category['id']
    i+=1

print(f"Total: {len(all_categories)}")

Total: 55


In [5]:
for category in all_categories.keys():
    print(category)

Top Lists
Hip-Hop
Pop
Summer
Country
Rock
Mood
Workout
Chill
R&B
Decades
Francophone
Netflix
Caribbean
Dance/Electronic
Indie
Fresh Finds
Latin
RADAR
EQUAL
Frequency
Pride
Party
Focus
Alternative
theLINER
Sleep
Instrumental
Ambient
Wellness
Folk & Acoustic
Romance
Tastemakers
Commute
Classical
Jazz
Gaming
Christian & Gospel
Cooking & Dining
K-Pop
Desi
Afro
API Heritage Month
Metal
Punk
Soul
Funk
Blues
Arab
Student
Kids & Family
Pop culture
Anime
Comedy
Spotify Singles


In [6]:
def get_category_playlists(category_key):
    category_playlists = spotify.category_playlists(category_id=all_categories[category_key], limit=50, offset=0, country='CA')

    all_playlists = []
    for playlist in category_playlists['playlists']['items']:
        all_playlists.append(playlist['id'])
    i=1
    while category_playlists['playlists']['next']:
        category_playlists = spotify.category_playlists(category_id=all_categories[category_key], limit=50, offset=i*50, country='CA')
        for playlist in category_playlists['playlists']['items']:
            all_playlists.append(playlist['id'])
        i+=1

    print(f"Total: {len(all_playlists)}")
    return all_playlists

In [7]:
category_key = input('Which category?')
category_playlists = get_category_playlists(category_key)

Total: 88


In [8]:
playlists_followers = {}
for playlist in category_playlists:
    playlists_followers[playlist] = spotify.playlist(playlist, market='CA')['followers']['total']

In [9]:
sorted_playlists = dict(sorted(playlists_followers.items(), key=lambda item: item[1], reverse=True))
top_playlists = [key for key in list(sorted_playlists.keys())[:10]] # If you'd like more than 10 playlists to be included, increase the number here [:n].
top_playlists

['37i9dQZF1DX4dyzvuaRJ0n',
 '37i9dQZF1DWWY64wDtewQt',
 '37i9dQZF1DXaXB8fQg7xif',
 '37i9dQZF1DX0BcQWzuB7ZO',
 '37i9dQZF1DX6VdMW310YC7',
 '37i9dQZF1DXcZDD7cfEKhW',
 '37i9dQZF1DX2TRYkJECvfC',
 '37i9dQZF1DX4eRPd9frC1m',
 '37i9dQZF1DX32NsLKyzScr',
 '37i9dQZF1DWSf2RDTDayIx']

In [10]:
def get_track_search(playlist, index):
    search = ''
    search += playlist[index]['track']['name']
    search += ' - '
    search += playlist[index]['track']['artists'][0]['name']
    return search

In [13]:
def get_playlist_tracklist(playlists):
    tracklist = []
    for playlist in playlists:
        playlist_meta_dict = spotify.playlist_tracks(playlist)
        if playlist_meta_dict['next']:
            playlist_tracks = []
            current_playlist = playlist_meta_dict
            for t, track in enumerate(current_playlist['items']):
                playlist_tracks.append(track)
            i = 1
            while current_playlist['next']:
                current_playlist = spotify.playlist_tracks(playlist, offset = i*100)
                for t, track in enumerate(current_playlist['items']):
                    playlist_tracks.append(track)
                i += 1
        else:
            playlist_tracks = playlist_meta_dict['items']

        print(len(playlist_tracks))
        for track_index in range(len(playlist_tracks)):
            tracks_dict = {}
            tracks_dict['search'] = get_track_search(playlist_tracks, track_index)
            global track_id
            track_id = playlist_tracks[track_index]['track']['id']
            tracks_dict['id'] = track_id
            tracks_dict['href'] = playlist_tracks[track_index]['track']['href']
            tracks_dict['popularity'] = playlist_tracks[track_index]['track']['popularity']
            try:
                track_audio_analysis = spotify.audio_analysis(track_id)
                tracks_dict['duration'] = track_audio_analysis['track']['duration']
                tracks_dict['loudness'] = track_audio_analysis['track']['loudness']
                tracks_dict['tempo'] = track_audio_analysis['track']['tempo']
                tracks_dict['time_signature'] = track_audio_analysis['track']['time_signature']
                tracks_dict['key'] = track_audio_analysis['track']['key']
                tracks_dict['mode'] = track_audio_analysis['track']['mode']
                track_audio_features = spotify.audio_features(track_id)[0]
                tracks_dict['danceability'] = track_audio_features['danceability']
                tracks_dict['energy'] = track_audio_features['energy']
                tracks_dict['speechiness'] = track_audio_features['speechiness']
                tracks_dict['acousticness'] = track_audio_features['acousticness']
                tracks_dict['instrumentalness'] = track_audio_features['instrumentalness']
                tracks_dict['liveness'] = track_audio_features['liveness']
                tracks_dict['valence'] = track_audio_features['valence']
                tracklist.append(tracks_dict)
            except:
                tracks_dict['duration'] = pd.NA
                tracks_dict['loudness'] = pd.NA
                tracks_dict['tempo'] = pd.NA
                tracks_dict['time_signature'] = pd.NA
                tracks_dict['key'] = pd.NA
                tracks_dict['mode'] = pd.NA
                tracks_dict['danceability'] = pd.NA
                tracks_dict['energy'] = pd.NA
                tracks_dict['speechiness'] = pd.NA
                tracks_dict['acousticness'] = pd.NA
                tracks_dict['instrumentalness'] = pd.NA
                tracks_dict['liveness'] = pd.NA
                tracks_dict['valence'] = pd.NA
                tracklist.append(tracks_dict)
    return tracklist

If you would rather input your own/custom playlists, uncomment the cell below and input the links to the playlists.

In [5]:
# user_playlists = []
# statement = True
# while statement == True:
#     user_playlists.append(input('What is the playlist link?')[34:-20])
#     statement = [True if input("More playlists to add? (Y/N)") == 'Y' else None]
# user_playlists

In [None]:
playlist_tracklist = get_playlist_tracklist(top_playlists)

In [26]:
playlist_tracklist_df = pd.DataFrame(playlist_tracklist)
playlist_tracklist_df = playlist_tracklist_df.sort_values('popularity', ascending=False)
playlist_tracklist_df.drop_duplicates(subset='id', inplace=True)
playlist_tracklist_df.reset_index(inplace=True, drop=True)
playlist_tracklist_df

Unnamed: 0,search,id,href,popularity,duration,loudness,tempo,time_signature,key,mode,danceability,energy,speechiness,acousticness,instrumentalness,liveness,valence
0,I'm Good (Blue) - David Guetta,4uUG5RXrOk84mYEfFvj3cK,https://api.spotify.com/v1/tracks/4uUG5RXrOk84...,94,175.23810,-3.673,128.040,4.0,7.0,0.0,0.561,0.965,0.0343,0.003830,0.000007,0.3710,0.304
1,Miracle (with Ellie Goulding) - Calvin Harris,5eTaQYBE1yrActixMAeLcZ,https://api.spotify.com/v1/tracks/5eTaQYBE1yrA...,91,186.49615,-5.289,143.011,4.0,9.0,1.0,0.636,0.869,0.0412,0.037800,0.044600,0.0808,0.306
2,El Merengue - Marshmello,51FvjPEGKq2zByeeEQ43V9,https://api.spotify.com/v1/tracks/51FvjPEGKq2z...,91,189.66853,-3.079,123.982,4.0,8.0,0.0,0.775,0.677,0.0442,0.031300,0.005170,0.1120,0.698
3,Baby Don't Hurt Me - David Guetta,3BKD1PwArikchz2Zrlp1qi,https://api.spotify.com/v1/tracks/3BKD1PwArikc...,90,140.01768,-3.404,127.944,4.0,7.0,1.0,0.602,0.910,0.0308,0.001260,0.000174,0.1200,0.228
4,METAMORPHOSIS - INTERWORLD,2ksyzVfU0WJoBpu8otr4pz,https://api.spotify.com/v1/tracks/2ksyzVfU0WJo...,89,142.83926,-12.727,175.014,4.0,7.0,0.0,0.593,0.641,0.0992,0.426000,0.901000,0.1220,0.147
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
921,Keramas - Moon Boots,6QYthxincEvqQzGkfAOAiK,https://api.spotify.com/v1/tracks/6QYthxincEvq...,40,224.44872,-7.327,122.018,4.0,7.0,1.0,0.658,0.894,0.0551,0.001050,0.079400,0.1280,0.158
922,Onda - Theo Kottis,2vHPrrFhhiSH5GZZcGRSFm,https://api.spotify.com/v1/tracks/2vHPrrFhhiSH...,40,251.99202,-7.631,121.977,4.0,10.0,0.0,0.700,0.917,0.0447,0.159000,0.850000,0.0768,0.272
923,Blissda - DJ Koze,6cOie6oyMHzoVHLmpLfHN4,https://api.spotify.com/v1/tracks/6cOie6oyMHzo...,40,343.33237,-11.509,116.984,4.0,8.0,1.0,0.786,0.633,0.0389,0.005610,0.899000,0.0845,0.186
924,Karabali - Kiko Navarro,48qDlvkTp9Qwi9upy4HbTw,https://api.spotify.com/v1/tracks/48qDlvkTp9Qw...,40,425.22067,-9.920,121.989,4.0,2.0,1.0,0.834,0.686,0.0519,0.023600,0.011100,0.0737,0.740


In [19]:
playlist_duration = round(float(input('Playlist duration (hours)?')), 2)
for i in range(1, len(playlist_tracklist_df)):
    duration = round(playlist_tracklist_df['duration'].iloc[:i].sum()/3600, 2)
    if (duration >= playlist_duration-0.1) and (duration <= playlist_duration+0.1):
        truncated_tracklist = playlist_tracklist_df.iloc[:i]
print(f"Length: {len(truncated_tracklist)} tracks")
print(f"Duration: {round(truncated_tracklist['duration'].sum()/3600, 2)} hours")
truncated_tracklist.describe()

Length: 679 tracks
Duration: 36.06 hours


Unnamed: 0,popularity,duration,loudness,tempo,time_signature,key,mode,danceability,energy,speechiness,acousticness,instrumentalness,liveness,valence
count,679.0,677.0,677.0,677.0,677.0,677.0,677.0,677.0,677.0,677.0,677.0,677.0,677.0,677.0
mean,65.911635,191.767517,-6.045161,125.622576,3.983752,5.156573,0.516987,0.688096,0.786997,0.082994,0.113448,0.247822,0.195597,0.436634
std,8.279831,58.351411,2.626339,14.175669,0.183737,3.557932,0.500081,0.108053,0.141652,0.079172,0.176822,0.34057,0.154561,0.24551
min,53.0,67.03007,-21.167,75.373,1.0,0.0,0.0,0.279,0.264,0.024,2.5e-05,0.0,0.0265,0.026
25%,59.0,154.96059,-7.324,120.959,4.0,2.0,0.0,0.623,0.712,0.04,0.00803,7.7e-05,0.0966,0.242
50%,64.0,181.90475,-5.801,124.914,4.0,5.0,1.0,0.69,0.821,0.0529,0.034,0.0143,0.13,0.441
75%,71.5,217.70892,-4.353,127.98,4.0,8.0,1.0,0.758,0.896,0.0879,0.143,0.557,0.252,0.63
max,94.0,627.0968,1.26,194.992,5.0,11.0,1.0,0.958,0.996,0.613,0.888,0.957,0.924,0.967


In [20]:
def build_neighbor_list(start_index, data, labels, n_neighbors, k):
    neighbor_list = [start_index]
    visited_indices = set([start_index])
    current_label = labels[start_index]

    while len(neighbor_list) < n_neighbors:
        current_coords = data[neighbor_list[-1]]

        nbrs = NearestNeighbors(n_neighbors=k+1).fit(data)
        distances, indices = nbrs.kneighbors([current_coords], n_neighbors=n_neighbors)
        indices = indices[0]
        distances = distances[0]

        for i in range(1, n_neighbors):
            if indices[i] not in visited_indices and labels[indices[i]] == current_label:
                neighbor_list.append(indices[i])
                visited_indices.add(indices[i])
                break

        else:
            for i in range(1, n_neighbors):
                if indices[i] not in visited_indices and labels[indices[i]] != current_label:
                    neighbor_list.append(indices[i])
                    visited_indices.add(indices[i])
                    current_label = labels[indices[i]]
                    break

            else:
                break

    return neighbor_list

In [21]:
def sort_tracklist(tracklist):
    # Sort tracks by search name
    global tracklist_df
    tracklist_df = pd.DataFrame(tracklist).drop_duplicates(subset=['id'])
    tracklist_df = tracklist_df.drop_duplicates(subset=['search'])
    le = LabelEncoder()
    tracklist_df = tracklist_df.sort_values('search', ascending=True)
    tracklist_df.reset_index(inplace=True, drop=True)

    duration_mean = round(tracklist_df['duration'].mean(),3)
    duration_na = tracklist_df['duration'][tracklist_df['duration'].isna() == True].index
    tracklist_df['duration'].iloc[duration_na] = duration_mean

    loudness_mean = round(tracklist_df['loudness'].mean(),3)
    loudness_na = tracklist_df['loudness'][tracklist_df['loudness'].isna() == True].index
    tracklist_df['loudness'].iloc[loudness_na] = loudness_mean

    tempo_mean = round(tracklist_df['tempo'].mean(),3)
    tempo_high = round(tracklist_df['tempo'].max(),3)
    tempo_na = tracklist_df['tempo'][tracklist_df['tempo'].isna() == True].index
    tempo_high = tracklist_df['tempo'][tracklist_df['tempo'] >= 240].index
    tracklist_df['tempo'].iloc[tempo_na] = tempo_mean
    tracklist_df['tempo'].iloc[tempo_high] = tempo_mean

    time_signature_mean = round(tracklist_df['time_signature'].mean(),0)
    time_signature_na = tracklist_df['time_signature'][tracklist_df['time_signature'].isna() == True].index
    tracklist_df['time_signature'].iloc[time_signature_na] = time_signature_mean

    key_mean = round(tracklist_df['key'].mean(),0)
    key_na = tracklist_df['key'][tracklist_df['key'].isna() == True].index
    tracklist_df['key'].iloc[key_na] = key_mean

    mode_mean = round(tracklist_df['mode'].mean(),0)
    mode_na = tracklist_df['mode'][tracklist_df['mode'].isna() == True].index
    tracklist_df['mode'].iloc[mode_na] = mode_mean

    danceability_mean = round(tracklist_df['danceability'].mean(),3)
    danceability_na = tracklist_df['danceability'][tracklist_df['danceability'].isna() == True].index
    tracklist_df['danceability'].iloc[danceability_na] = danceability_mean

    energy_mean = round(tracklist_df['energy'].mean(),3)
    energy_na = tracklist_df['energy'][tracklist_df['energy'].isna() == True].index
    tracklist_df['energy'].iloc[energy_na] = energy_mean

    speechiness_mean = tracklist_df['speechiness'].mean()
    speechiness_na = tracklist_df['speechiness'][tracklist_df['speechiness'].isna() == True].index
    tracklist_df['speechiness'].iloc[speechiness_na] = speechiness_mean

    acousticness_mean = round(tracklist_df['acousticness'].mean(),5)
    acousticness_na = tracklist_df['acousticness'][tracklist_df['acousticness'].isna() == True].index
    tracklist_df['acousticness'].iloc[acousticness_na] = acousticness_mean

    instrumentalness_mean = round(tracklist_df['instrumentalness'].mean(),3)
    instrumentalness_na = tracklist_df['instrumentalness'][tracklist_df['instrumentalness'].isna() == True].index
    tracklist_df['instrumentalness'].iloc[instrumentalness_na] = instrumentalness_mean

    liveness_mean = round(tracklist_df['liveness'].mean(),3)
    liveness_na = tracklist_df['liveness'][tracklist_df['liveness'].isna() == True].index
    tracklist_df['liveness'].iloc[liveness_na] = liveness_mean

    valence_mean = round(tracklist_df['valence'].mean(),3)
    valence_na = tracklist_df['valence'][tracklist_df['valence'].isna() == True].index
    tracklist_df['valence'].iloc[valence_na] = valence_mean

    len_tracklist = len(tracklist_df)

    # Numeric values
    numeric_df = tracklist_df.drop(columns=['search', 'id', 'href', 'mode'])

    # Scale values
    scaler = MinMaxScaler(feature_range=(0, 10))
    numeric_scaled = pd.DataFrame(scaler.fit_transform(numeric_df), columns=numeric_df.columns)

    # Reduce dimensions
    pca = PCA(n_components=2)
    pca.fit(numeric_scaled)
    global numeric_scaled_pca
    numeric_scaled_pca = pca.transform(numeric_scaled)
    tempo_array = np.array(numeric_scaled['tempo'])
    tempo_array = tempo_array.reshape(-1, 1)
    numeric_scaled_pca = np.concatenate((numeric_scaled_pca, tempo_array), axis=1)
    X_PCA = pd.DataFrame(numeric_scaled_pca, columns=(['col1', 'col2', 'col3']))

    component_names = []
    for component in pca.components_:
        abs_loadings = np.abs(component)
        max_loading_index = np.argmax(abs_loadings)
        component_names.append(numeric_scaled.columns[max_loading_index])

    print(f'The three main components are: {component_names[0]}, {component_names[1]}, and tempo.')

    # Option to pick the columns
    # X_PCA = pd.DataFrame(np.asarray(numeric_scaled[['tempo', 'key','valence']]), columns=(['col1', 'col2', 'col3']))

    # Get the suggested number of clusters (k) from the elbow method
    elbow_m = KElbowVisualizer(KMeans(n_init='auto'), k=15, show=False)
    elbow_m.fit(X_PCA)
    k = elbow_m.elbow_value_
    plt.close()

    # Get cluster labels
    n_clusters = k
    clusterer = AgglomerativeClustering(n_clusters=n_clusters)
    cluster_labels = clusterer.fit_predict(X_PCA)

    # Add clusters to dataframes
    X_PCA['clusters'] = cluster_labels
    numeric_scaled['clusters'] = cluster_labels

    # Show data points
    PCA_show = X_PCA.copy()
    x = PCA_show['col1']
    y = PCA_show['col2']
    z = PCA_show['col3']
    PCA_show['name'] = tracklist_df['search']
    fig = px.scatter_3d(PCA_show, x=x, y=y, z=z, color='clusters', hover_name='name', hover_data=['clusters'])
    fig.show()

    # Get nearest neighbors from lowest tempo song
    random_pick = np.random.randint(0,20,size=1)
    start_index = numeric_scaled.sort_values('tempo').head(20).iloc[random_pick].index.values[0]
    n_neighbors = len_tracklist
    data = np.squeeze(np.asarray(X_PCA.drop(columns=['clusters'])))
    labels = np.array(X_PCA['clusters'].values)
    neighbor_list = build_neighbor_list(start_index, data, labels, n_neighbors, k)
    sorted_tracklist = tracklist_df.iloc[neighbor_list]
    sorted_tracklist_names = list(sorted_tracklist['search'].values)
    tracklist_ids = list(sorted_tracklist['id'].values)


    return sorted_tracklist_names, tracklist_ids

In [24]:
tracklist, tracklist_ids = sort_tracklist(truncated_tracklist)
tracklist[:10]

The three main components are: instrumentalness, key, and tempo.


['go - goddard. Remix - Cat Burns',
 'Beat das Trevas - DJ MENOR DA ZN',
 'SUCCUMB - Dxrk ダーク',
 'Override - KSLV Noh',
 'Kamili - Francis Mercier',
 'The Best Part - gardenstate',
 'New Gold (feat. Tame Impala and Bootie Brown) - Dom Dolla Remix - Gorillaz',
 'Howl - Elderbrook',
 'Dance With Somebody - Sam Feldt',
 'Strong - Romy']

In [25]:
redirect_uri = 'https://example.com/callback'
scope = 'playlist-modify-private'

sp = spotipy.Spotify(auth_manager=SpotifyOAuth(client_id=CLIENT_ID,
                                                client_secret=SECRET_KEY,
                                                redirect_uri=redirect_uri,
                                                scope=scope))

# Create a playlist
playlist_name = input("Playlist name?")
playlist_description = input("Playlist description?")
user_id = sp.current_user()['id']
playlist = sp.user_playlist_create(user_id, playlist_name, public=False, description=playlist_description)
playlist_id = playlist['id']

# Add tracks to the playlist
for i in range(int(np.ceil(len(tracklist_ids)/100))):
    sp.user_playlist_add_tracks(user_id, playlist_id, tracklist_ids[i*100:(i+1)*100])

print('Tracks added successfully!')

Tracks added successfully!
