# Imports

In [75]:
import os
import shutil
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import silhouette_score
import warnings
warnings.filterwarnings("ignore")

In [76]:
df = pd.read_csv('liked_tracks.csv')
categorical_columns = ['Artist', 'Broad Genre']
label_encoders = {}
for col in categorical_columns:
    label_encoders[col] = LabelEncoder()
    df[f'{col}_encoded'] = label_encoders[col].fit_transform(df[col])
df

Unnamed: 0,Track ID,Track Name,Artist,Album,Genre,Date Added,Popularity,Duration,Album Release Date,Tempo,Energy,Danceability,Year Added,Month Added,Day Added,Hour Added,Broad Genre,Artist_encoded,Broad Genre_encoded
0,5rAxhWcgFng3s570sGO2F8,A Place for My Head,Linkin Park,Hybrid Theory (Bonus Edition),alternative metal,2024-06-12 12:45:21-05:00,65,3.077333,2000,133.063,0.908,0.603,2024,6,12,12,Metal,296,13
1,0rNqDh9zWWJVTLS4VfceHP,Brickell,Feid,MANIFESTING 20-05,colombian pop,2024-06-11 17:57:50-05:00,79,3.006000,2024-04-10,93.037,0.890,0.805,2024,6,11,17,Pop,181,15
2,4kjI1gwQZRKNDkw1nI475M,MY EYES,Travis Scott,UTOPIA,rap,2024-06-10 20:04:56-05:00,79,4.187483,2023-07-28,119.043,0.621,0.455,2024,6,10,20,Hip Hop,526,9
3,6F9yAYUaNbUhdlQyt5uZ3b,La Incondicional,Luis Miguel,Busca Una Mujer,bolero,2024-06-07 21:34:11-05:00,73,4.283767,1988-11-25,155.127,0.727,0.374,2024,6,7,21,Latin,314,12
4,7alVaT3Dl9jsT1vzcUz6rj,El Día De Mi Suerte,Willie Colón,Greatest Hits,salsa,2024-06-06 09:02:55-05:00,38,5.479767,2008-02-26,97.097,0.855,0.560,2024,6,6,9,Latin,543,12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1496,7Jzsc04YpkRwB1zeyM39wE,R U Mine?,Arctic Monkeys,AM,garage rock,2017-02-13 10:22:27-05:00,0,3.362217,2013-09-10,97.076,0.763,0.509,2017,2,13,10,Rock,31,18
1497,4kTd0TND65MUY4BlcmJ2cM,Why'd You Only Call Me When You're High?,Arctic Monkeys,AM,garage rock,2017-02-13 10:20:57-05:00,0,2.685550,2013-09-10,91.989,0.627,0.698,2017,2,13,10,Rock,31,18
1498,2x8evxqUlF0eRabbW2JBJd,Fluorescent Adolescent,Arctic Monkeys,Favourite Worst Nightmare,garage rock,2017-02-13 10:16:12-05:00,72,3.064883,2007-04-22,112.056,0.828,0.654,2017,2,13,10,Rock,31,18
1499,6nFvbLWccsEydO36fyBBlm,Ruby Tuesday - Stereo Version,The Rolling Stones,Between The Buttons (Remastered),album rock,2017-02-13 10:15:40-05:00,0,3.274883,1967-01-20,104.528,0.543,0.518,2017,2,13,10,Rock,504,18


In [77]:
from sklearn.impute import KNNImputer
features = df[['Tempo', 'Energy', 'Danceability', 'Year Added', 'Month Added', 'Day Added', 'Hour Added']]

imputer = KNNImputer(n_neighbors=3)
imputed_values = imputer.fit_transform(features)
features_imputed_df = pd.DataFrame(imputed_values, columns=features.columns)
features[['Tempo', 'Energy', 'Danceability']] = features_imputed_df[['Tempo', 'Energy', 'Danceability']]

# Is Clustering a better way to make Playlists?

In [78]:
def KMeans_clustering(features):
    num_files = features.shape[0]
    k_values = range(5, 30)
    best_score = -1
    best_k = -1
    rand = np.random.randint(100)
    
    for k in k_values:
        cluster = KMeans(n_clusters=k, random_state=rand)
        cluster_labels = cluster.fit_predict(features)
        score = silhouette_score(features, cluster_labels)
        if score > best_score:
            best_score = score
            best_k = k
    print(f'The best k is {best_k} with a silhouette score of {best_score}')

    cluster = KMeans(n_clusters=best_k, random_state=rand)
    cluster_labels = cluster.fit_predict(features)
    
    return cluster_labels

In [79]:
df['Cluster Labels'] = KMeans_clustering(features)
df.to_csv('Clusters_knn.csv')
df

The best k is 7 with a silhouette score of 0.2675773617683579


Unnamed: 0,Track ID,Track Name,Artist,Album,Genre,Date Added,Popularity,Duration,Album Release Date,Tempo,Energy,Danceability,Year Added,Month Added,Day Added,Hour Added,Broad Genre,Artist_encoded,Broad Genre_encoded,Cluster Labels
0,5rAxhWcgFng3s570sGO2F8,A Place for My Head,Linkin Park,Hybrid Theory (Bonus Edition),alternative metal,2024-06-12 12:45:21-05:00,65,3.077333,2000,133.063,0.908,0.603,2024,6,12,12,Metal,296,13,4
1,0rNqDh9zWWJVTLS4VfceHP,Brickell,Feid,MANIFESTING 20-05,colombian pop,2024-06-11 17:57:50-05:00,79,3.006000,2024-04-10,93.037,0.890,0.805,2024,6,11,17,Pop,181,15,1
2,4kjI1gwQZRKNDkw1nI475M,MY EYES,Travis Scott,UTOPIA,rap,2024-06-10 20:04:56-05:00,79,4.187483,2023-07-28,119.043,0.621,0.455,2024,6,10,20,Hip Hop,526,9,4
3,6F9yAYUaNbUhdlQyt5uZ3b,La Incondicional,Luis Miguel,Busca Una Mujer,bolero,2024-06-07 21:34:11-05:00,73,4.283767,1988-11-25,155.127,0.727,0.374,2024,6,7,21,Latin,314,12,0
4,7alVaT3Dl9jsT1vzcUz6rj,El Día De Mi Suerte,Willie Colón,Greatest Hits,salsa,2024-06-06 09:02:55-05:00,38,5.479767,2008-02-26,97.097,0.855,0.560,2024,6,6,9,Latin,543,12,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1496,7Jzsc04YpkRwB1zeyM39wE,R U Mine?,Arctic Monkeys,AM,garage rock,2017-02-13 10:22:27-05:00,0,3.362217,2013-09-10,97.076,0.763,0.509,2017,2,13,10,Rock,31,18,6
1497,4kTd0TND65MUY4BlcmJ2cM,Why'd You Only Call Me When You're High?,Arctic Monkeys,AM,garage rock,2017-02-13 10:20:57-05:00,0,2.685550,2013-09-10,91.989,0.627,0.698,2017,2,13,10,Rock,31,18,1
1498,2x8evxqUlF0eRabbW2JBJd,Fluorescent Adolescent,Arctic Monkeys,Favourite Worst Nightmare,garage rock,2017-02-13 10:16:12-05:00,72,3.064883,2007-04-22,112.056,0.828,0.654,2017,2,13,10,Rock,31,18,6
1499,6nFvbLWccsEydO36fyBBlm,Ruby Tuesday - Stereo Version,The Rolling Stones,Between The Buttons (Remastered),album rock,2017-02-13 10:15:40-05:00,0,3.274883,1967-01-20,104.528,0.543,0.518,2017,2,13,10,Rock,504,18,6


In [86]:
import spotipy
from spotipy.oauth2 import SpotifyOAuth

# change these according to your developer account
client_id = 'look for this on the spotify developer settings'
client_secret = 'look for this on the spotify developer settings'
redirect_uri = 'http://localhost'

# leave the scope and cache the same
scope = 'user-library-read playlist-modify-public playlist-modify-private user-top-read'
cache_path = '.cache-spotify'

sp = spotipy.Spotify(auth_manager=SpotifyOAuth(client_id=client_id,
                                               client_secret=client_secret,
                                               redirect_uri=redirect_uri,
                                               scope=scope,
                                               cache_path=cache_path,
                                               requests_timeout=30))
# created playlists by the name of each genre
def create_playlist(sp, clustered_tracks):
    user_id = sp.current_user()['id']
    grouped_tracks = clustered_tracks.groupby('Cluster Labels')
    for genre, tracks in grouped_tracks:
        playlist_name = str(genre)
        playlist = sp.user_playlist_create(user_id, playlist_name, public=True, description='Automatic Clusters')
        playlist_id = playlist['id']
        # now we move the songs
        track_ids = tracks['Track ID'].tolist()
        for i in range(0, len(track_ids), 100):
            sp.playlist_add_items(playlist_id, track_ids[i:i+100])

In [87]:
create_playlist(sp, df)