In [115]:
!pip install spotipy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [116]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.io as pio
import seaborn as sns
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [117]:
# Set up Spotify API credentials
client_id = 'f6a2cc12d09d45c78e24356d0348ba0b'
client_secret = 'fcac6403f263466f8fb344651b7cd75d'
client_credentials_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

# Define function to get audio features for a song
def get_audio_features(track_id):
    audio_features = sp.audio_features(tracks=[track_id])[0]
    return audio_features

# Define function to get all tracks from a playlist
def get_playlist_tracks(username, playlist_id):
    # Get playlist details
    playlist = sp.user_playlist(username, playlist_id)
    # Extract playlist name
    playlist_name = playlist['name']
    # Get playlist tracks
    results = sp.user_playlist_tracks(username, playlist_id)
    tracks = results['items']
    while results['next']:
        results = sp.next(results)
        tracks.extend(results['items'])
    return tracks, playlist_name


In [118]:
# Get all tracks from the playlist
username = 'spotify'
playlist_id = '37i9dQZF1DX4o1oenSJRJd'
#playlist_id = '1yG9WLgaK7JgRKCzd1Llvw'
tracks, playlist_name = get_playlist_tracks(username, playlist_id)

# Extract audio features for each track and store in a dataframe
audio_features = []
for track in tracks:
    audio_features.append(get_audio_features(track['track']['id']))
df = pd.DataFrame(audio_features)

# Add track name and artists to the dataframe
track_names = []
artists = []
for track in tracks:
    track_names.append(track['track']['name'])
    artist_names = []
    for artist in track['track']['artists']:
        artist_names.append(artist['name'])
    artists.append(artist_names)
df['name'] = track_names
df['artists'] = artists

In [119]:
#df.sample(10)

In [139]:
# Select the features to use for clustering and PCA
features = ['danceability', 'energy', 'speechiness', 'acousticness', 'liveness', 'valence', 'tempo', 'loudness']
#features = ['energy', 'acousticness', 'valence']
# Normalize the features
scaler = StandardScaler()
dfn=df
dfn[features] = scaler.fit_transform(df[features])

In [158]:
# Determine the optimal number of clusters using the silhouette score

silhouette_scores = []
for k in range(2, 25):
    kmeans = KMeans(n_clusters=k, random_state=1)
    preds = kmeans.fit_predict(df[features])
    score = silhouette_score(df[features], preds)
    silhouette_scores.append(score)
    
optimal_k = silhouette_scores.index(max(silhouette_scores)) + 2

# Fit the data into KMeans with the optimal number of clusters
kmeans = KMeans(n_clusters=optimal_k)
df['cluster'] = kmeans.fit_predict(dfn[features])

In [159]:
# Reduce the selected features into three dimensions using PCA
pca = PCA(n_components=3)
df_pca = pca.fit_transform(dfn[features])
df_pca = pd.DataFrame(df_pca, columns=['PC1', 'PC2', 'PC3'])

# Combine the cluster labels and PCA results into one dataframe
df_plot = pd.concat([dfn, df_pca], axis=1)

In [160]:
# Plot a 3D scatter plot using plotly express
fig = px.scatter_3d(df_plot, x='energy', y='acousticness', z='valence', color='cluster', hover_data=['name', 'artists'] + features,
                    )
# adjust the size of the figure and add title
fig.update_layout(
    width=1200, 
    height=1000,
    title={
        'text': f"3D Scatter Plot for Playlist: {playlist_name}",
        'x': 0.5,
        'y': 0.95,
        'xanchor': 'center',
        'yanchor': 'top'
    }
)



# show the plot
fig.show()

In [161]:


# Export the plot as an interactive HTML file
pio.write_html(fig, file='spotify_clusters.html', auto_open=True)


In [162]:


# Plot a 3D scatter plot using plotly express
fig = px.scatter_3d(df_plot, x='PC1', y='PC2', z='PC3', color='cluster', hover_data=['name', 'artists'] + features,
                    )

#color_continuous_scale='RdYlBu'



# adjust the size of the figure
fig.update_layout(width=1200, height=1000)

# show the plot
fig.show()

In [163]:
fig.update_traces(hovertemplate="<br>".join([
    "Name: %{customdata[0]}",
    "Artists: %{customdata[1]}",
    "Danceability: %{customdata[2]:.3f}",
    "Energy: %{customdata[3]:.3f}",
    "Speechiness: %{customdata[4]:.3f}",
    "Acousticness: %{customdata[5]:.3f}",
    "Instrumentalness: %{customdata[6]:.3f}",
    "Liveness: %{customdata[7]:.3f}",
    "Valence: %{customdata[8]:.3f}"
]))

In [164]:

# Export the plot as an interactive HTML file
pio.write_html(fig, file='spotify_PCA.html', auto_open=True)


In [165]:
# Group songs by cluster and print the results
grouped_df = df.groupby('cluster')
for name, group in grouped_df:
    print(f"Cluster {name+1}:")
    for index, row in group.iterrows():
        print(f"- {row['name']} by {', '.join(row['artists'])}")
    print()

Cluster 1:
- Seven Nation Army by The White Stripes
- Viva La Vida by Coldplay
- The Scientist by Coldplay
- Fix You by Coldplay
- Pon de Replay by Rihanna
- Clocks by Coldplay
- Chasing Cars by Snow Patrol
- If I Ain't Got You by Alicia Keys
- Back To Black by Amy Winehouse
- She Will Be Loved - Radio Mix by Maroon 5
- Take Me Out by Franz Ferdinand
- Hey There Delilah by Plain White T's
- Alors on danse - Radio Edit by Stromae
- Dilemma by Nelly, Kelly Rowland
- My Boo by Usher, Alicia Keys
- Grenade by Bruno Mars
- So Sick by Ne-Yo
- I Knew You Were Trouble. by Taylor Swift
- Put Your Records On by Corinne Bailey Rae
- Miss Independent by Ne-Yo
- Don't Know Why by Norah Jones
- Here Without You by 3 Doors Down
- Hung Up by Madonna
- You're Beautiful by James Blunt
- I Miss You by blink-182
- Halo by Beyoncé
- Bleeding Love by Leona Lewis
- Valerie - Live At BBC Radio 1 Live Lounge, London / 2007 by Amy Winehouse

Cluster 2:
- Umbrella by Rihanna, JAY-Z
- Lose Yourself - From "8 Mile

In [166]:


# Use elbow method to determine the optimal number of clusters
#wcss = []
#for i in range(1, 26):
#    kmeans = KMeans(n_clusters=i, init='k-means++', random_state=42)
#    kmeans.fit(df[features])
#    wcss.append(kmeans.inertia_)
#plt.plot(range(1, 26), wcss)
#plt.title('Elbow Method')
#plt.xlabel('Number of clusters')
#plt.ylabel('WCSS')
#plt.show()
