In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

Preparing a dataframe

In [None]:
df = pd.read_pickle("dataframe_w_labels.pkl")
df = df[df['PlaylistTitle'] != 'No Playlist']
df = df[df['PlaylistTitle'] != 'Songs to Sing in the Shower']
df = df[df['PlaylistTitle'] != 'New Music Friday'] #Remove for the Mood playlist set
df.reset_index(drop=True, inplace=True)

In [None]:
#included playists check
df['PlaylistTitle'].unique()

Selecting musical features for the clustering

In [None]:
features = df[['danceability','energy', 'loudness', 'acousticness', 'instrumentalness','valence', 'tempo', 'duration_ms', 'NoFeaturing']]

Scaling the dataset

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
x = scaler.fit_transform(features)

# PCA

In [None]:
#Choosing the optimal number of components

from sklearn.decomposition import PCA

pca_t = PCA().fit(x.data)
plt.plot(np.cumsum(pca_t.explained_variance_ratio_))
plt.xlabel('Number of PCA components')
plt.ylabel('Cumulative explained variance');
plt.savefig("cross_val_pca.pdf")

In [None]:
pca = PCA(n_components=4)
principalComponents = pca.fit_transform(x)
xp = pd.DataFrame(data = principalComponents
             , columns = ['pc1', 'pc2', 'pc3', 'pc4'])

## K-Means clustering

In [None]:
from sklearn.cluster import KMeans

In [None]:
#Choosing the optimal number of clusters
from numpy import absolute

scores = []
clusters=[]

for cl in np.arange(1,10):
    kmeans = KMeans(cl)
    kmeans.fit(xp)
    
    scores.append(absolute(kmeans.score(xp)))
    clusters.append(cl)
    

In [None]:
plt.plot(clusters, scores)
plt.xlabel('Number of KMeans clusters')
plt.ylabel('Score (absolute)')
plt.savefig("cross_val_km.pdf")

In [None]:
kmeans = KMeans(4)

In [None]:
identified_clusters = kmeans.fit_predict(xp)

In [None]:
df_clusters = df.copy()
df_clusters['Clusters'] = identified_clusters 

In [None]:
df_clusters[['PlaylistTitle','Clusters']].groupby(['PlaylistTitle','Clusters']).size()

In [None]:
# 3D plot for playlists on Kmeans clusters

u_labels = np.unique(identified_clusters)
fig= plt.figure(figsize=[5,5])
ax= fig.add_subplot(111, projection= '3d')
cdict = {0: 'orange', 1: 'green', 2: 'blue', 3: 'red'}

for i in u_labels:
    ax.scatter(principalComponents[identified_clusters == i , 0], 
                 principalComponents[identified_clusters == i , 2], 
                 principalComponents[identified_clusters == i , 1], 
                 label = i, c = cdict[i])
    ax.legend()

fig.tight_layout()
fig.savefig("clusters_mood_km.pdf")

In [None]:
xpf = xp.copy()
xpf['PlaylistTitle'] = df['PlaylistTitle']


In [None]:
# 3D plot for playlists on PCA
fig= plt.figure(figsize=[5,6])
ax= fig.add_subplot(111, projection= '3d')
u_labels = np.unique(xpf['PlaylistTitle'])
column = xpf['PlaylistTitle']
cdict = {'Beast Mode': 'red',
         'Comfort Zone': 'blue', 
         'Deep Focus': 'green',
         'Mood Booster': 'orange'}

for i in u_labels:
    ax.scatter(principalComponents[column == i , 0], 
                 principalComponents[column == i , 2], 
                 principalComponents[column == i , 1], 
                 label = i,  c = cdict[i])
    ax.legend()
fig.tight_layout()
fig.savefig("clusters_mood_title.pdf")