In [None]:
import sys
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import seaborn as sns

from yellowbrick.cluster import KElbowVisualizer
from kneed import KneeLocator

sns.set()

In [None]:
#Data downloaded from kaggle: https://www.kaggle.com/yamaerenay/spotify-dataset-19212020-160k-tracks

## Principal Component Analysis (PCA)

In [None]:
df_X = pd.read_csv("data.csv")
non_features = ['year', 'artists', 'duration_ms', 'explicit','id','key','mode','name','release_date','popularity']
df_X = df_X.drop(columns=non_features)
df_X.head()

In [None]:
df_X = df_X [['acousticness','danceability','energy','instrumentalness','liveness','loudness','speechiness','tempo','valence']]
df_X.shape
#Speed up testing
#df_X = df.sample(frac=0.05)

In [None]:
scaler = StandardScaler()
X_std = scaler.fit_transform(df_X)

In [None]:
pca = PCA()
pca.fit(X_std);

In [None]:
# The attribute shows how much variance is explained by each of the nine features
evr = pca.explained_variance_ratio_
evr

In [None]:
fig = plt.figure(figsize=(10,8))
plt.plot(range(1, len(df_X.columns)+1), evr.cumsum(), marker='o', linestyle='--')
plt.xlabel('Number of Components', fontsize=18)
plt.ylabel('Cumulative Explained Variance',fontsize=18)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
fig = plt.savefig("./visualizations/cumulative_variance_plot.png")
plt.show()

In [None]:
for i, exp_var in enumerate(evr.cumsum()):
    if exp_var >= 0.8:
        n_comps = i + 1
        break
print("Number of components:", n_comps)
pca = PCA(n_components=n_comps)
pca.fit(X_std)
scores_pca = pca.transform(X_std)

## K-Means Clustering

Finding the elbow point of the WCSS (within cluster sum of squares) curve using the YellowBrick `KElbowVisualizer`

In [None]:
#Optimal n_cluster: version 1
#visualizer = KElbowVisualizer(KMeans(init='k-means++', random_state=42), k=(1,21), timings=False)
#visualizer.fit(scores_pca)
#visualizer.show()
#n_clusters = visualizer.elbow_value_
#print("Optimal number of clusters:", n_clusters)

Finding the elbow point of the WCSS (within cluster sum of squares) curve using the kneed `KneeLocator`

In [None]:
#Optimal n_cluster: version 2
#wcss = []
#max_clusters = 21
#for i in range(1, max_clusters):
#    kmeans_pca = KMeans(i, init='k-means++', random_state=42)
#    kmeans_pca.fit(scores_pca)
#    wcss.append(kmeans_pca.inertia_)
#n_clusters = KneeLocator([i for i in range(1, max_clusters)], wcss, curve='convex', direction='decreasing').knee
#print("Optimal number of clusters", n_clusters)

#fig = plt.figure(figsize=(10,8))
#plt.plot(range(1, 21), wcss, marker='o', linestyle='--')
#plt.vlines(KneeLocator([i for i in range(1, max_clusters)], wcss, curve='convex', direction='decreasing').knee, ymin=min(wcss), ymax=max(wcss), linestyles='dashed')
#plt.xlabel('Number of Clusters', fontsize=18)
#plt.ylabel('Within Cluster Sum of Squares (WCSS)', fontsize=18)
#plt.xticks(fontsize=16)
#plt.yticks(fontsize=16)
#fig.savefig("./visualizations/num_clusters.png")
#plt.show()


In [None]:
n_clusters = 6

In [None]:
kmeans_pca = KMeans(n_clusters=n_clusters, init='k-means++', random_state=42)
kmeans_pca.fit(scores_pca);

## Analysis and Visualization

In [None]:
df_seg_pca_kmeans = pd.concat([df_X.reset_index(drop=True), pd.DataFrame(scores_pca)], axis=1)
df_seg_pca_kmeans.columns.values[(-1*n_comps):] = ["Component " + str(i+1) for i in range(n_comps)]
df_seg_pca_kmeans['Cluster'] = kmeans_pca.labels_
df_seg_pca_kmeans.head()

In [None]:
df_seg_pca_kmeans.head(5)

In [None]:
x = df_seg_pca_kmeans['Component 2']
y = df_seg_pca_kmeans['Component 1']
fig = plt.figure(figsize=(10, 8))
sns.scatterplot(x, y, hue=df_seg_pca_kmeans['Cluster'], palette = ['tab:blue', 'tab:orange', 'tab:green', 'tab:red', 'goldenrod', 'tab:cyan'])
plt.title('Clusters by PCA Components', fontsize=20)
plt.xlabel("Component 2", fontsize=18)
plt.ylabel("Component 1", fontsize=18)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.show()
#fig.savefig("./visualizations/clusters-2d.png")

In [None]:
df_X['Cluster'] = df_seg_pca_kmeans['Cluster']
df_X

In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [None]:
fig = make_subplots(rows=2, cols=4, specs=[[{'type': 'polar'}]*4]*2)

angles = list(df_X.columns[0:9])
angles.append(angles[0])
abbrev = ["A", "D", "E", "I", "Li", "Lo", "S", "T", "V", "A"]

layoutdict = dict(
            radialaxis=dict(
            visible=True,
            range=[0, 1]
            ))

for i in range(n_clusters):
    subset = df_X[df_X['Cluster'] == i]
    data = [np.mean(subset[col]) for col in subset.columns[4:]]
    data.append(data[0])
    fig.add_trace(go.Scatterpolar(
        r=data,
        theta=abbrev,
        fill='toself',
        name="Cluster " + str(i)),
        i // 4 + 1,
        i % 4 + 1)
    
fig.update_layout(
        polar=layoutdict,
        polar2=layoutdict,
        polar3=layoutdict,
        polar4=layoutdict,
        polar5=layoutdict,
        polar6=layoutdict,
        polar7=layoutdict,
        polar8=layoutdict,
        showlegend=True
        )
fig.show()
#     fig.write_image('./visualizations/cluster' + str(i) + '.png')

In [None]:
fig = go.Figure()

angles = list(df_X.columns[0:9])
angles.append(angles[0])

layoutdict = dict(
            radialaxis=dict(
            visible=True,
            range=[0, 1]
            ))

for i in range(n_clusters):
    subset = df_X[df_X['Cluster'] == i]
    data = [np.mean(subset[col]) for col in subset.columns[4:]]
    data.append(data[0])
    fig.add_trace(go.Scatterpolar(
        r=data,
        theta=angles,
        fill='toself',
        name="Cluster " + str(i)))
    
fig.update_layout(
        polar=layoutdict,
        showlegend=True
        )
fig.show()
#     fig.write_image('./visualizations/cluster' + str(i) + '.png')

In [None]:
def inspect(df, param, cluster):
    plt.figure(figsize=(10,8))
    test_cluster = df.loc[df['Cluster'] == cluster]
    y = test_cluster[param]
    x = [i for i in range(len(y))]
    plt.bar(x, y)
    plt.xlabel('Track #', fontsize=18)
    plt.ylabel(param.title(), fontsize=18)
    plt.xticks(fontsize=16)
    plt.yticks(fontsize=16)
    plt.hlines(np.mean(df[param]), 0, len(y))
    plt.show()
    
inspect(df_X, "liveness", 4)

In [None]:
clusters = [2, 4, 5]
features = ["acousticness", "speechiness", "instrumentalness"]
colors = ['tab:green', 'tab:olive', 'tab:cyan']
dim = len(clusters)

fig, axes = plt.subplots(dim, dim, figsize=(24, 12))
i = 0
test_cluster = df_X.loc[df_X['Cluster'] == clusters[0]]
for ax in (axes.flatten()):
    if i % dim == 0 and i != 0:
        test_cluster = df_X.loc[df_X['Cluster'] == clusters[i // dim]]
    col = features[i % dim]
    y = test_cluster[col]
    x = [i for i in range(len(y))]
    ax.bar(x, y, color=colors[i//dim])
    ax.set_ylabel(col, fontsize=14)
    ax.set_title("Cluster " + str(clusters[i // dim]), fontsize=16)
    ax.hlines(np.mean(df_X[col]), 0, len(y))
    plt.subplots_adjust(wspace=.5, hspace=.5)
    i += 1
    
#fig.savefig("./visualizations/cluster_feature_bar_charts_dim_3.png")

In [None]:
preview = df_X.loc[df_X['Cluster'] == 5]
print(len(preview))
preview.head(n=50)