In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans, DBSCAN
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
from sklearn.neighbors import NearestNeighbors
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# read data
df = pd.read_csv('/kaggle/input/spotify-top-2000s-mega-dataset/Spotify-2000.csv', delimiter=',')
df.dataframeName = 'Spotify-2000.csv'
df.shape

In [None]:
df.columns

In [None]:
df.head()

In [None]:
# convert relevant categorical variables to dummy variables
temp_df = pd.get_dummies(df[['Artist', 'Top Genre']])
relevant_df = df.join(temp_df, how='left')
relevant_df = relevant_df.drop(columns = ['Artist', 'Top Genre', 'Title', 'Year', 'Index', 'Length (Duration)'], axis=1)
relevant_df.shape

In [None]:
# perform PCA with 20 components
song_std = StandardScaler().fit_transform(relevant_df)
pca = PCA(n_components=20)
principalComponents = pca.fit_transform(song_std)
pca_df = pd.DataFrame(principalComponents)

In [None]:
# calculate distances for different k values
distortions = []
K = range(2,10)
for k in K:
    kmeanModel = KMeans(n_clusters=k, random_state=1000)
    kmeanModel.fit(pca_df)
    distortions.append(kmeanModel.inertia_)

In [None]:
# plot elbow graph
plt.figure(figsize=(16,8))
plt.plot(K, distortions, 'bx-')
plt.xlabel('k')
plt.ylabel('Distortion')
plt.title('The Elbow Method showing the optimal k')
plt.show()

In [None]:
# use silhouette method to determine optimal k value
sil = []
for k in K:
    kmeanModel = KMeans(n_clusters=k, random_state=1000)
    kmeanModel.fit(pca_df)
    labels = kmeanModel.labels_
    sil.append(silhouette_score(relevant_df, labels, metric = 'euclidean'))

In [None]:
# plot silhouette graph
plt.figure(figsize=(16,8))
plt.plot(K, sil, 'bx-')
plt.xlabel('k')
plt.ylabel('Silhouette Score')
plt.title('The Silhouette Method showing the optimal k')
plt.show()

Elbow method was indecisive, so we tried the sillhouette method to determine the optimal k value.
The results were again dissapointed with the maximum score appearing for k=2

In [None]:
# perform k-means with k=2
songs_kmeans = KMeans(n_clusters=2, random_state=1000).fit(pca_df)
relevant_df = df.drop(columns = ['Index', 'Year'], axis=1)
relevant_df['kmeans'] = songs_kmeans.labels_

In [None]:
kmeans_stats = relevant_df.groupby(['kmeans']).mean()

# normalize
kmeans_statmeans = kmeans_stats.mean(axis=0)
kmeans_range = kmeans_stats.max(axis=0) - kmeans_stats.min(axis=0)
kmeans_statnorm = (kmeans_stats - kmeans_statmeans) / kmeans_range
kmeans_statnorm = kmeans_statnorm

# make plot
fig, (axis1, axis2) = plt.subplots(2,1,figsize=(14,14))
kmeans_statnorm.iloc[:,:300].plot.bar(ax=axis2).legend(loc='lower left')

In [None]:
# relevant_df['type'] = 'NA'
# relevant_df.loc[(relevant_df['kmeans']==0),'type8'] = 'Regular'
# relevant_df.loc[(relevant_df['kmeans']==1),'type8'] = 'Ballad'

# sort and write the results to relevant file
relevant_df_sorted = relevant_df[['Title', 'Artist', 'Top Genre', 'kmeans','Beats Per Minute (BPM)', 'Energy', 'Danceability', 'Loudness (dB)', 'Valence', 'Acousticness']].sort_values(['kmeans'])
relevant_df_sorted.to_csv('songClusterKMeans.csv',index=True)
relevant_df_sorted

In [None]:
# calculate nearest neighbors
neigh = NearestNeighbors(n_neighbors=2)
nbrs = neigh.fit(pca_df)
distances, indices = nbrs.kneighbors(pca_df)

In [None]:
# sort distances and plot graph to determine epsilon value
distances = np.sort(distances, axis=0)
distances = distances[:,1]
plt.grid(b=True, which='major', color='#666666', linestyle='-')
plt.plot(distances)

We can see from the graph that 3 is a good value for epsilon.
I also decided to set the minimum sample variable to 50, based on domain knowledge.

In [None]:
# perform dbscan
dbscan = DBSCAN(eps = 3, min_samples = 50)
dbscan.fit(pca_df)
relevant_df = df.drop(columns = ['Index', 'Year'], axis=1)
relevant_df['dbscan'] = dbscan.labels_

dbscan_stats = relevant_df.groupby(['dbscan']).mean()

# normalize
dbscan_statmeans = dbscan_stats.mean(axis=0)
dbscan_range = dbscan_stats.max(axis=0) - dbscan_stats.min(axis=0)
dbscan_statnorm = (dbscan_stats - dbscan_statmeans) / dbscan_range
dbscan_statnorm = dbscan_statnorm

# make plot
fig, (axis1, axis2) = plt.subplots(2,1,figsize=(14,14))
dbscan_statnorm.iloc[:,:7].plot.bar(ax=axis2).legend(loc='lower left')

In [None]:
relevant_df['dbscan'].value_counts()

In [None]:
# relevant_df['type'] = 'NA'
# relevant_df.loc[(relevant_df['dbscan']==-1),'type8'] = 'Pop'
# relevant_df.loc[(relevant_df['dbscan']==0),'type8'] = 'Acoustic Ballads'
# relevant_df.loc[(relevant_df['dbscan']==1),'type8'] = 'Fast & Heavy'
# relevant_df.loc[(relevant_df['dbscan']==2),'type8'] = 'Live'
# relevant_df.loc[(relevant_df['dbscan']==3),'type8'] = 'Classical Rock'
# relevant_df.loc[(relevant_df['dbscan']==4),'type8'] = 'Fast'
# relevant_df.loc[(relevant_df['dbscan']==5),'type8'] = 'Metal'

# sort and write to the relevant file
relevant_df_sorted = relevant_df[['Title', 'Artist', 'Top Genre', 'dbscan','Beats Per Minute (BPM)', 'Energy', 'Danceability', 'Loudness (dB)', 'Valence', 'Acousticness']].sort_values(['dbscan'])
relevant_df_sorted.to_csv('songClusterDBSCAN.csv',index=True)
relevant_df_sorted