In [1]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [2]:
'''
if running a kaggle notebook, please uncomment these lines
os.chdir('/kaggle/input')
sdata = pd.read_csv('spotify-songs-information/spotifydata.csv',encoding='ISO-8859-1',index_col=[0])
'''
sdata = pd.read_csv('spotifydata.csv',encoding='ISO-8859-1',index_col=[0])
print(sdata.head())

In [3]:
#get the column names
col_names = sdata.columns
print(col_names)

In [4]:
sdata.info()

In [5]:
sdata.isnull().sum()

In [6]:
#before handing null values
sdata.groupby(sdata['genre']).size()

In [7]:
sdata.groupby(sdata['artist']).size()

**Now, we need to handle these missing values.**

In [8]:
#drop rows having any value as "null"
sdata=sdata.dropna(how='any')

**Basic Data Analysis**

In [9]:
#after handing null values
sdata.groupby(sdata['genre']).size()

In [10]:
#correlation between the features
corr = sdata.corr(method='pearson')
corr.style.background_gradient(cmap='coolwarm').set_precision(2)

* Loudness has a strong positive correlation with energy
* Valence also has considerable positive correation with energy and loudness
* acousticness has a strong negative correlation with energy and loudness

In [11]:
artists_10 = pd.DataFrame({'No of songs':sdata['artist'].value_counts().head(10)})
artists_10.plot.bar(color='blue')
plt.title('Top 10 artists')
plt.xlabel('Artists')
plt.ylabel('No of songs')
plt.show()

Artists are a mix of many genres. 

In [12]:
col_names = ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo'  ]

In [18]:
mean_val = sdata.copy()
mean_val = mean_val.drop(["type","id","uri", "track_href", "analysis_url", "song_name", "artist", "genre", "duration_ms", 
                          "time_signature"], axis=1)
mean_val = StandardScaler().fit_transform(mean_val)
mean_val=pd.DataFrame(mean_val, columns = col_names)
mean_val.mean().plot.bar()
plt.show()

Let us visualise the features according to the genre, this is just to see which feature affects a song the most to be grouped to a perticular genre.

In [20]:
data_ = sdata.copy()
data_ = data_.drop(["type","id","uri", "track_href", "analysis_url", "song_name", "artist", "duration_ms", 
                          "time_signature"], axis=1)
for genre in data_:
    if genre == 'genre': continue
    grid=sns.FacetGrid(data_, col='genre')
    grid.map(plt.hist, genre)

## Clustering using k means
Choose only numerical features for k-means <br>
remove features like type, id, uri, track_href, analysis_url, song_name, arist, genre <br>


In [26]:
sdata = sdata.drop(["type","id","uri", "track_href", "analysis_url", "song_name", "artist", "genre"],axis=1)

In [27]:
#finding the best k in k-means via elbow method
sum_of_distances = []
max_k = 12
for k in range(2, max_k):
    kmean = KMeans(k).fit(sdata) 
    sum_of_distances.append(kmean.inertia_)

###### Plot the cost vs number of clusters ######
fig = plt.figure(figsize=(9,6))
plt.plot(range(2, max_k), sum_of_distances, '--x')
plt.title("SSE vs # Clusters")
plt.xlabel("# Clusters")

plt.ylabel('Sum of squared error')
plt.show()

From the elbow graph above, we see that the Sum of squared errors monotonically decreases. This does not give us a good idea of choosing the right number of clusters.<br>
Let us try the to select the number of clusters with silhouette analysis.<br>


In [28]:
#silhoutte analysis
sdata = StandardScaler().fit_transform(sdata)
sdata=pd.DataFrame(sdata)
for k in range(2,max_k):
    fig, (ax1, ax2) = plt.subplots(1, 2)
    fig.set_size_inches(18, 7)
    ax1.set_xlim([-0.1, 1])  
    ax1.set_ylim([0, len(sdata) + (k + 1) * 10])
    kmean = KMeans(k).fit(sdata) 
    c_labels = kmean.fit_predict(sdata)
    
    sil_score =  silhouette_score(sdata, c_labels)
    print(
        "For n_clusters =",k,
        "The average silhouette_score is :",
        sil_score,
    )
    #silhoutte score of each sample
    sil_score_sample =  silhouette_samples(sdata, c_labels)
    y_lower = 10

    for i in range(k):
        ith_cluster_silhouette_values = sil_score_sample[c_labels == i]
        ith_cluster_silhouette_values.sort()

        size_cluster_i = ith_cluster_silhouette_values.shape[0]
        y_upper = y_lower + size_cluster_i
        
        color = cm.nipy_spectral(float(i) / k)
        ax1.fill_betweenx(
            np.arange(y_lower, y_upper),
            0,
            ith_cluster_silhouette_values,
            facecolor=color,
            edgecolor=color,
            alpha=0.7,
        )
        
        ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

        # Compute the new y_lower for next plot
        y_lower = y_upper + 10 
    
    ax1.set_title("The silhouette plot for the various clusters.")
    ax1.set_xlabel("The silhouette coefficient values")
    ax1.set_ylabel("Cluster label")
    
    ax1.axvline(x=sil_score, color="red", linestyle="--")

    ax1.set_yticks([])  # Clear the yaxis labels / ticks
    ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])

    # 2nd Plot showing the actual clusters formed
    colors = cm.nipy_spectral(c_labels.astype(float) / k)
    ax2.scatter(
        sdata.iloc[:, 0].values, sdata.iloc[:, 1].values, marker=".", s=30, lw=0, alpha=0.7, c=colors, edgecolor="k"
    )
    
    ax2.set_title("The visualization of the clustered data.")
    ax2.set_xlabel("Feature space for the 1st feature")
    ax2.set_ylabel("Feature space for the 2nd feature")
    
    plt.suptitle(
        "Silhouette analysis for KMeans clustering on sample data with n_clusters = %d"
        % k,
        fontsize=14,
        fontweight="bold",
    )
plt.show()

In [31]:
#normalising the data woth Standard Scalar
pca = PCA(2)
new_df = pca.fit_transform(sdata)
#new_df will be a numpy array, converting this to a dataframe
final_ = pd.DataFrame(new_df)
print(final_.shape)

In [30]:
#view the normalised and transformed dataframe
final_.head()

In [32]:
kmeans = KMeans(
       init="random",
       n_clusters=8,
       n_init=10,
       max_iter=100,
       random_state=42
    ).fit(final_)
prediction = kmeans.predict(final_)
ccenters = kmeans.cluster_centers_
print(kmeans.labels_)
print("--Cluster centers are-- ",ccenters)

In [33]:
###### Generate by-cluster feature breakdowns to aid in interpretation ######
data_ = sdata.copy()
data_['cluster'] = kmeans.labels_

for cluster in data_:
    if cluster == 'cluster': continue
    grid=sns.FacetGrid(data_, col='cluster')
    grid.map(plt.hist, cluster)

In [34]:
print(data_.columns)
data_.rename(columns = {
0:'danceability',
1:'energy',         
2:   'key',               
3:   'loudness',         
4:   'mode',           
5:   'speechiness',       
6:   'acousticness',      
7:   'instrumentalness',  
8:   'liveness',          
9:   'valence',           
10:  'tempo',       
11:  'duration_ms',      
12: 'time_signature'
}, inplace = True)
print(data_.columns)

In [35]:
#plotting the clusters and centroids
clusters = 8
for i in range(clusters):
    plt.scatter(final_.iloc[prediction==i,0].values, final_.iloc[prediction==i,1].values, label=i)
    
#plot the centroids
#plt.scatter(ccenters[:,0] , ccenters[:,1] , s = 10, color = 'k')
plt.legend()
plt.show()


Detailed analysis of the cluster in the report.

In [47]:
#Visualizing how samples are distributed among the clusters
genres = data_.groupby(['cluster']).size()
plt.figure(figsize=(10,6))
genres.sort_values().plot.barh(color='blue')
plt.xlabel('Total Songs')
plt.show()