## Kmeans clustering


In [32]:
# import libraries
import pandas as pd
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import plotly.graph_objects as go


In [24]:
# load the tracklist.csv dataset
df = pd.read_csv('tracklist.csv')

In [25]:
# Preprocessing for k-means
# Select columns to be used
df_cl = df[['tempo', 'loudness', 'danceability', 'energy', 'key', 'mode', 'speechiness', 'acousticness', 'instrumentalness',
        'liveness', 'valence']]

# Fill zeros values with 0.1
df_cl = df_cl.replace(0, 0.1)

# Replace NaN values with mean
df_cl = df_cl.fillna(df_cl.mean())

In [26]:
# Log transformation
df_log = np.log(df_cl)
print(df_log.head(3))

      tempo  loudness  danceability    energy       key      mode   
0  4.983648       NaN     -0.316082 -0.002002  2.079442  0.000000  \
1  4.820112       NaN     -0.186330 -0.373966  1.609438 -2.302585   
2  4.828146       NaN     -0.080126 -0.572701  2.397895 -2.302585   

   speechiness  acousticness  instrumentalness  liveness   valence  
0    -2.790345     -5.867479         -1.210662 -2.407946 -0.426178  
1    -2.650725     -2.779009         -0.345311 -2.120264 -0.314711  
2    -2.095571     -5.286389         -1.807889 -3.162968 -0.309246  


  result = func(self.values, **kwargs)


In [27]:
# Standardization
std_scaler = StandardScaler()
df_scaled = std_scaler.fit_transform(df_cl)

print(df_scaled[:3])

# Min Max Scaling
scaler = MinMaxScaler()
df_scaled_positive = scaler.fit_transform(df_log)

print(df_scaled_positive[:3])

[[ 1.11536377  0.06649875  0.09308744  1.59394349  0.67598454  1.02572903
  -0.34945604 -0.64229032 -0.08398297 -0.47193583  0.28321347]
 [ 0.04034688 -0.38297998  0.90368141 -0.10164404 -0.14172434 -0.97633132
  -0.23843772 -0.37147616  1.06461882 -0.26370351  0.59480546]
 [ 0.08915139 -0.45115201  1.65006992 -0.77987905  1.49369341 -0.97633132
   0.39388404 -0.63210109 -0.45937965 -0.80302522  0.61099206]]
[[0.73182251        nan 0.80216941 0.99962621 0.93225081 1.
  0.29086401 0.53279881 0.91323434 0.44413346 0.89725392]
 [0.60623505        nan 0.89171646 0.93017734 0.83226026 0.
  0.33526342 0.77888797 0.97592989 0.51148251 0.92484333]
 [0.61240441        nan 0.9650118  0.89307193 1.         0.
  0.5118031  0.57910003 0.86996462 0.26737564 0.92619585]]


  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)


In [28]:

model = KMeans(n_clusters=10, random_state=42)
model.fit(df_scaled)
df = df.assign(ClusterLabel= model.labels_)



In [38]:
# Rename CLusterLabel to KMeans
df = df.rename(columns={'ClusterLabel': 'KMeans'})
df.groupby("KMeans")[["valence", "acousticness" ]].median()

Unnamed: 0_level_0,valence,acousticness
KMeans,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.361,0.186
1,0.509,0.00561
2,0.395,0.7745
3,0.718,0.0332
4,0.584,0.221
5,0.68,0.0263
6,0.6735,0.02635
7,0.552,0.00596
8,0.501,0.0858
9,0.7355,0.1015


In [30]:
# Cluster Label to categorical
df['KMeans'] = df['KMeans'].astype('category')

In [36]:
# Scatter matrix without upper half
fig = go.Figure(
    data=go.Splom(
        dimensions=[dict(label='valence', values=df['valence']),
                    dict(label='acousticness', values=df['acousticness']),
                    dict(label='danceability', values=df['danceability']),
                    dict(label='energy', values=df['energy']),
                    dict(label='instrumentalness', values=df['instrumentalness']),
                    dict(label='liveness', values=df['liveness']),
                    dict(label='speechiness', values=df['speechiness']),
                    dict(label='tempo', values=df['tempo'])],
        text=df['KMeans'],
        showupperhalf=False,
        diagonal=dict(visible=False),
        marker=dict(
            color=df['KMeans'],
            showscale=True,
            colorscale='Rainbow',
            line_color='white',
            line_width=0.5,
            size=5,
            opacity=0.6
        )
    )
)

fig.update_layout(
    title='Spotify Music Data',
    showlegend=True,
    width=1000,
    height=1000,
    dragmode='select',
    hovermode='closest',

)

fig.show()