In [13]:
import numpy as np
import pandas as pd

from scipy import stats

In [14]:
# Import PyDrive and associated libraries.
# This only needs to be done once per notebook.
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate and create the PyDrive client.
# This only needs to be done once per notebook.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [15]:
file_id = '1iuDCmgIXw2hzTYYkrGxxqZV4g6MBHclS' # songs.csv
downloaded = drive.CreateFile({'id': file_id})
downloaded.GetContentFile('songs.csv')

In [16]:
df = pd.read_csv('songs.csv')
features = df.columns
songs = df.to_numpy()

In [17]:
df.head()

Unnamed: 0,artist_familiarity,artist_ hotttnesss,artist_num_songs,release,duration,energy,pitches,timbre,loudness,danceability
0,0.570278,0.344414,-0.380229,0.033102,0.212251,-0.295284,-0.178672,-0.367908,-0.105688,-0.31591
1,-0.179399,-0.306438,-0.27876,0.102305,0.730888,0.270387,-0.039409,-0.320095,-0.271154,0.029891
2,-0.172195,-0.589682,0.261041,-0.133294,-0.429221,-0.270769,-0.355272,-0.109027,-0.032178,-0.374202
3,0.623788,-0.190461,0.161531,0.07132,0.201448,-0.342355,-0.288166,0.490152,-0.110037,0.224162
4,0.192972,-0.298097,0.088428,0.472626,0.463764,0.342086,0.110808,0.017268,0.454363,-0.302626


Construct the adjacency matrix A

In [18]:
def euclidean_distance(point1, point2):
    return np.linalg.norm(point1 - point2)

In [19]:
def get_adjacency_matrix(songs, threshold=1):
    N, d = songs.shape

    A = np.zeros(N * N).reshape(N, N)

    for i in range(N):
        for j in range(N):
            if euclidean_distance(songs[i], songs[j]) < threshold:
                A[i][j] = 1
    
    return A

A = get_adjacency_matrix(songs)

Compute the degree matrix D

In [20]:
D = np.diag(np.sum(A, axis=1))

In [21]:
D

array([[69.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0., 67.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0., 58., ...,  0.,  0.,  0.],
       ...,
       [ 0.,  0.,  0., ..., 63.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0., 60.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0., 66.]])

Compute the Laplacian L = D − A and the normalized Laplacian $L = D^{−1/2}LD^{−1/2}$

In [22]:
# laplacian matrix
L = D - A

# normalized laplacian matrix
D_inv_sqrt = np.linalg.inv(np.sqrt(D))
L_norm = np.dot(D_inv_sqrt, L).dot(D_inv_sqrt)

In [23]:
L_norm

array([[ 0.98550725,  0.        ,  0.        , ...,  0.        ,
         0.        , -0.01481847],
       [ 0.        ,  0.98507463,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.98275862, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  0.98412698,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.98333333,  0.        ],
       [-0.01481847,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.98484848]])

Compute the eigenvector v corresponding to the second smallest eigenvalue of L and compute x = $D^{1/2}v$

In [24]:
eigenValues, eigenVectors = np.linalg.eig(L_norm)

idx = eigenValues.argsort()
eigenValues = eigenValues[idx]
eigenVectors = eigenVectors[:,idx]

In [25]:
v = eigenVectors[:, 1]
x = np.sqrt(D).dot(v)
# x = D_inv_sqrt.dot(v)

In [26]:
C1 = np.where(x >= 0)[0]
C2 = np.where(x < 0)[0]

In [27]:
C1[:10] + 1

array([ 1,  3,  4,  6,  7,  8,  9, 14, 15, 16])

In [28]:
C2[:10] + 1

array([ 2,  5, 10, 11, 12, 13, 19, 20, 22, 23])

Comparing the 2 clusters

In [29]:
features

Index(['artist_familiarity', 'artist_ hotttnesss', 'artist_num_songs',
       'release', 'duration', 'energy', 'pitches', 'timbre', 'loudness',
       'danceability'],
      dtype='object')

In [30]:
songs_c1 = songs[C1]
songs_c2 = songs[C2]

In [31]:
songs_c1_feature_mean = songs_c1.mean(axis=0)
songs_c2_feature_mean = songs_c2.mean(axis=0)

In [32]:
feature_means = abs(songs_c1_feature_mean - songs_c2_feature_mean)
# print(f'Feature Means:\n{feature_means}')

idx = feature_means.argsort()[::-1]

for i in idx[:3]:
    print(f"Feature \'{features[i]}\' has mean {feature_means[i]:.4}")

# print(f'\nTop 3 different features:\n{features[idx[:3]]}')

Feature 'energy' has mean 0.6556
Feature 'danceability' has mean 0.07532
Feature 'pitches' has mean 0.03412


Checking for statistical significance, using t-tests

In [33]:
for feature_idx in idx[:3]:
    t, p = stats.ttest_ind(songs_c1[:, feature_idx], songs_c2[:, feature_idx], equal_var = True)
    print(f'\'{features[feature_idx]}\' has a p-value of: {p:.5f}')

'energy' has a p-value of: 0.00000
'danceability' has a p-value of: 0.00014
'pitches' has a p-value of: 0.08263
