# Applying Euclidean distance only on chord progressions / with centralization and scaling

In [1]:
import pandas as pd
from sklearn.cluster import KMeans
import seaborn as sns
from matplotlib import pyplot as plt 
import math
from ipynb.fs.full.Utils import createKDeplot, loadDataFrames, loadCoverGroups
import numpy as np
from sklearn.preprocessing import StandardScaler

In [2]:
from scipy import stats

def applyLogTransformation(df, features, suffix):
    newFeatures = []
    for feature in features:
#         df[feature + '_log'] = np.sqrt(df[feature])
    
        df[feature + suffix] = stats.boxcox(df[feature])[0]
        newFeatures.append(feature + suffix)
        print('skewnees before:', df[feature].skew(), ' after: ', df[feature + suffix].skew())
        
    createKDeplot(df,newFeatures)

In [3]:
spotifyFeatures, transitionMatrices, combined = loadDataFrames()


In [4]:
len(transitionMatrices)

5980

In [5]:
transitionMatrices.sum(axis = 1).sum(axis=0)

5979.999999999996

In [6]:
transitionMatrices['id'] = transitionMatrices.index

In [7]:
transitionMatrices[transitionMatrices.id == 1686718]

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,1286,1287,1288,1290,1291,1292,1293,1294,1295,id
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1686718,0.035714,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1686718


In [8]:
transitionMatrices.reset_index(drop=True, inplace=True)

In [9]:
ids = transitionMatrices.id
df = transitionMatrices.iloc[:, 0:-1]

## Combining Centering and Scaling

In [10]:
cols = df.columns
scaler = StandardScaler()
scaler.fit(df)
data_normalized = scaler.transform(df)
print('mean: ', data_normalized.mean(axis=0).round(2))
print('std: ', data_normalized.std(axis=0).round(2))

df = pd.DataFrame(data_normalized, columns=cols)

mean:  [-0. -0. -0. ...  0. -0.  0.]
std:  [1. 1. 1. ... 1. 1. 1.]


In [11]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1284,1286,1287,1288,1290,1291,1292,1293,1294,1295
0,-0.399006,-0.091843,-0.041708,-0.048451,-0.049629,-0.161063,-0.266736,-0.161475,-0.038295,-0.075173,...,-0.025699,-0.042078,-0.073782,-0.053313,-0.027166,-0.012933,-0.099383,-0.034484,-0.012933,-0.044327
1,2.638978,-0.091843,-0.041708,-0.048451,-0.049629,-0.161063,-0.266736,-0.161475,-0.038295,-0.075173,...,-0.025699,-0.042078,-0.073782,-0.053313,-0.027166,-0.012933,-0.099383,-0.034484,-0.012933,-0.044327
2,-0.399006,-0.091843,-0.041708,-0.048451,-0.049629,-0.161063,1.253998,-0.161475,-0.038295,-0.075173,...,-0.025699,-0.042078,-0.073782,-0.053313,-0.027166,-0.012933,-0.099383,-0.034484,-0.012933,-0.044327
3,0.083497,-0.091843,-0.041708,-0.048451,-0.049629,-0.161063,-0.266736,-0.161475,-0.038295,-0.075173,...,-0.025699,-0.042078,-0.073782,-0.053313,-0.027166,-0.012933,-0.099383,-0.034484,-0.012933,-0.044327
4,-0.171157,-0.091843,-0.041708,-0.048451,-0.049629,-0.161063,-0.266736,0.815751,-0.038295,-0.075173,...,-0.025699,-0.042078,-0.073782,-0.053313,-0.027166,-0.012933,-0.099383,-0.034484,-0.012933,-0.044327
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5975,-0.399006,-0.091843,-0.041708,-0.048451,-0.049629,-0.161063,-0.266736,-0.161475,-0.038295,-0.075173,...,-0.025699,-0.042078,-0.073782,-0.053313,-0.027166,-0.012933,-0.099383,-0.034484,-0.012933,-0.044327
5976,-0.399006,-0.091843,-0.041708,-0.048451,-0.049629,-0.161063,-0.266736,-0.161475,-0.038295,-0.075173,...,-0.025699,-0.042078,-0.073782,-0.053313,-0.027166,-0.012933,-0.099383,-0.034484,-0.012933,-0.044327
5977,-0.399006,1.751266,-0.041708,4.392104,-0.049629,0.786031,0.774382,-0.161475,-0.038295,-0.075173,...,-0.025699,-0.042078,-0.073782,-0.053313,-0.027166,-0.012933,-0.099383,-0.034484,-0.012933,-0.044327
5978,-0.399006,-0.091843,-0.041708,-0.048451,-0.049629,-0.161063,2.150145,-0.161475,-0.038295,-0.075173,...,-0.025699,-0.042078,-0.073782,10.211144,-0.027166,-0.012933,-0.099383,26.322022,-0.012933,-0.044327


## Applying eucledian distance

In [12]:
def euclidean_distance(x, y):   
    return np.sqrt(np.sum((x - y) ** 2))

In [13]:
originalSongId = 180849 # The Scientist Original
originalSongIndex = ids[ids == originalSongId].index[0]
originalSongIndex

originalSongVector = df.iloc[originalSongIndex, :]

***Dropping the cover song***

In [14]:
coverSongId = 1686718 # THe Scientist by Boyce Avenue
coverIndex = ids[ids == coverSongId].index[0]
coverSongVector = df.iloc[coverIndex, :]

In [15]:
ranking = []
for index in range(0, len(df)):
    ranking.append([ids[index], euclidean_distance(df.iloc[index, :], originalSongVector)])
ranking    

[[23, 14.102754849412888],
 [105, 16.79636075132978],
 [118, 23.42143697384292],
 [120, 38.55002547462992],
 [292, 9.972756824930862],
 [407, 9.837310833847507],
 [418, 16.689419471730584],
 [456, 8.903431098898697],
 [467, 25.23195146715489],
 [469, 17.528960409269132],
 [588, 8.70114928625381],
 [628, 18.88222507334007],
 [635, 11.092638294795735],
 [652, 15.552865788659663],
 [816, 10.747023634363165],
 [830, 31.065877412946048],
 [836, 17.16557120276704],
 [1334, 13.338514666103107],
 [1345, 10.864947658971012],
 [1395, 9.731358325671433],
 [1433, 11.923998016075723],
 [2063, 10.267982853942847],
 [2147, 22.498035745092345],
 [2148, 23.637295780594915],
 [2220, 16.045803115488294],
 [2535, 9.273073971176174],
 [2553, 10.61255097213937],
 [3118, 21.99951689847004],
 [3165, 7.87211825091267],
 [3337, 10.631898745578875],
 [3391, 10.43404441057342],
 [3991, 19.220579089445906],
 [4019, 9.058384213160274],
 [4041, 10.958644297065096],
 [4084, 13.623712391876264],
 [4169, 45.45838342518

In [16]:
res = round(len(set(originalSongVector.tolist()) & set(coverSongVector.tolist())) / float(len(set(originalSongVector.tolist()) | set(coverSongVector.tolist()))) * 100, 2)
res

95.14

In [17]:
ranksDF = pd.DataFrame(ranking)

In [18]:
ranksDF = ranksDF.rename(columns={0: "id", 1: "rank"})

In [19]:
ranksDF

Unnamed: 0,id,rank
0,23,14.102755
1,105,16.796361
2,118,23.421437
3,120,38.550025
4,292,9.972757
...,...,...
5975,6002496,11.611831
5976,6002497,20.903976
5977,6002498,102.568810
5978,6002499,79.490761


In [20]:
ranksDF = ranksDF.sort_values(by="rank")

In [21]:
ranksDF.reset_index(drop=True, inplace=True)

In [22]:
ranksDF

Unnamed: 0,id,rank
0,180849,0.000000
1,569357,5.471591
2,1065079,5.739800
3,672021,5.742266
4,2846348,5.849941
...,...,...
5975,6002253,172.401715
5976,6000743,172.967805
5977,6002278,177.186927
5978,6001117,187.664979


In [23]:
ranksDF['percentage'] = pd.Series(1 - ranksDF.index / len(ranksDF))

In [24]:
ranksDF[ranksDF['id'] > 5000000]

Unnamed: 0,id,rank,percentage
13,6001580,6.394157,0.997826
33,5000019,6.820573,0.994482
70,6001418,7.251233,0.988294
81,6001308,7.349998,0.986455
93,6000268,7.427033,0.984448
...,...,...,...
5975,6002253,172.401715,0.000836
5976,6000743,172.967805,0.000669
5977,6002278,177.186927,0.000502
5978,6001117,187.664979,0.000334


In [25]:
ranksDF[ranksDF['id']==coverSongId]

Unnamed: 0,id,rank,percentage
129,1686718,7.638776,0.978428


In [26]:
ranksDF.head(30)

Unnamed: 0,id,rank,percentage
0,180849,0.0,1.0
1,569357,5.471591,0.999833
2,1065079,5.7398,0.999666
3,672021,5.742266,0.999498
4,2846348,5.849941,0.999331
5,1148352,6.07113,0.999164
6,263779,6.114807,0.998997
7,1025154,6.145947,0.998829
8,2065851,6.155223,0.998662
9,1048915,6.158998,0.998495


## Rank for original Song 100