# 06-03

- Transition Matrices
- Center and Scaling
- Euclidean distance
- Remove Constant Features


In [32]:
import pandas as pd
from sklearn.cluster import KMeans
import seaborn as sns
from matplotlib import pyplot as plt 
import math
from ipynb.fs.full.Utils import createKDeplot, loadDataFrames, loadCoverGroups
import numpy as np
from sklearn.preprocessing import StandardScaler
from ipynb.fs.full.Pipeline import removeConstantFeatures, applyBoxCoxAllFeatures, centerAndScale, applyEuclideanDistance, getIndexAndVector, applyBoxCoxFeatures, scalerMinMax

from ipynb.fs.full.SearchInfo import searchByID, searchBySpotify, getTrackSpotifyInfo
from ipynb.fs.full.Utils import append_df_to_csv,transformToPercentage

In [33]:
cover = loadCoverGroups()
listCovers = cover[0]
originalSongId = listCovers[0]
coverSongId = listCovers[1]

In [34]:
cover

[[180849,
  6001027,
  1686718,
  5000013,
  5000001,
  5000007,
  5000008,
  5000023,
  5000017,
  5000019,
  5000020,
  5000018,
  5000011,
  5000012],
 [6001971, 6001984],
 [6002492, 6002491],
 [6000699, 6001165, 6000182],
 [6000104, 6000273],
 [6000768, 6000784],
 [6001931, 6001748],
 [6002043, 6002073],
 [6002244, 6000194],
 [6001209, 6000606],
 [6000352, 6001248],
 [6000549, 6001508, 6001242],
 [6001918, 6001997],
 [6001651, 6001900],
 [6000263, 6001570, 6002374],
 [6001822, 6001653],
 [6000763, 6000668],
 [6000764, 6000667],
 [6001543, 6001541],
 [6000762, 6000773],
 [6000771, 6000670],
 [6001747, 6001952],
 [6000548, 6002203],
 [6000172, 6002133],
 [6001762, 6001675],
 [6002147, 6002146],
 [6000767, 6000775],
 [6000766, 6000673],
 [6001506, 6000529],
 [6000397, 6000409],
 [6001914, 6002001],
 [6002482, 6002454],
 [6000765, 6000783],
 [6000761, 6000782],
 [6000814, 6000813],
 [6001224, 6000140]]

In [35]:
originalSongId

180849

In [36]:
spotifyFeatures, transitionMatrices, combined = loadDataFrames()

In [37]:
len(transitionMatrices)

5980

In [38]:
transitionMatrices.sum(axis = 1).sum(axis=0)

5979.999999999996

In [39]:
transitionMatrices['id'] = transitionMatrices.index

In [40]:
transitionMatrices[transitionMatrices.id == originalSongId]

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,1286,1287,1288,1290,1291,1292,1293,1294,1295,id
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
180849,0.222222,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,180849


In [41]:
transitionMatrices = removeConstantFeatures(transitionMatrices)

In [42]:
transitionMatrices.reset_index(drop=True, inplace=True)

In [43]:
ids = transitionMatrices.id
df = transitionMatrices.iloc[:, 0:-1]

## Combining Centering and Scaling

In [44]:
cols = df.columns
scaler = StandardScaler()
scaler.fit(df)
data_normalized = scaler.transform(df)
print('mean: ', data_normalized.mean(axis=0).round(2))
print('std: ', data_normalized.std(axis=0).round(2))

df = pd.DataFrame(data_normalized, columns=cols)

mean:  [-0. -0. -0. ...  0. -0.  0.]
std:  [1. 1. 1. ... 1. 1. 1.]


In [45]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1284,1286,1287,1288,1290,1291,1292,1293,1294,1295
0,-0.399006,-0.091843,-0.041708,-0.048451,-0.049629,-0.161063,-0.266736,-0.161475,-0.038295,-0.075173,...,-0.025699,-0.042078,-0.073782,-0.053313,-0.027166,-0.012933,-0.099383,-0.034484,-0.012933,-0.044327
1,2.638978,-0.091843,-0.041708,-0.048451,-0.049629,-0.161063,-0.266736,-0.161475,-0.038295,-0.075173,...,-0.025699,-0.042078,-0.073782,-0.053313,-0.027166,-0.012933,-0.099383,-0.034484,-0.012933,-0.044327
2,-0.399006,-0.091843,-0.041708,-0.048451,-0.049629,-0.161063,1.253998,-0.161475,-0.038295,-0.075173,...,-0.025699,-0.042078,-0.073782,-0.053313,-0.027166,-0.012933,-0.099383,-0.034484,-0.012933,-0.044327
3,0.083497,-0.091843,-0.041708,-0.048451,-0.049629,-0.161063,-0.266736,-0.161475,-0.038295,-0.075173,...,-0.025699,-0.042078,-0.073782,-0.053313,-0.027166,-0.012933,-0.099383,-0.034484,-0.012933,-0.044327
4,-0.171157,-0.091843,-0.041708,-0.048451,-0.049629,-0.161063,-0.266736,0.815751,-0.038295,-0.075173,...,-0.025699,-0.042078,-0.073782,-0.053313,-0.027166,-0.012933,-0.099383,-0.034484,-0.012933,-0.044327
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5975,-0.399006,-0.091843,-0.041708,-0.048451,-0.049629,-0.161063,-0.266736,-0.161475,-0.038295,-0.075173,...,-0.025699,-0.042078,-0.073782,-0.053313,-0.027166,-0.012933,-0.099383,-0.034484,-0.012933,-0.044327
5976,-0.399006,-0.091843,-0.041708,-0.048451,-0.049629,-0.161063,-0.266736,-0.161475,-0.038295,-0.075173,...,-0.025699,-0.042078,-0.073782,-0.053313,-0.027166,-0.012933,-0.099383,-0.034484,-0.012933,-0.044327
5977,-0.399006,1.751266,-0.041708,4.392104,-0.049629,0.786031,0.774382,-0.161475,-0.038295,-0.075173,...,-0.025699,-0.042078,-0.073782,-0.053313,-0.027166,-0.012933,-0.099383,-0.034484,-0.012933,-0.044327
5978,-0.399006,-0.091843,-0.041708,-0.048451,-0.049629,-0.161063,2.150145,-0.161475,-0.038295,-0.075173,...,-0.025699,-0.042078,-0.073782,10.211144,-0.027166,-0.012933,-0.099383,26.322022,-0.012933,-0.044327


## Applying eucledian distance

In [46]:
def euclidean_distance(x, y):   
    return np.sqrt(np.sum((x - y) ** 2))

In [47]:
originalSongIndex = ids[ids == originalSongId].index[0]
originalSongIndex

originalSongVector = df.iloc[originalSongIndex, :]

***Dropping the cover song***

In [48]:
coverIndex = ids[ids == coverSongId].index[0]
coverSongVector = df.iloc[coverIndex, :]

In [49]:
ranking = []
for index in range(0, len(df)):
    ranking.append([ids[index], euclidean_distance(df.iloc[index, :], originalSongVector)])  

In [50]:
res = round(len(set(originalSongVector.tolist()) & set(coverSongVector.tolist())) / float(len(set(originalSongVector.tolist()) | set(coverSongVector.tolist()))) * 100, 2)
res

95.79

In [51]:
ranksDF = pd.DataFrame(ranking)

In [52]:
ranksDF = ranksDF.rename(columns={0: "id", 1: "rank"})

In [53]:
ranksDF

Unnamed: 0,id,rank
0,23,14.102755
1,105,16.796361
2,118,23.421437
3,120,38.550025
4,292,9.972757
...,...,...
5975,6002496,11.611831
5976,6002497,20.903976
5977,6002498,102.568810
5978,6002499,79.490761


In [54]:
ranksDF = ranksDF.sort_values(by="rank")

In [55]:
ranksDF.reset_index(drop=True, inplace=True)

In [56]:
ranksDF.head(10)

Unnamed: 0,id,rank
0,180849,0.0
1,569357,5.471591
2,1065079,5.7398
3,672021,5.742266
4,2846348,5.849941
5,1148352,6.07113
6,263779,6.114807
7,1025154,6.145947
8,2065851,6.155223
9,1048915,6.158998


In [57]:
ranksDF['percentage'] = pd.Series(1 - ranksDF.index / len(ranksDF))

In [58]:
ranksDF[ranksDF['id'].isin(listCovers)]

Unnamed: 0,id,rank,percentage
0,180849,0.0,1.0
33,5000019,6.820573,0.994482
111,6001027,7.553511,0.981438
129,1686718,7.638776,0.978428
151,5000017,7.728774,0.974749
154,5000018,7.743021,0.974247
189,5000013,7.884639,0.968395
202,5000012,7.923901,0.966221
263,5000011,8.102214,0.95602
435,5000001,8.495844,0.927258


In [59]:
ranksDF[ranksDF['id']==coverSongId]

Unnamed: 0,id,rank,percentage
111,6001027,7.553511,0.981438


In [60]:
ranksDF.head(30)

Unnamed: 0,id,rank,percentage
0,180849,0.0,1.0
1,569357,5.471591,0.999833
2,1065079,5.7398,0.999666
3,672021,5.742266,0.999498
4,2846348,5.849941,0.999331
5,1148352,6.07113,0.999164
6,263779,6.114807,0.998997
7,1025154,6.145947,0.998829
8,2065851,6.155223,0.998662
9,1048915,6.158998,0.998495


In [31]:
# To extract covers info change path in util if needed
ranks = transformToPercentage(ranksDF.head(10), '0300')
ranks

Unnamed: 0,id_chord,id_spotify_track,name,release_date,percentage,rank,position,id_experiment
0,180849,3jButwtJMLx3Ub61BfRiHh,the scientist,2022-06-03,100.0,0.0,0,300
1,569357,7kxfWvj6u9oWQ5C36kMtGb,watch over you,2007-01-01,94.528,5.471591,1,300
2,1065079,2U8g9wVcUu9wsg6i7sFSv8,every teardrop is a waterfall,2011-10-24,94.26,5.7398,2,300
3,672021,4wWPjMdb7owUVi5S43yFtJ,give me jesus,2006-01-01,94.258,5.742266,3,300
4,2846348,4j5ffIFh7bFT7GZciP1TCy,10000 hours,2021-08-13,94.15,5.849941,4,300
5,1148352,2R6UrJ8uWbSIiHWmvRQvN8,whiskey in the jar,1998-01-01,93.929,6.07113,5,300
6,263779,6aXKbU2QGsOms8UT3eUOg1,catch the wind,1965-05-14,93.885,6.114807,6,300
7,1025154,67xq6C6IdzUgCOaCrP1nM0,the stable song,2016-06-10,93.854,6.145947,7,300
8,2065851,43IbCqMcMvZi1v16LFAE3n,safe,2017-06-01,93.845,6.155223,8,300
9,1048915,4bcgRbGeq4IejesjsiHyTH,little bird,2011-09-09,93.841,6.158998,9,300
