# 06-03

- Transition Matrices
- Center and Scaling
- Euclidean distance
- Remove Constant Features


In [28]:
import pandas as pd
from sklearn.cluster import KMeans
import seaborn as sns
from matplotlib import pyplot as plt 
import math
from ipynb.fs.full.Utils import createKDeplot, loadDataFrames, loadCoverGroups
import numpy as np
from sklearn.preprocessing import StandardScaler
from ipynb.fs.full.Pipeline import removeConstantFeatures, applyBoxCoxAllFeatures, centerAndScale, applyEuclideanDistance, getIndexAndVector, applyBoxCoxFeatures, scalerMinMax

In [29]:
cover = loadCoverGroups()
listCovers = cover[12]
originalSongId = listCovers[0]
coverSongId = listCovers[1]

In [30]:
spotifyFeatures, transitionMatrices, combined = loadDataFrames()

In [31]:
len(transitionMatrices)

5980

In [32]:
transitionMatrices.sum(axis = 1).sum(axis=0)

5979.999999999996

In [33]:
transitionMatrices['id'] = transitionMatrices.index

In [34]:
transitionMatrices[transitionMatrices.id == originalSongId]

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,1286,1287,1288,1290,1291,1292,1293,1294,1295,id
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6001918,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6001918


In [35]:
transitionMatrices = removeConstantFeatures(transitionMatrices)

In [36]:
transitionMatrices.reset_index(drop=True, inplace=True)

In [37]:
ids = transitionMatrices.id
df = transitionMatrices.iloc[:, 0:-1]

## Combining Centering and Scaling

In [38]:
cols = df.columns
scaler = StandardScaler()
scaler.fit(df)
data_normalized = scaler.transform(df)
print('mean: ', data_normalized.mean(axis=0).round(2))
print('std: ', data_normalized.std(axis=0).round(2))

df = pd.DataFrame(data_normalized, columns=cols)

mean:  [-0. -0. -0. ...  0. -0.  0.]
std:  [1. 1. 1. ... 1. 1. 1.]


In [39]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1284,1286,1287,1288,1290,1291,1292,1293,1294,1295
0,-0.399006,-0.091843,-0.041708,-0.048451,-0.049629,-0.161063,-0.266736,-0.161475,-0.038295,-0.075173,...,-0.025699,-0.042078,-0.073782,-0.053313,-0.027166,-0.012933,-0.099383,-0.034484,-0.012933,-0.044327
1,2.638978,-0.091843,-0.041708,-0.048451,-0.049629,-0.161063,-0.266736,-0.161475,-0.038295,-0.075173,...,-0.025699,-0.042078,-0.073782,-0.053313,-0.027166,-0.012933,-0.099383,-0.034484,-0.012933,-0.044327
2,-0.399006,-0.091843,-0.041708,-0.048451,-0.049629,-0.161063,1.253998,-0.161475,-0.038295,-0.075173,...,-0.025699,-0.042078,-0.073782,-0.053313,-0.027166,-0.012933,-0.099383,-0.034484,-0.012933,-0.044327
3,0.083497,-0.091843,-0.041708,-0.048451,-0.049629,-0.161063,-0.266736,-0.161475,-0.038295,-0.075173,...,-0.025699,-0.042078,-0.073782,-0.053313,-0.027166,-0.012933,-0.099383,-0.034484,-0.012933,-0.044327
4,-0.171157,-0.091843,-0.041708,-0.048451,-0.049629,-0.161063,-0.266736,0.815751,-0.038295,-0.075173,...,-0.025699,-0.042078,-0.073782,-0.053313,-0.027166,-0.012933,-0.099383,-0.034484,-0.012933,-0.044327
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5975,-0.399006,-0.091843,-0.041708,-0.048451,-0.049629,-0.161063,-0.266736,-0.161475,-0.038295,-0.075173,...,-0.025699,-0.042078,-0.073782,-0.053313,-0.027166,-0.012933,-0.099383,-0.034484,-0.012933,-0.044327
5976,-0.399006,-0.091843,-0.041708,-0.048451,-0.049629,-0.161063,-0.266736,-0.161475,-0.038295,-0.075173,...,-0.025699,-0.042078,-0.073782,-0.053313,-0.027166,-0.012933,-0.099383,-0.034484,-0.012933,-0.044327
5977,-0.399006,1.751266,-0.041708,4.392104,-0.049629,0.786031,0.774382,-0.161475,-0.038295,-0.075173,...,-0.025699,-0.042078,-0.073782,-0.053313,-0.027166,-0.012933,-0.099383,-0.034484,-0.012933,-0.044327
5978,-0.399006,-0.091843,-0.041708,-0.048451,-0.049629,-0.161063,2.150145,-0.161475,-0.038295,-0.075173,...,-0.025699,-0.042078,-0.073782,10.211144,-0.027166,-0.012933,-0.099383,26.322022,-0.012933,-0.044327


## Applying eucledian distance

In [40]:
def euclidean_distance(x, y):   
    return np.sqrt(np.sum((x - y) ** 2))

In [41]:
originalSongIndex = ids[ids == originalSongId].index[0]
originalSongIndex

originalSongVector = df.iloc[originalSongIndex, :]

***Dropping the cover song***

In [42]:
coverIndex = ids[ids == coverSongId].index[0]
coverSongVector = df.iloc[coverIndex, :]

In [43]:
ranking = []
for index in range(0, len(df)):
    ranking.append([ids[index], euclidean_distance(df.iloc[index, :], originalSongVector)])  

In [44]:
res = round(len(set(originalSongVector.tolist()) & set(coverSongVector.tolist())) / float(len(set(originalSongVector.tolist()) | set(coverSongVector.tolist()))) * 100, 2)
res

93.22

In [45]:
ranksDF = pd.DataFrame(ranking)

In [46]:
ranksDF = ranksDF.rename(columns={0: "id", 1: "rank"})

In [47]:
ranksDF

Unnamed: 0,id,rank
0,23,22.209682
1,105,22.635670
2,118,27.869411
3,120,42.019390
4,292,18.616568
...,...,...
5975,6002496,18.713955
5976,6002497,21.762610
5977,6002498,103.668554
5978,6002499,80.905899


In [48]:
ranksDF = ranksDF.sort_values(by="rank")

In [49]:
ranksDF.reset_index(drop=True, inplace=True)

In [50]:
ranksDF.head(10)

Unnamed: 0,id,rank
0,6001918,0.0
1,6000373,5.825279
2,6001599,9.112269
3,6001782,9.924445
4,6002002,10.597484
5,6002487,10.795067
6,6000844,10.940581
7,6001270,10.989959
8,6001512,11.002944
9,6000167,12.656258


In [51]:
ranksDF['percentage'] = pd.Series(1 - ranksDF.index / len(ranksDF))

In [52]:
ranksDF[ranksDF['id'].isin(listCovers)]

Unnamed: 0,id,rank,percentage
0,6001918,0.0,1.0
5573,6001997,66.391605,0.06806


In [53]:
ranksDF[ranksDF['id']==coverSongId]

Unnamed: 0,id,rank,percentage
5573,6001997,66.391605,0.06806


In [54]:
ranksDF.head(30)

Unnamed: 0,id,rank,percentage
0,6001918,0.0,1.0
1,6000373,5.825279,0.999833
2,6001599,9.112269,0.999666
3,6001782,9.924445,0.999498
4,6002002,10.597484,0.999331
5,6002487,10.795067,0.999164
6,6000844,10.940581,0.998997
7,6001270,10.989959,0.998829
8,6001512,11.002944,0.998662
9,6000167,12.656258,0.998495
