# 06-06

- Transition matrices + continous
    - Remove Constant Features
    - BoxCox
    - Center and Scaling
    - Euclidean Distance
- Categorical One Hot Encoded Features
    - Jaccard
- Weighted rank result of euclidean and jackard

In [35]:
import pandas as pd
from sklearn.cluster import KMeans
import seaborn as sns
from matplotlib import pyplot as plt 
import math
from ipynb.fs.full.Utils import createKDeplot, loadDataFrames, dropFeaturesWithNoVariance, loadCoverGroups
import numpy as np
from sklearn.preprocessing import StandardScaler
from scipy import stats
from ipynb.fs.full.Pipeline import removeConstantFeatures, applyBoxCoxAllFeatures, centerAndScale, applyEuclideanDistance, getIndexAndVector, applyJaccard

In [36]:
cover = loadCoverGroups()
originalSongId = cover[0][0]
coverSongId = cover[0][2]
listCovers = cover[0]

In [37]:
df = pd.read_csv('./data/unified/05-allDataContinousCategoricalTransition.csv')
df = df.iloc[:, 1:]
encodedFeaturesNames = pd.read_csv('./data/unified/05-encodedFeatureNames.csv')
encodedFeaturesNames = encodedFeaturesNames['0'].values.tolist()
encodedFeaturesNames.append('id')


In [38]:
df.columns

Index(['index', 'id', 'danceability', 'energy', 'speechiness', 'acousticness',
       'liveness', 'valence', 'tempo', '0',
       ...
       'mood_5', 'mood_6', 'mood_7', 'mood_8', 'mood_9', 'mood_10', 'mood_11',
       'instrumentalness_cat_0', 'instrumentalness_cat_1',
       'instrumentalness_cat_2'],
      dtype='object', length=1076)

In [39]:
dfE = df[encodedFeaturesNames]
dfE = dfE.sort_values('id')
ids = dfE.id
dfE.drop(['id'], axis=1, inplace=True)
dfE.reset_index(drop=True, inplace=True)

In [40]:
dfE = removeConstantFeatures(dfE)

In [41]:
originalSongIndex,originalSongVector = getIndexAndVector(originalSongId, ids, dfE)
coverIndex,coverSongVector = getIndexAndVector(coverSongId, ids, dfE)

In [42]:
##[0,1,1], [1,0,1] -> 0.66 Diff -> 0.33 Similarity
ranksJaccard = applyJaccard(dfE, ids, originalSongVector)

In [43]:
ranksJaccard

Unnamed: 0,id,rank
0,106522,0.0
1,1029726,0.0
2,3653915,0.0
3,6001258,0.0
4,466649,0.0
...,...,...
5972,6000126,1.0
5973,783974,1.0
5974,6000825,1.0
5975,6001510,1.0


In [44]:
ranksJaccard['rank'].value_counts().sort_index()

0.000000      10
0.333333     216
0.571429    1413
0.750000    3023
0.888889    1237
1.000000      78
Name: rank, dtype: int64

## Continous features

In [45]:
encodedFeaturesNames.remove('id')
df = df[df.columns.difference(encodedFeaturesNames)]
df = df.sort_values('id')
ids = df.id
df.drop(['id'], axis=1, inplace=True)
df.reset_index(drop=True, inplace=True)

## Pipeline

In [46]:
df = removeConstantFeatures(df)
df = centerAndScale(df)

mean:  [ 0. -0. -0. ... -0.  0.  0.]
std:  [1. 1. 1. ... 1. 1. 1.]


### Original Song & Cover Song

In [47]:
originalSongIndex,originalSongVector = getIndexAndVector(originalSongId, ids, df)
coverIndex,coverSongVector = getIndexAndVector(coverSongId, ids, df)

In [48]:
ranksDF  = applyEuclideanDistance(df, ids, originalSongVector)

distance


In [49]:
ranksDF.head(10)

Unnamed: 0,id,rank
0,180849,0.0
1,1025154,6.2462
2,569357,6.269484
3,2065851,6.395168
4,1884102,6.478077
5,1210073,6.501521
6,263779,6.518148
7,672021,6.538485
8,1065079,6.544171
9,1048915,6.658681


In [50]:
ranksDF[ranksDF['id']==coverSongId]

Unnamed: 0,id,rank
73,1686718,7.919422


In [51]:
ranksDF[ranksDF['id'].isin(listCovers)]

Unnamed: 0,id,rank
0,180849,0.0
42,5000019,7.569236
69,6001027,7.876343
73,1686718,7.919422
94,5000017,8.061349
133,5000013,8.223692
152,5000018,8.328912
174,5000011,8.411096
218,5000012,8.554556
351,5000001,8.910452


## Weighted rank

In [52]:
ranks = pd.merge(ranksJaccard,ranksDF, on='id')
ranks['new_rank'] = ranks['rank_x'] + ranks['rank_y']
ranks = ranks.sort_values(by='new_rank')
ranks.reset_index(drop=True, inplace=True)


In [53]:
ranks.head(10)

Unnamed: 0,id,rank_x,rank_y,new_rank
0,180849,0.0,0.0,0.0
1,2065851,0.333333,6.395168,6.728501
2,1025154,0.571429,6.2462,6.817629
3,672021,0.333333,6.538485,6.871819
4,569357,0.75,6.269484,7.019484
5,1884102,0.571429,6.478077,7.049506
6,3504560,0.333333,6.725566,7.058899
7,1210073,0.571429,6.501521,7.07295
8,1048915,0.571429,6.658681,7.23011
9,263779,0.75,6.518148,7.268148


In [54]:
ranks[ranks['id']==coverSongId]

Unnamed: 0,id,rank_x,rank_y,new_rank
71,1686718,0.571429,7.919422,8.490851


In [55]:
ranks.loc[ranks.id.isin(listCovers)]

Unnamed: 0,id,rank_x,rank_y,new_rank
0,180849,0.0,0.0,0.0
19,6001027,0.0,7.876343,7.876343
21,5000019,0.333333,7.569236,7.902569
62,5000017,0.333333,8.061349,8.394682
71,1686718,0.571429,7.919422,8.490851
108,5000013,0.571429,8.223692,8.79512
130,5000018,0.571429,8.328912,8.900341
196,5000011,0.75,8.411096,9.161096
220,5000001,0.333333,8.910452,9.243786
244,5000012,0.75,8.554556,9.304556
