# 06-06

- Transition matrices + continous
    - Remove Constant Features
    - BoxCox
    - Center and Scaling
    - Euclidean Distance
- Categorical One Hot Encoded Features
    - Jaccard
- Weighted rank result of euclidean and jackard

In [22]:
import pandas as pd
from sklearn.cluster import KMeans
import seaborn as sns
from matplotlib import pyplot as plt 
import math
from ipynb.fs.full.Utils import createKDeplot, loadDataFrames, dropFeaturesWithNoVariance, loadCoverGroups
import numpy as np
from sklearn.preprocessing import StandardScaler
from scipy import stats
from ipynb.fs.full.Pipeline import removeConstantFeatures, applyBoxCoxAllFeatures, centerAndScale, applyEuclideanDistance, getIndexAndVector, applyJaccard

In [23]:
cover = loadCoverGroups()
listCovers = cover[12]
originalSongId = listCovers[0]
coverSongId = listCovers[1]

In [24]:
df = pd.read_csv('./data/unified/05-allDataContinousCategoricalTransition.csv')
df = df.iloc[:, 1:]
encodedFeaturesNames = pd.read_csv('./data/unified/05-encodedFeatureNames.csv')
encodedFeaturesNames = encodedFeaturesNames['0'].values.tolist()
encodedFeaturesNames.append('id')


In [25]:
df.columns

Index(['index', 'id', 'danceability', 'energy', 'speechiness', 'acousticness',
       'liveness', 'valence', 'tempo', '0',
       ...
       'mood_5', 'mood_6', 'mood_7', 'mood_8', 'mood_9', 'mood_10', 'mood_11',
       'instrumentalness_cat_0', 'instrumentalness_cat_1',
       'instrumentalness_cat_2'],
      dtype='object', length=1076)

In [26]:
dfE = df[encodedFeaturesNames]
dfE = dfE.sort_values('id')
ids = dfE.id
dfE.drop(['id'], axis=1, inplace=True)
dfE.reset_index(drop=True, inplace=True)

In [27]:
dfE = removeConstantFeatures(dfE)

In [28]:
originalSongIndex,originalSongVector = getIndexAndVector(originalSongId, ids, dfE)
coverIndex,coverSongVector = getIndexAndVector(coverSongId, ids, dfE)

In [29]:
##[0,1,1], [1,0,1] -> 0.66 Diff -> 0.33 Similarity
ranksJaccard = applyJaccard(dfE, ids, originalSongVector)

In [30]:
ranksJaccard

Unnamed: 0,id,rank
0,1863781,0.0
1,1191744,0.0
2,279519,0.0
3,6001901,0.0
4,6001918,0.0
...,...,...
5972,6000816,1.0
5973,6000815,1.0
5974,1194750,1.0
5975,700572,1.0


In [31]:
ranksJaccard['rank'].value_counts().sort_index()

0.000000      15
0.333333     222
0.571429    1793
0.750000    2681
0.888889    1169
1.000000      97
Name: rank, dtype: int64

## Continous features

In [32]:
encodedFeaturesNames.remove('id')
df = df[df.columns.difference(encodedFeaturesNames)]
df = df.sort_values('id')
ids = df.id
df.drop(['id'], axis=1, inplace=True)
df.reset_index(drop=True, inplace=True)

## Pipeline

In [33]:
df = removeConstantFeatures(df)
df = centerAndScale(df)

mean:  [ 0. -0. -0. ... -0.  0.  0.]
std:  [1. 1. 1. ... 1. 1. 1.]


### Original Song & Cover Song

In [34]:
originalSongIndex,originalSongVector = getIndexAndVector(originalSongId, ids, df)
coverIndex,coverSongVector = getIndexAndVector(coverSongId, ids, df)

In [35]:
ranksDF  = applyEuclideanDistance(df, ids, originalSongVector)

distance


In [36]:
ranksDF.head(10)

Unnamed: 0,id,rank
0,6001918,0.0
1,6000373,6.706965
2,6001599,9.431517
3,6001782,10.159282
4,6002002,10.950974
5,6001512,11.302348
6,6001270,11.441441
7,6000844,11.593494
8,6002487,12.627946
9,6001624,13.064715


In [37]:
ranksDF[ranksDF['id']==coverSongId]

Unnamed: 0,id,rank
5568,6001997,66.453697


In [38]:
ranksDF[ranksDF['id'].isin(listCovers)]

Unnamed: 0,id,rank
0,6001918,0.0
5568,6001997,66.453697


## Weighted rank

In [39]:
ranks = pd.merge(ranksJaccard,ranksDF, on='id')
ranks['new_rank'] = ranks['rank_x'] + ranks['rank_y']
ranks = ranks.sort_values(by='new_rank')
ranks.reset_index(drop=True, inplace=True)


In [40]:
ranks.head(10)

Unnamed: 0,id,rank_x,rank_y,new_rank
0,6001918,0.0,0.0,0.0
1,6000373,0.75,6.706965,7.456965
2,6001599,0.75,9.431517,10.181517
3,6001782,0.571429,10.159282,10.730711
4,6002002,0.571429,10.950974,11.522403
5,6001512,0.75,11.302348,12.052348
6,6001270,0.75,11.441441,12.191441
7,6000844,0.888889,11.593494,12.482383
8,6002487,0.333333,12.627946,12.96128
9,6001624,0.75,13.064715,13.814715


In [41]:
ranks[ranks['id']==coverSongId]

Unnamed: 0,id,rank_x,rank_y,new_rank
5565,6001997,0.333333,66.453697,66.78703


In [42]:
ranks.loc[ranks.id.isin(listCovers)]

Unnamed: 0,id,rank_x,rank_y,new_rank
0,6001918,0.0,0.0,0.0
5565,6001997,0.333333,66.453697,66.78703
