# Using continous / categorical / transition matrices to get recommendations / Selective Box Cox

# 06-10

- KNN Clustering 
- Using Categories from KNN to get subset from information
- Continous Features(Transition Matrices, HighLevel Features)
    - Box Cox
    - Centering and Scaling
    - Remove Constant Features
    - Euclidean Distance
- Categorical Features
    - Hot Encoding
    - Jaccard

In [35]:
import pandas as pd
from sklearn.cluster import KMeans
import seaborn as sns
from matplotlib import pyplot as plt 
import math
from ipynb.fs.full.Utils import createKDeplot, loadDataFrames, dropFeaturesWithNoVariance, loadCoverGroups
import numpy as np
from sklearn.preprocessing import StandardScaler
from scipy import stats
from ipynb.fs.full.Pipeline import removeConstantFeatures, applyBoxCoxAllFeatures, centerAndScale, applyEuclideanDistance, getIndexAndVector, applyBoxCoxFeatures,applyJaccard, getEuclideanJaccardDf 

In [36]:
transitionMatrices = pd.read_csv('./data/unified/07-transitionsClustered.csv')
transitionMatrices = transitionMatrices[['id','category']]

In [37]:
cover = loadCoverGroups()
listCovers = cover[12]
originalSongId = listCovers[0]
coverSongId = listCovers[1]

In [38]:
coverSongId

6001997

In [39]:
df = pd.read_csv("./data/unified/05-allDataContinousCategoricalTransition.csv")
df = df.drop(columns=['Unnamed: 0', 'index'])

In [40]:
transitionMatrices

Unnamed: 0,id,category
0,23,1
1,105,0
2,118,2
3,120,1
4,292,2
...,...,...
5975,6002496,0
5976,6002497,1
5977,6002498,1
5978,6002499,1


In [41]:
df = pd.merge(df,transitionMatrices, on='id')

In [42]:
categoryOriginalSong = df[df.id == originalSongId].category.values[0]
df = df[df.category == categoryOriginalSong]

In [43]:
df

Unnamed: 0,id,danceability,energy,speechiness,acousticness,liveness,valence,tempo,0,1,...,mood_6,mood_7,mood_8,mood_9,mood_10,mood_11,instrumentalness_cat_0,instrumentalness_cat_1,instrumentalness_cat_2,category
0,23,0.494,0.756,0.0432,0.001400,0.1640,0.589,133.337,0.000000,0.000000,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1
3,120,0.319,0.807,0.0388,0.001040,0.1310,0.601,118.097,0.029412,0.000000,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1
16,836,0.511,0.517,0.0328,0.130000,0.0740,0.433,75.670,0.000000,0.000000,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1
17,1334,0.330,0.905,0.0497,0.000233,0.0712,0.259,163.142,0.104478,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1
22,2147,0.311,0.815,0.0443,0.253000,0.6080,0.700,147.173,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5970,6002494,0.713,0.798,0.2190,0.366000,0.2460,0.721,176.073,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1
5973,6002497,0.725,0.583,0.0427,0.300000,0.0665,0.762,79.984,0.000000,0.000000,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1
5974,6002498,0.399,0.787,0.0499,0.019700,0.0685,0.572,117.089,0.000000,0.015385,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1
5975,6002499,0.376,0.435,0.0264,0.513000,0.1380,0.304,80.002,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1


In [44]:
df = df.drop(columns=['category'])

In [45]:
df

Unnamed: 0,id,danceability,energy,speechiness,acousticness,liveness,valence,tempo,0,1,...,mood_5,mood_6,mood_7,mood_8,mood_9,mood_10,mood_11,instrumentalness_cat_0,instrumentalness_cat_1,instrumentalness_cat_2
0,23,0.494,0.756,0.0432,0.001400,0.1640,0.589,133.337,0.000000,0.000000,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,120,0.319,0.807,0.0388,0.001040,0.1310,0.601,118.097,0.029412,0.000000,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
16,836,0.511,0.517,0.0328,0.130000,0.0740,0.433,75.670,0.000000,0.000000,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
17,1334,0.330,0.905,0.0497,0.000233,0.0712,0.259,163.142,0.104478,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
22,2147,0.311,0.815,0.0443,0.253000,0.6080,0.700,147.173,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5970,6002494,0.713,0.798,0.2190,0.366000,0.2460,0.721,176.073,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
5973,6002497,0.725,0.583,0.0427,0.300000,0.0665,0.762,79.984,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
5974,6002498,0.399,0.787,0.0499,0.019700,0.0685,0.572,117.089,0.000000,0.015385,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
5975,6002499,0.376,0.435,0.0264,0.513000,0.1380,0.304,80.002,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [46]:
df

Unnamed: 0,id,danceability,energy,speechiness,acousticness,liveness,valence,tempo,0,1,...,mood_5,mood_6,mood_7,mood_8,mood_9,mood_10,mood_11,instrumentalness_cat_0,instrumentalness_cat_1,instrumentalness_cat_2
0,23,0.494,0.756,0.0432,0.001400,0.1640,0.589,133.337,0.000000,0.000000,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,120,0.319,0.807,0.0388,0.001040,0.1310,0.601,118.097,0.029412,0.000000,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
16,836,0.511,0.517,0.0328,0.130000,0.0740,0.433,75.670,0.000000,0.000000,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
17,1334,0.330,0.905,0.0497,0.000233,0.0712,0.259,163.142,0.104478,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
22,2147,0.311,0.815,0.0443,0.253000,0.6080,0.700,147.173,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5970,6002494,0.713,0.798,0.2190,0.366000,0.2460,0.721,176.073,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
5973,6002497,0.725,0.583,0.0427,0.300000,0.0665,0.762,79.984,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
5974,6002498,0.399,0.787,0.0499,0.019700,0.0685,0.572,117.089,0.000000,0.015385,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
5975,6002499,0.376,0.435,0.0264,0.513000,0.1380,0.304,80.002,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [47]:
## Removing unamed and other unnecessary columns
df.reset_index(drop=True, inplace=True)
df = df.sort_values('id')
df.id = df.id.astype('int64')
# Saving idChords
ids = df.id
df = df.set_index('id')
df.reset_index(drop=True, inplace=True)

In [48]:
df

Unnamed: 0,danceability,energy,speechiness,acousticness,liveness,valence,tempo,0,1,2,...,mood_5,mood_6,mood_7,mood_8,mood_9,mood_10,mood_11,instrumentalness_cat_0,instrumentalness_cat_1,instrumentalness_cat_2
0,0.494,0.756,0.0432,0.001400,0.1640,0.589,133.337,0.000000,0.000000,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.319,0.807,0.0388,0.001040,0.1310,0.601,118.097,0.029412,0.000000,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.511,0.517,0.0328,0.130000,0.0740,0.433,75.670,0.000000,0.000000,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.330,0.905,0.0497,0.000233,0.0712,0.259,163.142,0.104478,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.311,0.815,0.0443,0.253000,0.6080,0.700,147.173,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2420,0.713,0.798,0.2190,0.366000,0.2460,0.721,176.073,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
2421,0.725,0.583,0.0427,0.300000,0.0665,0.762,79.984,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
2422,0.399,0.787,0.0499,0.019700,0.0685,0.572,117.089,0.000000,0.015385,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2423,0.376,0.435,0.0264,0.513000,0.1380,0.304,80.002,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


## Pipeline

In [49]:
dfEuclidean, dfJaccard = getEuclideanJaccardDf(df)

In [50]:
dfEuclidean

Unnamed: 0,0,1,10,100,1000,1001,1002,1003,1004,1005,...,997,998,999,acousticness,danceability,energy,liveness,speechiness,tempo,valence
0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.001400,0.494,0.756,0.1640,0.0432,133.337,0.589
1,0.029412,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.001040,0.319,0.807,0.1310,0.0388,118.097,0.601
2,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.130000,0.511,0.517,0.0740,0.0328,75.670,0.433
3,0.104478,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.000233,0.330,0.905,0.0712,0.0497,163.142,0.259
4,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.115942,0.253000,0.311,0.815,0.6080,0.0443,147.173,0.700
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2420,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.366000,0.713,0.798,0.2460,0.2190,176.073,0.721
2421,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.300000,0.725,0.583,0.0665,0.0427,79.984,0.762
2422,0.000000,0.015385,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.019700,0.399,0.787,0.0685,0.0499,117.089,0.572
2423,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.513000,0.376,0.435,0.1380,0.0264,80.002,0.304


In [51]:
dfJaccard

Unnamed: 0,key_0,key_1,key_2,key_3,key_4,key_5,key_6,key_7,key_8,key_9,...,mood_5,mood_6,mood_7,mood_8,mood_9,mood_10,mood_11,instrumentalness_cat_0,instrumentalness_cat_1,instrumentalness_cat_2
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2420,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
2421,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
2422,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2423,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [52]:
# dfEuclidean = applyBoxCoxFeatures(dfEuclidean, ['speechiness', 'acousticness', 'liveness', 'valence'])
dfEuclidean = dfEuclidean.copy()
dfEuclidean = removeConstantFeatures(dfEuclidean)
# dfEuclidean= applyBoxCoxFeatures(dfEuclidean, ['speechiness', 'acousticness', 'liveness', 'valence'])
dfEuclidean= applyBoxCoxAllFeatures(dfEuclidean)
dfEuclidean = centerAndScale(dfEuclidean)

mean:  [ 0. -0. -0. ... -0. -0.  0.]
std:  [1. 1. 1. ... 1. 1. 1.]


### Original Song & Cover Song

In [53]:
originalSongIndex,originalSongVector = getIndexAndVector(originalSongId, ids, dfEuclidean)
coverSongIndex,coverSongVector = getIndexAndVector(coverSongId, ids, dfEuclidean)

### Get Rankings Euclidean

In [54]:
ranksDFEuclidean  = applyEuclideanDistance(dfEuclidean, ids, originalSongVector)
ranksDFEuclidean

distance


Unnamed: 0,id,rank
0,6001918,0.000000
1,6002017,9.553860
2,59609,9.684525
3,6001670,9.739011
4,717911,9.928647
...,...,...
2420,6000959,167.600004
2421,6000790,169.264712
2422,6000742,170.538807
2423,6000793,181.242260


In [55]:
ranksDFEuclidean[ranksDFEuclidean.id == originalSongId]

Unnamed: 0,id,rank
0,6001918,0.0


In [56]:
ranksDFEuclidean[ranksDFEuclidean['id']==coverSongId]

Unnamed: 0,id,rank
2111,6001997,43.171696


In [57]:
ranksDFEuclidean.loc[ranksDFEuclidean.id.isin(listCovers)]

Unnamed: 0,id,rank
0,6001918,0.0
2111,6001997,43.171696


### Get Rankings Jaccard

In [58]:
originalSongIndex,originalSongVector = getIndexAndVector(originalSongId, ids, dfJaccard)
coverSongIndex,coverSongVector = getIndexAndVector(coverSongId, ids, dfJaccard)

In [59]:
ranksDFJaccard  = applyJaccard(dfJaccard, ids, originalSongVector)

In [60]:
ranksDFJaccard

Unnamed: 0,id,rank
0,6002474,0.0
1,1863781,0.0
2,6001901,0.0
3,6002056,0.0
4,6001718,0.0
...,...,...
2420,6000196,1.0
2421,6000198,1.0
2422,6000767,1.0
2423,390605,1.0


In [61]:
ranksDFJaccard[ranksDFJaccard['id']==coverSongId]

Unnamed: 0,id,rank
9,6001997,0.333333


In [62]:
ranksDFJaccard.loc[ranksDFJaccard.id.isin(listCovers)]

Unnamed: 0,id,rank
7,6001918,0.0
9,6001997,0.333333


### Experiment to unify ranks

In [63]:
ranks = pd.merge(ranksDFJaccard,ranksDFEuclidean, on='id')

In [64]:
ranks['new_rank'] = ranks['rank_x'] + ranks['rank_y']

In [65]:
ranks = ranks.sort_values(by='new_rank')
ranks.reset_index(drop=True, inplace=True)

In [66]:
ranks.head(10)

Unnamed: 0,id,rank_x,rank_y,new_rank
0,6001918,0.0,0.0,0.0
1,6002017,0.571429,9.55386,10.125289
2,59609,0.75,9.684525,10.434525
3,6001670,0.75,9.739011,10.489011
4,717911,0.75,9.928647,10.678647
5,6001853,0.75,10.040716,10.790716
6,6001700,0.571429,10.858766,11.430195
7,6000119,0.75,10.769494,11.519494
8,9344,0.888889,10.944382,11.833271
9,951051,0.75,11.119114,11.869114


In [67]:
ranks.loc[ranks.id.isin(listCovers)]

Unnamed: 0,id,rank_x,rank_y,new_rank
0,6001918,0.0,0.0,0.0
2102,6001997,0.333333,43.171696,43.505029


In [68]:
ranks[ranks['id']==coverSongId]

Unnamed: 0,id,rank_x,rank_y,new_rank
2102,6001997,0.333333,43.171696,43.505029
