# Using continous / categorical / transition matrices to get recommendations / Selective Box Cox

In [2]:
import pandas as pd
from sklearn.cluster import KMeans
import seaborn as sns
from matplotlib import pyplot as plt 
import math
from ipynb.fs.full.Utils import createKDeplot, loadDataFrames, dropFeaturesWithNoVariance
import numpy as np
from sklearn.preprocessing import StandardScaler
from scipy import stats
from ipynb.fs.full.Pipeline import removeConstantFeatures, applyBoxCoxAllFeatures, centerAndScale, applyEuclideanDistance, getIndexAndVector, applyBoxCoxFeatures,applyJaccard, getEuclideanJaccardDf 

In [3]:
transitionMatrices = pd.read_csv('./data/unified/07-transitionsClustered.csv')
transitionMatrices = transitionMatrices[['id','category']]

In [4]:
transitionMatrices


Unnamed: 0,id,category
0,23,1
1,105,0
2,118,2
3,120,1
4,292,2
...,...,...
5975,6002496,0
5976,6002497,1
5977,6002498,1
5978,6002499,1


In [721]:
df = pd.read_csv("./data/unified/05-allDataContinousCategoricalTransition.csv")
df = df.drop(columns=['Unnamed: 0', 'index'])

In [722]:
transitionMatrices

Unnamed: 0,id,category
0,23,1
1,105,0
2,118,2
3,120,1
4,292,2
...,...,...
5975,6002496,2
5976,6002497,1
5977,6002498,1
5978,6002499,1


In [723]:
df = pd.merge(df,transitionMatrices, on='id')

In [724]:
originalSongId = 180849 # The Scientist Original
categoryOriginalSong = df[df.id == originalSongId].category.values[0]
df = df[df.category == categoryOriginalSong]

In [725]:
df

Unnamed: 0,id,danceability,energy,speechiness,acousticness,liveness,valence,tempo,0,1,...,mood_6,mood_7,mood_8,mood_9,mood_10,mood_11,instrumentalness_cat_0,instrumentalness_cat_1,instrumentalness_cat_2,category
1,105,0.582,0.744,0.0336,0.000458,0.0405,0.674,133.344,0.185185,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0
14,816,0.536,0.928,0.0589,0.015400,0.2000,0.389,136.258,0.000000,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0
15,830,0.526,0.665,0.0243,0.407000,0.1690,0.285,158.493,0.181818,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0
19,1395,0.561,0.593,0.0264,0.003270,0.0967,0.574,80.282,0.000000,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0
20,1433,0.481,0.638,0.0276,0.153000,0.1540,0.497,79.064,0.000000,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5933,6002455,0.737,0.742,0.0441,0.218000,0.3160,0.386,119.966,0.000000,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0
5935,6002457,0.749,0.780,0.0636,0.019600,0.0901,0.905,92.007,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0
5953,6002476,0.770,0.750,0.0785,0.190000,0.0618,0.609,99.997,0.000000,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0
5954,6002477,0.700,0.710,0.0532,0.327000,0.1290,0.582,167.971,0.000000,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0


In [726]:
df[(df.id > 5000000) & (df.id < 6000000)]

Unnamed: 0,id,danceability,energy,speechiness,acousticness,liveness,valence,tempo,0,1,...,mood_6,mood_7,mood_8,mood_9,mood_10,mood_11,instrumentalness_cat_0,instrumentalness_cat_1,instrumentalness_cat_2,category
3481,5000001,0.255,0.343,0.031,0.927,0.164,0.116,140.527,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0
3482,5000007,0.273,0.205,0.0316,0.941,0.171,0.0796,148.225,0.077922,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0
3483,5000008,0.506,0.481,0.0264,0.482,0.2,0.224,148.626,0.011905,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0
3484,5000011,0.503,0.123,0.0333,0.879,0.114,0.28,145.843,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0
3485,5000012,0.306,0.229,0.0331,0.807,0.112,0.156,85.492,0.048387,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0
3486,5000013,0.334,0.252,0.0287,0.722,0.0907,0.214,153.879,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0
3487,5000017,0.453,0.295,0.0291,0.518,0.293,0.14,146.022,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0
3488,5000018,0.525,0.292,0.026,0.91,0.0795,0.133,75.009,0.032258,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0
3489,5000019,0.549,0.283,0.0284,0.661,0.357,0.163,76.992,0.084746,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0
3490,5000020,0.493,0.27,0.0278,0.244,0.123,0.176,73.001,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0


In [727]:
df = df.drop(columns=['category'])

In [728]:
df

Unnamed: 0,id,danceability,energy,speechiness,acousticness,liveness,valence,tempo,0,1,...,mood_5,mood_6,mood_7,mood_8,mood_9,mood_10,mood_11,instrumentalness_cat_0,instrumentalness_cat_1,instrumentalness_cat_2
1,105,0.582,0.744,0.0336,0.000458,0.0405,0.674,133.344,0.185185,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
14,816,0.536,0.928,0.0589,0.015400,0.2000,0.389,136.258,0.000000,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
15,830,0.526,0.665,0.0243,0.407000,0.1690,0.285,158.493,0.181818,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
19,1395,0.561,0.593,0.0264,0.003270,0.0967,0.574,80.282,0.000000,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
20,1433,0.481,0.638,0.0276,0.153000,0.1540,0.497,79.064,0.000000,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5933,6002455,0.737,0.742,0.0441,0.218000,0.3160,0.386,119.966,0.000000,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
5935,6002457,0.749,0.780,0.0636,0.019600,0.0901,0.905,92.007,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
5953,6002476,0.770,0.750,0.0785,0.190000,0.0618,0.609,99.997,0.000000,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
5954,6002477,0.700,0.710,0.0532,0.327000,0.1290,0.582,167.971,0.000000,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [729]:
df

Unnamed: 0,id,danceability,energy,speechiness,acousticness,liveness,valence,tempo,0,1,...,mood_5,mood_6,mood_7,mood_8,mood_9,mood_10,mood_11,instrumentalness_cat_0,instrumentalness_cat_1,instrumentalness_cat_2
1,105,0.582,0.744,0.0336,0.000458,0.0405,0.674,133.344,0.185185,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
14,816,0.536,0.928,0.0589,0.015400,0.2000,0.389,136.258,0.000000,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
15,830,0.526,0.665,0.0243,0.407000,0.1690,0.285,158.493,0.181818,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
19,1395,0.561,0.593,0.0264,0.003270,0.0967,0.574,80.282,0.000000,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
20,1433,0.481,0.638,0.0276,0.153000,0.1540,0.497,79.064,0.000000,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5933,6002455,0.737,0.742,0.0441,0.218000,0.3160,0.386,119.966,0.000000,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
5935,6002457,0.749,0.780,0.0636,0.019600,0.0901,0.905,92.007,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
5953,6002476,0.770,0.750,0.0785,0.190000,0.0618,0.609,99.997,0.000000,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
5954,6002477,0.700,0.710,0.0532,0.327000,0.1290,0.582,167.971,0.000000,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [730]:
## Removing unamed and other unnecessary columns
df.reset_index(drop=True, inplace=True)
df = df.sort_values('id')
df.id = df.id.astype('int64')
# Saving idChords
ids = df.id
df = df.set_index('id')
df.reset_index(drop=True, inplace=True)

In [731]:
df

Unnamed: 0,danceability,energy,speechiness,acousticness,liveness,valence,tempo,0,1,2,...,mood_5,mood_6,mood_7,mood_8,mood_9,mood_10,mood_11,instrumentalness_cat_0,instrumentalness_cat_1,instrumentalness_cat_2
0,0.582,0.744,0.0336,0.000458,0.0405,0.674,133.344,0.185185,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1,0.536,0.928,0.0589,0.015400,0.2000,0.389,136.258,0.000000,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.526,0.665,0.0243,0.407000,0.1690,0.285,158.493,0.181818,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.561,0.593,0.0264,0.003270,0.0967,0.574,80.282,0.000000,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.481,0.638,0.0276,0.153000,0.1540,0.497,79.064,0.000000,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1317,0.737,0.742,0.0441,0.218000,0.3160,0.386,119.966,0.000000,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1318,0.749,0.780,0.0636,0.019600,0.0901,0.905,92.007,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
1319,0.770,0.750,0.0785,0.190000,0.0618,0.609,99.997,0.000000,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1320,0.700,0.710,0.0532,0.327000,0.1290,0.582,167.971,0.000000,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [732]:
ids[ids ==180849]

163    180849
Name: id, dtype: int64

## Pipeline

In [733]:
dfEuclidean, dfJaccard = getEuclideanJaccardDf(df)

In [734]:
dfEuclidean

Unnamed: 0,0,1,10,100,1000,1001,1002,1003,1004,1005,...,997,998,999,acousticness,danceability,energy,liveness,speechiness,tempo,valence
0,0.185185,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000458,0.582,0.744,0.0405,0.0336,133.344,0.674
1,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.015400,0.536,0.928,0.2000,0.0589,136.258,0.389
2,0.181818,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.407000,0.526,0.665,0.1690,0.0243,158.493,0.285
3,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.003270,0.561,0.593,0.0967,0.0264,80.282,0.574
4,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.153000,0.481,0.638,0.1540,0.0276,79.064,0.497
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1317,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.218000,0.737,0.742,0.3160,0.0441,119.966,0.386
1318,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.019600,0.749,0.780,0.0901,0.0636,92.007,0.905
1319,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.190000,0.770,0.750,0.0618,0.0785,99.997,0.609
1320,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.327000,0.700,0.710,0.1290,0.0532,167.971,0.582


In [735]:
dfJaccard

Unnamed: 0,key_0,key_1,key_2,key_3,key_4,key_5,key_6,key_7,key_8,key_9,...,mood_5,mood_6,mood_7,mood_8,mood_9,mood_10,mood_11,instrumentalness_cat_0,instrumentalness_cat_1,instrumentalness_cat_2
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1317,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1318,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
1319,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1320,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [736]:
# dfEuclidean = applyBoxCoxFeatures(dfEuclidean, ['speechiness', 'acousticness', 'liveness', 'valence'])
dfEuclidean = dfEuclidean.copy()
dfEuclidean = removeConstantFeatures(dfEuclidean)
# dfEuclidean= applyBoxCoxFeatures(dfEuclidean, ['speechiness', 'acousticness', 'liveness', 'valence'])
dfEuclidean= applyBoxCoxAllFeatures(dfEuclidean)
dfEuclidean = centerAndScale(dfEuclidean)

### Original Song & Cover Song

In [737]:
originalSongId = 180849 # The Scientist Original
originalSongIndex,originalSongVector = getIndexAndVector(originalSongId, ids, dfEuclidean)
coverSongId = 5000011 # The Scientist Original
coverSongIndex,coverSongVector = getIndexAndVector(coverSongId, ids, dfEuclidean)

### Get Rankings Euclidean

In [738]:
ranksDFEuclidean  = applyEuclideanDistance(dfEuclidean, ids, originalSongVector)
ranksDFEuclidean

Unnamed: 0,id,rank
0,180849,0.000000
1,610562,29.834360
2,732460,30.000729
3,6001027,30.239626
4,886255,30.282131
...,...,...
1317,6002189,93.288911
1318,980591,95.755992
1319,2054794,97.803672
1320,6000881,102.568123


In [739]:
ranksDFEuclidean[ranksDFEuclidean.id == 180849]

Unnamed: 0,id,rank
0,180849,0.0


In [740]:
ranksDFEuclidean[ranksDFEuclidean['id']==coverSongId]

Unnamed: 0,id,rank
584,5000011,31.402126


In [741]:
ranksDFEuclidean.loc[(ranksDFEuclidean['id'] > 5000000) & (ranksDFEuclidean['id'] < 5900000)]

Unnamed: 0,id,rank
12,5000019,30.474782
18,5000017,30.516912
65,5000013,30.702777
258,5000012,30.953369
584,5000011,31.402126
754,5000001,31.92395
860,5000018,32.721263
1148,5000008,43.619753
1172,5000007,45.691859
1203,5000020,49.364774


### Get Rankings Jaccard

In [742]:
originalSongId = 180849 # The Scientist Original
originalSongIndex,originalSongVector = getIndexAndVector(originalSongId, ids, dfJaccard)
coverSongId = 5000011 # The Scientist Original
coverSongIndex,coverSongVector = getIndexAndVector(coverSongId, ids, dfJaccard)

In [743]:
ranksDFJaccard  = applyJaccard(dfJaccard, ids, originalSongVector)

In [744]:
ranksDFJaccard

Unnamed: 0,id,rank
0,180849,0.000000
1,6001027,0.000000
2,466649,0.000000
3,6001258,0.000000
4,450492,0.000000
...,...,...
1317,6001654,0.888889
1318,1899241,0.888889
1319,6001660,0.888889
1320,362641,0.888889


In [745]:
ranksDFJaccard[ranksDFJaccard['id']==coverSongId]

Unnamed: 0,id,rank
638,5000011,0.75


In [746]:
ranksDFEuclidean[ranksDFEuclidean['id']==coverSongId]

Unnamed: 0,id,rank
584,5000011,31.402126


### Cover songs

In [747]:
ranksDFEuclidean[ranksDFEuclidean['id'] > 5000000]

Unnamed: 0,id,rank
3,6001027,30.239626
12,5000019,30.474782
18,5000017,30.516912
32,6000372,30.582644
47,6001393,30.647782
...,...,...
1314,6000247,89.688992
1315,6000768,90.372164
1317,6002189,93.288911
1320,6000881,102.568123


In [748]:
ranksDFJaccard[ranksDFJaccard['id'] > 5000000]

Unnamed: 0,id,rank
1,6001027,0.000000
3,6001258,0.000000
6,5000001,0.333333
8,5000017,0.333333
9,5000019,0.333333
...,...,...
1307,6001617,0.888889
1310,6000652,0.888889
1313,6000174,0.888889
1317,6001654,0.888889


<font color="red"> TODO: apply weight, or simply add rankings</font>

### Experiment to unify ranks

In [749]:
ranks = pd.merge(ranksDFJaccard,ranksDFEuclidean, on='id')

In [750]:
ranks['new_rank'] = ranks['rank_x'] + ranks['rank_y']

In [751]:
ranks = ranks.sort_values(by='new_rank')
ranks.reset_index(drop=True, inplace=True)

In [752]:
ranks[ranks.id > 5000000]

Unnamed: 0,id,rank_x,rank_y,new_rank
1,6001027,0.000000,30.239626,30.239626
5,5000019,0.333333,30.474782,30.808115
6,5000017,0.333333,30.516912,30.850246
36,6000372,0.571429,30.582644,31.154072
60,5000013,0.571429,30.702777,31.274206
...,...,...,...,...
1314,6000247,0.750000,89.688992,90.438992
1315,6000768,0.750000,90.372164,91.122164
1317,6002189,0.750000,93.288911,94.038911
1320,6000881,0.750000,102.568123,103.318123


In [753]:
ranks.head(10)

Unnamed: 0,id,rank_x,rank_y,new_rank
0,180849,0.0,0.0,0.0
1,6001027,0.0,30.239626,30.239626
2,732460,0.571429,30.000729,30.572157
3,610562,0.75,29.83436,30.58436
4,466649,0.0,30.807642,30.807642
5,5000019,0.333333,30.474782,30.808115
6,5000017,0.333333,30.516912,30.850246
7,872773,0.333333,30.562395,30.895729
8,450492,0.0,30.91231,30.91231
9,1909308,0.571429,30.367868,30.939296


In [754]:
ranks.loc[(ranks.id > 5000000) & (ranks.id < 5900000)]

Unnamed: 0,id,rank_x,rank_y,new_rank
5,5000019,0.333333,30.474782,30.808115
6,5000017,0.333333,30.516912,30.850246
60,5000013,0.571429,30.702777,31.274206
310,5000012,0.75,30.953369,31.703369
609,5000011,0.75,31.402126,32.152126
650,5000001,0.333333,31.92395,32.257283
840,5000018,0.571429,32.721263,33.292692
1148,5000008,0.571429,43.619753,44.191181
1173,5000007,0.75,45.691859,46.441859
1200,5000020,0.333333,49.364774,49.698107
