# Using continous / categorical / transition matrices to get recommendations / Selective Box Cox

In [3]:
import pandas as pd
from sklearn.cluster import KMeans
import seaborn as sns
from matplotlib import pyplot as plt 
import math
from ipynb.fs.full.Utils import createKDeplot, loadDataFrames, dropFeaturesWithNoVariance
import numpy as np
from sklearn.preprocessing import StandardScaler
from scipy import stats
from ipynb.fs.full.Pipeline import removeConstantFeatures, applyBoxCoxAllFeatures, centerAndScale, applyEuclideanDistance, getIndexAndVector, applyBoxCoxFeatures

In [45]:
transitionMatrices = pd.read_csv('./data/07-transitionsClustered.csv')
transitionMatrices = transitionMatrices[['id','category']]

In [46]:
data = pd.read_csv('./data/05-allDataContinousCategoricalTransition.csv')
data = data.drop(columns=['Unnamed: 0'])

In [50]:
df = pd.merge(data,transitionMatrices, on='id')

In [53]:
df = df.drop(columns=['index'])

In [54]:
originalSongId = 180849 # The Scientist Original
categoryOriginalSong = df[df.id == originalSongId].category.values[0]
df = df[df.category == categoryOriginalSong]

In [55]:
df[df.id > 5000000]

Unnamed: 0,id,danceability,energy,speechiness,acousticness,liveness,valence,tempo,0,1,...,mood_6,mood_7,mood_8,mood_9,mood_10,mood_11,instrumentalness_cat_0,instrumentalness_cat_1,instrumentalness_cat_2,category
3189,5000001,0.255,0.343,0.031,0.927,0.164,0.116,140.527,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2
3190,5000007,0.273,0.205,0.0316,0.941,0.171,0.0796,148.225,0.077922,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2
3191,5000008,0.506,0.481,0.0264,0.482,0.2,0.224,148.626,0.011905,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2
3192,5000011,0.503,0.123,0.0333,0.879,0.114,0.28,145.843,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2
3193,5000012,0.306,0.229,0.0331,0.807,0.112,0.156,85.492,0.048387,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2
3194,5000013,0.334,0.252,0.0287,0.722,0.0907,0.214,153.879,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2
3195,5000017,0.453,0.295,0.0291,0.518,0.293,0.14,146.022,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2
3196,5000018,0.525,0.292,0.026,0.91,0.0795,0.133,75.009,0.032258,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2
3197,5000019,0.549,0.283,0.0284,0.661,0.357,0.163,76.992,0.084746,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2
3198,5000020,0.493,0.27,0.0278,0.244,0.123,0.176,73.001,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2


In [56]:
df = df.drop(columns=['category'])

In [57]:
df.reset_index(drop=True, inplace=True)

In [58]:
df

Unnamed: 0,id,danceability,energy,speechiness,acousticness,liveness,valence,tempo,0,1,...,mood_5,mood_6,mood_7,mood_8,mood_9,mood_10,mood_11,instrumentalness_cat_0,instrumentalness_cat_1,instrumentalness_cat_2
0,105,0.582,0.744,0.0336,0.000458,0.0405,0.674,133.344,0.185185,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1,816,0.536,0.928,0.0589,0.015400,0.2000,0.389,136.258,0.000000,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,830,0.526,0.665,0.0243,0.407000,0.1690,0.285,158.493,0.181818,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,1395,0.561,0.593,0.0264,0.003270,0.0967,0.574,80.282,0.000000,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,1433,0.481,0.638,0.0276,0.153000,0.1540,0.497,79.064,0.000000,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
899,5000013,0.334,0.252,0.0287,0.722000,0.0907,0.214,153.879,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
900,5000017,0.453,0.295,0.0291,0.518000,0.2930,0.140,146.022,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
901,5000018,0.525,0.292,0.0260,0.910000,0.0795,0.133,75.009,0.032258,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
902,5000019,0.549,0.283,0.0284,0.661000,0.3570,0.163,76.992,0.084746,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [59]:
## Removing unamed and other unnecessary columns
df = df.sort_values('id')
df.id = df.id.astype('int64')
# Saving idChords
ids = df.id
df = df.set_index('id')

In [60]:
ids

0          105
1          816
2          830
3         1395
4         1433
        ...   
899    5000013
900    5000017
901    5000018
902    5000019
903    5000020
Name: id, Length: 904, dtype: int64

## Pipeline

In [61]:
df = removeConstantFeatures(df)
df= applyBoxCoxFeatures(df, ['speechiness', 'acousticness', 'liveness', 'valence'])
df = centerAndScale(df)

In [62]:
df

Unnamed: 0,danceability,energy,speechiness,acousticness,liveness,valence,tempo,0,1,2,...,mood_4,mood_5,mood_6,mood_7,mood_9,mood_10,mood_11,instrumentalness_cat_0,instrumentalness_cat_1,instrumentalness_cat_2
0,0.461078,0.752810,-0.185045,-1.787120,-2.887895,1.195615,0.436969,2.390288,-0.056332,-0.033278,...,-0.234185,-0.150414,-0.587570,-0.402435,-0.129896,4.047791,-0.276026,-0.888875,-0.613149,1.582364
1,0.116399,1.590304,1.530708,-0.999153,0.700590,0.117197,0.537443,-0.481398,-0.056332,-0.033278,...,-0.234185,-0.150414,-0.587570,2.484875,-0.129896,-0.247048,-0.276026,-0.888875,-0.613149,1.582364
2,0.041469,0.393235,-2.026056,0.709880,0.436133,-0.399141,1.304100,2.338075,-0.056332,-0.033278,...,-0.234185,-0.150414,-0.587570,-0.402435,-0.129896,-0.247048,-0.276026,-0.888875,-0.613149,1.582364
3,0.303724,0.065520,-1.470517,-1.432035,-0.606252,0.856543,-1.392595,-0.481398,-0.056332,-0.033278,...,-0.234185,-0.150414,1.701926,-0.402435,-0.129896,-0.247048,-0.276026,-0.888875,-0.613149,1.582364
4,-0.295717,0.270341,-1.198820,0.046334,0.280980,0.569798,-1.434591,-0.481398,-0.056332,-0.033278,...,-0.234185,-0.150414,1.701926,-0.402435,-0.129896,-0.247048,-0.276026,1.125017,-0.613149,-0.631966
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
899,-1.397191,-1.486574,-0.973909,1.181148,-0.744273,-0.822877,1.145010,-0.481398,-0.056332,-0.033278,...,-0.234185,-0.150414,-0.587570,-0.402435,-0.129896,-0.247048,-0.276026,1.125017,-0.613149,-0.631966
900,-0.505522,-1.290855,-0.897243,0.899950,1.228109,-1.370370,0.874103,-0.481398,-0.056332,-0.033278,...,-0.234185,-0.150414,-0.587570,-0.402435,-0.129896,-0.247048,-0.276026,-0.888875,1.630926,-0.631966
901,0.033976,-1.304510,-1.567922,1.391409,-1.041659,-1.430684,-1.574407,0.018832,-0.056332,-0.033278,...,-0.234185,-0.150414,-0.587570,-0.402435,-0.129896,-0.247048,-0.276026,1.125017,-0.613149,-0.631966
902,0.213808,-1.345474,-1.033142,1.104101,1.466098,-1.184334,-1.506033,0.832764,-0.056332,-0.033278,...,-0.234185,-0.150414,-0.587570,-0.402435,-0.129896,-0.247048,-0.276026,-0.888875,1.630926,-0.631966


### Original Song & Cover Song

In [63]:
originalSongId = 180849 # The Scientist Original
originalSongIndex,originalSongVector = getIndexAndVector(originalSongId, ids, df)

In [69]:
coverSongId = 1686718 # The Scientist Original
coverSongIndex,coverSongVector = getIndexAndVector(coverSongId, ids, df)

### Get Rankings

In [70]:
ranksDF  = applyEuclideanDistance(df, ids, originalSongVector)

In [71]:
ranksDF.head(10)

Unnamed: 0,id,rank
0,180849,0.0
1,672021,23.48424
2,872773,23.63864
3,1025154,23.644112
4,450492,23.681565
5,5000019,23.77525
6,1456194,23.79714
7,1818565,23.813695
8,1884102,23.828943
9,5000017,23.836085


In [72]:
ranksDF[ranksDF['id']==coverSongId]

Unnamed: 0,id,rank
45,1686718,24.211236


### Cover songs

In [73]:
ranksDF[ranksDF['id'] > 5000000]

Unnamed: 0,id,rank
5,5000019,23.77525
9,5000017,23.836085
43,5000018,24.184886
48,5000013,24.221052
257,5000012,24.918546
265,5000001,24.937101
438,5000011,25.367075
808,5000020,38.674365
845,5000008,45.929655
880,5000007,56.87715
