# Using continous / categorical / transition matrices to get recommendations

In [59]:
import pandas as pd
from sklearn.cluster import KMeans
import seaborn as sns
from matplotlib import pyplot as plt 
import math
from ipynb.fs.full.Utils import createKDeplot, loadDataFrames, dropFeaturesWithNoVariance, loadCoverGroups
import numpy as np
from sklearn.preprocessing import StandardScaler
from scipy import stats
from ipynb.fs.full.Pipeline import removeConstantFeatures, applyBoxCoxAllFeatures, centerAndScale, applyEuclideanDistance, getIndexAndVector, applyJaccard

In [60]:
df = pd.read_csv('./data/unified/05-allDataContinousCategoricalTransition.csv')
df = df.iloc[:, 1:]
encodedFeaturesNames = pd.read_csv('./data/unified/05-encodedFeatureNames.csv')
encodedFeaturesNames = encodedFeaturesNames['0'].values.tolist()
encodedFeaturesNames.append('id')


In [61]:
df.columns

Index(['index', 'id', 'danceability', 'energy', 'speechiness', 'acousticness',
       'liveness', 'valence', 'tempo', '0',
       ...
       'mood_5', 'mood_6', 'mood_7', 'mood_8', 'mood_9', 'mood_10', 'mood_11',
       'instrumentalness_cat_0', 'instrumentalness_cat_1',
       'instrumentalness_cat_2'],
      dtype='object', length=1076)

In [62]:
dfE = df[encodedFeaturesNames]
dfE = dfE.sort_values('id')
ids = dfE.id
dfE.drop(['id'], axis=1, inplace=True)
dfE.reset_index(drop=True, inplace=True)

In [63]:
dfE = removeConstantFeatures(dfE)

In [64]:
originalSongId = 180849 # The Scientist Original
originalSongIndex,originalSongVector = getIndexAndVector(originalSongId, ids, dfE)
coverSongId = 1686718 # THe Scientist by Boyce Avenue
coverIndex,coverSongVector = getIndexAndVector(coverSongId, ids, dfE)

In [65]:
##[0,1,1], [1,0,1] -> 0.66 Diff -> 0.33 Similarity
ranksJaccard = applyJaccard(dfE, ids, originalSongVector)

In [66]:
ranksJaccard

Unnamed: 0,id,rank
0,106522,0.0
1,1029726,0.0
2,3653915,0.0
3,6001258,0.0
4,466649,0.0
...,...,...
5972,6000126,1.0
5973,783974,1.0
5974,6000825,1.0
5975,6001510,1.0


In [67]:
ranksJaccard['rank'].value_counts().sort_index()

0.000000      10
0.333333     216
0.571429    1413
0.750000    3023
0.888889    1237
1.000000      78
Name: rank, dtype: int64

<br><br><br><br><br><br><br><br><br><br><br><br><br>

## Continous features

In [68]:
encodedFeaturesNames.remove('id')
df = df[df.columns.difference(encodedFeaturesNames)]
df = df.sort_values('id')
ids = df.id
df.drop(['id'], axis=1, inplace=True)
df.reset_index(drop=True, inplace=True)

## Pipeline

In [69]:
df = removeConstantFeatures(df)
df = centerAndScale(df)

### Original Song & Cover Song

In [70]:
originalSongId = 180849 # The Scientist Original
originalSongIndex,originalSongVector = getIndexAndVector(originalSongId, ids, df)
coverSongId = 1686718 # THe Scientist by Boyce Avenue
coverIndex,coverSongVector = getIndexAndVector(coverSongId, ids, df)

In [71]:
ranksDF  = applyEuclideanDistance(df, ids, originalSongVector)

In [72]:
ranksDF.head(10)

Unnamed: 0,id,rank
0,180849,0.0
1,1025154,6.2462
2,569357,6.269484
3,2065851,6.395168
4,1884102,6.478077
5,1210073,6.501521
6,263779,6.518148
7,672021,6.538485
8,1065079,6.544171
9,1048915,6.658681


In [73]:
ranksDF[ranksDF['id']==coverSongId]

Unnamed: 0,id,rank
73,1686718,7.919422


## Rank for original Song 450

In [74]:
ranksDF[ranksDF['id'] > 5000000]

Unnamed: 0,id,rank
42,5000019,7.569236
69,6001027,7.876343
71,6001580,7.918286
84,6000268,8.002573
94,5000017,8.061349
...,...,...
5972,6002253,172.409981
5973,6000743,172.978867
5974,6002278,177.214129
5975,6001117,187.673974


# Next steps
    1) Get a weighted recommendation with jac and euclidean
    2) Do experimentation with Gower
    3) Get result with KNN clustering
    4)
