In [1]:
import pandas as pd
from sklearn.cluster import KMeans
import seaborn as sns
from matplotlib import pyplot as plt 
import math
from ipynb.fs.full.Utils import createKDeplot, loadDataFrames, dropFeaturesWithNoVariance
import numpy as np
from sklearn.preprocessing import StandardScaler
from scipy import stats
from scipy.spatial.distance import jaccard
from sklearn.metrics import jaccard_score
from scipy.spatial import distance
import gower
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [2]:
def removeConstantFeatures(df):
    df = df.loc[:, (df != df.iloc[0]).any()] 
    return df

In [3]:
def applyBoxCoxAllFeatures(df):
    cols = df.columns.tolist()
    for colName in cols:
        df[colName] = df[colName] + 1
        df[colName] = stats.boxcox(df[colName])[0]
    return df

In [4]:
def applyBoxCoxFeatures(df, features):
    newFeatures = []
    for feature in features:    
        df[feature] = df[feature] + 1
        df[feature] = stats.boxcox(df[feature])[0]
    return df

In [5]:
def centerAndScale(df):
    cols = df.columns
    scaler = StandardScaler()
    scaler.fit(df)
    data_normalized = scaler.transform(df)
    print('mean: ', data_normalized.mean(axis=0).round(2))
    print('std: ', data_normalized.std(axis=0).round(2))

    df = pd.DataFrame(data_normalized, columns=cols)
    return df

In [6]:
def euclidean_distance(x, y):   
    return np.sqrt(np.sum((x - y) ** 2))

In [7]:
def applyEuclideanDistance(df, ids, originalSongVector):
    ranking = []
    for index in range(0, len(df)):
        ranking.append([ids[index], euclidean_distance(originalSongVector, df.iloc[index, :] )])
    print('distance')
    ranksDF = pd.DataFrame(ranking)
    ranksDF = ranksDF.rename(columns={0: "id", 1: "rank"})
    ranksDF = ranksDF.sort_values(by="rank")
    ranksDF.reset_index(drop=True, inplace=True)
    return ranksDF

In [8]:
def getIndexAndVector(id_chord, ids, df):
    songIndex = ids[ids == id_chord].index[0]
    songVector = df.iloc[songIndex, :]
    return songIndex, songVector

In [9]:
def applyJaccard(df, ids, originalSongVector):
    ranking = []
    idx = 0
    for i,x in df.iterrows():
        result = jaccard(originalSongVector.tolist(), x.tolist())
        ranking.append([ids[idx], result])
        idx += 1
    ranksDF = pd.DataFrame(ranking)
    ranksDF = ranksDF.rename(columns={0: "id", 1: "rank"})
    ranksDF = ranksDF.sort_values(by="rank")
    ranksDF.reset_index(drop=True, inplace=True)
    return ranksDF

In [10]:
def applyGower(df,ids,id,n):
    indexToAnalyze = [i for i,x in enumerate(ids) if x == id][0]
    elementToAnalyze = df.loc[indexToAnalyze:indexToAnalyze,:]
    ranksDFGower = pd.DataFrame(gower.gower_topn(elementToAnalyze, df, n = n))
    songID = pd.DataFrame(ids[ranksDFGower['index']]).reset_index()
    ranksDFGower = ranksDFGower.merge(songID)
    ranksDFGower = ranksDFGower[['id','values']]
    return ranksDFGower

In [11]:
def getEuclideanJaccardDf(df):
    encodedFeaturesNames = pd.read_csv("./data/unified/05-encodedFeatureNames.csv")
    encodedFeaturesNames = encodedFeaturesNames['0'].tolist()
    dfEuclidean = df[df.columns.difference(encodedFeaturesNames)]
    dfJaccard = df[encodedFeaturesNames]
    return (dfEuclidean,dfJaccard)

In [12]:
def applyCosineDistance(df, ids, originalSongVector):
    ranking = []
    for index in range(0, len(df)):
        ranking.append([ids[index], distance.cosine(df.iloc[index, :], originalSongVector)])
    ranksDF = pd.DataFrame(ranking)
    ranksDF = ranksDF.rename(columns={0: "id", 1: "rank"})
    ranksDF = ranksDF.sort_values(by="rank")
    ranksDF.reset_index(drop=True, inplace=True)
    return ranksDF

In [13]:
def scalerMinMax(df):
    scaler = MinMaxScaler()
    scaler.fit(df)
    df = pd.DataFrame(scaler.transform(df)).copy()
    return df