In [1]:
import numpy as np
import pandas as pd
from random import seed
from random import random
from sklearn import preprocessing
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt

In [2]:
#Get data

def get_data(path):
    return pd.read_csv(path)

In [3]:
columns = ['danceability', 'energy',
       'key', 'loudness', 'speechiness', 'acousticness',
       'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms',
                'popularity', 'time_signature', 'year']

cc = get_data("centroids_top features.csv")
dat = get_data("combined_spotify_full_with labels_top features.csv")

In [4]:
scaler = preprocessing.MinMaxScaler()
data_l = dat[columns]
X_scaled = scaler.fit_transform(data_l)
df_scaled = pd.DataFrame(X_scaled, columns = columns)
    
df_scaled['id'] = dat['id']
df_scaled['k_cluster'] = dat['k_cluster']

In [5]:
#Standardizing data
#Inputs are the full dataset (dataframe) and the selected song (dataframe)
#Output is the standardized dataset, plus the new song,
#without the columns that are unfit for standardization
#We'll probably want filters for "explicit" and "mode" (they both have only 0 and 1 values)

def standardize(data, song):
    
    #If song not in data, add it and assign dummy cluster value
    if song['id'] not in list(data['id']):
        song['k_cluster'] = 100
        data = data.append(song)

    min_max_scaler = preprocessing.MinMaxScaler()
    data_l = data[columns]
    X_scaled = min_max_scaler.fit_transform(data_l)
    df_scaled = pd.DataFrame(X_scaled, columns = columns)
    
    df_scaled['id'] = data['id']
    df_scaled['k_cluster'] = data['k_cluster']
    
    return df_scaled

In [6]:
#Get the cluster assignment for the selected song
#Inputs are the standardized data set, a dataframe containing the standardized cluster centers, 
#and the data for the new song (dataframe)
#Output is the number of the closest cluster

def song_cluster(data_stand, cc, song):
    
    song = data_stand[data_stand['id'] == song['id']].squeeze()
    columns_cc = ['danceability', 'energy', 'acousticness', 'instrumentalness', 'valence', 'tempo']
    
    if song['k_cluster'] != 100:
        
        return song['k_cluster']
        
    else:

        #Calculates distance between song and each cc.
        distance_calc = np.sqrt(np.sum((cc[columns_cc] - song[columns_cc].squeeze())**2, axis = 1))

        #Zips distance_calc with cluster number and sorts ascending
        sorted_list = sorted(list(zip(cc['k_cluster'], distance_calc)), key = lambda tup: tup[1])

        return sorted_list[0][0]

In [7]:
#Filter dataset to cluster recommended
#Inputs are cluster number and the standardized dataset
#Output is the filtered dataset

def cluster_filter(c_num, sds):
    return sds[(sds['k_cluster'] == c_num) | (sds['k_cluster'] == 100)]


In [8]:
#Get the top ten songs from the selected cluster
#Inputs are the standardized cluster-filtered dataset, the full non-standardized dataset, 
#the chosen song, and the multipliers for each standardized factor
#Output is the list of the top ten songs (all info)


def top_ten(data_cf, data_full, song, dance = 25, energy = 25, key = 25, loud = 25,
             speech = 20, acous = 10, inst = 20, live = 10, val = 20, temp = 25, dur = 25,
             pop = 10, time = 20, year = 25):
    
    #If song not in data_full, add it and assign dummy cluster value
    if song['id'] not in list(data_full['id']):
        song['k_cluster'] = 100
        data_full = data_full.append(song)
        
    #Assign the columns their proper multipliers
    data_cf["danceability"] = data_cf["danceability"] * (((.75 * dance) + 25) / 25)
    data_cf["energy"] = data_cf["energy"] * (((.75 * energy) + 25) / 25)
    data_cf["key"] = data_cf["key"] * (((.75 * key) + 25) / 25)
    data_cf["loudness"] = data_cf["loudness"] * (((.75 * loud) + 25) / 25)
    data_cf["speechiness"] = data_cf["speechiness"] * (((.8 * speech) + 20) / 20)
    data_cf["acousticness"] = data_cf["acousticness"] * (((.9 * acous) + 10) / 10)
    data_cf["instrumentalness"] = data_cf["instrumentalness"] * (((.8 * inst) + 20) / 20)
    data_cf["liveness"] = data_cf["liveness"] * (((.9 * live) + 10) / 10)
    data_cf["valence"] = data_cf["valence"] * (((.8 * val) + 20) / 20)
    data_cf["tempo"] = data_cf["tempo"] * (((.75 * temp) + 25) / 25)
    data_cf["duration_ms"] = data_cf["duration_ms"] * (((.75 * dur) + 25) / 25)
    data_cf["popularity"] = data_cf["popularity"] * (((.9 * pop) + 10) / 10)
    data_cf["time_signature"] = data_cf["time_signature"] * (((.8 * time) + 20) / 20)
    data_cf["year"] = data_cf["year"] * (((.75 * year) + 25) / 25)
     
    #Get the standardized song info
    song = data_cf[data_cf['id'] == song['id']]
        
    distance_calc = np.sqrt(np.sum((data_cf[columns] - song[columns].squeeze())**2, 
                                   axis = 1))
    
    top_10 = sorted(list(zip(distance_calc.index, distance_calc)), key = lambda tup: tup[1])[1:11]
    
    indicies = [i[0] for i in top_10]
    
    data_dist = data_full.iloc[indicies, :]
    data_dist['distance_calc'] = [i[1] for i in top_10]
    
    return data_dist

In [9]:
#Puts all of the preceeding functions together
#Takes as input a path, song, standardized cluster centers,
#and multipliers for standardized factors
#Output is the top ten song recommendations

def all_together(path, song, cc, dance = 25, energy = 25, key = 25, loud = 25,
             speech = 20, acous = 10, inst = 20, live = 10, val = 20, temp = 25, dur = 25,
             pop = 10, time = 20, year = 25):
    data_full = get_data(path)
    data_s = standardize(data_full, song)
    c_num = song_cluster(data_s, cc, song)
    data_cf = cluster_filter(c_num, data_s)
    tt = top_ten(data_cf, data_full, song, dance, energy, key, speech, acous,
              inst, live, val, temp, dur, pop, time, year)
    return tt

In [10]:
#Puts all of the preceeding functions together
#Takes as input the standardized data set, full data set,
#song, standardized cluster centers,
#and multipliers for standardized factors
#Output is the top ten song recommendations

def all_together_2(sds, data_full, song, cc, dance = 25, energy = 25, key = 25, loud = 25,
             speech = 20, acous = 10, inst = 20, live = 10, val = 20, temp = 25, dur = 25,
             pop = 10, time = 20, year = 25):
    c_num = song_cluster(sds, cc, song)
    data_cf = cluster_filter(c_num, sds)
    tt = top_ten(data_cf, data_full, song, dance, energy, key, speech, acous,
              inst, live, val, temp, dur, pop, time, year)
    return tt

In [11]:
#Test

ind = dat.iloc[int(random() * len(dat)), :]
ind2 = dat.iloc[37774, :]

pd.options.mode.chained_assignment = None

all_together_2(df_scaled, dat, ind2, cc, acous = 100, energy = 100)

Unnamed: 0,grp,id,name,album,album_id,artists,artist_ids,track_number,disc_number,explicit,...,liveness,valence,tempo,duration_ms,release_date,popularity,time_signature,year,k_cluster,distance_calc
1221427,24428,6SvmZn0i3anxzQ5a0X1CJ2,Cold Rain,CSN,63AF6wD9Mt1vQgKPNZaAXx,"['Crosby, Stills & Nash']",2pdvghEHZJtgSXZ7cvNLou,10,1,0,...,0.1,0.172,115.884,154600,1977-06-17,29,3,1977,1,0.240736
1254875,25097,1Iht13clFnNSltZ5F73n9w,Old Man - Remastered,Sail Away (Expanded & Remastered),7ojNQckNp7Tj2BkLJCiiUL,['Randy Newman'],3HQyFCFFfJO3KKBlUfZsyW,6,1,0,...,0.12,0.175,129.674,163107,1972-05-01,25,3,1972,1,0.245055
37778,755,2WPHpOOrSoDBOEnV8Cv76w,Famous Blue Raincoat,The Best Of Leonard Cohen,06TH14tYg5Xy6bauQtgxnt,['Leonard Cohen'],['5l8VQNuIg0turYE1VtM9zV'],8,1,0,...,0.109,0.218,120.191,306960,1975,36,3,1975,1,0.253707
1332829,26656,6rf0Kgia2yt5j80glOBzpB,If I Had Only Known,For My Broken Heart,2j2u43lPOti5W1btEw8NN5,['Reba McEntire'],02rd0anEWfMtF7iMku9uor,10,1,0,...,0.0934,0.172,129.179,240360,1991-01-01,38,3,1991,1,0.294322
1283135,25662,69LpTXgwTVeIDfAcozZ8n5,Half As Much,Sentimentally Yours,0GhlY0htAsFNb0eHO9y1lZ,"['Patsy Cline', 'The Jordanaires']",7dNsHhGeGU5MV01r06O8gK,10,1,0,...,0.0912,0.302,134.302,150467,1962,33,3,1962,1,0.329223
1299323,25986,3qwK66prhlovqNtdzCVv9x,Monday Morning,A Song Will Rise,3p7zMW9o6XPVZ2y8kNc4uF,"['Peter, Paul and Mary']",6yrBBtqX2gKCHCrZOYBDrB,8,1,0,...,0.105,0.188,105.345,198360,1965,34,3,1965,1,0.33673
126639,2532,52X0pEfTb5zmGYK2Yuaq6i,Famous Blue Raincoat,The Essential Leonard Cohen,2t2sAWQl1NwwHy793LyEfR,['Leonard Cohen'],['5l8VQNuIg0turYE1VtM9zV'],8,1,0,...,0.107,0.162,123.995,307093,2002-10-22,33,3,2002,1,0.352559
1346421,26928,4yt0ktA4S9ihg0PIyTyrEE,The Movie,An American Prayer,7qj6lZyChZxDGAqzTArHHQ,['Jim Morrison'],1QB4oo4JbSRdxNyidIuD0W,10,1,1,...,0.145,0.0909,134.931,95987,1978,35,3,1978,1,0.3604
1078252,21565,2OpEjOrVbb3BxpU2n4h6QV,Sycamore Trees,Twin Peaks: Fire Walk With Me - Soundtrack,7KTThkarRqhJX5DVkDOo7Z,['Jimmy Scott'],['5T7u5bYO62WTYOk5RBZwjo'],3,1,0,...,0.105,0.137,121.116,234507,1992,39,3,1992,1,0.360564
883607,17672,47xju1lDX28RGes3CEBNUw,Borrowed Time,Trail of Souls,34ZyoeGB37YIiTNyu625Sk,"['Solveig Slettahjell', 'Knut Reiersrud', 'In ...","['4uhYXIelKBw9dPR7TfX9sG', '6Q8A9S9o98GqVGzDdi...",1,1,0,...,0.104,0.249,128.544,279267,2015-11-27,33,3,2015,1,0.389761
