In [1]:
import pandas as pd
import numpy as np
import random
from utils import get_dfs
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import os
import csv

In [2]:
CLUSTERS = 25

# Build model for prediction
data_df, audio_features_df, holdout_df = get_dfs()
audio_features_df = audio_features_df.drop(
    columns=["mode", "key", "loudness", "duration_ms", "track_popularity"])
transformer = StandardScaler()
scaled_audio_features = transformer.fit_transform(audio_features_df)
k_means_model = KMeans(init='k-means++', n_clusters=CLUSTERS,
                       random_state=0).fit(scaled_audio_features)
data_df['cluster'] = k_means_model.labels_

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['track_album_release_date'] = pd.to_datetime(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['year'] = df['track_album_release_date'].dt.year
  super()._check_params_vs_input(X, default_n_init=10)


In [3]:
# Define audio-columns
audio_columns = ['danceability', 'energy', 'speechiness',
                 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']
holdout_numerical_df = holdout_df[audio_columns]

# Scale audio-columns
transformer = StandardScaler()
holdout_numerical_df = transformer.fit_transform(holdout_numerical_df)

# Replace non-scaled values with scaled values
holdout_numerical_df = pd.DataFrame(
    holdout_numerical_df, columns=audio_columns, index=holdout_df.index)
holdout_df = pd.concat(
    [holdout_df.drop(columns=audio_columns), holdout_numerical_df], axis=1)

# Method for recommending a song based on track_id from holdout_df
def recommend_song(track_id):
    audio_columns = ['danceability', 'energy', 'speechiness',
                     'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']

    # Find song in holdout_df
    song = holdout_df.loc[(holdout_df['track_id'] == track_id)]
    print(
        f"Listening to {song['track_name'].iloc[0]} by artist {song['track_artist'].iloc[0]}")

    # Predict the cluster of the song
    song_audio_features = song[audio_columns]
    predicted_cluster = k_means_model.predict(
        song_audio_features.to_numpy())[0]

    # Find songs in same cluster
    cluster_songs = data_df.loc[(data_df['cluster'] == predicted_cluster) & (
        data_df['track_popularity'].ge(70)) & (data_df['track_id'] != song['track_id'].iloc[0])]
    # Print to evaluate candidate songs
    print("20 Candidate songs for recommendation:")
    print(cluster_songs.head(20)[['track_name', 'track_artist']])
    # pick a random song from reduced df
    recommended_song = cluster_songs.sample()
    # print(recommended_song.iloc[0])
    print(
        f"Recommended song is {recommended_song['track_name'].iloc[0]} by artist {recommended_song['track_artist'].iloc[0]}\n")


for i in range(20):
    sample_song_track_id = holdout_df.sample()['track_id'].iloc[0]
    recommend_song(sample_song_track_id)

Listening to Hunnybee by artist Unknown Mortal Orchestra
20 Candidate songs for recommendation:
                                              track_name       track_artist
10072                                              Patek              Ozuna
13473                 The Logical Song - Remastered 2010         Supertramp
3616                                  TRUE - Single Edit     Spandau Ballet
18317                                           Underdog        Alicia Keys
1298                                        Dance Monkey        Tones and I
17220  Ain't Nobody (Loves Me Better) (feat. Jasmine ...        Felix Jaehn
17685                                            Distant               Maes
1765                        Somebody That I Used To Know              Gotye
2687                               We Don't Talk Anymore       Charlie Puth
133                                Closer (feat. Halsey)   The Chainsmokers
12148   Crazy Little Thing Called Love - Remastered 2011            

In [4]:
#Method for continuosly supplying track_id to recommend songs. Meant for testing recommender
def start_recommender():
    while True:
        print("Please supply a valid track-id from holdout_df:")
        track_id = input()
        # Find song in df
        song = holdout_df.loc[(holdout_df['track_id'] == track_id)]
        if song.empty:
            print('Song not found. Please try again')
            continue
        recommend_song(track_id=track_id)

#Uncomment and run to start recommender
#start_recommender()