In [1]:
import pandas as pd
import ast

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.feature_extraction import FeatureHasher

from scipy.spatial.distance import cdist
import numpy as np

In [2]:
df = pd.read_csv('data/data_clean.csv')

In [3]:
df['artists'] = df['artists'].apply(ast.literal_eval)

In [4]:
df.columns

Index(['valence', 'year', 'acousticness', 'artists', 'danceability',
       'duration_ms', 'energy', 'id', 'instrumentalness', 'key', 'liveness',
       'loudness', 'mode', 'name', 'popularity', 'speechiness', 'tempo'],
      dtype='object')

In [5]:
DF_COLUMNS = ['valence', 'year', 'acousticness', 'artists', 'danceability',
       'duration_ms', 'energy', 'id', 'instrumentalness', 'key',
       'liveness', 'loudness', 'mode', 'name', 'popularity', 'release_date',
       'speechiness', 'tempo']
DF_COLUMNS_MODEL = ['valence', 'year', 'acousticness','artists', 'danceability',
         'duration_ms', 'energy', 'instrumentalness', 'key',
         'liveness', 'loudness', 'mode', 'popularity', 'speechiness', 'tempo']

In [6]:

# Kolom yang akan di-hash
hash_columns = 'artists'

# Inisialisasi FeatureHasher untuk kolom tertentu
hasher = FeatureHasher(n_features=10, input_type="string")

# Inisialisasi StandardScaler
scaler = StandardScaler()

# Inisialisasi KMeans
kmeans = KMeans(n_clusters=10)

# Definisikan transformer untuk hashing kolom tertentu
hash_transformer = ('hasher', hasher, hash_columns)

# Definisikan transformer untuk kolom yang tidak di-hash
remainder_transformer = ('passthrough', 'remainder', slice(0, 0))

# Gabungkan transformer untuk semua kolom
preprocessor = ColumnTransformer(
    transformers=[hash_transformer],
    remainder='passthrough')

# Gabungkan preprocessor dengan algoritma clustering (KMeans)
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('scaler', scaler),
    ('kmeans', kmeans)
])

In [7]:
pipeline.fit(df[DF_COLUMNS_MODEL])

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [8]:
# export pipeline
import joblib
joblib.dump(pipeline, 'model/pipeline.pkl')

['model/pipeline.pkl']

In [9]:
import yaml
import spotipy
from spotipy.oauth2 import SpotifyOAuth,SpotifyClientCredentials

stream= open("streamlit/spotify/spotify.yaml")
spotify_details = yaml.safe_load(stream)
auth_manager = SpotifyClientCredentials(client_id=spotify_details['Client_id'],
                                        client_secret=spotify_details['client_secret'])

sp = spotipy.client.Spotify(auth_manager=auth_manager)

In [10]:
def get_song_spotify(song_name, data):
    log = open('log.txt','w')
    try:
        result = sp.search(q=song_name, limit=1)
    except:
        log.write('Error: Failed to search song')
        return None

    if result['tracks']['items'] == []:
        return None

    song = result['tracks']['items'][0]

    id = song['id']

    try:
        song_data = data[(data['id'] == id)].iloc[0]
        return song_data
    except:
        try:
            audio_features = sp.audio_features(id)
            song_data = {
                'valence': audio_features[0]['valence'],
                "year" : result['tracks']['items'][0]['album']['release_date'][:4],
                'acousticness': audio_features[0]['acousticness'],
                'artists': list(map(lambda x: x['name'], result['tracks']['items'][0]['artists'])),
                'danceability': audio_features[0]['danceability'],
                'duration_ms': audio_features[0]['duration_ms'],
                'energy': audio_features[0]['energy'],
                'id': id,
                'instrumentalness': audio_features[0]['instrumentalness'],
                'key': audio_features[0]['key'],
                'liveness': audio_features[0]['liveness'],
                'loudness': audio_features[0]['loudness'],
                'mode': audio_features[0]['mode'],
                'name': result['tracks']['items'][0]['name'],
                'popularity': result['tracks']['items'][0]['popularity'],
                'speechiness': audio_features[0]['speechiness'],
                'tempo': audio_features[0]['tempo']
            }
        except:
            log.write('Error: Failed to get audio features from Spotify')
            return None

        return pd.DataFrame([song_data], columns=DF_COLUMNS).iloc[0]

In [11]:
def get_song_data(song_name, data):
    try:
        song_data = data[(data['name'] == song_name)].iloc[0]
        return song_data
    except:
        return get_song_spotify(song_name,data)


In [12]:
def df_song_data(list_song_name,data):
    rows_song_data = list()
    for song_name in list_song_name:
        rows_song_data.append(get_song_data(song_name,data))

    return pd.DataFrame(rows_song_data,columns=DF_COLUMNS)

In [13]:

def songs_recommendation(list_song_name,data,num_rec=10):

    song_data_input = df_song_data(list_song_name,df)

    vector = pipeline.named_steps["preprocessor"].transform(df_song_data(list_song_name,data))
    vector = pipeline.named_steps["scaler"].transform(vector)

    vector = vector.mean(axis=0)

    predicted_cluster = pipeline.named_steps["kmeans"].predict([vector])

    cluster_data = df[pipeline.named_steps["kmeans"].labels_ == predicted_cluster[0]]

    vector_cluster = pipeline.named_steps["preprocessor"].transform(cluster_data)
    vector_cluster = pipeline.named_steps["scaler"].transform(vector_cluster)

    distance = cdist([vector],vector_cluster)
    index = list(np.argsort(distance)[:, :num_rec][0])

    recsongs = cluster_data.iloc[index]
    recsongs = recsongs[~recsongs['name'].isin(list_song_name)]
    return song_data_input,recsongs


In [14]:
input,output = songs_recommendation(['505'],df)

In [15]:
input

Unnamed: 0,valence,year,acousticness,artists,danceability,duration_ms,energy,id,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo
17006,0.234,2007,0.00237,[Arctic Monkeys],0.52,253587,0.852,58ge6dfP91o9oXMzq3XkIS,5.8e-05,0,0.0733,-5.866,1,505,76,,0.0543,140.267


In [16]:
output

Unnamed: 0,valence,year,acousticness,artists,danceability,duration_ms,energy,id,instrumentalness,key,liveness,loudness,mode,name,popularity,speechiness,tempo
18509,0.293,2014,0.000501,[Young the Giant],0.466,244693,0.763,5Az8KU81g2aLBbJN67F2CI,0.0,0,0.0973,-4.448,1,Mind Over Matter,69,0.0318,154.914
18039,0.37,2012,0.126,[One Direction],0.514,200400,0.727,6M31fPFCYB8Job3MCjjrDV,0.0,0,0.0978,-6.131,1,They Don't Know About Us,79,0.0492,147.917
108337,0.285,2018,0.103,[BTS],0.541,275696,0.661,2tMBCYj22KxtmkAACIvvk4,0.0,1,0.0779,-5.838,1,Magic Shop,72,0.0889,149.926
17405,0.205,2008,0.000354,[Nickelback],0.536,252653,0.89,06T10fEzN8ZCcqzQZYA184,0.00165,0,0.133,-5.222,1,Gotta Be Somebody,63,0.0601,115.998
140275,0.285,2018,0.103,[BTS],0.541,275696,0.661,22ryCgQttpV7oCrn7llGRu,0.0,1,0.0779,-5.838,1,Magic Shop,68,0.0889,149.926
17373,0.261,2008,0.00401,[Theory of a Deadman],0.415,215400,0.781,5xyACR2lzIyzFepF4qlAas,0.0,1,0.126,-6.947,1,Not Meant to Be,58,0.0357,142.966
37677,0.361,2015,0.0928,[One Direction],0.546,196933,0.657,1ZWLWVqeEMWMKTlteS0yLH,0.0,1,0.119,-4.787,1,Love You Goodbye,70,0.0332,133.188
17340,0.306,2008,0.131,[Taylor Swift],0.617,236267,0.741,1D4PL9B8gOg78jiHg3FvBb,0.0,2,0.0772,-3.97,1,Love Story,79,0.0311,118.984
36264,0.385,2008,0.033,[Nickelback],0.489,262627,0.748,1xQZtbipNLyP0e0hihE5F5,0.0,2,0.144,-5.488,1,I'd Come for You,59,0.0311,147.998
