# Import Library

In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import hstack

# Load Dataset

In [3]:
# Load dataset
df = pd.read_csv("spotify-2023.csv", encoding='latin1')

In [4]:
df

Unnamed: 0,track_name,artist(s)_name,artist_count,released_year,released_month,released_day,in_spotify_playlists,in_spotify_charts,streams,in_apple_playlists,...,bpm,key,mode,danceability_%,valence_%,energy_%,acousticness_%,instrumentalness_%,liveness_%,speechiness_%
0,Seven (feat. Latto) (Explicit Ver.),"Latto, Jung Kook",2,2023,7,14,553,147,141381703,43,...,125,B,Major,80,89,83,31,0,8,4
1,LALA,Myke Towers,1,2023,3,23,1474,48,133716286,48,...,92,C#,Major,71,61,74,7,0,10,4
2,vampire,Olivia Rodrigo,1,2023,6,30,1397,113,140003974,94,...,138,F,Major,51,32,53,17,0,31,6
3,Cruel Summer,Taylor Swift,1,2019,8,23,7858,100,800840817,116,...,170,A,Major,55,58,72,11,0,11,15
4,WHERE SHE GOES,Bad Bunny,1,2023,5,18,3133,50,303236322,84,...,144,A,Minor,65,23,80,14,63,11,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
948,My Mind & Me,Selena Gomez,1,2022,11,3,953,0,91473363,61,...,144,A,Major,60,24,39,57,0,8,3
949,Bigger Than The Whole Sky,Taylor Swift,1,2022,10,21,1180,0,121871870,4,...,166,F#,Major,42,7,24,83,1,12,6
950,A Veces (feat. Feid),"Feid, Paulo Londra",2,2022,11,3,573,0,73513683,2,...,92,C#,Major,80,81,67,4,0,8,6
951,En La De Ella,"Feid, Sech, Jhayco",3,2022,10,20,1320,0,133895612,29,...,97,C#,Major,82,67,77,8,0,12,5


# Pre-Processing

In [5]:
# Preprocessing
for col in ['streams', 'in_deezer_playlists', 'in_shazam_charts']:
    df[col] = df[col].astype(str).str.replace(",", "")
    df[col] = pd.to_numeric(df[col], errors='coerce')

for col in df.select_dtypes(include='object').columns:
    df[col].fillna(df[col].mode()[0], inplace=True)

for col in df.select_dtypes(include='number').columns:
    df[col].fillna(df[col].median(), inplace=True)

categorical_features = ['artist(s)_name', 'key', 'mode']
numerical_features = [
    'bpm', 'danceability_%', 'valence_%', 'energy_%', 'acousticness_%',
    'instrumentalness_%', 'liveness_%', 'speechiness_%'
]

df['text_features'] = df[categorical_features].agg(' '.join, axis=1)

scaler = MinMaxScaler()
numerical_scaled = scaler.fit_transform(df[numerical_features])
numerical_scaled_df = pd.DataFrame(numerical_scaled, columns=numerical_features)

full_features = pd.concat([df[['track_name', 'text_features']].reset_index(drop=True), numerical_scaled_df], axis=1)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting value

# Vectorization

In [6]:
# TF-IDF vectorization
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(full_features['text_features'])

# Combine TF-IDF and numeric features
final_matrix = hstack([tfidf_matrix, numerical_scaled_df.values])

# Compute similarity
similarity = cosine_similarity(final_matrix)

# Calculate Similarity

In [7]:
def recommend(track_name, top_n=10):
    track_name_lower = track_name.lower()
    track_names_lower = full_features['track_name'].str.lower()

    if track_name_lower not in track_names_lower.values:
        return f"Lagu '{track_name}' tidak ditemukan dalam dataset."

    idx = track_names_lower[track_names_lower == track_name_lower].index[0]
    sim_scores = list(enumerate(similarity[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    top_scores = sim_scores[1:top_n+1]
    top_indices = [i for i, _ in top_scores]
    scores = [score for _, score in top_scores]

    result = df.iloc[top_indices][['track_name', 'artist(s)_name']].copy()
    result['similarity_score'] = scores

    return result


# Penggunaan

In [8]:
import ipywidgets as widgets
from IPython.display import display, clear_output

def interactive_recommend():
    print("Sistem Rekomendasi Lagu - Content Based")

    input_box = widgets.Text(
        description="Judul lagu:",
        placeholder="Masukkan judul lagu",
        layout=widgets.Layout(width='50%')
    )

    output_area = widgets.Output()

    def on_submit(change):
        with output_area:
            clear_output()
            user_input = change['new'].strip()
            results = recommend(user_input)
            print("\nRekomendasi:")
            print(results)

    input_box.observe(on_submit, names='value')
    display(input_box, output_area)

# Panggil fungsi ini di cell Jupyter
interactive_recommend()


Sistem Rekomendasi Lagu - Content Based


Text(value='', description='Judul lagu:', layout=Layout(width='50%'), placeholder='Masukkan judul lagu')

Output()