In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler

# Data Understanding

- track_id: ID Spotify trek  
- artists: Nama artis yang membawakan, dipisahkan ; jika lebih dari satu  
- album_name: Nama album  
- track_name: Nama trek  
- popularity: Popularitas dari 0-100, berdasarkan jumlah putaran dan terbaru  
- duration_ms: Durasi trek dalam milidetik  
- explicit: Ada lirik eksplisit (true/false)  
- danceability: Kemampuan untuk menari (0.0-1.0)  
- energy: Tingkat energi (0.0-1.0)  
- key: Kunci nada (0-11, -1 jika tidak terdeteksi)  
- loudness: Kekerasan dalam dB  
- mode: Mode mayor (1) atau minor (0)  
- speechiness: Kemiripan dengan ucapan (0.0-1.0)  
- acousticness: Keaslian akustik (0.0-1.0)  
- instrumentalness: Tanpa vokal (0.0-1.0)  
- liveness: Kemungkinan live (0.0-1.0)  
- valence: Positivitas musik (0.0-1.0)  
- tempo: Tempo dalam BPM  
- time_signature: Tanda birama (3-7)  
- track_genre: Genre musiknya

In [None]:
data = pd.read_csv('data/spotify_dataset.csv')
data.sample(5)

In [None]:
data.info()

In [None]:
data.drop(columns=['Unnamed: 0'], inplace=True)
data.info()

In [None]:
num_cols = data.select_dtypes(include=['int64', 'float64']).columns
num_cols

In [None]:
data.describe()

## Duplicate

In [None]:
# Check duplicates
data.duplicated().sum()

In [None]:
data['track_id'].duplicated().sum()

In [None]:
data['track_id'].nunique()

In [None]:
data['track_name'].duplicated().sum()

## Missing Value

In [None]:
# Check for missing values
data.isnull().sum()

## Outliers

In [None]:
for col in num_cols:
    plt.figure(figsize=(10, 5))
    sns.boxplot(data=data[col], orient='h')
    plt.title(f'Boxplot of {col}', fontsize=20)
    plt.xlabel(col)
    plt.show()

## EDA

### Numerik

In [None]:
num_cols_data = data[num_cols]
num_cols_data.head()

In [None]:
num_cols_data.hist(
    bins=30,
    figsize=(20, 15),
    color='green',
    edgecolor='black'
)
plt.suptitle('Distribution of Numerical Features', fontsize=20)
plt.tight_layout()
plt.show()

In [None]:
data_sorted_by_popularity = data.sort_values(by='popularity', ascending=False)
data_sorted_by_popularity.head()

### Categorical

In [None]:
category_cols = data.select_dtypes(include=['object', 'bool']).columns
category_cols

In [None]:
categoty_cols_data = data[category_cols]
categoty_cols_data.sample(5)

In [None]:
data['track_genre'].unique()

In [None]:
data.groupby('track_genre').size().reset_index(name='count').sort_values(by='count', ascending=False).head(10)

In [None]:
data.groupby('track_genre').size().reset_index(name='count').sort_values(by='count', ascending=True).head(10)

In [None]:
plt.figure(figsize=(20, 6))
plt.title('Distribution of Track Genres')
sns.countplot(data=data, x='track_genre', order=data['track_genre'].value_counts().index, color='green')
plt.xticks(rotation=90)
plt.show()

In [None]:
data['explicit'].value_counts()

### Bivariate

In [None]:
genre_popularity = data.groupby('track_genre')['popularity'].mean().sort_values(ascending=False)
genre_popularity = pd.DataFrame(genre_popularity).reset_index()
genre_popularity.head(10)

In [None]:
plt.figure(figsize=(10, 6))
plt.title('Distribution of Track Popularity')
plt.bar(genre_popularity['track_genre'].head(10), genre_popularity['popularity'].head(10), color='green')
plt.xlabel('Track Genre')
plt.ylabel('Popularity')
plt.xticks(rotation=90)
plt.show()

### Heatmap

In [None]:
plt.figure(figsize=(12, 8))
sns.heatmap(num_cols_data.corr(), annot=True, fmt=".2f", cmap="coolwarm", cbar=True)
plt.title("Correlation Heatmap of Numerical Features", fontsize=16)
plt.show()

# Data Preparation

## Data Cleaning

### Duplicate

We keep the one that more popular

In [None]:
data = data.sort_values('popularity', ascending=False)
data = data.drop_duplicates(subset=['track_id'], keep='first')
data = data.reset_index(drop=True)
data.duplicated(subset=['track_id']).sum()

In [None]:
data.head()

In [None]:
data.shape

In [None]:
data = data.drop_duplicates(subset=['track_name', 'artists'], keep='first')
data = data.reset_index(drop=True)
data.duplicated(subset=['track_name', 'artists']).sum()

In [None]:
data.shape

### Missing Value

In [None]:
data = data.dropna()
data.shape

### Outlier

Terdapat anomali pada kolom durasi. Terdapat beberapa lagu dengan durasi sangat pendek dan bebrapa sangat panjang. Oleh karena itu, dilakukan pembersihan dengan menetapkan batas atas dan batasan bawah dari lagu yang akan digunakan sebagai data. 

Pada kasus ini dilakukan penetapan minimal lagu memiliki durasi waktu 45 detik (45000 ms) hingga 10 menit (600.000 ms). 

Standar ini dibuat berdasarkan karakteristik lagu yang umumnya diterima.

Pada data lain tidak dilakukan pembersihan outlier dikarenakan fitur yang dimiliki merupakan daya tarik dari lagu dan terlalu berharga apabila dihapus.

In [32]:
def convert_to_minutes(miliseconds):
    seconds = miliseconds / 1000
    minutes = seconds // 60
    seconds = seconds % 60
    return f"{int(minutes)}:{int(seconds):02d}"

In [None]:
average_duration = data['duration_ms'].mean()
print(f"Average duration of tracks: {average_duration} ms or {convert_to_minutes(average_duration)}")
min_duration = data['duration_ms'].min()
print(f"Minimum duration of tracks: {min_duration} ms or {convert_to_minutes(min_duration)}")
max_duration = data['duration_ms'].max()
print(f"Maximum duration of tracks: {max_duration} ms or {convert_to_minutes(max_duration)}")

In [None]:
data.sort_values('duration_ms', ascending=False).head(10)

In [None]:
data.sort_values('duration_ms', ascending=True).head(10)

In [36]:
upper_bound = 600000
lower_bound = 45000

# Filter the data 
data_filtered = data[(data['duration_ms'] >= lower_bound) & (data['duration_ms'] <= upper_bound)]

In [None]:
data_filtered.describe()

In [None]:
data = data_filtered
data.shape

In [None]:
data['explicit'] = data['explicit'].astype(int)
data['explicit'].sample(5)

## Normalization

In [None]:
scaler = MinMaxScaler()
data_scaled = data.copy()
# Normalize numerical columns
data_scaled[num_cols] = scaler.fit_transform(data[num_cols])
data_scaled[num_cols].describe()

# PCA

In [41]:
selected_cols = ['explicit', 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness','valence', 'tempo', 'time_signature']

In [None]:
music_features_data = data_scaled[selected_cols]
music_features_data.sample(5)

In [None]:
from sklearn.decomposition import PCA
n_components = 0.95
pca = PCA(n_components=n_components)
pca_features = pca.fit_transform(music_features_data)

print(f"Original feature shape: {music_features_data.shape}")
print(f"PCA feature shape: {pca_features.shape}")

# Modelling

In [44]:
output_columns = ['track_id', 'track_name', 'artists', 'popularity', 'track_genre']

## Popularity Based Recommendations

In [45]:
def popularity_recommendation(input_song_name, num_recommendations=5):
    #check if the song exists in the dataset
    if input_song_name not in data['track_name'].values:
        raise ValueError(f"The song '{input_song_name}' is not in the dataset.")

    # Get the popularity of the given track
    track_popularity = data.loc[data['track_name'] == input_song_name, 'popularity'].values[0]
    
    # Get the most popular tracks
    most_popular_tracks = data[data['popularity'] >= track_popularity].sort_values(by='popularity', ascending=False)
    
    # Return the top 10 most popular tracks
    df =  most_popular_tracks[output_columns].head(num_recommendations)
    ids = df['track_id'].tolist()
    
    return df, ids

## KNN Based Recommendations

Digunakan model KNN dengan metric cosine untuk menghitung jarak antar fitur lagu antara satu dengan yang lain

In [46]:
from sklearn.neighbors import NearestNeighbors
def knn_based_recommendations(input_song_name, num_recommendations=5):
    #check if the song exists in the dataset
    if input_song_name not in data['track_name'].values:
        raise ValueError(f"The song '{input_song_name}' is not in the dataset.")
    
    knn_model = NearestNeighbors(n_neighbors=num_recommendations, metric='cosine')
    knn_model.fit(pca_features)

    # Get the index of the input song
    input_song_index = data[data['track_name'] == input_song_name].index[0]
    input_song_vector = pca_features[input_song_index].reshape(1, -1)
    
    distances, indices = knn_model.kneighbors(input_song_vector, n_neighbors=num_recommendations + 10)
    similar_song_indices = indices[0][1:]  # Exclude the first index (the song itself)
    
    knn_based_recommendations = data.iloc[similar_song_indices][output_columns]
    knn_based_recommendations['distance'] = distances[0][1:]  # Exclude the first distance (the song itself)

    # Delete duplicate songs
    knn_based_recommendations.drop_duplicates(subset=['track_name'])
    knn_based_recommendations.sort_values(by='distance', ascending=True)

    df = knn_based_recommendations.head(num_recommendations)
    ids = df['track_id'].tolist()
    
    return df, ids

## Cluster Based Recommendation

In [47]:
from sklearn.cluster import KMeans

In [None]:
wcss = []
for k in range(5, 51, 5):  # Try from 5 to 50 clusters
    kmeans = KMeans(n_clusters=k, random_state=42, n_init='auto')
    kmeans.fit(pca_features) 
    wcss.append(kmeans.inertia_)

plt.plot(range(5, 51, 5), wcss, marker='o')
plt.title('Elbow Method For Optimal k')
plt.xlabel('Number of Clusters')
plt.ylabel('WCSS')
plt.show()

Dari grafik, diasumsikan nilai n_cluster paling optimal yaitu 10. 

In [None]:
kmeans = KMeans(n_clusters=10)
clusters = kmeans.fit_predict(pca_features)
data['cluster'] = clusters
data[['track_name', 'artists', 'cluster']].sample(5)

In [None]:
data['cluster'].value_counts()

In [None]:
data[data['cluster'] == 0].track_genre.value_counts().head(10)

In [52]:
def cluster_based_recommendations(input_song_name, num_recommendations=5):
    # Get the cluster of the given track
    track_cluster = data.loc[data['track_name'] == input_song_name, 'cluster'].values[0]

    cluster_tracks = data[data['cluster'] == track_cluster].sort_values(by='popularity', ascending=False)
    cluster_tracks = cluster_tracks[cluster_tracks['track_name'] != input_song_name]
    df = cluster_tracks[output_columns].head(num_recommendations)
    ids = df['track_id'].tolist()

    return df, ids


## Hybrid Recommendation

In [135]:
def hybrid_recommendation(input_song_name, num_recommendations=5):
    # Check if the song exists in the dataset
    if input_song_name not in data['track_name'].values:
        raise ValueError(f"The song '{input_song_name}' is not in the dataset.")

    # KNN model for similarity
    knn_model = NearestNeighbors(n_neighbors=num_recommendations + 10, metric='cosine')
    knn_model.fit(pca_features)

    # Get the index of the input song
    input_song_index = data[data['track_name'] == input_song_name].index[0]
    input_song_vector = pca_features[input_song_index].reshape(1, -1)

    distances, indices = knn_model.kneighbors(input_song_vector, n_neighbors=num_recommendations + 10)
    similar_song_indices = indices[0][1:]  # Exclude the first index (the song itself)

    knn_based_recommendations = data.iloc[similar_song_indices].copy()
    knn_based_recommendations['distance'] = distances[0][1:]  # Exclude the first distance (the song itself)

    # Filter based on input song cluster
    input_song_cluster = data.loc[input_song_index, 'cluster']
    candidate_songs = knn_based_recommendations[knn_based_recommendations['cluster'] == input_song_cluster].copy()

    # If no songs are found in the cluster, use the unfiltered recommendations
    if candidate_songs.empty:
        candidate_songs = knn_based_recommendations.copy()  # Use a copy to avoid modifying the original

    # Sort by popularity and similarity (hybrid ranking)
    candidate_songs['hybrid_score'] = 0.6 * candidate_songs['distance'] + 0.4 * (
            candidate_songs['popularity'] / 100)
    candidate_songs = candidate_songs.sort_values(by='hybrid_score', ascending=False)
    
    # Get the top recommendations
    top_recommendations = candidate_songs[output_columns].head(num_recommendations)
    ids = top_recommendations['track_id'].tolist()
    

    return top_recommendations, ids

# Evaluation

### define ground truth by genre

In [96]:
def get_ground_truth_by_genre(df, input_track_id):
    genre = df[df['track_id'] == input_track_id]['track_genre'].values[0]
    df = df[df['track_genre'] == genre][output_columns]
    ids = df['track_id'].tolist()
    return df, ids

In [None]:
genre_truth, genre_truth_ids = get_ground_truth_by_genre(data, input_song_id)
genre_truth

## Scoring

In [98]:
from sklearn.metrics import ndcg_score

In [99]:
def precision_at_k(y_true, y_pred, k):
    y_pred = y_pred[:k]
    relevant = set(y_true)
    return len([i for i in y_pred if i in relevant]) / k

def recall_at_k(y_true, y_pred, k):
    y_pred = y_pred[:k]
    relevant = set(y_true)
    return len([i for i in y_pred if i in relevant]) / len(relevant) if relevant else 0

def f1_score_at_k(precision, recall):
    return 2 * precision * recall / (precision + recall) if (precision + recall) else 0

def average_precision(y_true, y_pred, k):
    score = 0.0
    hits = 0.0
    for i, p in enumerate(y_pred[:k]):
        if p in y_true:
            hits += 1
            score += hits / (i + 1)
    return score / min(len(y_true), k) if y_true else 0

def evaluate_all(y_true, y_pred, k=5):
    p = precision_at_k(y_true, y_pred, k)
    r = recall_at_k(y_true, y_pred, k)
    f1 = f1_score_at_k(p, r)
    ap = average_precision(y_true, y_pred, k)

    # NDCG assumes relevance scores → binary (1 if relevant)
    true_relevance = [[1 if i in y_true else 0 for i in y_pred[:k]]]
    predicted_scores = [[1.0 / (i + 1) for i in range(k)]]
    ndcg = ndcg_score(true_relevance, predicted_scores)

    return {
        'Precision@{}'.format(k): round(p, 4),
        'Recall@{}'.format(k): round(r, 4),
        'F1@{}'.format(k): round(f1, 4),
        'MAP@{}'.format(k): round(ap, 4),
        'NDCG@{}'.format(k): round(ndcg, 4)
    }

## Blinding Light by The Weekend

In [None]:
input_song_name = 'Blinding Lights'
song = data.loc[data['track_name'] == input_song_name]
song = song[0:1]
song

In [None]:
input_song_id = song['track_id'].iloc[0]
input_song_id

In [None]:
pop_pred, pop_ids = popularity_recommendation(input_song_name, num_recommendations=5)
pop_pred

In [None]:
knn_pred, knn_ids = knn_based_recommendations(input_song_name, num_recommendations=5)
knn_pred

In [None]:
cluster_pred, cluster_ids = cluster_based_recommendations(input_song_name, num_recommendations=5)
cluster_pred

In [None]:
hybrid_pred, hybrid_ids = hybrid_recommendation(input_song_name, num_recommendations=5)
hybrid_pred

In [120]:
results_1 = []
results_1.append({'Model': 'Popularity', **evaluate_all(genre_truth_ids, pop_ids)})
results_1.append({'Model': 'KNN', **evaluate_all(genre_truth_ids, knn_ids)})
results_1.append({'Model': 'Clustering', **evaluate_all(genre_truth_ids, cluster_ids)})
results_1.append({'Model': 'Hybrid', **evaluate_all(genre_truth_ids, hybrid_ids)})

In [None]:
eval_df_1 = pd.DataFrame(results_1)
eval_df_1

## Out of Phase by Proem

In [None]:
input_song_name = 'Out of Phase'
song = data.loc[data['track_name'] == input_song_name]
song = song[0:1]
song

In [None]:
input_song_id = song['track_id'].iloc[0]
input_song_id

In [None]:
pop_pred, pop_ids = popularity_recommendation(input_song_name, num_recommendations=5)
pop_pred

In [None]:
knn_pred, knn_ids = knn_based_recommendations(input_song_name, num_recommendations=5)
knn_pred

In [None]:
cluster_pred, cluster_ids = cluster_based_recommendations(input_song_name, num_recommendations=5)
cluster_pred

In [None]:
hybrid_pred, hybrid_ids = hybrid_recommendation(input_song_name, num_recommendations=5)
hybrid_pred[['track_name', 'artists']]

In [143]:
results_2 = []
results_2.append({'Model': 'Popularity', **evaluate_all(genre_truth_ids, pop_ids)})
results_2.append({'Model': 'KNN', **evaluate_all(genre_truth_ids, knn_ids)})
results_2.append({'Model': 'Clustering', **evaluate_all(genre_truth_ids, cluster_ids)})
results_2.append({'Model': 'Hybrid', **evaluate_all(genre_truth_ids, hybrid_ids)})

In [None]:
eval_df_2 = pd.DataFrame(results_2)
eval_df_2

Dari hasil evaluasi yang paling baik merupakan model yang berbasis cluster