In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
import faiss
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
song_df = pd.read_csv('../data/songs_database.csv')
tags_df = pd.read_csv('../data/track_tags.csv')

final_df = pd.merge(song_df, tags_df[["ID", "Tags"]], on="ID")

final_df.to_csv('../data/merged_tracks.csv',index=False)

In [3]:
final_df.head()

Unnamed: 0,Title,Artist,Album,ReleaseDate,ID,Key,BPM,Acousticness,Popularity,Happiness,Danceability,Instrumentalness,Energy,Speechiness,Loudness,Cover,Tags
0,God's Plan,Drake,Scorpion,2018-06-29,6DCZcSspjsKoFjzjrWoCdn,G Major,77,0.0332,78,0.357,0.754,8.3e-05,0.449,0.109,-9.211,https://i.scdn.co/image/ab67616d0000b273f907de...,"2018, Hip-Hop, rap, 2010s, MySpotigramBot"
1,NOKIA,Drake,$ome $exy $ongs 4 U,2025-02-14,2u9S9JJ6hTZS3Vf22HOZKg,F Major,106,0.247,83,0.508,0.537,0.0,0.727,0.15,-4.184,https://i.scdn.co/image/ab67616d0000b273fcb7f8...,"pop rap, Drake, atlanta bass, pop, electro"
2,MEET YOUR PADRE,"PARTYNEXTDOOR, Drake, Chino Pacas",$ome $exy $ongs 4 U,2025-02-14,0o5p5zzRN84q6qVbOVONPM,B Minor,128,0.667,73,0.72,0.71,2.2e-05,0.596,0.0524,-8.555,https://i.scdn.co/image/ab67616d0000b273fcb7f8...,"Drake, partynextdoor, Chino Pacas, pop rap, 2025"
3,DIE TRYING,"PARTYNEXTDOOR, Drake, Yebba",$ome $exy $ongs 4 U,2025-02-14,0NUqi0ps17YpLUC3kgsZq0,B Major,110,0.146,81,0.665,0.763,0.0,0.531,0.0328,-6.154,https://i.scdn.co/image/ab67616d0000b273fcb7f8...,"Drake, partynextdoor, YEBBA, indie pop, rnb"
4,GIMME A HUG,Drake,$ome $exy $ongs 4 U,2025-02-14,5Nz7hI3cCOHmMR4vSLJ1An,C# Major,143,0.0643,79,0.245,0.457,0.0,0.671,0.0979,-5.53,https://i.scdn.co/image/ab67616d0000b273fcb7f8...,"Drake, rnb, trap, pop rap, 2025"


In [4]:
DISPLAY_COLS = ["Title","Artist","Album","ReleaseDate","ID", "Key","BPM"]
NUMERIC_COLS = ["Acousticness", "Popularity", "Happiness", "Danceability", "Instrumentalness","Energy","Speechiness", "Loudness"]

scaler = StandardScaler()
df_scaled = scaler.fit_transform(final_df[NUMERIC_COLS])

vectorizer = TfidfVectorizer()
tag_embeddings = vectorizer.fit_transform(final_df["Tags"]).toarray()

track_vectors = np.hstack((df_scaled, tag_embeddings))


In [5]:
d = track_vectors.shape[1]
index = faiss.IndexFlatL2(d)

index.add(np.array(track_vectors).astype('float32'))

track_metadata = final_df[["ID","Title","Artist","Album"] + DISPLAY_COLS]

In [6]:
def find_similar_tracks(song_id, top_k=3):
    song_idx = final_df[final_df["ID"] == song_id].index[0]  # finds index of the song
    query_vector = np.array([track_vectors[song_idx]]).astype('float32')

    distances, indices = index.search(query_vector, top_k)  # FAISS search

    results = track_metadata.iloc[indices[0]].reset_index(drop=True)
    results["Distance"] = distances[0]  # Lower distance = more similar

    return results

# find similar songs to "God's Plan"
find_similar_tracks("2oHM9KAdFdFvDCaE4V6RST", top_k=5)


Unnamed: 0,ID,Title,Artist,Album,Title.1,Artist.1,Album.1,ReleaseDate,ID.1,Key,BPM,Distance
0,2oHM9KAdFdFvDCaE4V6RST,Hate Bein' Sober,"Chief Keef, 50 Cent, Wiz Khalifa",Finally Rich,Hate Bein' Sober,"Chief Keef, 50 Cent, Wiz Khalifa",Finally Rich,2012-01-01,2oHM9KAdFdFvDCaE4V6RST,A♭ Major,132,0.0
1,6ZWalyzfVcNCc1XwKnnyyn,Not Like Us,Kendrick Lamar,Not Like Us,Not Like Us,Kendrick Lamar,Not Like Us,2024-05-03,6ZWalyzfVcNCc1XwKnnyyn,C# Major,101,2.604012
2,0TlLq3lA83rQOYtrqBqSct,Nonstop,Drake,Scorpion,Nonstop,Drake,Scorpion,2018-06-29,0TlLq3lA83rQOYtrqBqSct,G Major,155,2.776838
3,4Kup1kDTLoJaUMgpGFPqau,Married To The Game,Future,Project E.T.,Married To The Game,Future,Project E.T.,2016-10-13,4Kup1kDTLoJaUMgpGFPqau,C# Major,143,2.824402
4,6DCZcSspjsKoFjzjrWoCdn,God's Plan,Drake,Scorpion,God's Plan,Drake,Scorpion,2018-06-29,6DCZcSspjsKoFjzjrWoCdn,G Major,77,3.018773
