In [25]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
import faiss
from sklearn.feature_extraction.text import TfidfVectorizer
import umap
import json

In [26]:
song_df = pd.read_csv('../data/songs_database.csv')
tags_df = pd.read_csv('../data/track_tags.csv')

final_df = pd.merge(song_df, tags_df[["ID", "Tags"]], on="ID")

final_df.to_csv('../data/merged_tracks.csv',index=False)

In [27]:
final_df.head()

Unnamed: 0,Title,Artist,Album,ReleaseDate,ID,Key,BPM,Acousticness,Popularity,Happiness,Danceability,Instrumentalness,Energy,Speechiness,Loudness,Cover,Tags
0,Buttons,The Pussycat Dolls,PCD,2005-01-01,3BxWKCI06eQ5Od8TY2JBeA,D Major,211,0.178,70,0.408,0.569,0.0,0.821,0.267,-4.38,https://i.scdn.co/image/ab67616d0000b2738cc441...,"pop, dance, sexy, rnb, The Pussycat Dolls"
1,Top Of The World,The Pussycat Dolls,Top Of The World,2009-02-03,6r5OaOArztjvur2lSqumAH,G Minor,122,0.00246,41,0.498,0.74,0.0,0.916,0.115,-4.091,https://i.scdn.co/image/ab67616d0000b273183e24...,"pop, dance, The Pussycat Dolls, female vocalis..."
2,Buttons,The Pussycat Dolls,PCD,2005-01-01,4lZbw23ts5G4nlzGgQbAxc,D Major,211,0.183,62,0.478,0.546,0.0,0.817,0.285,-4.362,https://i.scdn.co/image/ab67616d0000b273babcb2...,"pop, dance, sexy, rnb, The Pussycat Dolls"
3,Sports car,Tate McRae,So Close To What,2025-02-20,5UJbgR4XF4y1DvbkxEqe8S,B Minor,107,0.0505,88,0.886,0.861,0.00139,0.815,0.0349,-3.672,https://i.scdn.co/image/ab67616d0000b273ec58cd...,"pop, best of 2025, electropop, soty, fernandac..."
4,tate mcrae,C. Beck,tate mcrae,2025-02-21,3BK1yV08DDjzMu5jVnSciA,C Major,138,0.032,9,0.505,0.754,0.0,0.7,0.138,-5.698,https://i.scdn.co/image/ab67616d0000b273cd9505...,No tags


In [28]:
DISPLAY_COLS = ["Title","Artist","Album","ReleaseDate","ID", "Key","BPM"]
NUMERIC_COLS = ["Acousticness", "Popularity", "Happiness", "Danceability", "Instrumentalness","Energy","Speechiness", "Loudness"]

scaler = StandardScaler()
df_scaled = scaler.fit_transform(final_df[NUMERIC_COLS])

vectorizer = TfidfVectorizer()
tag_embeddings = vectorizer.fit_transform(final_df["Tags"]).toarray()

track_vectors = np.hstack((df_scaled, tag_embeddings))


In [29]:
d = track_vectors.shape[1]
index = faiss.IndexFlatL2(d)

index.add(np.array(track_vectors).astype('float32'))

track_metadata = final_df[["ID","Title","Artist","Album"] + DISPLAY_COLS]

In [30]:
def find_similar_tracks(song_id, top_k=3):
    song_idx = final_df[final_df["ID"] == song_id].index[0]  # finds index of the song
    query_vector = np.array([track_vectors[song_idx]]).astype('float32')

    distances, indices = index.search(query_vector, top_k)  # FAISS search

    results = track_metadata.iloc[indices[0]].reset_index(drop=True)
    results["Distance"] = distances[0]  # Lower distance = more similar

    return results

In [31]:
def generate_visualization_json(track_vectors, final_df, out_path):
    reducer = umap.UMAP(n_components=3, random_state=42)
    embedding_3d = reducer.fit_transform(track_vectors)

    viz_df = final_df[["ID", "Title", "Artist"]].copy()
    viz_df["x"] = embedding_3d[:, 0]
    viz_df["y"] = embedding_3d[:, 1]
    viz_df["z"] = embedding_3d[:, 2]

    with open(out_path, "w") as f:
        json.dump(viz_df.to_dict(orient="records"), f, indent=2)


In [32]:
generate_visualization_json(track_vectors, final_df, "../client/static/data/visualization_embeddings.json")

  warn(
