In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
import faiss
from sklearn.feature_extraction.text import TfidfVectorizer
from umap.umap_ import UMAP
import json

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
song_df = pd.read_csv('../data/songs_database.csv')
tags_df = pd.read_csv('../data/track_tags.csv')

final_df = pd.merge(song_df, tags_df[["ID", "Tags"]], on="ID")

final_df.to_csv('../data/merged_tracks.csv',index=False)

In [3]:
final_df.head()

Unnamed: 0,Title,Artist,Album,ReleaseDate,ID,Key,BPM,Acousticness,Popularity,Happiness,Danceability,Instrumentalness,Energy,Speechiness,Loudness,Cover,Tags
0,Losing It,FISHER,Losing It,2018-07-13,6ho0GyrWZN3mhi9zVRW7xi,D Major,125,0.00181,74,0.643,0.76,0.703,0.964,0.0577,-5.844,https://i.scdn.co/image/ab67616d0000b2739367c1...,"electronic, dance, techno, tech house, 666"
1,Losing It - Radio Edit,FISHER,Losing It (Radio Edit),2018-10-25,2KXwJZv1pRQFQ95Kj60jBN,D Major,125,0.00176,59,0.744,0.729,0.66,0.965,0.0624,-5.711,https://i.scdn.co/image/ab67616d0000b273ebeb52...,"party, 2018 single, bbc radio1 playlist 2018"
2,Piece Of Your Heart,"MEDUZA, Goodboys",MEDUZA,2023-10-13,5RDUCbvwPZdmjFbhZ5EY4N,B♭ Minor,124,0.0405,69,0.631,0.677,0.000153,0.744,0.0296,-6.806,https://i.scdn.co/image/ab67616d0000b2738d7f33...,"House, EDM, electronic, british, dance"
3,Not Like Us,Kendrick Lamar,Not Like Us,2024-05-04,6AI3ezQ4o3HUoP6Dhudph3,C# Major,101,0.0107,96,0.214,0.898,0.0,0.472,0.0776,-7.001,https://i.scdn.co/image/ab67616d0000b2731ea0c6...,"MySpotigramBot, Diss, Hip-Hop, diss track, hip..."
4,Back To U,"SLANDER, William Black",Back To U,2020-04-24,19m23w2ANVhtB7rApM6pbN,E Major,150,0.645,58,0.272,0.586,0.0,0.499,0.049,-6.627,https://i.scdn.co/image/ab67616d0000b273df3423...,melodic dubstep


In [4]:
DISPLAY_COLS = ["Title","Artist","Album","ReleaseDate","ID", "Key","BPM"]
NUMERIC_COLS = ["Acousticness", "Popularity", "Happiness", "Danceability", "Instrumentalness","Energy","Speechiness", "Loudness"]

scaler = StandardScaler()
df_scaled = scaler.fit_transform(final_df[NUMERIC_COLS])

vectorizer = TfidfVectorizer()
tag_embeddings = vectorizer.fit_transform(final_df["Tags"]).toarray()

track_vectors = np.hstack((df_scaled, tag_embeddings))


In [5]:
d = track_vectors.shape[1]
index = faiss.IndexFlatL2(d)

index.add(np.array(track_vectors).astype('float32'))

track_metadata = final_df[["ID","Title","Artist","Album"] + DISPLAY_COLS]

In [6]:
def find_similar_tracks(song_id, top_k=3):
    song_idx = final_df[final_df["ID"] == song_id].index[0]  # finds index of the song
    query_vector = np.array([track_vectors[song_idx]]).astype('float32')

    distances, indices = index.search(query_vector, top_k)  # FAISS search

    results = track_metadata.iloc[indices[0]].reset_index(drop=True)
    results["Distance"] = distances[0]  # Lower distance = more similar

    return results

In [7]:
def generate_visualization_json(track_vectors, final_df, out_path):
    reducer = UMAP(n_components=3, random_state=42)
    embedding_3d = reducer.fit_transform(track_vectors)

    viz_df = final_df[["ID", "Title", "Artist"]].copy()
    viz_df["x"] = embedding_3d[:, 0]
    viz_df["y"] = embedding_3d[:, 1]
    viz_df["z"] = embedding_3d[:, 2]

    with open(out_path, "w") as f:
        json.dump(viz_df.to_dict(orient="records"), f, indent=2)


In [8]:
generate_visualization_json(track_vectors, final_df, "../client/static/data/visualization_embeddings.json")

  warn(
