In [1]:
import pandas as pd
import torch
from torch import nn
import numpy as np
from pynndescent import NNDescent

In [19]:
df = pd.read_csv('spotify_dataset.csv')
df.head(5)

Unnamed: 0,Artist(s),song,text,Length,emotion,Genre,Album,Release Date,Key,Tempo,Loudness (db),Time signature,Explicit,Popularity,Energy,Danceability,Positiveness,Speechiness,Liveness,Acousticness,Instrumentalness,Good for Party,Good for Work/Study,Good for Relaxation/Meditation,Good for Exercise,Good for Running,Good for Yoga/Stretching,Good for Driving,Good for Social Gatherings,Good for Morning Routine,Similar Artist 1,Similar Song 1,Similarity Score 1,Similar Artist 2,Similar Song 2,Similarity Score 2,Similar Artist 3,Similar Song 3,Similarity Score 3
0,!!!,Even When the Waters Cold,Friends told her she was better off at the bot...,03:47,sadness,hip hop,Thr!!!er,29th April 2013,D min,105,-6.85db,4/4,No,40,83,71,87,4,16,11,0,0,0,0,0,0,0,0,0,0,Corey Smith,If I Could Do It Again,0.986061,Toby Keith,Drinks After Work,0.983719,Space,Neighbourhood,0.983236
1,!!!,One Girl / One Boy,"Well I heard it, playing soft From a drunken b...",04:03,sadness,hip hop,Thr!!!er,29th April 2013,A# min,117,-5.75db,4/4,No,42,85,70,87,4,32,0,0,0,0,0,0,0,0,0,0,0,Hiroyuki Sawano,BRE@TH//LESS,0.995409,When In Rome,Heaven Knows,0.990905,Justice Crew,Everybody,0.984483
2,!!!,Pardon My Freedom,"Oh my god, did I just say that out loud? Shoul...",05:51,joy,hip hop,Louden Up Now,8th June 2004,A Maj,121,-6.06db,4/4,No,29,89,71,63,8,64,0,20,0,0,0,1,0,0,0,0,0,Ricky Dillard,More Abundantly Medley Live,0.993176,Juliet,Avalon,0.965147,The Jacksons,Lovely One,0.956752
3,!!!,Ooo,[Verse 1] Remember when I called you on the te...,03:44,joy,hip hop,As If,16th October 2015,A min,122,-5.42db,4/4,No,24,84,78,97,4,12,12,0,0,0,0,1,0,0,0,0,0,Eric Clapton,Man Overboard,0.992749,Roxette,Don't Believe In Accidents,0.991494,Tiwa Savage,My Darlin,0.990381
4,!!!,Freedom 15,[Verse 1] Calling me like I got something to s...,06:00,joy,hip hop,As If,16th October 2015,F min,123,-5.57db,4/4,No,30,71,77,70,7,10,4,1,0,0,0,1,0,0,0,0,0,Cibo Matto,Lint Of Love,0.98161,Barrington Levy,Better Than Gold,0.981524,Freestyle,Its Automatic,0.981415


In [3]:
df = df.drop(columns=[
    'text', 'Key', 'Time signature', 'Explicit',
])

In [4]:
df.rename(columns={
    'Artist(s)': 'artist',
    'song': 'song_title',
    'Length': 'length',
    'emotion': 'emotion',
    'Genre': 'genre',
    'Album': 'album',
    'Release Date': 'release_date',
    'Tempo': 'tempo',
    'Loudness (db)': 'loudness',
    'Popularity': 'popularity',
    'Energy': 'energy',
    'Danceability': 'danceability',
    'Positiveness': 'positiveness',
    'Speechiness': 'speechiness',
    'Liveness': 'liveness',
    'Acousticness': 'acousticness',
    'Instrumentalness': 'instrumentalness',
    'Good for Party': 'party',
    'Good for Work/Study': 'work_study',
    'Good for Relaxation/Meditation': 'relaxation_meditation',
    'Good for Exercise': 'exercise',
    'Good for Running': 'running',
    'Good for Yoga/Stretching': 'yoga_stretching',
    'Good for Driving': 'driving',
    'Good for Social Gatherings': 'social_gatherings',
    'Good for Morning Routine': 'morning_routine',
    'Similar Artist 1': 'similar_artist_1',
    'Similar Song 1': 'similar_song_1',
    'Similarity Score 1': 'similarity_score_1',
    'Similar Artist 2': 'similar_artist_2',
    'Similar Song 2': 'similar_song_2',
    'Similarity Score 2': 'similarity_score_2',
    'Similar Artist 3': 'similar_artist_3',
    'Similar Song 3': 'similar_song_3',
    'Similarity Score 3': 'similarity_score_3'
}, inplace=True)

df.head(0)

Unnamed: 0,artist,song_title,length,emotion,genre,album,release_date,tempo,loudness,popularity,...,morning_routine,similar_artist_1,similar_song_1,similarity_score_1,similar_artist_2,similar_song_2,similarity_score_2,similar_artist_3,similar_song_3,similarity_score_3


In [5]:
df['genre_list'] = df['genre'].str.split(',')
df_exploded = df.explode('genre_list')
df_exploded['genre_list'] = df_exploded['genre_list'].str.strip()
genre_dummies = pd.get_dummies(df_exploded['genre_list'], prefix='genre')
genre_encoded = genre_dummies.groupby(df_exploded.index).sum()
df = df.drop(columns=['genre', 'genre_list']).join(genre_encoded)

emotion_dummies = pd.get_dummies(df['emotion'], prefix='emotion', dtype=int)
df = pd.concat([df.drop(columns=['emotion']), emotion_dummies], axis=1)


In [6]:
pd.set_option('display.max_columns', None)
df.filter(like='genre_').head()

Unnamed: 0,genre_acoustic,genre_alt-country,genre_alternative,genre_alternative rock,genre_ambient,genre_black metal,genre_blues,genre_britpop,genre_chillout,genre_chillwave,genre_christian,genre_classic rock,genre_classical,genre_cloud rap,genre_comedy,genre_country,genre_dance,genre_dancehall,genre_death metal,genre_deathcore,genre_disco,genre_doom metal,genre_dream pop,genre_drum and bass,genre_dub,genre_dubstep,genre_electro,genre_electronic,genre_electropop,genre_emo,genre_emo rap,genre_experimental,genre_folk,genre_funk,genre_garage rock,genre_gospel,genre_grime,genre_grunge,genre_hard rock,genre_hardcore,genre_heavy metal,genre_hip hop,genre_hip-hop,genre_house,genre_indie,genre_indie pop,genre_indie rock,genre_industrial,genre_j-pop,genre_jazz,genre_k-pop,genre_latin,genre_lo-fi,genre_math rock,genre_melodic death metal,genre_metal,genre_metalcore,genre_new wave,genre_nu metal,genre_pop,genre_pop punk,genre_pop rock,genre_post-hardcore,genre_post-punk,genre_power metal,genre_progressive metal,genre_progressive rock,genre_psychedelic,genre_psychedelic rock,genre_punk,genre_punk rock,genre_rap,genre_reggae,genre_reggaeton,genre_rnb,genre_rock,genre_screamo,genre_shoegaze,genre_soul,genre_soundtrack,genre_swing,genre_synthpop,genre_techno,genre_thrash metal,genre_trance,genre_trap,genre_trip-hop,genre_worship
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [7]:
df.filter(like='emotion_').head()

Unnamed: 0,emotion_Love,emotion_True,emotion_anger,emotion_angry,emotion_confusion,emotion_fear,emotion_interest,emotion_joy,emotion_love,emotion_pink,emotion_sadness,emotion_surprise,emotion_thirst
0,0,0,0,0,0,0,0,0,0,0,1,0,0
1,0,0,0,0,0,0,0,0,0,0,1,0,0
2,0,0,0,0,0,0,0,1,0,0,0,0,0
3,0,0,0,0,0,0,0,1,0,0,0,0,0
4,0,0,0,0,0,0,0,1,0,0,0,0,0


In [8]:
semantic_groups = {
    'love': ['emotion_love', 'emotion_Love', 'emotion_True'],
    'anger': ['emotion_angry', 'emotion_anger'],
    'sadness': ['emotion_sadness'],
    'joy': ['emotion_joy'],
    'fear': ['emotion_fear'],
    'surprise': ['emotion_surprise'],
    'confusion': ['emotion_confusion'],
    'interest': ['emotion_interest'],
    'thirst': ['emotion_thirst'],
    'pink': ['emotion_pink']  # unclear, keep separate unless you specify mapping
}

for unified_name, cols in semantic_groups.items():
    existing_cols = [c for c in cols if c in df.columns]
    if existing_cols:
        # Create merged column first
        df[f'emotion_{unified_name}'] = df[existing_cols].sum(axis=1).clip(upper=1)
        
        # Drop only the original columns, exclude the new merged one
        cols_to_drop = [c for c in existing_cols if c != f'emotion_{unified_name}']
        df.drop(columns=cols_to_drop, inplace=True)


In [9]:
df.filter(like='emotion_').head()

Unnamed: 0,emotion_anger,emotion_confusion,emotion_fear,emotion_interest,emotion_joy,emotion_love,emotion_pink,emotion_sadness,emotion_surprise,emotion_thirst
0,0,0,0,0,0,0,0,1,0,0
1,0,0,0,0,0,0,0,1,0,0
2,0,0,0,0,1,0,0,0,0,0
3,0,0,0,0,1,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0


In [10]:
# List of genre and emotion columns (from your column names)
genre_cols = [
    'genre_acoustic', 'genre_alt-country', 'genre_alternative', 'genre_alternative rock', 'genre_ambient',
    'genre_black metal', 'genre_blues', 'genre_britpop', 'genre_chillout', 'genre_chillwave', 'genre_christian',
    'genre_classic rock', 'genre_classical', 'genre_cloud rap', 'genre_comedy', 'genre_country', 'genre_dance',
    'genre_dancehall', 'genre_death metal', 'genre_deathcore', 'genre_disco', 'genre_doom metal', 'genre_dream pop',
    'genre_drum and bass', 'genre_dub', 'genre_dubstep', 'genre_electro', 'genre_electronic', 'genre_electropop',
    'genre_emo', 'genre_emo rap', 'genre_experimental', 'genre_folk', 'genre_funk', 'genre_garage rock',
    'genre_gospel', 'genre_grime', 'genre_grunge', 'genre_hard rock', 'genre_hardcore', 'genre_heavy metal',
    'genre_hip hop', 'genre_hip-hop', 'genre_house', 'genre_indie', 'genre_indie pop', 'genre_indie rock',
    'genre_industrial', 'genre_j-pop', 'genre_jazz', 'genre_k-pop', 'genre_latin', 'genre_lo-fi', 'genre_math rock',
    'genre_melodic death metal', 'genre_metal', 'genre_metalcore', 'genre_new wave', 'genre_nu metal', 'genre_pop',
    'genre_pop punk', 'genre_pop rock', 'genre_post-hardcore', 'genre_post-punk', 'genre_power metal',
    'genre_progressive metal', 'genre_progressive rock', 'genre_psychedelic', 'genre_psychedelic rock',
    'genre_punk', 'genre_punk rock', 'genre_rap', 'genre_reggae', 'genre_reggaeton', 'genre_rnb', 'genre_rock',
    'genre_screamo', 'genre_shoegaze', 'genre_soul', 'genre_soundtrack', 'genre_swing', 'genre_synthpop',
    'genre_techno', 'genre_thrash metal', 'genre_trance', 'genre_trap', 'genre_trip-hop', 'genre_worship'
]

emotion_cols = [
    'emotion_anger', 'emotion_confusion', 'emotion_fear', 'emotion_interest', 'emotion_joy',
    'emotion_love', 'emotion_pink', 'emotion_sadness', 'emotion_surprise', 'emotion_thirst'
]

good_for_cols = ['party', 'work_study', 'relaxation_meditation', 'exercise', 'running', 'yoga_stretching', 'driving', 'social_gatherings', 'morning_routine']

# Make sure these columns exist in your DataFrame
genre_cols = [col for col in genre_cols if col in df.columns]
emotion_cols = [col for col in emotion_cols if col in df.columns]
good_for_cols = [col for col in good_for_cols if col in df.columns]

# Convert to int (just to be safe)
df[genre_cols] = df[genre_cols].astype(int)
df[emotion_cols] = df[emotion_cols].astype(int)
df[good_for_cols] = df[good_for_cols].astype(int)

# Combine genre columns into a list in a new 'genre' column
df['genre'] = df[genre_cols].values.tolist()

# Combine emotion columns into a list in a new 'emotion' column
df['emotion'] = df[emotion_cols].values.tolist()

# Combine good_for columns into a list in a new 'good_for' column
df['good_for'] = df[good_for_cols].values.tolist()

# If you want, you can drop the original genre and emotion columns after combining
# df.drop(columns=genre_cols + emotion_cols, inplace=True)

# Now df has two columns 'genre' and 'emotion' with multi-hot encoded lists


  df['genre'] = df[genre_cols].values.tolist()
  df['emotion'] = df[emotion_cols].values.tolist()
  df['good_for'] = df[good_for_cols].values.tolist()


In [11]:
df[['genre', 'emotion', 'good_for']].head()

Unnamed: 0,genre,emotion,good_for
0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 1, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0]"
1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 1, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0]"
2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0]","[0, 0, 0, 1, 0, 0, 0, 0, 0]"
3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0]","[0, 0, 0, 1, 0, 0, 0, 0, 0]"
4,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0]","[0, 0, 0, 1, 0, 0, 0, 0, 0]"


In [12]:
numeric_cols = ['tempo', 'loudness', 'popularity', 'energy', 'danceability',
                'positiveness', 'speechiness', 'liveness', 'acousticness', 'instrumentalness']

genre_cols = [col for col in df.columns if col.startswith("genre_")]
emotion_cols = [col for col in df.columns if col.startswith("emotion_")]
goodfor_cols = [col for col in df.columns if col.startswith("party") or 
                col.startswith("work_study") or 
                col.startswith("relaxation") or 
                col.startswith("exercise") or 
                col.startswith("running") or 
                col.startswith("yoga") or 
                col.startswith("driving") or 
                col.startswith("social") or 
                col.startswith("morning")]

df["loudness"] = df["loudness"].astype(str).str.replace("db", "", case=False).str.strip().astype(float)

df[numeric_cols] = df[numeric_cols].fillna(0.0)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def prepare_embeddingbag_inputs_vectorized(df, col_group):
    data = df[col_group].values.astype(np.int64)
    row_ids, col_ids = np.nonzero(data)
    indices_tensor = torch.tensor(col_ids, dtype=torch.long)
    offsets = torch.zeros(len(df), dtype=torch.long)
    np.add.at(offsets.numpy(), row_ids, 1)
    offsets = torch.cumsum(offsets, dim=0) - offsets
    return indices_tensor.to(device), offsets.to(device)

X_numeric = torch.tensor(df[numeric_cols].astype(float).values, dtype=torch.float32).to(device)
genre_indices, genre_offsets = prepare_embeddingbag_inputs_vectorized(df, genre_cols)
emotion_indices, emotion_offsets = prepare_embeddingbag_inputs_vectorized(df, emotion_cols)
goodfor_indices, goodfor_offsets = prepare_embeddingbag_inputs_vectorized(df, goodfor_cols)


In [13]:
class MusicEmbeddingModel(nn.Module):
    def __init__(self, num_numeric, genre_vocab, emotion_vocab, goodfor_vocab, emb_dim=32):
        super().__init__()
        self.genre_emb = nn.EmbeddingBag(genre_vocab, emb_dim, mode='mean')
        self.emotion_emb = nn.EmbeddingBag(emotion_vocab, emb_dim, mode='mean')
        self.goodfor_emb = nn.EmbeddingBag(goodfor_vocab, emb_dim, mode='mean')
        self.linear_numeric = nn.Linear(num_numeric, emb_dim)

    def forward(self, x_numeric, genre_idx, genre_off, emotion_idx, emotion_off, goodfor_idx, goodfor_off):
        genre_vec = self.genre_emb(genre_idx, genre_off)
        emotion_vec = self.emotion_emb(emotion_idx, emotion_off)
        goodfor_vec = self.goodfor_emb(goodfor_idx, goodfor_off)
        numeric_vec = self.linear_numeric(x_numeric)
        return torch.cat([numeric_vec, genre_vec, emotion_vec, goodfor_vec], dim=1)

In [14]:
model = MusicEmbeddingModel(
    num_numeric=len(numeric_cols),
    genre_vocab=len(genre_cols),
    emotion_vocab=len(emotion_cols),
    goodfor_vocab=len(goodfor_cols),
    emb_dim=32
).to(device)

In [15]:
def batch_embedding_inputs(indices, offsets, batch_start, batch_end):
    batch_offsets = offsets[batch_start:batch_end]
    next_offset = offsets[batch_end] if batch_end < len(offsets) else len(indices)
    batch_indices = indices[batch_offsets[0]:next_offset]
    batch_offsets = batch_offsets - batch_offsets[0]
    return batch_indices, batch_offsets

batch_size = 1024
embedding_batches = []

for i in range(0, len(df), batch_size):
    batch_end = min(i + batch_size, len(df))
    xb = X_numeric[i:batch_end]

    genre_idx_batch, genre_off_batch = batch_embedding_inputs(genre_indices, genre_offsets, i, batch_end)
    emotion_idx_batch, emotion_off_batch = batch_embedding_inputs(emotion_indices, emotion_offsets, i, batch_end)
    goodfor_idx_batch, goodfor_off_batch = batch_embedding_inputs(goodfor_indices, goodfor_offsets, i, batch_end)

    with torch.no_grad():
        emb = model(
            xb,
            genre_idx_batch, genre_off_batch,
            emotion_idx_batch, emotion_off_batch,
            goodfor_idx_batch, goodfor_off_batch
        )
        embedding_batches.append(emb.cpu())

final_embeddings = torch.cat(embedding_batches)
print("Final Embeddings Shape:", final_embeddings.shape)

Final Embeddings Shape: torch.Size([551443, 128])


In [16]:
data = final_embeddings.cpu().numpy()
index = NNDescent(data, metric='cosine', n_neighbors=5)


In [17]:
n = 100 # Index of the song to query

indices, distances = index.query(data[n:n+1], k=5)
print("Indices:", indices)
print("Distances:", distances)

Indices: [[   100    189     35    203 100730]]
Distances: [[-1.19209305e-07  2.22916410e-05  2.25300480e-05  1.39328165e-03
   2.85866879e-03]]


In [18]:
for i in indices.flatten():
    print(df.iloc[i][['artist', 'song_title', 'album']].to_dict())

{'artist': '"Weird Al" Yankovic', 'song_title': 'Parody', 'album': 'Bad Hair Day'}
{'artist': '"Weird Al" Yankovic', 'song_title': 'George Of The Jungle', 'album': 'Bad Hair Day'}
{'artist': '"Weird Al" Yankovic', 'song_title': 'Amish Paradise', 'album': 'Bad Hair Day'}
{'artist': '"Weird Al" Yankovic', 'song_title': 'Confessions Part Iii', 'album': 'Bad Hair Day'}
{'artist': 'CoryaYo', 'song_title': 'Freakin Chill', 'album': '1995'}
