In [2]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer

In [3]:
# Load the cleaned Spotify dataset for embedding
df = pd.read_csv('../data/SpotifyFeatures_cleaned.csv')
print("Shape of data:", df.shape)
df['combined_text'].head()

Shape of data: (232725, 19)


0    artist: henri salvador, track: cest beau de fa...
1    artist: martin  les fes, track: perdu davance ...
2    artist: joseph williams, track: dont let me be...
3    artist: henri salvador, track: dismoi monsieur...
4    artist: fabien nataf, track: ouverture, genre:...
Name: combined_text, dtype: object

In [4]:
# Load a pre-trained SBERT model
model = SentenceTransformer('all-MiniLM-L6-v2')  # This model is fast and accurate for semantic similarity


In [5]:
# Create a list of sentences from the combined_text column
sentences = df['combined_text'].tolist()

# Generate SBERT embeddings for each song
embeddings = model.encode(sentences, batch_size=64, show_progress_bar=True)


Batches:   0%|          | 0/3637 [00:00<?, ?it/s]

In [6]:
# Save the embeddings as a NumPy array for future use
np.save('../data/song_embeddings.npy', embeddings)
print("Embeddings saved! Shape:", embeddings.shape)


Embeddings saved! Shape: (232725, 384)


In [7]:
# Inspect the first embedding to ensure correct dimensionality (should be 384)
print("First embedding:", embeddings[0])
print("Embedding dimension:", len(embeddings[0]))


First embedding: [-4.59806286e-02  3.83640500e-03 -1.54102594e-02  1.69851780e-02
  3.99520397e-02  1.09667987e-01  1.54757509e-02  2.08055973e-02
  4.40118499e-02 -3.88103947e-02 -1.50918392e-02 -6.32003546e-02
 -1.15240943e-02 -2.19610613e-02 -1.50136864e-02  5.69209903e-02
  2.53227092e-02  3.62377353e-02  3.16388831e-02  1.72649659e-02
  1.00223519e-01 -4.89110723e-02 -3.15917730e-02  6.89062178e-02
 -8.14694539e-02 -4.33037244e-02  1.81132229e-03 -3.26072723e-02
 -2.02372745e-02 -5.14487736e-02  2.85512861e-02  8.76619071e-02
 -4.14345339e-02 -2.25103665e-02 -8.66812654e-03 -3.22976038e-02
 -1.62538383e-02 -6.62098229e-02 -1.36952838e-02  1.60175115e-02
 -1.13608949e-02  2.27641053e-02 -8.83905813e-02 -4.84199151e-02
  5.83298318e-02 -1.91822574e-02  4.78833914e-02  2.44655609e-02
 -3.92212672e-03  4.69572246e-02  5.60280355e-03  6.04092656e-03
 -5.97719625e-02 -2.35150568e-02 -4.66874130e-02 -2.11050287e-02
  8.43854472e-02 -2.65096854e-02  1.20799877e-01 -2.65941210e-03
 -3.5342

In [15]:


embeddings = np.load('../data/song_embeddings.npy')

# Take a random sample of N songs (e.g., 1000)
N = 15000

df_sample = df.sample(n=N, random_state=42)
sample_indices = df_sample.index.tolist()

# Select only those embeddings
embeddings_sample = embeddings[sample_indices]

# Save the sampled DataFrame and .npy file
df_sample.to_csv('../data/SpotifyFeatures_sample.csv', index=False)
np.save('../data/song_embeddings_sample.npy', embeddings_sample)