In [2]:
import numpy as np
import faiss
import pandas as pd


In [3]:
# Load song embeddings and the original DataFrame for reference
embeddings = np.load('../data/song_embeddings.npy')
df = pd.read_csv('../data/SpotifyFeatures_cleaned.csv')
print("Embeddings shape:", embeddings.shape)
print("DF shape:", df.shape)


Embeddings shape: (232725, 384)
DF shape: (232725, 19)


In [4]:
# Normalize embeddings for cosine similarity search
embeddings_norm = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)


In [5]:
# Build FAISS index for inner product (cosine similarity)
dimension = embeddings_norm.shape[1]
index = faiss.IndexFlatIP(dimension)  # Inner Product = Cosine when vectors are normalized
index.add(embeddings_norm)
print("Number of vectors indexed:", index.ntotal)


Number of vectors indexed: 232725


In [6]:
# Example: Find top-5 similar songs to the first song in the dataset
query_vector = embeddings_norm[0].reshape(1, -1)
D, I = index.search(query_vector, k=5)

# Show results
print("Query Song:", df.iloc[0]['track_name'], "by", df.iloc[0]['artist_name'])
print("\nTop 5 Recommendations:")
for idx in I[0][1:]:
    print(f"- {df.iloc[idx]['track_name']} by {df.iloc[idx]['artist_name']} (Genre: {df.iloc[idx]['genre']})")
print("\n")
# Find the first Pop song
pop_idx = df[df['genre'] == 'Pop'].index[0]
print("Index of first Pop song:", pop_idx)
print(df.iloc[pop_idx][['track_name', 'artist_name', 'genre']])
# Set to pop_idx or hiphop_idx as needed
query_idx = pop_idx  # or hiphop_idx
query_vector = embeddings_norm[query_idx].reshape(1, -1)
D, I = index.search(query_vector, k=6)

print("Query Song:", df.iloc[query_idx]['track_name'], "by", df.iloc[query_idx]['artist_name'])
print("\nTop 5 Recommendations:")
for idx in I[0][1:]:
    print(f"- {df.iloc[idx]['track_name']} by {df.iloc[idx]['artist_name']} (Genre: {df.iloc[idx]['genre']})")


Query Song: C'est beau de faire un Show by Henri Salvador

Top 5 Recommendations:
- Faire des ronds dans l'eau by Henri Salvador (Genre: Movie)
- La Vie C’est La Vie by Henri Salvador (Genre: Movie)
- À Cannes cet été by Henri Salvador (Genre: Movie)
- C'est étonnant, c'est Cannes by Henri Salvador (Genre: Movie)


Index of first Pop song: 107802
track_name     break up with your girlfriend, i'm bored
artist_name                               Ariana Grande
genre                                               Pop
Name: 107802, dtype: object
Query Song: break up with your girlfriend, i'm bored by Ariana Grande

Top 5 Recommendations:
- break up with your girlfriend, i'm bored by Ariana Grande (Genre: Dance)
- Break Free by Ariana Grande (Genre: Pop)
- Break Your Heart Right Back by Ariana Grande (Genre: Pop)
- make up by Ariana Grande (Genre: Pop)
- bad idea by Ariana Grande (Genre: Pop)


In [7]:
faiss.write_index(index, '../data/faiss_song.index')
print("FAISS index saved!")


FAISS index saved!


In [8]:
#Hybrid Ranking (Similarity + Popularity or More)
#Choose your query index (example: first Pop song)
query_idx = pop_idx  # (or any song index you want)
query_vector = embeddings_norm[query_idx].reshape(1, -1)

# Get top-k candidates from FAISS (excluding query itself)
k = 20  # Get more than 5 so hybrid ranking can reorder them
D, I = index.search(query_vector, k=k+1)
candidate_indices = I[0][1:]           # indices of candidate songs
similarity_scores = D[0][1:]           # similarity scores (cosine similarity)


In [9]:
# Get popularity scores for these candidates
popularity_scores = df.iloc[candidate_indices]['popularity'].values

# Normalize to 0-1 (min-max scaling)
pop_min = popularity_scores.min()
pop_max = popularity_scores.max()
pop_norm = (popularity_scores - pop_min) / (pop_max - pop_min + 1e-8)


In [10]:
# Set weights (adjust as you like)
weight_similarity = 0.7
weight_popularity = 0.3

# Calculate hybrid score for each candidate
hybrid_scores = (weight_similarity * similarity_scores) + (weight_popularity * pop_norm)


In [11]:
# Sort candidates by hybrid score (descending)
sorted_indices = np.argsort(-hybrid_scores)
top_n = 5  # Number of recommendations to show

print("Hybrid Top 5 Recommendations:")
for idx in sorted_indices[:top_n]:
    song_idx = candidate_indices[idx]
    song = df.iloc[song_idx]
    print(f"- {song['track_name']} by {song['artist_name']} (Genre: {song['genre']}, Popularity: {song['popularity']})")


Hybrid Top 5 Recommendations:
- break up with your girlfriend, i'm bored by Ariana Grande (Genre: Dance, Popularity: 99)
- bad idea by Ariana Grande (Genre: Pop, Popularity: 91)
- make up by Ariana Grande (Genre: Pop, Popularity: 87)
- Into You by Ariana Grande (Genre: Pop, Popularity: 84)
- Side To Side by Ariana Grande (Genre: Pop, Popularity: 84)


In [12]:
# --- Parameters ---
query_idx = pop_idx  # or any other song index you want
k = 20               # candidates to consider before re-ranking
top_n = 5            # how many final recommendations to show

# --- Hybrid ratio settings to test ---
ratios = [
    (0.9, 0.1),
    (0.7, 0.3),
    (0.5, 0.5)
]

# --- Get FAISS candidates ---
query_vector = embeddings_norm[query_idx].reshape(1, -1)
D, I = index.search(query_vector, k=k+1)
candidate_indices = I[0][1:]      # candidate song indices (excluding query itself)
similarity_scores = D[0][1:]

# --- Normalize popularity ---
popularity_scores = df.iloc[candidate_indices]['popularity'].values
pop_min = popularity_scores.min()
pop_max = popularity_scores.max()
pop_norm = (popularity_scores - pop_min) / (pop_max - pop_min + 1e-8)

# --- Show recommendations for each ratio ---
print(f"Query Song: {df.iloc[query_idx]['track_name']} by {df.iloc[query_idx]['artist_name']} (Genre: {df.iloc[query_idx]['genre']})\n")

for w_sim, w_pop in ratios:
    hybrid_scores = (w_sim * similarity_scores) + (w_pop * pop_norm)
    sorted_indices = np.argsort(-hybrid_scores)
    print(f"--- Top {top_n} Recommendations (Similarity: {w_sim}, Popularity: {w_pop}) ---")
    for idx in sorted_indices[:top_n]:
        song_idx = candidate_indices[idx]
        song = df.iloc[song_idx]
        print(f"- {song['track_name']} by {song['artist_name']} (Genre: {song['genre']}, Popularity: {song['popularity']})")
    print()


Query Song: break up with your girlfriend, i'm bored by Ariana Grande (Genre: Pop)

--- Top 5 Recommendations (Similarity: 0.9, Popularity: 0.1) ---
- break up with your girlfriend, i'm bored by Ariana Grande (Genre: Dance, Popularity: 99)
- bad idea by Ariana Grande (Genre: Pop, Popularity: 91)
- make up by Ariana Grande (Genre: Pop, Popularity: 87)
- Break Free by Ariana Grande (Genre: Pop, Popularity: 78)
- Into You by Ariana Grande (Genre: Pop, Popularity: 84)

--- Top 5 Recommendations (Similarity: 0.7, Popularity: 0.3) ---
- break up with your girlfriend, i'm bored by Ariana Grande (Genre: Dance, Popularity: 99)
- bad idea by Ariana Grande (Genre: Pop, Popularity: 91)
- make up by Ariana Grande (Genre: Pop, Popularity: 87)
- Into You by Ariana Grande (Genre: Pop, Popularity: 84)
- Side To Side by Ariana Grande (Genre: Pop, Popularity: 84)

--- Top 5 Recommendations (Similarity: 0.5, Popularity: 0.5) ---
- break up with your girlfriend, i'm bored by Ariana Grande (Genre: Dance, Po

In [13]:
# Get valence and danceability for candidates
valence_scores = df.iloc[candidate_indices]['valence'].values
danceability_scores = df.iloc[candidate_indices]['danceability'].values

# Normalize to 0-1
valence_norm = (valence_scores - valence_scores.min()) / (valence_scores.max() - valence_scores.min() + 1e-8)
danceability_norm = (danceability_scores - danceability_scores.min()) / (danceability_scores.max() - danceability_scores.min() + 1e-8)


In [14]:
# 1 if genre matches, else 0 (can use 0.5 for partial matches)
query_genre = df.iloc[query_idx]['genre']
genre_match = (df.iloc[candidate_indices]['genre'] == query_genre).astype(float).values


In [15]:
# Fixed hybrid weights for all future recommendations
weight_similarity = 0.6
weight_popularity = 0.15
weight_valence = 0.1
weight_danceability = 0.1
weight_genre = 0.05


In [16]:
hybrid_scores = (
    weight_similarity * similarity_scores +
    weight_popularity * pop_norm +
    weight_valence * valence_norm +
    weight_danceability * danceability_norm +
    weight_genre * genre_match
)


In [17]:
sorted_indices = np.argsort(-hybrid_scores)
top_n = 5

print("Hybrid Top 5 Recommendations (Main Blend):")
for idx in sorted_indices[:top_n]:
    song_idx = candidate_indices[idx]
    song = df.iloc[song_idx]
    print(f"- {song['track_name']} by {song['artist_name']} (Genre: {song['genre']}, Popularity: {song['popularity']}, Valence: {song['valence']:.2f}, Dance: {song['danceability']:.2f}, GenreMatch: {genre_match[idx]})")


Hybrid Top 5 Recommendations (Main Blend):
- bad idea by Ariana Grande (Genre: Pop, Popularity: 91, Valence: 0.57, Dance: 0.85, GenreMatch: 1.0)
- break up with your girlfriend, i'm bored by Ariana Grande (Genre: Dance, Popularity: 99, Valence: 0.34, Dance: 0.73, GenreMatch: 0.0)
- make up by Ariana Grande (Genre: Pop, Popularity: 87, Valence: 0.48, Dance: 0.67, GenreMatch: 1.0)
- Side To Side by Ariana Grande (Genre: Pop, Popularity: 84, Valence: 0.61, Dance: 0.65, GenreMatch: 1.0)
- Problem by Ariana Grande (Genre: Pop, Popularity: 77, Valence: 0.62, Dance: 0.66, GenreMatch: 1.0)


In [18]:
# Parameters
query_idx = pop_idx  # Or any song index you want
k = 20               # Number of FAISS candidates
top_n = 5            # How many recommendations to show

# Hybrid weight settings to test (all should sum to 1)
ratios = [
    # (similarity, popularity, valence, danceability, genre)
    (0.5, 0.2, 0.15, 0.1, 0.05),
    (0.6, 0.15, 0.1, 0.1, 0.05),
    (0.4, 0.3, 0.1, 0.1, 0.1),
]

# Get FAISS candidates
query_vector = embeddings_norm[query_idx].reshape(1, -1)
D, I = index.search(query_vector, k=k+1)
candidate_indices = I[0][1:]      # candidate song indices (excluding query itself)
similarity_scores = D[0][1:]

# Normalize additional features
popularity_scores = df.iloc[candidate_indices]['popularity'].values
pop_min = popularity_scores.min(); pop_max = popularity_scores.max()
pop_norm = (popularity_scores - pop_min) / (pop_max - pop_min + 1e-8)

valence_scores = df.iloc[candidate_indices]['valence'].values
valence_norm = (valence_scores - valence_scores.min()) / (valence_scores.max() - valence_scores.min() + 1e-8)

danceability_scores = df.iloc[candidate_indices]['danceability'].values
danceability_norm = (danceability_scores - danceability_scores.min()) / (danceability_scores.max() - danceability_scores.min() + 1e-8)

query_genre = df.iloc[query_idx]['genre']
genre_match = (df.iloc[candidate_indices]['genre'] == query_genre).astype(float).values

print(f"Query Song: {df.iloc[query_idx]['track_name']} by {df.iloc[query_idx]['artist_name']} (Genre: {df.iloc[query_idx]['genre']})\n")

for w_sim, w_pop, w_val, w_dance, w_genre in ratios:
    hybrid_scores = (
        w_sim * similarity_scores +
        w_pop * pop_norm +
        w_val * valence_norm +
        w_dance * danceability_norm +
        w_genre * genre_match
    )
    sorted_indices = np.argsort(-hybrid_scores)
    print(f"--- Top {top_n} Recommendations (sim: {w_sim}, pop: {w_pop}, val: {w_val}, dance: {w_dance}, genre: {w_genre}) ---")
    for idx in sorted_indices[:top_n]:
        song_idx = candidate_indices[idx]
        song = df.iloc[song_idx]
        print(f"- {song['track_name']} by {song['artist_name']} (Genre: {song['genre']}, Pop: {song['popularity']}, Valence: {song['valence']:.2f}, Dance: {song['danceability']:.2f}, GenreMatch: {genre_match[idx]})")
    print()


Query Song: break up with your girlfriend, i'm bored by Ariana Grande (Genre: Pop)

--- Top 5 Recommendations (sim: 0.5, pop: 0.2, val: 0.15, dance: 0.1, genre: 0.05) ---
- bad idea by Ariana Grande (Genre: Pop, Pop: 91, Valence: 0.57, Dance: 0.85, GenreMatch: 1.0)
- break up with your girlfriend, i'm bored by Ariana Grande (Genre: Dance, Pop: 99, Valence: 0.34, Dance: 0.73, GenreMatch: 0.0)
- make up by Ariana Grande (Genre: Pop, Pop: 87, Valence: 0.48, Dance: 0.67, GenreMatch: 1.0)
- Side To Side by Ariana Grande (Genre: Pop, Pop: 84, Valence: 0.61, Dance: 0.65, GenreMatch: 1.0)
- Greedy by Ariana Grande (Genre: Pop, Pop: 72, Valence: 0.84, Dance: 0.62, GenreMatch: 1.0)

--- Top 5 Recommendations (sim: 0.6, pop: 0.15, val: 0.1, dance: 0.1, genre: 0.05) ---
- bad idea by Ariana Grande (Genre: Pop, Pop: 91, Valence: 0.57, Dance: 0.85, GenreMatch: 1.0)
- break up with your girlfriend, i'm bored by Ariana Grande (Genre: Dance, Pop: 99, Valence: 0.34, Dance: 0.73, GenreMatch: 0.0)
- make 

In [19]:
# --- Evaluation: Simulated User Recall@5 ---
#
# To evaluate our recommender system, we simulate users who like multiple songs from the same artist
# (e.g., all George Strait or Ariana Grande tracks).
# For each simulated user, we:
#   - Randomly pick an artist with enough songs.
#   - Select a set of "liked" songs by that artist.
#   - For each song, use it as a query and check if other liked songs appear in the top-5 recommendations.
#   - Compute Recall@5: fraction of a user's remaining liked songs retrieved in top-5 recommendations.
# This gives a realistic measure of how well the system surfaces other relevant favorites for a user,
# which is especially important in music or content recommendation settings.


import random

num_users = 5
songs_per_user = 10
k = 5
recall_scores = []

# Find all artists with enough songs
artists = df['artist_name'].value_counts()
good_artists = artists[artists > songs_per_user].index.tolist()

for user in range(num_users):
    artist = random.choice(good_artists)
    artist_songs = df[df['artist_name'] == artist].index.tolist()
    liked_indices = random.sample(artist_songs, songs_per_user)
    
    for i, query_idx in enumerate(liked_indices):
        ground_truth = set(liked_indices[:i] + liked_indices[i+1:])
        
        query_vector = embeddings_norm[query_idx].reshape(1, -1)
        D, I = index.search(query_vector, 20)
        candidate_indices = I[0][1:]

        # Hybrid scoring as before
        popularity_scores = df.iloc[candidate_indices]['popularity'].values
        pop_min = popularity_scores.min(); pop_max = popularity_scores.max()
        pop_norm = (popularity_scores - pop_min) / (pop_max - pop_min + 1e-8)
        valence_scores = df.iloc[candidate_indices]['valence'].values
        valence_norm = (valence_scores - valence_scores.min()) / (valence_scores.max() - valence_scores.min() + 1e-8)
        danceability_scores = df.iloc[candidate_indices]['danceability'].values
        danceability_norm = (danceability_scores - danceability_scores.min()) / (danceability_scores.max() - danceability_scores.min() + 1e-8)
        query_genre = df.iloc[query_idx]['genre']
        genre_match = (df.iloc[candidate_indices]['genre'] == query_genre).astype(float).values

        hybrid_scores = (
            weight_similarity * D[0][1:] +
            weight_popularity * pop_norm +
            weight_valence * valence_norm +
            weight_danceability * danceability_norm +
            weight_genre * genre_match
        )
        sorted_indices = np.argsort(-hybrid_scores)
        top_indices = [candidate_indices[idx] for idx in sorted_indices[:k]]
        hits = len(set(top_indices) & ground_truth)
        recall = hits / min(k, len(ground_truth)) if ground_truth else 0
        recall_scores.append(recall)
        
        # Print queries and recommendations for illustration
        if user == 0 and i < 2:
            print("Query:", df.iloc[query_idx]['track_name'], "| Top recommendations:")
            for ti in top_indices:
                print("-", df.iloc[ti]['track_name'])

print(f"Average Recall@{k}: {np.mean(recall_scores):.2f}")


Query: Angel (Live) | Top recommendations:
- No Angel
- Angel (Live)
- Angel - Recorded At Spotify Studios NYC
- No Angel
- That Was Then
Query: Forever, For Always, For Love | Top recommendations:
- Love, Love, Love
- Angel (Live)
- Steady Love
- Always In My Head
- That Was Then
Average Recall@5: 0.51


In [21]:


# Load sample DataFrame and embeddings
df_sample = pd.read_csv('../data/SpotifyFeatures_sample.csv')
embeddings_sample = np.load('../data/song_embeddings_sample.npy')

# Normalize embeddings (for cosine similarity search)
embeddings_norm = embeddings_sample / np.linalg.norm(embeddings_sample, axis=1, keepdims=True)

# Build FAISS index (Inner Product is cosine similarity if vectors are normalized)
index = faiss.IndexFlatIP(embeddings_norm.shape[1])
index.add(embeddings_norm.astype('float32'))

# Save the index
faiss.write_index(index, '../data/faiss_song_sample.index')

print("FAISS sample index created and saved!")


FAISS sample index created and saved!
