In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix
from flask import Flask, request, jsonify
import faiss  # Optimized similarity search

In [2]:
df = pd.read_csv("dataset/lyrics.csv")

In [3]:
# Ensure column names have no spaces
df.columns = df.columns.str.strip()

In [4]:
# Preprocess text data (Using song names instead of lyrics)
vectorizer = TfidfVectorizer(stop_words='english', max_features=10000)
tfidf_matrix = vectorizer.fit_transform(df['name'].fillna(""))  # Handle missing song names

In [5]:
# Convert to CSR matrix for efficient storage
song_features = csr_matrix(tfidf_matrix)

In [6]:
# Convert sparse matrix to dense format for Faiss (Faiss requires float32 arrays)
song_features_array = song_features.toarray().astype('float32')

In [7]:
# Train KNN model using Faiss for fast search
index = faiss.IndexFlatIP(song_features.shape[1])  # Inner Product for similarity
index.add(song_features_array)  # Add song feature vectors to the index


In [8]:
print(f"✅ Faiss index built with {index.ntotal} songs.")

✅ Faiss index built with 28372 songs.


In [9]:
print(song_features_array.shape)

(28372, 10000)


In [10]:
def recommend_songs(query, df, vectorizer, index, top_n=20):
    """
    Recommend top N similar songs based on a given query (song name, artist name, or lyrics).
    """
    # Check if query is in dataset (song name or artist name)
    song_match = df[df['name'].str.lower() == query.lower()]
    artist_match = df[df['artists'].str.lower() == query.lower()]

    if not song_match.empty:
        song_lyrics = song_match['lyrics'].values[0]
        song_genre = song_match['genre'].values[0]
        print("Song is present in the dataset. Here are the recommendations:")
    elif not artist_match.empty:
        song_lyrics = ' '.join(artist_match['lyrics'].dropna().values)
        song_genre = artist_match['genre'].values[0]
        print("Artist is present in the dataset. Here are the recommendations:")
    else:
        print("Here are the nearest recommended songs:")
        return recommend_nearest_songs(query, df, vectorizer, index, top_n)

    # Generate recommendations
    query_vector = vectorizer.transform([song_lyrics])
    query_vector = query_vector.toarray().astype('float32')
    _, song_indices = index.search(query_vector, top_n)
    recommendations = df.iloc[song_indices[0]].copy()
    recommendations['similarity_score'] = _[0]

    # Check genre similarity
    relevant_recommendations = recommendations[recommendations['genre'] == song_genre]

    if relevant_recommendations.empty:
        print("The recommended songs are not fully relevant. After Providing the lyrics of the song for better recommendations.")
        lyrics = input("Enter lyrics of the song: ")
        return recommend_lyrics_song(lyrics, df, vectorizer, index, top_n)
    else:
        print("All the recommended songs are most relevant.")

    return relevant_recommendations[['name', 'artists', 'genre', 'similarity_score']]


def recommend_lyrics_song(lyrics, df, vectorizer, index, top_n=20):
    """
    Recommend songs for an unseen song (not in dataset) based on lyrics similarity.
    """
    query_vector = vectorizer.transform([lyrics])
    query_vector = query_vector.toarray().astype('float32')
    _, song_indices = index.search(query_vector, top_n)
    recommendations = df.iloc[song_indices[0]].copy()
    recommendations['similarity_score'] = _[0]

    print("The possible recommended songs are like this:")
    return recommendations[['name', 'artists', 'genre', 'similarity_score']]

def recommend_nearest_songs(query, df, vectorizer, index, top_n=20):
    """
    Recommend songs that are closest to the given query.
    """
    query_vector = vectorizer.transform([query])
    query_vector = query_vector.toarray().astype('float32')
    _, song_indices = index.search(query_vector, top_n)
    recommendations = df.iloc[song_indices[0]].copy()
    recommendations['similarity_score'] = _[0]

    print("The possible recommended songs are :")
    return recommendations[['name', 'artists', 'genre', 'similarity_score']]


In [12]:
query_type = input("What are you providing? (song/artist/lyrics): ").strip().lower()
song_query = input(f"Enter the {query_type}: ")

recommendations = recommend_songs(song_query, df, vectorizer, index,top_n=5)
print(recommendations)


Song is present in the dataset. Here are the recommendations:
The recommended songs are not fully relevant. After Providing the lyrics of the song for better recommendations.
The possible recommended songs are like this:
                           name         artists    genre  similarity_score
22680          saw her standing       ballyhoo!   reggae          0.382465
3333   i saw him standing there         tiffany      pop          0.382465
5759                  flashback   calvin harris      pop          0.342065
10492              party lights    junior brown  country          0.340556
8676    you can close your eyes  linda ronstadt  country          0.322255
