In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# Define your dataset path (update this path to where your CSV file is located)
dataset_path = '../data/spotify_songs.csv'

# Load the dataset into a DataFrame
df = pd.read_csv(dataset_path)

In [3]:
# Ensure the 'text' column is treated as string and create a list of song lyrics
texts = df['text'].astype(str).tolist()

In [4]:
# Create a TF-IDF vectorizer instance and fit it on the song lyrics
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(texts)

In [5]:
def find_best_match(query):
    """
    Given a lyric snippet (query), this function vectorizes the query,
    computes cosine similarity with the entire TF-IDF matrix,
    and returns the index of the best matching song along with its similarity score.
    """
    # Preprocess and vectorize the query
    query = query.lower()  # convert to lowercase for consistency
    query_vector = vectorizer.transform([query])
    
    # Compute cosine similarity between the query vector and all song vectors
    similarities = cosine_similarity(query_vector, tfidf_matrix)
    
    # Get the index of the song with the highest similarity score
    best_index = similarities.argmax()
    best_score = similarities[0, best_index]
    return best_index, best_score

In [7]:
# Example lyric snippet (modify as needed)
query_lyric = "raised in the shadows of an old cotton mill Back when believin' was the style Small town"

# Find the best match for the provided lyric snippet
best_index, score = find_best_match(query_lyric)

# Retrieve and display details of the best matching song
best_match = df.iloc[best_index]
print("Best Match:")
print("Artist:", best_match['artist'])
print("Song:", best_match['song'])
print("Link:", best_match['link'])
print("Similarity Score:", score)

Best Match:
Artist: John Mellencamp
Song: Small Town
Link: /j/john+mellencamp/small+town_20074448.html
Similarity Score: 0.34295381003113634
