In [None]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Download necessary NLTK data (run these lines once)
nltk.download('punkt')
nltk.download('stopwords')


In [None]:
# Define your dataset path (update this path to where your CSV file is located)
dataset_path = '../data/spotify_songs.csv'

# Load the dataset into a DataFrame
df = pd.read_csv(dataset_path)

In [None]:
# df.head() 
# df.shape
# df.info()

In [None]:
def clean_text(text):
    # Convert text to lowercase
    text = text.lower()
    
    # Remove any character that is not a letter or whitespace
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Replace multiple spaces with a single space and strip leading/trailing whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    # Remove stopwords
    stopword_list = stopwords.words('english')
    words = [word for word in text.split() if word not in stopword_list]
    
    # Initialize Snowball Stemmer for stemming words to their root form
    sb_stem = SnowballStemmer('english')
    words = [sb_stem.stem(word) for word in words]
    
    # Rejoin the list of words into a single string
    return ' '.join(words)


In [None]:
# Apply the cleaning function to the 'text' column
df['text'] = df['text'].astype(str).apply(clean_text)

# Optionally, add a column with the length of each cleaned text
df['text_length'] = df['text'].apply(len)

# Display the first few rows to inspect changes
df.head()


In [None]:
# Get the list of cleaned song lyrics
texts = df['text'].tolist()

# Create a TF-IDF vectorizer instance
# (You can remove 'stop_words' here if cleaning already removed them)
vectorizer = TfidfVectorizer(ngram_range=(1, 2), min_df=2)
tfidf_matrix = vectorizer.fit_transform(texts)


In [None]:
def find_best_match(query):
    """
    Given a lyric snippet (query), vectorize the query using the same TF-IDF vectorizer,
    compute cosine similarity with the TF-IDF matrix, and return the index and similarity score.
    """
    # Clean the query text using the same cleaning function
    query_clean = clean_text(query)
    query_vector = vectorizer.transform([query_clean])
    
    # Compute cosine similarity between the query vector and all song vectors
    similarities = cosine_similarity(query_vector, tfidf_matrix)
    
    # Identify the best matching song (highest similarity score)
    best_index = similarities.argmax()
    best_score = similarities[0, best_index]
    return best_index, best_score


In [None]:
# Example lyric snippet (modify as needed)
query_lyric = "Listen girl, I don't know where to start Cause every word I say is straight from the heart I've"

# Find the best match for the provided lyric snippet
best_index, score = find_best_match(query_lyric)

# Retrieve the best matching song details
best_match = df.iloc[best_index]

print("Best Match:")
print("Song Row:", best_index)
print("Artist:", best_match['artist'])
print("Song:", best_match['song'])
print("Similarity Score:", score)


In [None]:
print("Link:", best_match['link'])
print("Lyrics:", best_match['text'])

In [None]:
import numpy as np

def find_top_matches(query, top_n=3):
    """
    Given a lyric snippet, this function cleans and vectorizes the query,
    computes cosine similarity with the TF-IDF matrix, and returns the top N
    matching songs as a list of tuples (song_index, similarity_score).
    """
    # Clean the query text using the same cleaning function
    query_clean = clean_text(query)
    query_vector = vectorizer.transform([query_clean])
    
    # Compute cosine similarity between the query vector and all song vectors
    similarities = cosine_similarity(query_vector, tfidf_matrix)[0]
    
    # Get indices of the top N scores in descending order
    top_indices = np.argsort(similarities)[::-1][:top_n]
    top_scores = similarities[top_indices]
    
    return list(zip(top_indices, top_scores))

# Example lyric snippet (modify as needed)
# query_lyric = "sing us a song you're the piano man sing us a song tonight"

# Get the top three matches for the query lyric
top_matches = find_top_matches(query_lyric, top_n=3)

# Display the results
print("Top Three Matches:")
for idx, score in top_matches:
    match = df.iloc[idx]
    print("\nSong Row:", idx)
    print("Artist:", match['artist'])
    print("Song:", match['song'])
    # print("Link:", match['link'])
    # print("Lyrics:", match['text'])
    print("Similarity Score:", score)
