In [1]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load CSV
df = pd.read_csv("spotify_songs.csv")

# Select only required columns
df = df[['song', 'artist', 'text']]

# Rename columns to match the model
df.columns = ['track_name', 'artist_name', 'lyrics']

# Remove missing values
df.dropna(inplace=True)


In [2]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    return text

df['lyrics_clean'] = df['lyrics'].apply(clean_text)

vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(df['lyrics_clean'])

def search_song(input_lyrics):
    input_lyrics = clean_text(input_lyrics)
    input_vector = vectorizer.transform([input_lyrics])
    similarity = cosine_similarity(input_vector, tfidf_matrix)
    index = similarity.argmax()
    return df.iloc[index][['track_name', 'artist_name']]


In [3]:
query = "we were both young when I first saw you"
print(search_song(query))


track_name     When I Was Young
artist_name         Tina Turner
Name: 53366, dtype: object
