In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split


In [2]:
df = pd.read_csv("Spotify Million Song Dataset.csv")
df = df[['artist', 'song', 'text']]
df = df.dropna()
df.shape

(57650, 3)

In [3]:
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]
    return " ".join(tokens)

df['clean_text'] = df['text'].apply(preprocess_text)
df[['artist', 'song', 'clean_text']].head()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Unnamed: 0,artist,song,clean_text
0,ABBA,Ahe's My Kind Of Girl,look face wonderful face means something speci...
1,ABBA,"Andante, Andante",take easy please touch gently like summer even...
2,ABBA,As Good As New,ill never know go put lousy rotten show boy to...
3,ABBA,Bang,making somebody happy question give take learn...
4,ABBA,Bang-A-Boomerang,making somebody happy question give take learn...


In [4]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [5]:
vectorizer = TfidfVectorizer(max_features=5000)

X_train = vectorizer.fit_transform(train_df['clean_text'])
X_test = vectorizer.transform(test_df['clean_text'])

In [6]:
def predict_song(query):
    query = preprocess_text(query)
    query_vec = vectorizer.transform([query])
    similarities = cosine_similarity(query_vec, X_train)
    index = similarities.argmax()
    return train_df.iloc[index]['song'], train_df.iloc[index]['artist']

In [13]:
query = "she makes me feel fine who could ever believe"
predict_song(query)

("Ahe's My Kind Of Girl", 'ABBA')

In [11]:
sample_query = "look at her face its a wonderful face"
predict_song(sample_query)

('Face To Face', 'Foreigner')

In [8]:
correct = 0
total = 100

for i in range(total):
    query = test_df.iloc[i]['text']
    true_song = test_df.iloc[i]['song']
    predicted_song, _ = predict_song(query)
    if predicted_song == true_song:
        correct += 1

accuracy = correct / total
accuracy

0.12

In [10]:
user_input = input("Enter a snippet of song lyrics: ")

predicted_song, predicted_artist = predict_song(user_input)

print("Predicted Song Title:", predicted_song)
print("Predicted Artist:", predicted_artist)

Enter a snippet of song lyrics: she makes me feel fine who could ever believe
Predicted Song Title: Ahe's My Kind Of Girl
Predicted Artist: ABBA
