In [1]:
import os
import json
import zipfile
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [2]:
df = pd.read_csv('src/spotify_millsongdata.csv')
df.head()

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...


In [3]:
df.shape

(57650, 4)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57650 entries, 0 to 57649
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   artist  57650 non-null  object
 1   song    57650 non-null  object
 2   link    57650 non-null  object
 3   text    57650 non-null  object
dtypes: object(4)
memory usage: 1.8+ MB


In [5]:
df.describe()

Unnamed: 0,artist,song,link,text
count,57650,57650,57650,57650
unique,643,44824,57650,57494
top,Donna Summer,Have Yourself A Merry Little Christmas,/z/zwan/heartsong_20148991.html,Chestnuts roasting on an open fire \r\nJack F...
freq,191,35,1,6


In [6]:
df.isnull().sum()

artist    0
song      0
link      0
text      0
dtype: int64

In [7]:
top_artists = df['artist'].value_counts().head(10)
print("Top 10 artist")
print(top_artists)

Top 10 artist
artist
Donna Summer        191
Gordon Lightfoot    189
Bob Dylan           188
George Strait       188
Loretta Lynn        187
Alabama             187
Cher                187
Reba Mcentire       187
Chaka Khan          186
Dean Martin         186
Name: count, dtype: int64


In [8]:
df = df.sample(1000)

df = df.drop('link',axis=1).reset_index(drop=True)

In [9]:
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /Users/sankar/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/sankar/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sankar/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [10]:
stop_words = set(stopwords.words('english'))

In [11]:
def preprocess_words(text):
    #renove the special characters 
    text = re.sub(r'[^a-zA-Z\s]',"",text)
    # convert all word in lowercase
    text = text.lower()
    # tonkenzies the text
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words ]

    return " ".join(tokens)

In [12]:
df['clean_text'] = df['text'].apply(preprocess_words)

In [13]:
df.head()

Unnamed: 0,artist,song,text,clean_text
0,Procol Harum,Mabel,Don't eat green meat it ain't good for you \r...,dont eat green meat aint good know killed brot...
1,Nazareth,Somebody To Roll,"Goin' out, got some time to use \r\nBaby, bab...",goin got time use baby baby baby cant lose tru...
2,Isley Brothers,Don't Be Jealous,"I want to love you, yeah, over and over again ...",want love yeah want see see youre like like be...
3,Nazareth,Crack Me Up,"Hello, feelin' happy, got peace of mind? \r\n...",hello feelin happy got peace mind hello kimosa...
4,Otis Redding,A Lover's Question,"Does she love me, with all her heart \r\nShou...",love heart worry apart lovers question id like...


In [14]:
tfidd_vectorizer = TfidfVectorizer(max_features=5000)
tfidf_matrix = tfidd_vectorizer.fit_transform(df['clean_text'])

In [15]:
consin_sim = cosine_similarity(tfidf_matrix,tfidf_matrix)

In [16]:
def recommmand_song(song_name,consin_sim = consin_sim,df=df,top_n =5):
    #find the index of the song 
    idx = df[df['song'].str.lower() == song_name.lower()].index
    if len(idx) == 0 :
        return "Song not found in the dataset"
    idx = idx[0]

    #get simalarity scores
    sim_scores = list(enumerate(consin_sim[idx]))
    sim_scores = sorted(sim_scores,key=lambda x:x[1],reverse=True)
    sim_scores = sim_scores[1:top_n+1]

    #get song indices
    song_indices = [i[0] for i in sim_scores]

    # Return top n similar songs
    return df[['artist','song']].iloc[song_indices]

In [17]:
df['song'][2]

"Don't Be Jealous"