In [1]:
import pandas as pd
import nltk
from nltk.stem.porter import PorterStemmer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import pickle


In [2]:
df = pd.read_csv("spotify_millsongdata.csv")
# df = pd.read_excel('song.xlsx')

In [3]:
# checking if there is any null value or not in a sheet
df.shape
df.isnull().sum()

artist    0
song      0
link      0
text      0
dtype: int64

In [4]:
# Deleting a link column
df = df.sample(5000).drop('link', axis=1).reset_index(drop=True)

In [5]:
# Text Cleaning/ Text Preprocessing
df['text'] = df['text'].str.lower().replace(r'\w\s', ' ').replace(r'\n', ' ', regex=True) 

In [6]:
df.head(5)

Unnamed: 0,artist,song,text
0,Bonnie Raitt,My Opening Farewell,a lady stands before an open window \r lookin...
1,John Mellencamp,Martha Say,martha say she don't need no stinking man maki...
2,Roy Orbison,I Never Knew,i never knew how much i'd miss your touch till...
3,Carly Simon,After The Storm,"heat's up, tea's brewed \r clothes strewn aro..."
4,Wet Wet Wet,High On The Happy Side,everything seemed the same tonite \r and then...


In [7]:
#Taking a sample of 35000 instead of 53000
# df = df.sample(40000)

In [8]:
stemmer = PorterStemmer()

In [9]:
def tokenization(txt):
    tokens = nltk.word_tokenize(txt)
    stemming = [stemmer.stem(w) for w in tokens]
    return " ".join(stemming)

In [10]:
tokenization("You are beautiful, beautiful")

'you are beauti , beauti'

In [11]:
df['text']= df['text'].apply(lambda x: tokenization(x))

In [12]:
tfid = TfidfVectorizer(analyzer='word', stop_words='english')

In [13]:
matrix = tfid.fit_transform(df['text'])

In [14]:
similar = cosine_similarity(matrix)

In [15]:
similar[0]

array([1.        , 0.20689398, 0.06424373, ..., 0.01334277, 0.06365192,
       0.04651034], shape=(5000,))

In [16]:
# df[df['song']=="Money Money Money"].index[0]

In [17]:
# Recommendor function
def recommender(song_name):
    idx = df[df['song']==song_name].index[0]
    distance = sorted(list(enumerate(similar[idx])), reverse=True, key= lambda x:x[1])
    song = []
    for s_id in distance[:21]:
        song.append(df.iloc[s_id[0]].song)
    return song

In [19]:
pickle.dump(similar, open("similarity.pkl", "wb"))

In [20]:
pickle.dump(df, open("df.pkl", "wb"))