In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('songdata.csv')
df.head(3)

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \nAnd..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \nTouch me gentl..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \nWhy I had t...


In [3]:
df.shape

(57650, 4)

In [4]:
df = df.sample(n=5000).drop('link', axis=1).reset_index(drop=True)

In [5]:
df.shape

(5000, 3)

In [6]:
df['text'] = df['text'].str.lower().replace(r'[^\w\s]','').replace(r'\n',' ', regex=True)

In [7]:
df['text'][0]

"dream on   yesterday is gone   and it's clear   i'll never get out of here   no dice   got something in my eyes   it might be a tear      she never shoots her gun   she only keeps it just for fun   i wanna call her bluff   but i never get close enough   though i love to hear her heartbeat      dream on   yesterday is gone   and it's clear   i'll never get out of here   no dice   got something in my eyes   it might be a tear      eighteen times today   i've been wishin' my day away   and it's hard to say   if i'm windin' up ok   but i pray i'll hear her heartbeat      dream on   yesterday is gone   and outta year   i'll never get out of here   no dice   got something in my eyes   it might be a tear      in my ear i hear the heartbeat   in my ear i hear the heartbeat      dream on   yesterday is gone   and it's clear   i'll never get out of here   no dice   got something in my eyes   it might be a tear      dream on   yesterday is gone   and outta year   i'll never get out of here   no 

In [8]:
import nltk
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

def tokenization(txt):
    tokens = nltk.word_tokenize(txt)
    stemming = [stemmer.stem(w) for w in tokens]
    return " ".join(stemming)

In [9]:
df['text']

0       dream on   yesterday is gone   and it's clear ...
1       and here's to you, mrs. robinson   jesus loves...
2       tell me the story, tell me the legend   tell m...
3       get thee behind me, satan   i want to resist  ...
4       watch the watch the way i walk   can't you thi...
                              ...                        
4995    hallelujah      now i lay, i lay me down to sl...
4996    stick by me and i'll stick by you   stick by m...
4997    at the age of eighteen   i had had enough   an...
4998    (for your love, for your love)   when i think ...
4999    feeling good bout the way i am,   how come you...
Name: text, Length: 5000, dtype: object

In [10]:
df['text'] = df['text'].apply(lambda x: tokenization(x))

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [12]:
tfidvector = TfidfVectorizer(analyzer='word',stop_words='english')
matrix = tfidvector.fit_transform(df['text'])
similarity = cosine_similarity(matrix)

In [13]:
similarity[0]

array([1.        , 0.03111502, 0.0204055 , ..., 0.06756874, 0.02417805,
       0.04587847])

In [14]:
df['song'][0]

'Dream On'

In [15]:
df[df['song']=='Heart Of Hearts']

Unnamed: 0,artist,song,text
76,Electric Light Orchestra,Heart Of Hearts,vers they say there 's gold under the stone th...


# recommedation function

In [16]:
def recommendation(song_df):
    idx = df[df['song'] == song_df].index[0]
    distances = sorted(list(enumerate(similarity[idx])),reverse=True,key=lambda x:x[1])
    
    songs = []
    for m_id in distances[1:21]:
        songs.append(df.iloc[m_id[0]].song)
        
    return songs

In [17]:
recommendation('Heart Of Hearts')

["Here's My Heart",
 'Where Is Your Heart',
 "You'll Be In My Heart",
 'Heart Of Gold',
 'Stay Away',
 'Open Your Heart',
 'If Anybody Had A Heart',
 'Have You Ever',
 'Heart By Heart',
 'I Feel For You',
 'Miles Apart',
 "Don't Close Your Heart",
 'Each Time You Break My Heart',
 'Nothing But The Truth',
 'Heartbeat',
 'Stone Cold Heart',
 'The Least You Can Do',
 'Tell It To Your Heart',
 'More Than',
 "Don't Be Cruel"]

In [18]:
import pickle
pickle.dump(similarity,open('similarity.pkl','wb'))
pickle.dump(df,open('df.pkl','wb'))