In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('songdata.csv')
df.head(3)


Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \nAnd..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \nTouch me gentl..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \nWhy I had t...


In [3]:
df.shape

(57650, 4)

In [4]:
df = df.sample(n=5000).drop('link', axis=1).reset_index(drop=True)

In [5]:
df.shape

(5000, 3)

In [6]:
df['text'] = df['text'].str.lower().replace(r'[^\w\s]','').replace(r'\n',' ', regex=True)

In [7]:
df['text'][0]

"well i'm here looking through an old picture frame   just waiting for the perfect view   i hope something special will step in to my life   another fine edition of you   a pin-up done in shades of blue   sometimes you find a yearning for the quiet life   the country air and all its joys   but badgers couldn't compensate at twice the price      for just another night with the boys oh yeah   and boys will be boys, will be boys   they say love's a gamble, hard to win, easy lose   and while sun shines you'd better make hay   so if life is your table and fate is the wheel   then let the chips fall where they may   in modern times the modern way      and as i was drifting past the lorelei   i heard those slinky sirens wail, whooo   so look out sailor when you hear them croon   you'll never been the same again oh no   their crazy music drives you insane, this way   so love, leave me. do what you will   who knows what tomorrow might bring'   learn from your mistakes is my only advice      and

In [8]:
import nltk
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

def tokenization(txt):
    tokens = nltk.word_tokenize(txt)
    stemming = [stemmer.stem(w) for w in tokens]
    return " ".join(stemming)

In [9]:
df['text'] = df['text'].apply(lambda x: tokenization(x))

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [11]:
tfidvector = TfidfVectorizer(analyzer='word',stop_words='english')
matrix = tfidvector.fit_transform(df['text'])
similarity = cosine_similarity(matrix)

In [12]:
similarity[0]

array([1.        , 0.02679526, 0.03129713, ..., 0.04494107, 0.07196879,
       0.00629037])

In [28]:
df['song'][0]

'Editions Of You'

In [29]:
df[df['song']=="Singer's Song"]

Unnamed: 0,artist,song,text
2,Tom T. Hall,Singer's Song,now that i know what i know whi did the learn ...


In [26]:
def recommendation(song_df):
    idx = df[df['song'] == song_df].index[0]
    distances = sorted(list(enumerate(similarity[idx])),reverse=True,key=lambda x:x[1])
    
    songs = []
    for m_id in distances[1:21]:
        songs.append(df.iloc[m_id[0]].song)
        
    return songs

In [27]:
recommendation("Singer's Song")

['Build Me Up, Break Me Down',
 'My Song For You (live In Japan)',
 'Thank You',
 'Sing Your Praise To The Lord',
 'A Song For You',
 "A Song I'd Like To Sing",
 'A Song For You',
 'Killing Me Softly',
 'I Can See It',
 'Happy Song',
 'Portland Rain',
 'One X One',
 'Freedom Song',
 'Stand For Something',
 'Let The Little Boy Sing',
 "This Woman's Work",
 'Turn Around',
 'Good Morning Starshine',
 'If You Want',
 'Shoot You Down']

In [30]:
import pickle
pickle.dump(similarity,open('similarity.pkl','wb'))
pickle.dump(df,open('df.pkl','wb'))

In [None]:
# import pickle
# pickle.dump(similarity,open('similarity.pkl','wb'))
# pickle.dump(df,open('df.pkl','wb'))