In [126]:
import pandas as pd
import numpy as np

In [127]:
df = pd.read_csv('songdata.csv')
df.head(3)

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \nAnd..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \nTouch me gentl..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \nWhy I had t...


In [128]:
df.shape

(57650, 4)

In [129]:
df = df.sample(n=5000).drop('link', axis=1).reset_index(drop=True)

In [130]:
df.shape

(5000, 3)

In [131]:
df['text'] = df['text'].str.lower().replace(r'[^\w\s]','').replace(r'\n',' ', regex=True)

In [132]:
df['text'][0]

"i had another dream of you last night   sailing on a sea of blue   you were grieving for someone else   i was grieving for you      miss fantasy   it may be you don't remember me   but i remember you      in between the shadow and the sun   in between the black and blue   from a window i heard your voice   telling me you were through      miss fantasy   it may be you don't remember me   but i remember you      miss fantasy   i can see you don't remember me   but i remember you      and do you still tremble when i fall   will you still answer when i call   would you still love to linger up against the wall   everyone whispers when you go   into the silence of their love   10, 000 voices crying over with the show      everybody saying shine your light   everybody knows it's true   your the queen of the underground   never leaving a clue      miss fantasy   it may be you don't remember me   but i remember you      miss fantasy   i can see you don't remember me   but i remember you      i

In [133]:
import nltk
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()


def tokenization(txt):
    tokens = nltk.word_tokenize(txt)
    stemming = [stemmer.stem(w) for w in tokens]
    return " ".join(stemming)

In [134]:
df['text'] = df['text'].apply(lambda x: tokenization(x))

In [135]:
df['text']

0       i had anoth dream of you last night sail on a ...
1       [ fabol ] shawti you autumn time cool but they...
2       give it to me , i 'm worth it babi , i 'm wort...
3       you infiltr my everi hour you bug me like a fl...
4       it 's alright with me if it 's alright with yo...
                              ...                        
4995    in the church one day you will get hurt in the...
4996    alway and forev each moment with you is just l...
4997    oh lord , there 's just so much to be done . o...
4998    been hidin ' out and layin ' low it 's noth ne...
4999    babi i can be your sweet babi when thing get c...
Name: text, Length: 5000, dtype: object

In [136]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [137]:
tfidvector = TfidfVectorizer(analyzer='word',stop_words='english')
matrix = tfidvector.fit_transform(df['text'])
similarity = cosine_similarity(matrix)

In [138]:
similarity[0]

array([1.        , 0.01155649, 0.0084784 , ..., 0.01166811, 0.00961447,
       0.01132876])

In [139]:
df['song'][0]

'Miss Fantasy'

In [140]:
df[df['song']=='Autumn Leaves']

Unnamed: 0,artist,song,text
2015,Barbra Streisand,Autumn Leaves,"et le vent do nord , le emport dan la nuit bla..."


# recommedation function

In [141]:
def recommendation(song_df):
    idx = df[df['song'] == song_df].index[0]
    distances = sorted(list(enumerate(similarity[idx])),reverse=True,key=lambda x:x[1])
    
    songs = []
    for m_id in distances[1:21]:
        songs.append(df.iloc[m_id[0]].song)
        
    return songs

In [142]:
recommendation('Autumn Leaves')

["Ne T'enfuis Pas",
 'Lolita',
 'La Do Do La Do',
 'I Hate',
 'All Of A Sudden',
 'Everything Is Sound',
 'Ann',
 'Nobody Else But You',
 'I Love You',
 "Halfway 'round The World",
 'Nuevo Dia',
 'Golden Autumn Day',
 'Les Yeux Ouverts',
 'The Gates Of Delirium',
 'Let It Go',
 'Leave Me Alone',
 "Le Ballet D'or",
 'Ares Qui',
 'Darlington County',
 'If I Only Had The Words']

In [143]:
import pickle
pickle.dump(similarity,open('similarity.pkl','wb'))
pickle.dump(df,open('df.pkl','wb'))