In [1]:
import numpy as np
import pandas as pd


In [2]:
df=pd.read_csv("songdata.csv")
df.head(3)

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \nAnd..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \nTouch me gentl..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \nWhy I had t...


In [3]:
df.shape

(57650, 4)

In [4]:
df = df.sample(n=5000).drop('link', axis=1).reset_index(drop=True)

In [5]:
df.shape

(5000, 3)

In [6]:
df['artist']

0                    Hooverphonic
1                        Lou Reed
2           Red Hot Chili Peppers
3                     Johnny Cash
4                    Peter Cetera
                  ...            
4995                        Queen
4996    Frankie Goes To Hollywood
4997                 Judas Priest
4998                       Zero 7
4999                    Tom Waits
Name: artist, Length: 5000, dtype: object

In [7]:
df['song']

0                   Out Of Sight
1                   Make Up Mind
2              I Could Have Lied
3                     Clementine
4         Still Getting Over You
                  ...           
4995    Thank God It's Christmas
4996      Ferry Cross The Mersey
4997                Burn In Hell
4998           The Space Between
4999           I Beg Your Pardon
Name: song, Length: 5000, dtype: object

In [8]:
df['text'][0]

"Tell me why  \nDo you always forgive the things I do to you  \nYou're too good  \nOr am I not too bad  \nThose six years  \nWere the best that ever happened to me and you  \nNo regrets after all this fear  \n  \nWe'll always be best friends  \nSomething between you and me  \nWe'll always be best friends  \nSomething between you and me  \n  \nSo tell me why  \nI can't find any satisfaction somewhere else  \nNot good enough  \nOr am I way too bad  \nCause those six years  \nI consumed all your energy  \nAnd I didn't replace  \nSad enough  \nI can't give you what you need  \n  \nWe'll always be best friends  \nSomething between you and me  \nWe'll always be best friends  \nSomething between you and me  \nWe'll always be best friends  \nSomething between you and me  \nWe'll always be best friends  \nSomething between you and me  \n  \nSix years  \nSuch a long time  \nI can't give you what you need  \nSix years  \nSuch a long time  \nSomething special between you and me  \nYou and me  \n  

In [9]:
df['text'] = (
    df['text']
    .astype(str)
    .str.lower()
    .str.replace(r'[^\w\s]', '', regex=True)
    .str.replace(r'\n', ' ', regex=True)
)


In [10]:
import nltk
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

def tokenization(txt):
    tokens = nltk.word_tokenize(txt)
    stemming = [stemmer.stem(w) for w in tokens]
    return " ".join(stemming)

In [11]:
df['text'] = df['text'].apply(lambda x: tokenization(x))

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [13]:
tfidvector = TfidfVectorizer(analyzer='word',stop_words='english')
matrix = tfidvector.fit_transform(df['text'])
similarity = cosine_similarity(matrix)


In [14]:
similarity[0]

array([1.        , 0.02230579, 0.03966967, ..., 0.02513311, 0.10542   ,
       0.00800099])

In [15]:
sorted(list(enumerate(similarity[0])),reverse=False,key=lambda x:x[1])

[(4, 0.0),
 (43, 0.0),
 (53, 0.0),
 (59, 0.0),
 (105, 0.0),
 (109, 0.0),
 (120, 0.0),
 (137, 0.0),
 (144, 0.0),
 (149, 0.0),
 (175, 0.0),
 (181, 0.0),
 (204, 0.0),
 (232, 0.0),
 (264, 0.0),
 (276, 0.0),
 (292, 0.0),
 (327, 0.0),
 (329, 0.0),
 (334, 0.0),
 (341, 0.0),
 (355, 0.0),
 (358, 0.0),
 (371, 0.0),
 (384, 0.0),
 (385, 0.0),
 (411, 0.0),
 (426, 0.0),
 (428, 0.0),
 (437, 0.0),
 (443, 0.0),
 (453, 0.0),
 (455, 0.0),
 (457, 0.0),
 (466, 0.0),
 (484, 0.0),
 (495, 0.0),
 (506, 0.0),
 (523, 0.0),
 (552, 0.0),
 (561, 0.0),
 (566, 0.0),
 (569, 0.0),
 (583, 0.0),
 (602, 0.0),
 (614, 0.0),
 (633, 0.0),
 (640, 0.0),
 (649, 0.0),
 (651, 0.0),
 (720, 0.0),
 (744, 0.0),
 (757, 0.0),
 (782, 0.0),
 (790, 0.0),
 (817, 0.0),
 (902, 0.0),
 (904, 0.0),
 (941, 0.0),
 (942, 0.0),
 (943, 0.0),
 (944, 0.0),
 (945, 0.0),
 (1022, 0.0),
 (1036, 0.0),
 (1056, 0.0),
 (1069, 0.0),
 (1077, 0.0),
 (1081, 0.0),
 (1103, 0.0),
 (1119, 0.0),
 (1120, 0.0),
 (1125, 0.0),
 (1131, 0.0),
 (1142, 0.0),
 (1146, 0.0),
 (11

In [30]:
def recommendation(song):
    idx=df[df['song'] ==song].index[0]
    distances =sorted(list(enumerate(similarity[0])),reverse=False,key=lambda x:x[1])

    songs=[]
    for i in distances[1:21]:
        songs.append(df.iloc[i[0]].song)
    
    return songs

In [31]:
df.song[1]

'Make Up Mind'

In [32]:
recommendation('Make Up Mind')

['Fly On The Windscreen',
 'You Were The Star',
 'Dear One',
 "The Father's Song",
 'Loveletter From Space',
 'Cubit Cubitan',
 'Judas Be My Guide',
 'Amazing Grace',
 "I Sing Just To Know That I'm Alive",
 'Crazy',
 'Mother Machree',
 'Hold On To Me',
 "Ain't No More Cane",
 'Too Drunk To Dream',
 "There's A New Moon Over My Shoulder",
 'I',
 'Sacrament Of Wilderness',
 'Tragedy',
 'Kau Datang Lagi',
 'El Condor Pasa']

In [34]:
import pickle
pickle.dump(df,open('df.pkl','wb'))
pickle.dump(df,open('similarity.pkl','wb'))