In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('songdata.csv')
df.head(3)

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \nAnd..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \nTouch me gentl..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \nWhy I had t...


In [3]:
df.shape

(57650, 4)

In [4]:
df = df.sample(n=5000).drop('link', axis=1).reset_index(drop=True)

In [5]:
df.shape

(5000, 3)

In [6]:
df['text'] = df['text'].str.lower().replace(r'[^\w\s]','').replace(r'\n',' ', regex=True)

In [7]:
df['text'][0]

"see that long line of people who keep standing in the rain   lookin' tired, gettin' cold   and that signal light they're waiting for, don't ever seem to chang   killing time, ain't it slow.   watch them bending with the burden of the pennies that they save   marking time, growin' old   slowly marching by the numbers to the freedom of the grave   killing time, ain't it slow   that's the closest thing to living that they're ever gonna know   and their good times ain't no better than their bad   'cause they think they need a signal light to tell them they can go   killing time, ain't it sad.   don't feel sorry for those people who keep standing in the rain   they don't mind, they don't know   'cause it never crossed their minds they got a reason to complain   killing time, ain't it slow.   they're not haunted by the visions that they never dared to see   and they'll never miss the dreams they've never had   they're condemned to go on living in a penitentiary   for killing time, ain't it 

In [8]:
import nltk
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

def tokenization(txt):
    tokens = nltk.word_tokenize(txt)
    stemming = [stemmer.stem(w) for w in tokens]
    return " ".join(stemming)

In [10]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/alish/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [11]:
df['text'] = df['text'].apply(lambda x: tokenization(x))

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [13]:
tfidvector = TfidfVectorizer(analyzer='word',stop_words='english')
matrix = tfidvector.fit_transform(df['text'])
similarity = cosine_similarity(matrix)

In [14]:
similarity[0]

array([1.        , 0.01101989, 0.00219602, ..., 0.00249187, 0.06345591,
       0.13766079])

In [15]:
df[df['song']==''].index[0]

IndexError: index 0 is out of bounds for axis 0 with size 0

# recommedation function

In [16]:
def recommendation(song_df):
    idx = df[df['song'] == song_df].index[0]
    distances = sorted(list(enumerate(similarity[idx])),reverse=True,key=lambda x:x[1])
    
    songs = []
    for m_id in distances[1:21]:
        songs.append(df.iloc[m_id[0]].song)
        
    return songs

In [18]:
recommendation('Alma Mater')

["It Wasn't Very Long Ago",
 'Stabbed In The Back',
 'I Must Be Dreaming',
 'Just A Feeling',
 'I Dreamed A Dream',
 'Lets Get Together Again',
 "Doin' Just Fine",
 "You're Only In Love",
 'Here Is The House',
 'Be Myself Again',
 "Leavin' On Your Mind",
 'Box Of Rain',
 'New Shoes',
 'If I Knew Then',
 'Snowed In At Wheeler Street',
 'Let Me Love You',
 'A New Star',
 'Happy Xmas (War Is Over)',
 'All Things Are Possible',
 'Any Other Way']

In [19]:
import pickle
pickle.dump(similarity,open('similarity.pkl','wb'))
pickle.dump(df,open('df.pkl','wb'))