In [1]:
import pandas as pd
import numpy as np

In [2]:
df  = pd.read_csv("spotify_millsongdata.csv")

In [3]:
df.head()

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...


In [4]:
df.shape

(57650, 4)

In [5]:
df.isnull().sum()

artist    0
song      0
link      0
text      0
dtype: int64

In [6]:
df = df.drop('link',axis=1).reset_index(drop=True)
df = df.sample(5000, random_state=42).reset_index(drop=True)


In [7]:
df['text'][0]

"Like to have you 'round  \r\nWith all the lies that you make  \r\nThe things or darkness and you  \r\nSome people say, have just a taste  \r\nRight or wrong, you might get burned  \r\nWhat you gain is what you learn  \r\n  \r\nGot one too many women  \r\nDon't know quite which way to go  \r\nThey're all gettin' so expensive  \r\nWhen they walk by themselves  \r\nRight or wrong, don't regret  \r\nWhat you went for is what you get  \r\n  \r\nNo point in bitter tears  \r\nWhen someone else has cut you down  \r\n'Cause there's a time for leavin'  \r\nAnd there's a time for stickin' around, hey  \r\nRight or wrong, you've got to live  \r\nSo what you collect is what you give\r\n\r\n"

In [8]:
df['text'] = df['text'].str.lower().str.replace(r'[^\w\s]', ' ').replace(r'\n', ' ', regex=True)
df['text']


0       like to have you 'round  \r with all the lies ...
1       this little light of mine (light of mine),  \r...
2       she says she's no good with words but i'm wors...
3       hey mama, mama, come a look at sister,  \r she...
4       i see it all through my window it seems.  \r n...
                              ...                        
4995    i hope that you're the one  \r if not, you are...
4996    when evening chores are over at the ranch hous...
4997    don't sing me lullabies  \r i won't close my e...
4998    now that you've been bought and sold  \r you d...
4999    lately, i can't sleep at night  \r i can't tur...
Name: text, Length: 5000, dtype: object

In [24]:
# Tokenization and further processing can be done here
df.sample()

Unnamed: 0,artist,song,text
829,Nina Simone,Can't Get Out Of This Mood,all day long befor my eye come littl vision of...


In [10]:
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')
from nltk.stem import PorterStemmer

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\basav\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [11]:
stemmer = PorterStemmer()

In [12]:
def token(text):
    tokens = word_tokenize(text)
    a = [stemmer.stem(token) for token in tokens ]
    return " ".join(a)

In [13]:
df['text'] = df['text'].apply(token)

In [14]:
df['text']

0       like to have you 'round with all the lie that ...
1       thi littl light of mine ( light of mine ) , i ...
2       she say she 's no good with word but i 'm wors...
3       hey mama , mama , come a look at sister , she ...
4       i see it all through my window it seem . never...
                              ...                        
4995    i hope that you 're the one if not , you are t...
4996    when even chore are over at the ranch hous on ...
4997    do n't sing me lullabi i wo n't close my eye ,...
4998    now that you 've been bought and sold you do n...
4999    late , i ca n't sleep at night i ca n't turn o...
Name: text, Length: 5000, dtype: object

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
tfidf = TfidfVectorizer(analyzer='word', stop_words='english')

In [16]:
tfidf_matrix = tfidf.fit_transform(df['text'])
tfidf_matrix.shape

(5000, 17820)

In [18]:
similar = cosine_similarity(tfidf_matrix)

In [19]:
similar[0]

array([1.        , 0.        , 0.03546045, ..., 0.0240463 , 0.06534677,
       0.02792487])

In [25]:
df[df['song'] == "Can't Get Out Of This Mood"].index[0]

np.int64(829)

In [37]:

def recommendation(song_df):
    idx = df[df['song'] == song_df].index[0]
    distances = sorted(list(enumerate(similar[idx])),reverse=True,key=lambda x:x[1])
    
    songs = []
    for m_id in distances[1:6]:
        songs.append(df.iloc[m_id[0]].song)
        
    return songs

In [38]:
recommendation("Can't Get Out Of This Mood")

['Little Romance',
 'Moods Of Mary',
 'I Need A Man To Love',
 'Heartbreaker',
 "We Can't Move To This"]

In [39]:
import pickle
pickle.dump(df, open('df.pkl', 'wb'))
pickle.dump(similar, open('similar.pkl', 'wb'))
pickle.dump(tfidf, open('tfidf.pkl', 'wb'))