In [2]:
import pandas as pd

#### Reading the dataset

In [42]:
df = pd.read_csv("spotify_millsongdata.csv")

#### Checking the dataset values using head and tail (Top and last 5 records)

In [43]:
df.head(5)

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...


In [44]:
df.tail(5)

Unnamed: 0,artist,song,link,text
57645,Ziggy Marley,Good Old Days,/z/ziggy+marley/good+old+days_10198588.html,Irie days come on play \r\nLet the angels fly...
57646,Ziggy Marley,Hand To Mouth,/z/ziggy+marley/hand+to+mouth_20531167.html,Power to the workers \r\nMore power \r\nPowe...
57647,Zwan,Come With Me,/z/zwan/come+with+me_20148981.html,all you need \r\nis something i'll believe \...
57648,Zwan,Desire,/z/zwan/desire_20148986.html,northern star \r\nam i frightened \r\nwhere ...
57649,Zwan,Heartsong,/z/zwan/heartsong_20148991.html,come in \r\nmake yourself at home \r\ni'm a ...


#### checking the shape of dataset

In [45]:
df.shape

(57650, 4)

#### Checking for any Null or duplicate values

In [46]:
df.isnull().sum()

artist    0
song      0
link      0
text      0
dtype: int64

In [74]:
df.duplicated().sum()

0

#### Taking the sample data of random 10000 records/cols

In [47]:
df =df.sample(10000).drop('link', axis=1).reset_index(drop=True)

In [50]:
df.head()

Unnamed: 0,artist,song,text
0,Bob Dylan,Gospel Plow,Mary wore three links of chain \r\nEvery link...
1,OneRepublic,Counting Stars,"Lately I've been, I've been losing sleep \r\n..."
2,Billie Holiday,No More,You ain't gonna bother me no more \r\nNo how ...
3,Kelly Clarkson,Sober,I don't know \r\nThis could break my heart or...
4,Moody Blues,Stop,So you talked to my girl \r\nWhen I don't wan...


In [51]:
df.tail()

Unnamed: 0,artist,song,text
9995,Air Supply,Secret Agent,Well I've got friends in the states \r\nAnd I...
9996,The White Stripes,Truth Doesn't Make A Noise,My baby's got a heart of stone \r\nCan't you ...
9997,Jackson Browne,"Oh, My Love","Oh, my love for the first time in my life \r\..."
9998,Michael Buble,You'll Never Know,"You'll never know just how much I love you, \..."
9999,Engelbert Humperdinck,Funny Familiar Forgotten Feelings,"Last night, quietly, she walked through my min..."


In [52]:
df['text'][0]

"Mary wore three links of chain  \r\nEvery link was Jesus name  \r\nKeep your hand on that plow, hold on  \r\nOh Lord, Oh Lord, keep your hand on that plow, hold on.  \r\n  \r\nMary, Mark, Luke and John  \r\nAll these prophets are dead and gone  \r\nKeep your hand on that plow, hold on  \r\nOh Lord, Oh Lord, keep your hand on that plow, hold on.  \r\n  \r\nWell, I've never been to heaven  \r\nBut I've been told streets up there  \r\nAre lined with gold  \r\nKeep your hand on that plow, hold on  \r\nOh Lord, Oh Lord, keep your hand on that plow, hold on  \r\nOh Lord, Oh Lord, keep your hand on that plow, hold on  \r\nOh Lord, Oh Lord, keep your hand on that plow, hold on.\r\n\r\n"

In [53]:
# df = df.sample(5000)

In [54]:
df.shape

(10000, 3)

#### Text Cleaning/ Text Preprocessing

In [55]:
df['text'] = df['text'].str.lower().replace(r'^\w\s', ' ').replace(r'\n', ' ', regex = True)

In [56]:
df.tail()

Unnamed: 0,artist,song,text
9995,Air Supply,Secret Agent,well i've got friends in the states \r and i'...
9996,The White Stripes,Truth Doesn't Make A Noise,my baby's got a heart of stone \r can't you p...
9997,Jackson Browne,"Oh, My Love","oh, my love for the first time in my life \r ..."
9998,Michael Buble,You'll Never Know,"you'll never know just how much i love you, \..."
9999,Engelbert Humperdinck,Funny Familiar Forgotten Feelings,"last night, quietly, she walked through my min..."


In [75]:
df['text'][0]

"mari wore three link of chain everi link wa jesu name keep your hand on that plow , hold on oh lord , oh lord , keep your hand on that plow , hold on . mari , mark , luke and john all these prophet are dead and gone keep your hand on that plow , hold on oh lord , oh lord , keep your hand on that plow , hold on . well , i 've never been to heaven but i 've been told street up there are line with gold keep your hand on that plow , hold on oh lord , oh lord , keep your hand on that plow , hold on oh lord , oh lord , keep your hand on that plow , hold on oh lord , oh lord , keep your hand on that plow , hold on ."

#### Making the tokenization method for our model

In [57]:
import nltk
#nltk.download('punkt')
from nltk.stem.porter import PorterStemmer

stemmer = PorterStemmer()

def tokenization(txt):
    tokens = nltk.word_tokenize(txt)
    stemming = [stemmer.stem(w) for w in tokens]
    return " ".join(stemming)

[nltk_data] Downloading package punkt to C:\Users\Sarojkumar
[nltk_data]     Lal\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [76]:
str = "You are beautiful, beauty, beauti."
tokenization(str)

'you are beauti , beauti , beauti .'

In [58]:
df['text'] = df['text'].apply(lambda x: tokenization(x))

In [59]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [60]:
tfidvector = TfidfVectorizer(analyzer='word',stop_words='english')
matrix = tfidvector.fit_transform(df['text'])
similarity = cosine_similarity(matrix)

In [61]:
similarity[0]

array([1.        , 0.0175739 , 0.00328583, ..., 0.10411553, 0.01146239,
       0.02335684])

In [66]:
df[df['song'] == 'Secret Agent']

Unnamed: 0,artist,song,text
9995,Air Supply,Secret Agent,well i 've got friend in the state and i 've g...


In [67]:
def recommendation(song_df):
    idx = df[df['song'] == song_df].index[0]
    distances = sorted(list(enumerate(similarity[idx])),reverse=True,key=lambda x:x[1])
    
    songs = []
    for m_id in distances[1:21]:
        songs.append(df.iloc[m_id[0]].song)
        
    return songs

In [69]:
recommendation("Secret Agent")

['Secret Agent Man',
 'Lights And Sounds',
 'Ever Lonely',
 'Lap Of Luxury',
 "If You've Got The Time",
 'Friends Will Be Friends',
 'Luxury',
 'Code Of Silence',
 'All I Wanna Do In Life',
 'You And Your Friend',
 'Code Of Love',
 'Show Some Respect',
 'I Got Money Now',
 'No Doubt About Love',
 'Keep On Trying',
 "Don't Know How (Not To Love You)",
 'God Put A Smile On Your Face',
 'Krafty',
 'I Get Up',
 'The Bottom Line']

In [71]:
import pickle
pickle.dump(similarity,open('similarity.pkl','wb'))
pickle.dump(df,open('df.pkl','wb'))