**Importing Modules**

In [2]:
import pandas as pd
import nltk
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle
nltk.download('punkt')

ModuleNotFoundError: No module named 'pandas'

**Creating Dataframe**

In [None]:
df = pd.read_csv("dataset/spotify_millsongdata.csv")

In [None]:
print("The first 5 data of dataframe")
df.head(5)

The first 5 data of dataframe


Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...


In [None]:
print("The last 5 data of dataframe")
df.tail(5)

The last 5 data of dataframe


Unnamed: 0,artist,song,link,text
57645,Ziggy Marley,Good Old Days,/z/ziggy+marley/good+old+days_10198588.html,Irie days come on play \r\nLet the angels fly...
57646,Ziggy Marley,Hand To Mouth,/z/ziggy+marley/hand+to+mouth_20531167.html,Power to the workers \r\nMore power \r\nPowe...
57647,Zwan,Come With Me,/z/zwan/come+with+me_20148981.html,all you need \r\nis something i'll believe \...
57648,Zwan,Desire,/z/zwan/desire_20148986.html,northern star \r\nam i frightened \r\nwhere ...
57649,Zwan,Heartsong,/z/zwan/heartsong_20148991.html,come in \r\nmake yourself at home \r\ni'm a ...


In [None]:
print("The dimension of the dataframe is")
df.shape

The dimension of the dataframe is


(57650, 4)

**Checking for Null values in dataframe**

In [None]:
print("The null values present in the dataframe")
df.isnull().sum()

The null values present in the dataframe


artist    0
song      0
link      0
text      0
dtype: int64

**Since we are concentrating for recommending songs we will drop link column**

In [None]:
df = df.drop('link', axis=1).reset_index(drop=True)
print("The first 5 data of dataframe")
df.head(5)

The first 5 data of dataframe


Unnamed: 0,artist,song,text
0,ABBA,Ahe's My Kind Of Girl,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante","Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,Making somebody happy is a question of give an...


**Since we are going to find the similarity between songs using lyrics let's preprocess it**

In [None]:
df['text'] = df['text'].str.lower().replace(r'^\w\s', ' ').replace(r'\n', ' ', regex = True)

stemmer = PorterStemmer()

def tokenization(txt):
    tokens = nltk.word_tokenize(txt)
    stemming = [stemmer.stem(w) for w in tokens]
    return " ".join(stemming)

df['text'] = df['text'].apply(lambda x: tokenization(x))

print("The dataframe after text processing")
df.head(10)

The dataframe after text processing


Unnamed: 0,artist,song,text
0,ABBA,Ahe's My Kind Of Girl,"look at her face , it 's a wonder face and it ..."
1,ABBA,"Andante, Andante","take it easi with me , pleas touch me gentli l..."
2,ABBA,As Good As New,i 'll never know whi i had to go whi i had to ...
3,ABBA,Bang,make somebodi happi is a question of give and ...
4,ABBA,Bang-A-Boomerang,make somebodi happi is a question of give and ...
5,ABBA,Burning My Bridges,"well , you hoot and you holler and you make me..."
6,ABBA,Cassandra,down in the street they 're all sing and shout...
7,ABBA,Chiquitita,"chiquitita , tell me what 's wrong you 're enc..."
8,ABBA,Crazy World,"i wa out with the morn sun could n't sleep , s..."
9,ABBA,Crying Over You,i 'm waitin ' for you babi i 'm sit all alon i...


**Finding out the similarity matrix**

In [None]:
df = df.sample(30000)

tfidvector = TfidfVectorizer(analyzer='word',stop_words='english')
matrix = tfidvector.fit_transform(df['text'])
similarity = cosine_similarity(matrix)

print("The similarity of first song with others is")
similarity[0]

The similarity of first song with others is


array([1.        , 0.04767397, 0.00978979, ..., 0.04900975, 0.05589482,
       0.05070984])

**Recommender function to recommend songs by ranking the similarity matrix**

In [None]:
def recommendation(song_df):
    idx = df[df['song'] == song_df].index[0]
    distances = sorted(list(enumerate(similarity[idx])),reverse=True,key=lambda x:x[1])

    songs = []
    for m_id in distances[1:21]:
        songs.append(df.iloc[m_id[0]].song)

    return songs

print("The recommended songs for Andante, Andante")
recommendation('Andante, Andante')

The recommended songs for Andante, Andante


['We Shall Not Be Moved',
 'Churchill Speech',
 'Second Movement: Andante',
 'Como Estais Amigos',
 'Nessun Dorma',
 'Ghost Dance',
 'Sweet By And By',
 'Wings Of Grace',
 'Love The Way You Do So',
 'High',
 'Praise The War Machine',
 'Notes To The Future',
 'Psalm 27',
 'Psalm 23',
 'In The Sweet By And By',
 'All Good Things (Intro)',
 'Lion Of Judah (Conquering Lion)',
 'Watching, Waiting',
 'I Shall Sing',
 'Thou Shall']

**Creating Model file**

In [None]:
pickle.dump(similarity,open('model/similarity.pkl','wb'))
pickle.dump(df,open('model/df.pkl','wb'))