## Importing Required Packages

In [4]:
import numpy as np
import pandas as pd

## Load the data

In [33]:
df = pd.read_csv("songdata.csv")

## EDA

In [6]:
# read first 10 records
df.head(10)

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \nAnd..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \nTouch me gentl..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \nWhy I had t...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...
5,ABBA,Burning My Bridges,/a/abba/burning+my+bridges_20003011.html,"Well, you hoot and you holler and you make me ..."
6,ABBA,Cassandra,/a/abba/cassandra_20002811.html,Down in the street they're all singing and sho...
7,ABBA,Chiquitita,/a/abba/chiquitita_20002978.html,"Chiquitita, tell me what's wrong \nYou're enc..."
8,ABBA,Crazy World,/a/abba/crazy+world_20003013.html,I was out with the morning sun \nCouldn't sle...
9,ABBA,Crying Over You,/a/abba/crying+over+you_20177611.html,I'm waitin' for you baby \nI'm sitting all al...


In [7]:
# read last 10 records
df.tail(10)

Unnamed: 0,artist,song,link,text
57640,Zebrahead,The Setup,/z/zebrahead/the+setup_10198494.html,Lie to me \nTell me that everything will be a...
57641,Ziggy Marley,Freedom Road,/z/ziggy+marley/freedom+road_20531174.html,"That's why I'm marching, yes, I'm marching, \..."
57642,Ziggy Marley,Friend,/z/ziggy+marley/friend_20673508.html,[Chorus] \nI wanna thank you for the things y...
57643,Ziggy Marley,G7,/z/ziggy+marley/g7_20531173.html,Seven richest countries in the world \nThem h...
57644,Ziggy Marley,Generation,/z/ziggy+marley/generation_20531171.html,Many generation have passed away \nFighting f...
57645,Ziggy Marley,Good Old Days,/z/ziggy+marley/good+old+days_10198588.html,Irie days come on play \nLet the angels fly l...
57646,Ziggy Marley,Hand To Mouth,/z/ziggy+marley/hand+to+mouth_20531167.html,Power to the workers \nMore power \nPower to...
57647,Zwan,Come With Me,/z/zwan/come+with+me_20148981.html,all you need \nis something i'll believe \nf...
57648,Zwan,Desire,/z/zwan/desire_20148986.html,northern star \nam i frightened \nwhere can ...
57649,Zwan,Heartsong,/z/zwan/heartsong_20148991.html,come in \nmake yourself at home \ni'm a bit ...


In [8]:
# get the general info about the data set
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57650 entries, 0 to 57649
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   artist  57650 non-null  object
 1   song    57650 non-null  object
 2   link    57650 non-null  object
 3   text    57650 non-null  object
dtypes: object(4)
memory usage: 1.8+ MB


In [9]:
df.shape

(57650, 4)

## Checking for null values

In [10]:
df.isnull().sum()

artist    0
song      0
link      0
text      0
dtype: int64

## Removing unwanted columns

In [34]:
df = df.drop('link',axis=1).reset_index(drop=True)

In [12]:
df.head(10)

Unnamed: 0,artist,song,text
0,Bee Gees,King And Country,"All the love of my mother, I offer to you \nA..."
1,Will Smith,Yes Yes Y'all,Yes \nAt the start of the new jiggyness \nWi...
2,George Strait,I'd Like To Have That One Back,Heard somebody speak her name \nThey said she...
3,Bette Midler,All I Need Is The Girl,I pretend I'm at home getting dressed for a da...
4,Nick Cave,"Babe, You Turn Me On","Stay by me, stay by me \nYou are the one, my ..."
5,Dave Matthews Band,Spaceman,"Probably get it wrong just to get it right, gi..."
6,Within Temptation,The Other Half (Of Me),"Over the hills lies a new beginning, over the ..."
7,Neil Young,Driveby,It's a random kind of thing \nCame upon a del...
8,Leonard Cohen,Here It Is,Here is your crown \nAnd your seal and rings ...
9,Foreigner,Died In Your Arm Tonight,"Oh I, I just died in your arms tonight \nIt m..."


## Checking Duplicate rows

In [13]:
# Count total duplicates
total_duplicates = df.duplicated().sum()
print(f'Total duplicate rows: {total_duplicates}')

Total duplicate rows: 0


In [14]:
df['text'][0]

"All the love of my mother, I offer to you  \nAnd I'll try so very hard , to get you through  \nIf lay down my life, for my King and country men  \nWould it change you for the better  \nIt better be all right  \nTo be always like children  \nAfraid of the night  \n  \nIf I gave up the cause , what would you give me  \nto make up for all the things that I would lose  \nIf I lay down my life , for my King and country men  \nWould it change you for the better  \nIt better be all right  \nTo be always like children  \nAfraid of the night  \n  \nWell you better get busy , get it over and done  \nI was here when you arrived , I'll be here when you're gone  \nIf I give up my life for my King and country men  \nWould it change you for the better  \nIt better be all right  \nTo be never like people  \nAfraid of the night  \n  \nDo do do , etc ... (fade out)\n\n"

## Text Preprocessing

In [35]:
df['text'] = df['text'].str.lower().replace(r'^\w\s',' ').replace(r'\n',' ',regex=True)
df['text']

0        look at her face, it's a wonderful face   and ...
1        take it easy with me, please   touch me gently...
2        i'll never know why i had to go   why i had to...
3        making somebody happy is a question of give an...
4        making somebody happy is a question of give an...
                               ...                        
57645    irie days come on play   let the angels fly le...
57646    power to the workers   more power   power to t...
57647    all you need   is something i'll believe   fla...
57648    northern star   am i frightened   where can i ...
57649    come in   make yourself at home   i'm a bit la...
Name: text, Length: 57650, dtype: object

In [36]:
df

Unnamed: 0,artist,song,text
0,ABBA,Ahe's My Kind Of Girl,"look at her face, it's a wonderful face and ..."
1,ABBA,"Andante, Andante","take it easy with me, please touch me gently..."
2,ABBA,As Good As New,i'll never know why i had to go why i had to...
3,ABBA,Bang,making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,making somebody happy is a question of give an...
...,...,...,...
57645,Ziggy Marley,Good Old Days,irie days come on play let the angels fly le...
57646,Ziggy Marley,Hand To Mouth,power to the workers more power power to t...
57647,Zwan,Come With Me,all you need is something i'll believe fla...
57648,Zwan,Desire,northern star am i frightened where can i ...


## Importing nltk

In [17]:
import nltk 
from nltk.stem.porter import PorterStemmer

## Creating Instance

In [18]:
stemmer = PorterStemmer()

In [19]:
# Function for Tokenization and Stemming
def token(txt): 
    token = nltk.word_tokenize(txt)
    a = [stemmer.stem(w) for w in token]
    return " ".join(a)

In [20]:
token("you are beautiful, beauty")

'you are beauti , beauti'

## Prepocess every text in the DataFrame 

In [37]:
df['text'].apply(lambda x: token(x))

0        look at her face , it 's a wonder face and it ...
1        take it easi with me , pleas touch me gentli l...
2        i 'll never know whi i had to go whi i had to ...
3        make somebodi happi is a question of give and ...
4        make somebodi happi is a question of give and ...
                               ...                        
57645    iri day come on play let the angel fli let the...
57646    power to the worker more power power to the wo...
57647    all you need is someth i 'll believ flashlight...
57648    northern star am i frighten where can i go to ...
57649    come in make yourself at home i 'm a bit late ...
Name: text, Length: 57650, dtype: object

## Importing Tf-idf and cosine-similarity

In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.metrics.pairwise import cosine_similarity

## Removing Stop words

In [23]:
tfid = TfidfVectorizer(analyzer='word', stop_words='english')

In [38]:
matrix = tfid.fit_transform(df['text'])
matrix

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 3095478 stored elements and shape (57650, 82077)>

## Calculating similarity 

In [25]:
similar = cosine_similarity(matrix)

In [26]:
similar[0]

array([1.        , 0.03204656, 0.05821039, ..., 0.02450503, 0.00178709,
       0.00824724])

## Recommender Function

In [27]:
def recommender(song_name): 
    idx = df[df['song'] == song_name].index[0]
    distance = sorted(list(enumerate(similar[idx])), reverse=True, key=lambda x:x[1]) 
    song = []
    for s_id in distance[1:6]: 
        song.append(df.iloc[s_id[0]].song) 
    return song

In [28]:
recommender("Believe")

['Believe In Life',
 'Say You Really Want Me',
 'I Feel For You',
 "Don't Believe What You Read",
 'Nobody']

## Downlading the pickle files

In [29]:
import pickle

In [30]:
with open("similarity.pkl", "wb") as file:
    pickle.dump(similar, file)

In [31]:
with open("df.pkl", "wb") as file:
    pickle.dump(df, file)