Importing necessary libraries

In [2]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt 
import warnings
warnings.filterwarnings('ignore')

In [3]:
data = pd.read_csv("ex.csv")
data.head(3)

Unnamed: 0,Song-Name,Singer/Artists,Genre,Album/Movie,User-Rating
0,Aankh Marey,"Kumar Sanu, Mika Singh, Neha Kakkar",BollywoodDance,Simmba,8.8/10
1,Coca Cola,"Neha Kakkar, Tony Kakkar",BollywoodDanceRomantic,Luka Chuppi,9.0/10
2,Apna Time Aayega,Ranveer Singh,BollywoodDance,Gully Boy,9.7/10


Data preprocessing

In [4]:
data.dropna(inplace=True)

In [5]:
data.isnull().sum()

Song-Name         0
Singer/Artists    0
Genre             0
Album/Movie       0
User-Rating       0
dtype: int64

In [6]:
data.describe()

Unnamed: 0,Song-Name,Singer/Artists,Genre,Album/Movie,User-Rating
count,2407,2407,2407,2407,2407
unique,2327,1214,20,900,40
top,Tere Naina,Mohammed Rafi,BollywoodDance,Bajirao Mastani,9.4/10
freq,3,55,1220,10,278


In [7]:
data.duplicated().sum()

16

In [8]:
data.drop_duplicates(inplace=True)
print(f"Duplicates after removing:{data.duplicated().sum()}")

Duplicates after removing:0


In [9]:
l = []
for i in data['User-Rating']:
    l.append(i[:3])

data['User-Rating'] = l


In [10]:
data.tail(5)

Unnamed: 0,Song-Name,Singer/Artists,Genre,Album/Movie,User-Rating
2415,Jana Tumhare Pyar Mein,Mukesh,BollywoodDance,Sasural,6.2
2416,Tum Jaise Bigde Babu Se,Lata Mangeshkar,BollywoodDance,Jab Pyar Kisi Se Hota Hai,7.2
2417,O Yaad Nahi Bhool Gaya,"Lata Mangeshkar, Suresh Wadkar",BollywoodDance,Lamhe,7.5
2418,Ladi Re Ladi Tujhse Aankh Jo Ladi,Jagjit Kaur,BollywoodDance,Shola Aur Shabnam,6.5
2419,Mummy Aur Daddy Mein Ladai Ho Gayi,Asha Bhosle,BollywoodDance,Shola Aur Shabnam,6.6


In [11]:
data['Album/Movie'] = data['Album/Movie'].str.replace(" ","")
data['Singer/Artists'] = data['Singer/Artists'].str.replace(" ", "")


In [12]:
data.head(5)

Unnamed: 0,Song-Name,Singer/Artists,Genre,Album/Movie,User-Rating
0,Aankh Marey,"KumarSanu,MikaSingh,NehaKakkar",BollywoodDance,Simmba,8.8
1,Coca Cola,"NehaKakkar,TonyKakkar",BollywoodDanceRomantic,LukaChuppi,9.0
2,Apna Time Aayega,RanveerSingh,BollywoodDance,GullyBoy,9.7
3,Mungda,"JyoticaTangri,Shaan,SubhroGanguly",BollywoodDance,TotalDhamaal,9.1
4,Tere Bin,"AseesKaur,RahatFatehAliKhan,TanishkBagchi",BollywoodRomantic,Simmba,9.2


In [13]:
data['tags'] = data['Singer/Artists']+' '+ data['Genre']+ ' '+data['Album/Movie']+ data['User-Rating']

In [14]:
df = data[['Song-Name','tags']]
df.head(5)

Unnamed: 0,Song-Name,tags
0,Aankh Marey,"KumarSanu,MikaSingh,NehaKakkar BollywoodDance ..."
1,Coca Cola,"NehaKakkar,TonyKakkar BollywoodDanceRomantic L..."
2,Apna Time Aayega,RanveerSingh BollywoodDance GullyBoy9.7
3,Mungda,"JyoticaTangri,Shaan,SubhroGanguly BollywoodDan..."
4,Tere Bin,"AseesKaur,RahatFatehAliKhan,TanishkBagchi Boll..."


In [15]:
df['tags'] = df['tags'].apply(lambda x:x.lower())

Changing 'tags' into vectors

In [16]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=2000)

vectors = cv.fit_transform(df['tags']).toarray()
vectors.shape


(2391, 1956)

Finding similarities

In [17]:
from sklearn.metrics.pairwise import cosine_similarity
similarity = cosine_similarity(vectors)
#similarity[0][0]

In [18]:
df.rename(columns={'Song-Name': 'title'},inplace=True)

Recommendation function 

In [19]:
def recommend(music):
    music_index = df[df['title']==music].index[0]
    distance = similarity[music_index]
    music_list = sorted(list(enumerate(distance)),reverse=True,key=lambda x:x[1])[1:6]
    for i in music_list:
        print(df.iloc[i[0]].title)
    

In [20]:
recommend('Apna Time Aayega')

Mere Gully Mein
Doori
Dilnashin Dilnashin
Hai Junoon
Ladka Yeh Kehta Hai


Saving the model

In [21]:
import pickle
pickle.dump(df,open('music.pkl','wb'))
pickle.dump(similarity,open('similarity.pkl','wb'))
