**Many of us love the songs of Taylor Swift right? Why not try to build a simple song-recommender based on it?**


For making this, I have taken major inspiration from the following blog post- https://towardsdatascience.com/the-abc-of-building-a-music-recommender-system-part-i-230e99da9cad

# 1. Importing Necessary Values

In [6]:
import pandas as pd
from pandas import DataFrame
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer #for TFIDF 


In [7]:
def clean_lyric(lyric): #first layer of cleaning(remo)
    if '[' in lyric:
        return ''
    else:
        return lyric

## 2. Loading Data file

In [8]:
df = pd.read_csv('final_taylor_swift_lyrics.tsv', sep= '\t')


In [9]:
print(df.loc[1])

index                                                           0
album                                                       Lover
song_title                                                 Lover 
lyric           We could leave the Christmas lights up 'til Ja...
line_number                                                     1
release_date                                           2019-08-16
Name: 1, dtype: object


# 3. Removing Unnecessary Columns

In [10]:
df=df.drop(['release_date','index','line_number','album'],axis=1)

In [11]:
org=df #to store a copy

In [12]:
print(df)

                              song_title  \
0                                 Lover    
1                                 Lover    
2                                 Lover    
3                                 Lover    
4                                 Lover    
...                                  ...   
16279  Picture to Burn (Live From SoHo)    
16280  Picture to Burn (Live From SoHo)    
16281  Picture to Burn (Live From SoHo)    
16282  Picture to Burn (Live From SoHo)    
16283  Picture to Burn (Live From SoHo)    

                                                   lyric  
0                                              [Verse 1]  
1      We could leave the Christmas lights up 'til Ja...  
2               And this is our place, we make the rules  
3      And there's a dazzling haze, a mysterious way ...  
4               Have I known you 20 seconds or 20 years?  
...                                                  ...  
16279                       Burn, burn, burn, baby, burn  

# 4. Data Cleaning

In [13]:
spec_chars = ["!",'"',"#","%","&","(",")",
              "*","+",",","-",".","/",":",";","<",
              "=",">","?","@","[","\\","]","^","_",
              "`","{","|","}","~","–"]
for char in spec_chars:
    df['lyric'] = df['lyric'].str.replace(char, ' ')
df['lyric'] = df['lyric'].str.replace("'", '')

# 5.Making an array of song name and lyrics

In [14]:
base=[]
s_name=df.loc[0][0]
st=""
for i, j in df.iterrows(): 
    if(j[0]==s_name):
        st+=" "+j[1] #adding lines to the string of a song
    else:
        base.append([s_name,st]) #adding the song to array
        s_name=j[0]
        st=""
        
        
        
        

In [15]:
len(base) #No of songs

342

In [16]:
frame= DataFrame (base,columns=['name','lyrics']) #Converting back to data frame

In [None]:
frame.head()

# 6. Using Mathematical Similarity calculation using TFIDF and cosine

In [12]:
# Initialize tfidf vectorizer
tfidf = TfidfVectorizer(analyzer='word', stop_words='english')

# Fit and transform 
tfidf_matrix = tfidf.fit_transform(frame['lyrics'])

In [13]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_similarities = cosine_similarity(tfidf_matrix)
similarities = {}
for i in range(len(cosine_similarities)):
    # Now we'll sort each element in cosine_similarities and get the indexes of the songs. 
    similar_indices = cosine_similarities[i].argsort()[:-50:-1] 
    # After that, we'll store in similarities each name of the 50 most similar songs.
    # Except the first one that is the same song.
    similarities[frame['name'].iloc[i]] = [(cosine_similarities[i][x], frame['name'][x]) for x in similar_indices][1:]


In [14]:
class ContentBasedRecommender:
    def __init__(self, matrix):
        self.matrix_similar = matrix

    def _print_message(self, song, recom_song):
        rec_items = len(recom_song)
        
        print(f'The {rec_items} recommended songs for {song} are:')
        for i in range(rec_items):
            print(f"Number {i+1}:")
            print(f"{recom_song[i][1]} with {round(recom_song[i][0], 3)} similarity score") 
            print("--------------------")
        
    def recommend(self, recommendation):
        # Get song to find recommendations for
        song = recommendation['song']
        # Get number of songs to recommend
        number_songs = recommendation['number_songs']
        # Get the number of songs most similars from matrix similarities
        recom_song = self.matrix_similar[song][:number_songs]
        # print each item
        self._print_message(song=song, recom_song=recom_song)



# 7. Recommendation System

In [15]:
# Instantiate class
recommedations = ContentBasedRecommender(similarities)



In [17]:
#Commenting this cell because it requires dynamic input and cant show output here

inp= input("Enter the song name:")
print("")
print("Which one of these songs you want :")
for i,j in frame.iterrows():
    if (j[0].find(inp)>=0):
       print(i,j[0])
        
print("")
num=input("enter the song number ")
num=int(num)
num_son= input("enter the number of recommendations needed ")
num_son= int(num_son)
    
    

Enter the song name:Lover

Which one of these songs you want :
0 Lover 
335 City of Lover : Setlist 
336 Lover (Live from Paris) 

enter the song number 0
enter the number of recommendations needed 5


In [None]:
# num=0 #ID for Lover
# num_son = 3 #top 3 songs

In [18]:
# Create dict to pass
recommendation = {
    "song": frame['name'].iloc[num],
    "number_songs":  num_son
}

# Recommend
recommedations.recommend(recommendation)

The 5 recommended songs for Lover  are:
Number 1:
Lover (Live from Paris)  with 1.0 similarity score
--------------------
Number 2:
AMAs Artist of the Decade Performance  with 0.21 similarity score
--------------------
Number 3:
Wildest Dreams / Enchanted  with 0.163 similarity score
--------------------
Number 4:
Better Off  with 0.148 similarity score
--------------------
Number 5:
New Romantics  with 0.136 similarity score
--------------------


# A better recommendation system using SpaCy(NLP Library)

In [2]:
import spacy
# Load the spacy model that you have installed
nlp = spacy.load('en_core_web_md')
# process a sentence using the model
doc = nlp("This is some text that I am processing with Spacy")
# It's that simple - all of the vectors and words are assigned after this point
# Get the vector for 'text':
doc[3].vector
# Get the mean vector for the entire sentence (useful for sentence classification etc.)
doc.vector

array([-5.36412969e-02,  2.79353321e-01, -1.05259977e-01, -1.76284965e-02,
        1.34550199e-01,  1.92671806e-01,  5.50469756e-03, -2.39132687e-01,
       -4.06342074e-02,  1.78010297e+00, -1.80772960e-01,  1.02661893e-01,
        6.84069991e-02, -5.09319194e-02, -7.65837058e-02, -3.77540514e-02,
        8.24129581e-03,  1.37752008e+00, -1.78934380e-01, -5.76109104e-02,
        1.66338980e-02, -3.62196006e-02, -7.48579949e-02,  4.40651290e-02,
       -2.65241470e-02,  2.41529979e-02,  9.79370065e-03, -1.13990309e-03,
        1.59522101e-01, -1.56648397e-01, -9.12139937e-02,  9.11872908e-02,
        1.07169405e-01, -1.08843103e-01, -7.94988051e-02, -4.74919155e-02,
       -1.60613850e-01, -2.82304995e-02, -1.03425637e-01, -1.14933215e-01,
        1.62531182e-01, -1.01342008e-01,  2.17013666e-03,  3.47881988e-02,
       -6.34927005e-02,  2.44374484e-01, -3.01910043e-02, -1.46046979e-02,
       -1.06488302e-01,  6.26319647e-03, -1.30655810e-01,  7.04905912e-02,
       -4.86716032e-02,  

In [28]:
dict={} #Dictionary of all song names and lyrics
for i,j in frame.iterrows():
    dict[j[0]]=j[1]

In [40]:

inp= input("Enter the song name:")
print("")
print("Which one of these songs you want :")
for i,j in frame.iterrows():
    if (j[0].find(inp)>=0):
       print(i,j[0])
        
print("")
num=input("enter the song number ")
num=int(num)
num_son= input("enter the number of recommendations needed ")
num_son= int(num_son)
    
    

Enter the song name:Lover

Which one of these songs you want :
0 Lover 
335 City of Lover : Setlist 
336 Lover (Live from Paris) 

enter the song number 0
enter the number of recommendations needed 5


In [41]:
curr_song= nlp(frame.loc[num][1])
sim_dict= {} #making similarity matrix
for i,j in frame.iterrows():
    doc= nlp(j[1])
    sim_dict[j[0]]=curr_song.similarity(doc)
    if (i==num):
        sim_dict[j[0]]=0 #cancelling the case of same song
    



  """


In [42]:
from collections import Counter 
  
k = Counter(sim_dict) 
  
# Finding 3 highest values 
high = k.most_common(num_son)  
print("The top recommended songs are:") 
print("Keys: Values") 
  
for i in high: 
    print(i[0]," :",i[1]," ") 

The top recommended songs are:
Keys: Values
Lover (Live from Paris)   : 0.9999087908620512  
Last Kiss   : 0.9892939286438064  
My Turn To Be Me   : 0.9870838557177137  
​peace   : 0.9867175527944216  
My Cure   : 0.9858845119147734  
