In [1]:
#import all the required libraries
#Use TfidfVectorizer from the Scikit-learn package
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
#This dataset contains name, artist, and lyrics for 57650 songs in English.
songs = pd.read_csv('abcdata.csv')
songs.head()
print(len(songs))

57650


In [2]:
#Because of the dataset being so big, we are going to resample only 5000 random songs.
songs = songs.sample(n=5000).drop('link', axis=1).reset_index(drop=True)
songs['text'] = songs['text'].str.replace(r'\n', '')#We can notice also the presence of \n in the text, so we are going to remove it.
#we use TF-IDF vectorizer that calculates the TF-IDF score for each song lyric, word-by-word. 
tfidf = TfidfVectorizer(analyzer='word', stop_words='english')
tfidf_matrix = tfidf.fit_transform(songs['text'])

  songs['text'] = songs['text'].str.replace(r'\n', '')


In [3]:
#We now need to calculate the similarity of one lyric to another. We are going to use cosine similarity.
#We want to calculate the cosine similarity of each item with every other item in the dataset. So we just pass the tfidf_matrix as argument.
from sklearn.metrics.pairwise import cosine_similarity
cosine_similarities = cosine_similarity(tfidf_matrix) 

similarities = {}
for i in range(len(cosine_similarities)):
    ## Now we'll sort each element in cosine_similarities and get the indexes of the songs.
    similar_indices = cosine_similarities[i].argsort()[:-50:-1] 
    # After that, we'll store in similarities each name of the 50 most similar songs.
    # Except the first one that is the same song.
    similarities[songs['song'].iloc[i]] = [(cosine_similarities[i][x], songs['song'][x], 
                                           songs['artist'][x]) for x in similar_indices][1:]

In [4]:
tfidf_matrix.shape

(5000, 25287)

In [5]:
cosine_similarities.shape

(5000, 5000)

In [6]:
class ContentBasedRecommender:
    def __init__(self, matrix):
        self.matrix_similar = matrix
    def _print_message(self, song, recom_song):
        rec_items = len(recom_song)       
        print(f'The {rec_items} recommended songs for {song} are:')
        for i in range(rec_items):
            print(f"Number {i+1}:")
            print(f"{recom_song[i][1]} by {recom_song[i][2]} with {round(recom_song[i][0], 3)} similarity score") 
            print("--------------------") 
    def recommend(self, recommendation):
            # Get song to find recommendations for
            song = recommendation['song']
            # Get number of songs to recommend
            number_songs = recommendation['number_songs']
            # Get the number of songs most similars from matrix similarities
            recom_song = self.matrix_similar[song][:number_songs]
            # print each item
            self._print_message(song=song, recom_song=recom_song)

In [7]:
recommedations = ContentBasedRecommender(similarities)

In [8]:
recommendation = {
    "song": songs['song'].iloc[120],
    "number_songs": 15
}

In [9]:
recommedations.recommend(recommendation)

The 15 recommended songs for Speed King are:
Number 1:
Speed King by Deep Purple with 0.753 similarity score
--------------------
Number 2:
Good Golly Miss Molly by Status Quo with 0.322 similarity score
--------------------
Number 3:
Rockin' In The Free World by Billy Joel with 0.246 similarity score
--------------------
Number 4:
Around And Around by Chuck Berry with 0.224 similarity score
--------------------
Number 5:
Rock Me Baby by Otis Redding with 0.21 similarity score
--------------------
Number 6:
Clock Strikes Ten by Cheap Trick with 0.204 similarity score
--------------------
Number 7:
Around And Around by Bruce Springsteen with 0.203 similarity score
--------------------
Number 8:
Rock 'n' Roll Star by Oasis with 0.188 similarity score
--------------------
Number 9:
You'll Never Find Another Love Like Mine by Michael Buble with 0.181 similarity score
--------------------
Number 10:
Drive by Our Lady Peace with 0.18 similarity score
--------------------
Number 11:
We Rock b