In [180]:
import pandas as pd

In [181]:
# Reading the data and creating data frame
df = pd.read_csv("spotify_millsongdata.csv")
df.head()

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...


In [182]:
# Dropping the link column because it is not needed in this project
df = df.drop("link", axis=1)
df.head()

Unnamed: 0,artist,song,text
0,ABBA,Ahe's My Kind Of Girl,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante","Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,Making somebody happy is a question of give an...


In [183]:
# Size of the data frame for reference
df.shape

(57650, 3)

# Text Cleaning

### In this section, we will clean the text column to get it ready for tokenization.

In [184]:
# Lowercasing all letters so we do treat the same words with differents the same as well.
df["text"] = df["text"].str.lower()
df.head()

Unnamed: 0,artist,song,text
0,ABBA,Ahe's My Kind Of Girl,"look at her face, it's a wonderful face \r\na..."
1,ABBA,"Andante, Andante","take it easy with me, please \r\ntouch me gen..."
2,ABBA,As Good As New,i'll never know why i had to go \r\nwhy i had...
3,ABBA,Bang,making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,making somebody happy is a question of give an...


In [185]:
# Replacing new line characters with blank spaces
df["text"] = df["text"].replace(r"\n", " ", regex=True)
df.head()

Unnamed: 0,artist,song,text
0,ABBA,Ahe's My Kind Of Girl,"look at her face, it's a wonderful face \r an..."
1,ABBA,"Andante, Andante","take it easy with me, please \r touch me gent..."
2,ABBA,As Good As New,i'll never know why i had to go \r why i had ...
3,ABBA,Bang,making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,making somebody happy is a question of give an...


# Tokenization

In [186]:
import nltk
from nltk.stem.porter import PorterStemmer

import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

# nltk.download("punkt")

In [187]:
# Creating our tokenization function
stemmer = PorterStemmer()

def tokenization(text):
    tokens = nltk.word_tokenize(text)
    word_list = [stemmer.stem(i) for i in tokens]
    
    return " ".join(word_list)

In [188]:
# Selecting a random song from the data frame and passing it through tokenization
import random
index = random.choice(df["text"].index)
print("Artist:", df["artist"][index], "\tSong:", df["song"][index])
tokenization(df["text"][index])

Artist: Cher 	Song: It All Adds Up Now


"the way you look at her is the way you use to look at me when thing were differ too mani chang have gone down strang vibrat all around make me think perhap our love ai n't right so ask your heart and soul they will sure know if you still love me no break down is too big for you not to dig the kind of love i 'm put down it all add up now two and two are realli four it all add up now my heart know the score put me in your love bag pleas do n't say i am a drag i gave you more than life itself so if you can walk away love will come anoth day in a new disguis mayb you 're the kind of guy all you want to do is make me cri sometim i wonder whi today it all add up now two and two are realli four it all add up now my heart know the score"

## Sample Data Frame for Computational Ease

In [189]:
df = df.sample(20000).reset_index(drop=True)

In [190]:
# Running this cell might take around 3-5 minutes
# Applying the tokenization funcion to the whole data set
tokenized_data = df["text"].apply(lambda x: tokenization(x))

# Vectorizing and Cosine Similarity

In [191]:
# pip install numpy==1.25.2

If the cell below returns and error, it is because a numpy version older than 1.26.0 is needed.
Please run the line above if that is the case.

In [192]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [193]:
# Vectorization of the words
tfid = TfidfVectorizer(analyzer="word", stop_words="english")

In [194]:
sparse_matrix = tfid.fit_transform(tokenized_data)

In [195]:
similarity = cosine_similarity(sparse_matrix)

# Recommendation

In [196]:
# Creating the function to come up with recommendations
def recommendation(song_df, num_song):
    idx = df[df["song"] == song_df].index[0]
    distances = sorted(list(enumerate(similarity[idx])),reverse=True,key=lambda x:x[1])
    
    artists = []
    songs = []
    for m_id in distances[1:(num_song+1)]:
        songs.append(df.iloc[m_id[0]].song)
        artists.append(df.iloc[m_id[0]].artist)
        
    return artists, songs

In [197]:
# Getting a input from the user
while True:
    try:
        num = int(input("How many songs do you want us to recommend: "))
        break
    except ValueError:
        print("\nPlease enter an integer value.")

song = input("\nGive us a song: ")
while song not in list(df["song"]):
    print("\nPlease select a song from the data set, e.g. {}".format(df["song"][random.choice(df["text"].index)]))
    song = input("Give us a song: ")

# Serving our listener the suggestions
print("\n\nDiscoverify recommends you these songs:\n")
artists, songs = recommendation(song, num)
for i in range(num):
    print(artists[i], "-", songs[i])



Discoverify recommends you these songs:

Wilson Pickett - Hey Jude
The Temptations - Hey Jude
Beach Boys - Be With Me
America - No Fortune
P!nk - Delirium
