In [132]:
import pandas as pd

In [133]:
# Reading the data and creating data frame
df = pd.read_csv("spotify_millsongdata.csv")
df.head()

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...


In [134]:
# Dropping the link column because it is not needed in this project
df = df.drop("link", axis=1)
df.head()

Unnamed: 0,artist,song,text
0,ABBA,Ahe's My Kind Of Girl,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante","Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,Making somebody happy is a question of give an...


In [135]:
# Size of the data frame for reference
df.shape

(57650, 3)

# Text Cleaning

### In this section, we will clean the text column to get it ready for tokenization.

In [136]:
# Lowercasing all letters so we do treat the same words with differents the same as well.
df["text"] = df["text"].str.lower()
df.head()

Unnamed: 0,artist,song,text
0,ABBA,Ahe's My Kind Of Girl,"look at her face, it's a wonderful face \r\na..."
1,ABBA,"Andante, Andante","take it easy with me, please \r\ntouch me gen..."
2,ABBA,As Good As New,i'll never know why i had to go \r\nwhy i had...
3,ABBA,Bang,making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,making somebody happy is a question of give an...


In [137]:
# Replacing new line characters with blank spaces
df["text"] = df["text"].replace(r"\n", " ", regex=True)
df.head()

Unnamed: 0,artist,song,text
0,ABBA,Ahe's My Kind Of Girl,"look at her face, it's a wonderful face \r an..."
1,ABBA,"Andante, Andante","take it easy with me, please \r touch me gent..."
2,ABBA,As Good As New,i'll never know why i had to go \r why i had ...
3,ABBA,Bang,making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,making somebody happy is a question of give an...


# Tokenization

In [138]:
# pip install nltk
import nltk
from nltk.stem.porter import PorterStemmer

import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

# nltk.download("punkt")

In [139]:
# Creating our tokenization function
stemmer = PorterStemmer()

def tokenization(text):
    tokens = nltk.word_tokenize(text)
    word_list = [stemmer.stem(i) for i in tokens]
    
    return " ".join(word_list)

In [140]:
# Selecting a random song from the data frame and passing it through tokenization
import random
index = random.choice(df["text"].index)
print("Artist:", df["artist"][index], "\tSong:", df["song"][index])
tokenization(df["text"][index])

Artist: Dean Martin 	Song: Always In My Heart


"you are alway in my heart even though you 're far away i can hear the music of the song i sang to you you are alway in my heart and when sky abov are gray i rememb that you care and then and there the sun break through just befor i go to sleep there 's a rendezv i keep and the dream i alway meet help me forget we 're far apart i do n't know exactli when dear but i 'm sure we 'll meet again dear and my darl 'til we do you are alway in my heart and the dream i alway meet help me forget were far apart i do n't know exactli when dear but i 'm sure we 'll meet again dear and my darl 'til we do you are alway in my heart"

## Sample Data Frame for Computational Ease

In [141]:
df = df.sample(20000).reset_index(drop=True)

In [142]:
# Running this cell might take around 3-5 minutes
# Applying the tokenization funcion to the whole data set
tokenized_data = df["text"].apply(lambda x: tokenization(x))

# Vectorizing and Cosine Similarity

In [143]:
# pip install numpy==1.25.2

If the cell below returns and error, it is because a numpy version older than 1.26.0 is needed.
Please run the line above if that is the case.

In [144]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [145]:
# Vectorization of the words
tfid = TfidfVectorizer(analyzer="word", stop_words="english")

In [146]:
sparse_matrix = tfid.fit_transform(tokenized_data)

In [147]:
similarity = cosine_similarity(sparse_matrix)

# Recommendation

In [148]:
# Creating the function to come up with recommendations
def recommendation(song_df, num_song):
    idx = df[df["song"] == song_df].index[0]
    distances = sorted(list(enumerate(similarity[idx])),reverse=True,key=lambda x:x[1])
    
    artists = []
    songs = []
    for m_id in distances[1:(num_song+1)]:
        songs.append(df.iloc[m_id[0]].song)
        artists.append(df.iloc[m_id[0]].artist)
        
    return artists, songs

In [149]:
# Getting a input from the user
while True:
    try:
        num = int(input("How many songs do you want us to recommend: "))
        break
    except ValueError:
        print("\nPlease enter an integer value.")

song = input("\nGive us a song: ")
while song not in list(df["song"]):
    print("\nPlease select a song from the data set, e.g. {}".format(df["song"][random.choice(df["text"].index)]))
    song = input("Give us a song: ")

# Serving our listener the suggestions
print("\n\nDiscoverify recommends you these songs:\n")
artists, songs = recommendation(song, num)
for i in range(num):
    print(artists[i], "-", songs[i])


Please enter an integer value.

Please select a song from the data set, e.g. Hero

Please select a song from the data set, e.g. Blue


Discoverify recommends you these songs:

Grateful Dead - Hey Jude
Ed Sheeran - Where We Land
Bob Seger - Gone
Isley Brothers - Speechless (From Life Soundtrack)
America - Someday Woman
