In [1]:
import pandas as pd

In [2]:
# Reading the data and creating data frame
df = pd.read_csv("spotify_millsongdata.csv")
df.head()

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \nAnd..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \nTouch me gentl..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \nWhy I had t...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...


In [3]:
# Dropping the link column because it is not needed in this project
df = df.drop("link", axis=1)
df.head()

Unnamed: 0,artist,song,text
0,ABBA,Ahe's My Kind Of Girl,"Look at her face, it's a wonderful face \nAnd..."
1,ABBA,"Andante, Andante","Take it easy with me, please \nTouch me gentl..."
2,ABBA,As Good As New,I'll never know why I had to go \nWhy I had t...
3,ABBA,Bang,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,Making somebody happy is a question of give an...


In [4]:
# Size of the data frame for reference
df.shape

(57650, 3)

# Text Cleaning

### In this section, we will clean the text column to get it ready for tokenization.

In [5]:
# Lowercasing all letters so we do treat the same words with differents the same as well.
df["text"] = df["text"].str.lower()
df.head()

Unnamed: 0,artist,song,text
0,ABBA,Ahe's My Kind Of Girl,"look at her face, it's a wonderful face \nand..."
1,ABBA,"Andante, Andante","take it easy with me, please \ntouch me gentl..."
2,ABBA,As Good As New,i'll never know why i had to go \nwhy i had t...
3,ABBA,Bang,making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,making somebody happy is a question of give an...


In [6]:
# Replacing new line characters with blank spaces
df["text"] = df["text"].replace(r"\n", " ", regex=True)
df.head()

Unnamed: 0,artist,song,text
0,ABBA,Ahe's My Kind Of Girl,"look at her face, it's a wonderful face and ..."
1,ABBA,"Andante, Andante","take it easy with me, please touch me gently..."
2,ABBA,As Good As New,i'll never know why i had to go why i had to...
3,ABBA,Bang,making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,making somebody happy is a question of give an...


# Tokenization

In [7]:
# pip install nltk
import nltk
from nltk.stem.porter import PorterStemmer

import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

# nltk.download("punkt")

In [8]:
# Creating our tokenization function
stemmer = PorterStemmer()

def tokenization(text):
    tokens = nltk.word_tokenize(text)
    word_list = [stemmer.stem(i) for i in tokens]
    
    return " ".join(word_list)

In [9]:
# Selecting a random song from the data frame and passing it through tokenization
import random
index = random.choice(df["text"].index)
print("Artist:", df["artist"][index], "\tSong:", df["song"][index])
tokenization(df["text"][index])

Artist: Horrible Histories 	Song: The Monks' Song


"welcom to our monasteri pleas have a chair good to see you monk so deep in prayer onc the pray 's finish your chore must be done the main rule of a good monk 's life is no fun we have to pray eight time a day seven day a week and copi all these manuscript in write so antiqu i 'll finish off thi letter ' a ' onc i 've plough our field milk the cow , muck the sow , veget peel i 'll be do that and pray too veri well gent , i 'll bid you adieu advenium regnum tuum ( okay brother , i think he 's gone ) now the bishop 's not around throw off these religi gown hunki , chunki , funki monk get down ! it 's not all hymn and prayin ' it 's not all work and no playin ' so let 's start misbehavin ' and get with the funk ! we love to have a parti eat food that is hearti let 's get the boozin ' start drunk like a monk ! play that monki music , funk boy ! just want to check that dure my absenc you 're honour your mealtim vow of silenc althought we did n't oughter we like to hunt and slaughter do n't 

## Sample Data Frame for Computational Ease

In [10]:
df = df.sample(3000).reset_index(drop=True)

In [11]:
# Running this cell might take around 3-5 minutes
# Applying the tokenization funcion to the whole data set
tokenized_data = df["text"].apply(lambda x: tokenization(x))

# Vectorizing and Cosine Similarity

In [12]:
# pip install numpy==1.25.2

If the cell below returns and error, it is because a numpy version older than 1.26.0 is needed.
Please run the line above if that is the case.

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [14]:
# Vectorization of the words
tfid = TfidfVectorizer(analyzer="word", stop_words="english")

In [15]:
sparse_matrix = tfid.fit_transform(tokenized_data)

In [16]:
similarity = cosine_similarity(sparse_matrix)

# Recommendation

In [17]:
# Creating the function to come up with recommendations
def recommendation(song_df, num_song):
    idx = df[df["song"] == song_df].index[0]
    distances = sorted(list(enumerate(similarity[idx])),reverse=True,key=lambda x:x[1])
    
    artists = []
    songs = []
    for m_id in distances[1:(num_song+1)]:
        songs.append(df.iloc[m_id[0]].song)
        artists.append(df.iloc[m_id[0]].artist)
        
    return artists, songs

In [22]:
# Getting a input from the user
while True:
    try:
        num = int(input("How many songs do you want us to recommend: "))
        break
    except ValueError:
        print("\nPlease enter an integer value.")

song = input("\nGive us a song: ")
while song not in list(df["song"]):
    print("\nPlease select a song from the data set, e.g. {}".format(df["song"][random.choice(df["text"].index)]))
    song = input("Give us a song: ")

# Serving our listener the suggestions
print("\n\nDiscoverify recommends you these songs:\n")
artists, songs = recommendation(song, num)
for i in range(num):
    print(artists[i], "-", songs[i])

How many songs do you want us to recommend: wsdgf

Please enter an integer value.
How many songs do you want us to recommend: adf

Please enter an integer value.
How many songs do you want us to recommend: 56

Give us a song: iodıjfvx

Please select a song from the data set, e.g. Oh, Such A Stranger
Give us a song: sdf

Please select a song from the data set, e.g. Have I Told You Lately That I Love You
Give us a song: sdf

Please select a song from the data set, e.g. Dreidel
Give us a song: Dreidel


Discoverify recommends you these songs:

Extreme - Stop The World
Veruca Salt - Get Back
Christina Aguilera - Make The World Move
Lionel Richie - Can't Slow Down
Lloyd Cole - Forest Fire
Reo Speedwagon - She's Gonna' Love Me
Tears For Fears - Falling Down
Proclaimers - Spinning Around In The Air
Kirk Franklin - Gonna Be A Lovely Day
Alabama - Life's Too Short To Love This Fast
Tim McGraw - One Of These Days
Overkill - Wish You Were Dead
Pet Shop Boys - What Have I Done To Deserve This?
Ufo