In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("spotify_millsongdata.csv")
df.head()

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...


# Text Cleaning

In [3]:
df["text"] = df["text"].str.lower()
df.head()

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"look at her face, it's a wonderful face \r\na..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"take it easy with me, please \r\ntouch me gen..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,i'll never know why i had to go \r\nwhy i had...
3,ABBA,Bang,/a/abba/bang_20598415.html,making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,making somebody happy is a question of give an...


In [4]:
df["text"] = df["text"].replace(r"\n", " ").replace(r"^a-ZA-Z0-9", " ")
df

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"look at her face, it's a wonderful face \r\na..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"take it easy with me, please \r\ntouch me gen..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,i'll never know why i had to go \r\nwhy i had...
3,ABBA,Bang,/a/abba/bang_20598415.html,making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,making somebody happy is a question of give an...
...,...,...,...,...
57645,Ziggy Marley,Good Old Days,/z/ziggy+marley/good+old+days_10198588.html,irie days come on play \r\nlet the angels fly...
57646,Ziggy Marley,Hand To Mouth,/z/ziggy+marley/hand+to+mouth_20531167.html,power to the workers \r\nmore power \r\npowe...
57647,Zwan,Come With Me,/z/zwan/come+with+me_20148981.html,all you need \r\nis something i'll believe \...
57648,Zwan,Desire,/z/zwan/desire_20148986.html,northern star \r\nam i frightened \r\nwhere ...


# Sample for easier calculation (temporary)

In [5]:
df1 = df.sample(1000)
df1.head()

Unnamed: 0,artist,song,link,text
9956,Judy Garland,America The Beautiful,/j/judy+garland/america+the+beautiful_20823582...,"o beautiful, for spacious skies \r\nfor amber..."
51486,Scorpions,Taxman Woman,/s/scorpions/taxman+woman_20122588.html,i wake up in the morning i'm thinking of you ...
20011,Ugly Kid Joe,Would You Like To Be There,/u/ugly+kid+joe/would+you+like+to+be+there_101...,somewhere the evening sun \r\nis falling down...
50297,Ray Charles,Come Live With Me,/r/ray+charles/come+live+with+me_20862207.html,come live with me \r\nand won't you be my lov...
28958,David Bowie,Fill Your Heart,/d/david+bowie/fill+your+heart_20036792.html,fill your heart with love today \r\ndon't pla...


# Tokenization

In [6]:
# pip install nltk
import nltk
from nltk.stem.porter import PorterStemmer
"""
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download("punkt")
"""

'\nimport ssl\n\ntry:\n    _create_unverified_https_context = ssl._create_unverified_context\nexcept AttributeError:\n    pass\nelse:\n    ssl._create_default_https_context = _create_unverified_https_context\n\nnltk.download("punkt")\n'

In [7]:
stemmer = PorterStemmer()

def tokenization(text):
    token = nltk.word_tokenize(text)
    word_list = [stemmer.stem(i) for i in token]
    
    return " ".join(word_list)

In [8]:
# As an example
import random
index = random.choice(df1["text"].index)
print("Artist:", df1["artist"][index], "\t\tSong:", df1["song"][index])
tokenization(df1["text"][index])

Artist: Stevie Wonder 		Song: Overjoyed


"over time , i 've been build my castl of love just for two , though you never knew you were my reason i 've gone much too far for you now to say that i 've got to throw my castl away over dream , i have pick out a perfect come true though you never knew it wa of you i 've been dream the sandman ha come from too far away for you to say come back some other day and though you do n't believ that they do they do come true for did my dream come true when i look at you and mayb too , if you would believ you too might be overjoy , over love , over me over heart , i have pain turn everi stone just to find , i had found what i 've search to discov i 've come much too far for me now to find the love that i sought can never be mine and though you do n't believ that they do they do come true for did my dream come true when i look at you and mayb too , if you would believ you too might be overjoy , over love , over me and though the odd say improb what do they know for in romanc all true love need

In [9]:
temp_data = df1["text"].apply(lambda x: tokenization(x))

# Vectorizing

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [11]:
tfid = TfidfVectorizer(analyzer="word", stop_words="english")

In [12]:
sparse_matrix = tfid.fit_transform(temp_data)

In [13]:
cosine_similarity(sparse_matrix)

array([[1.        , 0.01142527, 0.02808229, ..., 0.02908   , 0.00861305,
        0.03506434],
       [0.01142527, 1.        , 0.00300579, ..., 0.05682585, 0.00125073,
        0.00763164],
       [0.02808229, 0.00300579, 1.        , ..., 0.02569148, 0.        ,
        0.02926133],
       ...,
       [0.02908   , 0.05682585, 0.02569148, ..., 1.        , 0.03291719,
        0.00761902],
       [0.00861305, 0.00125073, 0.        , ..., 0.03291719, 1.        ,
        0.00352387],
       [0.03506434, 0.00763164, 0.02926133, ..., 0.00761902, 0.00352387,
        1.        ]])