In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("spotify_millsongdata.csv")

In [3]:
df.head(5)

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...


In [4]:
df.tail(5)

Unnamed: 0,artist,song,link,text
57645,Ziggy Marley,Good Old Days,/z/ziggy+marley/good+old+days_10198588.html,Irie days come on play \r\nLet the angels fly...
57646,Ziggy Marley,Hand To Mouth,/z/ziggy+marley/hand+to+mouth_20531167.html,Power to the workers \r\nMore power \r\nPowe...
57647,Zwan,Come With Me,/z/zwan/come+with+me_20148981.html,all you need \r\nis something i'll believe \...
57648,Zwan,Desire,/z/zwan/desire_20148986.html,northern star \r\nam i frightened \r\nwhere ...
57649,Zwan,Heartsong,/z/zwan/heartsong_20148991.html,come in \r\nmake yourself at home \r\ni'm a ...


In [5]:
df.shape

(57650, 4)

In [6]:
df.isnull().sum()

artist    0
song      0
link      0
text      0
dtype: int64

In [7]:
df =df.sample(5000).drop('link', axis=1).reset_index(drop=True)

In [8]:
df.head(10)

Unnamed: 0,artist,song,text
0,Deep Purple,Before Time Began,The daylight fades and the stars comes out \r...
1,Crowded House,Distant Sun,Tell me all the things you would change \r\nI...
2,Eric Clapton,Modern Girl,[Chorus:] \r\nShe's a modern girl in a modern...
3,Pet Shop Boys,Memory Of The Future,You seem to be inevitable to me \r\nLike a me...
4,Dave Matthews Band,Joy Ride,Like a dog with the television on \r\nStaring...
5,Violent Femmes,Ugly,Crossed the path then I followed your face \r...
6,Indigo Girls,Lifeblood,Another night in a succession \r\nThinly glue...
7,Ella Fitzgerald,Come Rain Or Come Shine,"I'm gonna love you, like nobody's loved you \..."
8,NOFX,Soul Doubt,Sometimes I feel my life is going 'round in ci...
9,Z-Ro,Where Is The Love,[Chorus] \r\nWhere is the love - 2x \r\n[Z-R...


In [9]:
df['text'][0]

"The daylight fades and the stars comes out  \r\nBut there's never much to talk about  \r\nAnother day goes rushing by and here we are just wondering why  \r\nWe have to belong with a sense of purpose  \r\nThat's all we need, without them we're worthless  \r\nAh, we've come so far, hm, but then again  \r\nAll we see is more of the same  \r\nIt's getting crowded here we agree no doubt  \r\nOh dear brother what's it all about?  \r\n  \r\nHey you over there why don't you throw down your spear?  \r\nWe all need some help right now  \r\nWhere are we going from here?  \r\nThe way things are working out  \r\nIt won't be too long before we have to move  \r\nBetter start thinking about it  \r\nWhile there's still time to choose  \r\n  \r\nEvery day of my life I discover  \r\nSomeone murdering my sisters and brothers  \r\nIn the name of some god or another  \r\nWhat do you know  \r\n  \r\nFor the first precious few it's time to go  \r\nWhat might have been we'll never know  \r\nAll those bad ide

In [10]:
# df = df.sample(5000)

In [11]:
df.shape

(5000, 3)

Text Cleaning/ Text Preprocessing

In [12]:
df['text'] = df['text'].str.lower().replace(r'^\w\s', ' ').replace(r'\n', ' ', regex = True)

In [13]:
import nltk
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

def tokenization(txt):
    tokens = nltk.word_tokenize(txt)
    stemming = [stemmer.stem(w) for w in tokens]
    return " ".join(stemming)

In [14]:
import nltk
nltk.download('punkt_tab')
df['text'] = df['text'].apply(lambda x: tokenization(x))

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\roy_t\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [16]:
tfidvector = TfidfVectorizer(analyzer='word',stop_words='english')
matrix = tfidvector.fit_transform(df['text'])
similarity = cosine_similarity(matrix)

In [17]:
similarity[0]

array([1.        , 0.04963475, 0.08565448, ..., 0.03893911, 0.05465243,
       0.02514732], shape=(5000,))

In [18]:
df[df['song'] == 'Crying Over You']

Unnamed: 0,artist,song,text


In [19]:
def recommendation(song_df):
    if song_df not in df['song'].values:
        print(f"Song '{song_df}' not found in the dataset. Please enter an existing song.")
        return []
    idx = df[df['song'] == song_df].index[0]
    distances = sorted(list(enumerate(similarity[idx])),reverse=True,key=lambda x:x[1])

    songs = []
    for m_id in distances[1:21]:
        songs.append(df.iloc[m_id[0]].song)

    return songs

In [20]:
recommendation('Crying Over You')

Song 'Crying Over You' not found in the dataset. Please enter an existing song.


[]

In [21]:
import pickle
pickle.dump(similarity,open('similarity.pkl','wb'))
pickle.dump(df,open('df.pkl','wb'))