In [6]:
import pandas as pd


In [7]:
data= pd.read_csv('./spotify_millsongdata.csv')


In [8]:
data.head(10)

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...
5,ABBA,Burning My Bridges,/a/abba/burning+my+bridges_20003011.html,"Well, you hoot and you holler and you make me ..."
6,ABBA,Cassandra,/a/abba/cassandra_20002811.html,Down in the street they're all singing and sho...
7,ABBA,Chiquitita,/a/abba/chiquitita_20002978.html,"Chiquitita, tell me what's wrong \r\nYou're e..."
8,ABBA,Crazy World,/a/abba/crazy+world_20003013.html,I was out with the morning sun \r\nCouldn't s...
9,ABBA,Crying Over You,/a/abba/crying+over+you_20177611.html,I'm waitin' for you baby \r\nI'm sitting all ...


In [9]:
data.shape

(57650, 4)

In [10]:
data.isnull().sum()

artist    0
song      0
link      0
text      0
dtype: int64

In [11]:
data= data.drop('link', axis=1).reset_index(drop=True)
data.head(10)


Unnamed: 0,artist,song,text
0,ABBA,Ahe's My Kind Of Girl,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante","Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,Making somebody happy is a question of give an...
5,ABBA,Burning My Bridges,"Well, you hoot and you holler and you make me ..."
6,ABBA,Cassandra,Down in the street they're all singing and sho...
7,ABBA,Chiquitita,"Chiquitita, tell me what's wrong \r\nYou're e..."
8,ABBA,Crazy World,I was out with the morning sun \r\nCouldn't s...
9,ABBA,Crying Over You,I'm waitin' for you baby \r\nI'm sitting all ...


In [12]:
data=data.sample(20000).reset_index(drop=True)
data.shape

(20000, 3)

In [13]:
data.head(10)

Unnamed: 0,artist,song,text
0,George Harrison,Poor Little Girl,Poor little girl \r\nWith her head in the air...
1,Cat Stevens,God Is The Light,How great the wonder of the heaven \r\nAnd th...
2,Steely Dan,Negative Girl,She's lost she's late \r\nShe's zooming on a ...
3,Bonnie Raitt,Walking The Dog,"Mary Mac, dressed in black \r\nSilver buttons..."
4,Out Of Eden,Day Like Today,"Woke up this morning, jumped out of bed, \r\n..."
5,Rihanna,We All Want Love,"We all, we all, we all, we all, we all \r\nWe..."
6,Overkill,Half Past Dead,No wasted \r\nNo wanting the can't have \r\n...
7,Wiz Khalifa,Star Of The Show,Yeah \r\nUgh \r\nUgh \r\nUgh \r\n \r\nSee...
8,System Of A Down,Chic 'n' Stu,Walk into the refrigerator \r\nDoor's closed ...
9,Aerosmith,Just Push Play,She gave you a flower \r\nThe one that God ga...


 Data Preprocessing


In [14]:
data['text']= data['text'].str.lower().replace(r'^\w\s', ' ').replace(r'\n\r', ' ', regex=True)


In [15]:
import nltk
from nltk.stem.porter import PorterStemmer
nltk.download('punkt')
stemmer= PorterStemmer()

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/priteshdube/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [36]:
def tokenization(txt):
  tokens= nltk.word_tokenize(txt)
  stemming= [stemmer.stem(w) for w in tokens]
  return " ".join(stemming)


In [37]:
data['text']= data['text'].apply(lambda x: tokenization(x))

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [30]:
Tfid= TfidfVectorizer(analyzer='word', stop_words='english')
matrix= Tfid.fit_transform(data['text'])
# similarity= cosine_similarity(matrix)

In [35]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset

# Convert TF-IDF matrix to tensor for PyTorch
X = torch.tensor(matrix.toarray(), dtype=torch.float32)

# Data loader
dataset = TensorDataset(X)
loader = DataLoader(dataset, batch_size=256, shuffle=True)


In [39]:
class AutoEncoder(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 512),
            nn.ReLU(),
            nn.Linear(512, 128),
            nn.ReLU()
        )
        self.decoder = nn.Sequential(
            nn.Linear(128, 512),
            nn.ReLU(),
            nn.Linear(512, input_dim),
            nn.Sigmoid()
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded


In [40]:
model = AutoEncoder(X.shape[1])
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
loss_fn = nn.MSELoss()

for epoch in range(10):  
    total_loss = 0
    for batch in loader:
        xb = batch[0]
        output = model(xb)
        loss = loss_fn(output, xb)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")


Epoch 1, Loss: 3.2021
Epoch 2, Loss: 0.0022
Epoch 3, Loss: 0.0022
Epoch 4, Loss: 0.0022
Epoch 5, Loss: 0.0022
Epoch 6, Loss: 0.0022
Epoch 7, Loss: 0.0022
Epoch 8, Loss: 0.0022
Epoch 9, Loss: 0.0022
Epoch 10, Loss: 0.0022


In [42]:
from sklearn.metrics.pairwise import cosine_similarity

# Extract learned embeddings
with torch.no_grad():
    embeddings = model.encoder(X).numpy()

# Use learned embeddings for similarity
similarity = cosine_similarity(embeddings)



In [47]:
similarity[5]

array([0.99999225, 0.9999969 , 0.99999297, ..., 0.99999064, 0.9999981 ,
       0.99999326], dtype=float32)

In [50]:
data[data['song']=='Helpless'].index[0]

data.head(10)

Unnamed: 0,artist,song,text,cluster
0,George Harrison,Poor Little Girl,poor littl girl with her head in the air there...,5
1,Cat Stevens,God Is The Light,how great the wonder of the heaven and the tim...,7
2,Steely Dan,Negative Girl,she 's lost she 's late she 's zoom on a couch...,7
3,Bonnie Raitt,Walking The Dog,"mari mac , dress in black silver button up and...",5
4,Out Of Eden,Day Like Today,"woke up thi morn , jump out of bed , hit my he...",9
5,Rihanna,We All Want Love,"we all , we all , we all , we all , we all we ...",4
6,Overkill,Half Past Dead,no wast no want the ca n't have untast seem ev...,5
7,Wiz Khalifa,Star Of The Show,yeah ugh ugh ugh see i 've been go on and on f...,1
8,System Of A Down,Chic 'n' Stu,walk into the refrig door 's close light are o...,5
9,Aerosmith,Just Push Play,she gave you a flower the one that god gave he...,5


Recommender

In [55]:
def recommend(song):
    
    if song not in data['song'].values:
        return ["Song not found. Try another."]
     
    idx = data[data['song'] == song].index[0]
    distance = sorted(list(enumerate(similarity[idx])), reverse=True, key=lambda x: x[1])
    song_list = []
    for s in distance[1:11]:
        song_list.append(f"{data.iloc[s[0]].song} by {data.iloc[s[0]].artist}")
    return song_list


In [56]:
recommend('Helpless')

['Avalon Of The Heart by Van Morrison',
 "Don't Gimme No Lip by Pearl Jam",
 'Boogie Man by Kenny Loggins',
 'I Tried Love by Robbie Williams',
 'I Love You by Van Morrison',
 'Put Yer Money Where Yer Mouth Is by Oasis',
 'Onward by Yes',
 'Waiting For The Sunrise by Yoko Ono',
 "Don't Count The Waves by Yoko Ono",
 'Rock And Roll All Nite by Poison']

In [61]:
import pickle


In [62]:
pickle.dump(embeddings, open('embeddings.pkl', 'wb'))


In [63]:
pickle.dump(similarity, open('similarity.pkl', 'wb'))


In [64]:
pickle.dump(data, open('data.pkl', 'wb'))