In [1]:
from random import sample
from itertools import chain

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
from torch.utils.data import DataLoader

### Prepare data

In [2]:
df = pd.read_csv("../data/rec_test_assignment_playlist2track.csv")

In [3]:
df.head()

Unnamed: 0,playlist_id,track_id,track_uri
0,0,0,spotify:track:1r0faljjM2b876iNoaDUh5
1,1,1,spotify:track:1JO1xLtVc8mWhIoE3YaCL0
2,2,2,spotify:track:3SuzCeGoNOWmbWOoDMou0B
3,3,3,spotify:track:4GJAd1nBylAEbw1dZDVzEQ
4,4,4,spotify:track:5cbpoIu3YjoOwbBDGUEp3P


In [4]:
n_track = df['track_id'].unique().shape[0]
n_playlist = df['playlist_id'].unique().shape[0]
print('tracks: {}, playlists: {}'.format(n_track, n_playlist))

tracks: 169548, playlists: 714818


In [5]:
N_TOP = 10

playlists_df = df.groupby('playlist_id')['track_id'].agg(neighbours=list)
playlists_df['n_neighbours'] = playlists_df['neighbours'].apply(len)
playlists_df = playlists_df[playlists_df['n_neighbours'] >= N_TOP].drop('n_neighbours', axis=1)

playlists = playlists_df['neighbours'].to_numpy()

In [6]:
playlists.size

9107

In [7]:
n_playlist_samples = N_TOP // 2

triplets_id = []
for i in range(playlists.size):
    for _ in range(n_playlist_samples):
        anchor, positive = sample(playlists[i], 2)

        actual_idx = list(range(playlists.size))
        actual_idx.remove(i)
        negative = sample(sample(actual_idx, 1), 1)[0]

        if anchor != positive and anchor != negative and positive != negative:
            triplets_id.append((anchor, positive, negative))

In [8]:
triplets_id = list(set(triplets_id))
id2idx = {id: k for k, id in enumerate(set(chain.from_iterable(triplets_id)))}

triplets_idx = []
for t in triplets_id:
    triplets_idx.append((id2idx[t[0]], id2idx[t[1]], id2idx[t[2]]))

In [9]:
train_triplets, test_triplets = train_test_split(triplets_idx, test_size=0.2)

### Train

In [10]:
dim = 32
n_items = max(id2idx.values()) + 1

In [16]:
class Embedder(nn.Module):
    def __init__(self):
        super().__init__()
        self.embedding = nn.Embedding(n_items, dim)
        self.linear = nn.Linear(dim, dim)
        self.relu = nn.ReLU()

    def forward(self, x):
        return self.relu(self.linear(self.embedding(x)))

In [17]:
triplet_loss = nn.TripletMarginLoss()
embedder = Embedder()
opt = torch.optim.Adam(embedder.parameters(), lr=1e-3)

In [18]:
train_loader = DataLoader(train_triplets, batch_size=1, shuffle=True)
test_loader = DataLoader(test_triplets, batch_size=1, shuffle=True)

In [19]:
epochs = 5

In [20]:
train_losses, test_losses = [], []

for epoch in range(epochs):
    embedder.train()
    
    train_loss = 0
    for anchor_id, positive_id, negative_id in train_loader:
        anchor_vec = embedder(anchor_id)
        positive_vec = embedder(positive_id)
        negative_vec = embedder(negative_id)
        
        loss = triplet_loss(anchor_vec, positive_vec, negative_vec)
        loss.backward()

        opt.step()
        opt.zero_grad()

        train_loss += loss.item()
        
    embedder.eval()
    
    with torch.no_grad():
        test_loss = 0
        for test_anchor_id, test_positive_id, test_negative_id in test_loader:
            test_loss += triplet_loss(embedder(anchor_id), 
                                      embedder(positive_id),
                                      embedder(negative_id))

    train_loss /= len(train_loader)
    test_loss /= len(test_loader)
    
    train_losses.append(train_loss)
    test_losses.append(test_loss)
    
    print(f"Epoch: {epoch + 1} Train Loss: {train_loss} Test Loss: {test_loss}")

Epoch: 1 Train Loss: 0.9422046206564183 Test Loss: 1.5612562894821167
Epoch: 2 Train Loss: 0.6238275112810108 Test Loss: 0.9004877805709839
Epoch: 3 Train Loss: 0.3898550919330759 Test Loss: 0.0
Epoch: 4 Train Loss: 0.24782800332026028 Test Loss: 0.0
Epoch: 5 Train Loss: 0.1603849930574165 Test Loss: 0.4498909115791321
