In [2]:
import networkx
import geoopt
import torch
import torch.nn as nn
import numpy as np
import random
import logging

In [104]:
class ManifoldEmbedding(nn.Module):
    
    def __init__(self, manifold, num_embeddings, embedding_dim, dtype=torch.double, requires_grad=True, weights=None):
        super().__init__()
        if dtype != torch.double:
            logging.warning("Double precision is recommended for embeddings on manifold")
        self.embedding_dim = embedding_dim
        self.num_embeddings = num_embeddings
        self._manifold = manifold
        if weights is None:
            data = torch.zeros((num_embeddings, embedding_dim), dtype=dtype)
            data = geoopt.ManifoldTensor(data, manifold=self._manifold)
            self.w = geoopt.ManifoldParameter(data, requires_grad=requires_grad)
            self.reset_parameters()
        else:
            raise NotImplementedError()
            
    def forward(self, x):
        s0 = x.shape
        ws = self.w[x.view(-1)]
        return ws.view(*s0, self.embedding_dim)
    
    def reset_parameters(self) -> None:
        nn.init.normal_(self.w.data)
        self.w.data[:] = self._manifold.retr(torch.zeros(self.embedding_dim), self.w.data)
        
        
class LorentzEmbedding(ManifoldEmbedding):
    
    def __init__(self, num_embeddings, embedding_dim, k=1.0, **kwargs):
        manifold = geoopt.manifolds.Lorentz(k, learnable=False)
        super().__init__(manifold, num_embeddings, embedding_dim, **kwargs)
        
        
class LorentzSkipGram(nn.Module):
    
    def __init__(self, k=1.0):
        super().__init__()
        self._manifold = geoopt.manifolds.Lorentz(k)
        self.x0 = torch.zeros(10)
        
    def forward(self, a, b):
        return self._manifold.inner(self.x0, a, b)
    

class SGNSLoss(nn.Module):
    
    def __init__(self, reduction="mean"):
        super().__init__()
        self.reduction = reduction
        
    def forward(self, y_, y):
        y.masked_fill_(y == 0, -1)
        loss = torch.log(torch.sigmoid(y*y_))
        if self.reduction is None:
            return loss
        elif self.reduction == "mean":
            return loss.mean()
        elif self.rediction == "sum":
            return loss.sum()
        raise NotImplementedError()

In [236]:
import lightning as pl


class Model(pl.LightningModule):
    
    def __init__(self, num_embeddings, embedding_dim, theta, k=1.0):
        super().__init__()
        self.embd = LorentzEmbedding(num_embeddings, embedding_dim, k)
        self.sg = LorentzSkipGram(k)
        self.loss_fn = SGNSLoss()
        self.theta = theta
        
    def training_step(self, batch, batch_idx):
        x, y = batch
        e1, e2 = x[:, 0], x[:, 1]
        z = self.sg(e1, e2) + self.theta
        return self.loss_fn(z, y)
        
    def configure_optimizers(self):
        optimizer = geoopt.optim.RiemannianAdam(self.parameters(), 1e-3)
        return optimizer


In [237]:
import random

def skip_gram(x, i, w):
    return x[i], x[max(0, i-w):i] + x[i+1:i+w+1]

class SkipGramWithNegativeSampling:
    
    def __init__(self, window, vocabulary, negative=5, negative_probs=None):
        self.window = window
        self.vocabulary = vocabulary
        self.negative = negative
        self.negative_probs = negative_probs
        
    def sample_negatives(self, query):
        if self.negative == 0:
            return []
        assert self.negative_probs is None, "Weighted sampling not implemented yet!"
        items = set(query)
        randoms = random.choices(self.vocabulary, k=len(items)*self.negative)
        return zip(list(items)*self.negative, randoms)
        
    def __call__(self, x):
        grams = [skip_gram(x, i, self.window) for i in range(len(x))]
        batches = [[w,c] for w,context in grams for c in context]
        negatives = list(self.sample_negatives(x))
        labels = [1] * len(batches) + [0] * len(negatives)
        return batches + negatives, labels
    
    
class ToTensor:
    
    def __init__(self, *dtypes):
        self.dtypes = dtypes
        
    def __call__(self, x):
        assert isinstance(x, tuple)
        assert len(x) == len(self.dtypes), f"Number of inputs {len(x)} does not match number of specified data types {len(self.dtypes)}"
        return tuple(torch.tensor(xi, dtype=di) for xi, di in zip(x, self.dtypes))

In [238]:
import sys
sys.path.insert(0, "../src/")
from models.transformer.loader import PlaylistDataset
from models.transformer.transform import *

In [239]:
import os

# utils to create this file list

def get_file_list(base):
    return [os.path.join(base, f) for f in os.listdir(base) if ".json" in f]

files = get_file_list("../data/processed/")
len(files)

20

In [109]:
"""Compute and save song frequencies
from collections import Counter
from tqdm import tqdm
import json

songs = Counter()
for f in tqdm(files):
    with open(f) as f:
        data = json.load(f)["playlists"]
        for pl in data:
            songs.update(pl)
            
with open("../data/frequencies.json", "w") as f:
    f.write(json.dumps(dict(songs)))
len(songs)
"""

100%|███████████████████████████████████████████| 20/20 [00:26<00:00,  1.30s/it]


In [240]:
import json

MIN_FREQ = 5

with open("../data/frequencies.json") as f:
    frequencies = json.load(f)

frequencies = dict(filter(lambda item: item[1] >= MIN_FREQ, frequencies.items()))
songs = list(set(frequencies.keys()))
song2idx = {s: i for i,s in enumerate(songs)}
idx2song = {i: s for s,i in song2idx.items()}
len(frequencies)

599341

In [241]:
def collate_fn(data):
    pairs, labels = list(zip(*data))
    return torch.cat(pairs, dim=0), torch.cat(labels, dim=0)

tf = Compose(
    RemoveUnknownTracks(songs),
    TrackURI2Idx(song2idx),
    SkipGramWithNegativeSampling(5, list(song2idx.values()), 10),
    ToTensor(torch.long, torch.float)
)
dataset = PlaylistDataset(files, 50_000, transform=tf)
loader = torch.utils.data.DataLoader(dataset, batch_size=5, collate_fn=collate_fn)

In [242]:
NUM_EMBEDDING = len(songs)
EMBEDDING_DIM = 16
THETA = 0

model = Model(NUM_EMBEDDING, EMBEDDING_DIM, THETA)

In [243]:
trainer = pl.Trainer()

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [244]:
trainer.fit(model, loader)


  | Name    | Type             | Params
---------------------------------------------
0 | embd    | LorentzEmbedding | 9.6 M 
1 | sg      | LorentzSkipGram  | 1     
2 | loss_fn | SGNSLoss         | 0     
---------------------------------------------
9.6 M     Trainable params
2         Non-trainable params
9.6 M     Total params
38.358    Total estimated model params size (MB)


Epoch 0:   0%|                                       | 0/200000 [00:00<?, ?it/s]

RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn

In [254]:
x, y = dataset[5]
z = model.embd(x)
z1, z2 = z[:, 0], z[:, 1]
scores = model.sg(z1, z2)
#model.loss_fn(scores, y)
scores

tensor([ -1327.6856,   -619.8574,  -1169.5585,   -951.0880,  -1444.6864,
         -1327.6856,   -454.4581,   -698.6418,  -1590.7139,  -1490.3971,
          -921.1452,   -619.8574,   -454.4581,   -578.9538,   -463.1366,
          -338.1800,   -332.7604,   -177.5664,  -1169.5585,   -698.6418,
          -578.9538,  -1164.9486,  -1405.0359,  -1027.7635,   -258.9241,
         -2227.0105,   -951.0880,  -1590.7139,   -463.1366,  -1164.9486,
         -1279.5378,   -806.0576,   -322.7680,  -2158.7980,   -414.1183,
         -1444.6864,  -1490.3971,   -338.1800,  -1405.0359,  -1279.5378,
          -706.1082,   -469.4789,   -925.6813,   -318.5709,   -392.6118,
          -921.1452,   -332.7604,  -1027.7635,   -806.0576,   -706.1082,
          -437.6573,   -766.5232,   -197.0147,   -148.8135,   -237.3889,
          -177.5664,   -258.9241,   -322.7680,   -469.4789,   -437.6573,
          -791.6209,   -117.4669,   -135.0781,   -232.8197,   -219.0742,
         -2227.0105,  -2158.7980,   -925.6813,   -7