In [1]:
import pandas as pd
from tqdm import tqdm
import pickle
import numpy as np
tqdm.pandas()

In [59]:
df = pd.read_csv("data/ml-1m/ratings.dat", sep="::", names=["uid", "sid", "rating", "timestamp"])

  df = pd.read_csv("data/ml-1m/ratings.dat", sep="::", names=["uid", "sid", "rating", "timestamp"])


### The ratings data

In [4]:
df.head()

Unnamed: 0,uid,sid,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


### Converting ratings to implicit

In [60]:
df = df[df['rating'] >= 0]
df.head()
df.size

4000836

### Filter Triplets?

In [61]:
item_sizes = df.groupby('sid').size()
good_items = item_sizes.index[item_sizes >= 0]
df = df[df['sid'].isin(good_items)]

user_sizes = df.groupby('uid').size()
good_users = user_sizes.index[user_sizes >= 5]
df = df[df['uid'].isin(good_users)]

df.head()

Unnamed: 0,uid,sid,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


### Densifying index

In [62]:
umap = {u: i for i, u in enumerate(set(df['uid']))}
smap = {s: i for i, s in enumerate(set(df['sid']))}
df['uid'] = df['uid'].map(umap)
df['sid'] = df['sid'].map(smap)

df.head()

Unnamed: 0,uid,sid,rating,timestamp
0,0,1104,5,978300760
1,0,639,3,978302109
2,0,853,3,978301968
3,0,3177,4,978300275
4,0,2162,5,978824291


### Splitting Dataframe into train, test and val

In [63]:
user_group = df.groupby('uid')
user2items = user_group.progress_apply(lambda x: list(x.sort_values('timestamp')['sid']))

100%|██████████| 6040/6040 [00:01<00:00, 5321.58it/s]


In [64]:
user_count = len(umap)
train, val, test = {}, {}, {}
for user in range(user_count):
    items = user2items[user]
    train[user], val[user], test[user] = items[:-2], items[-2:-1], items[-1:]


In [65]:
dataset = {
    'train': train,
    'val': val,
    'test': test,
    'umap': umap,
    'smap': smap
}



In [66]:
folder_name = '{}_min_rating{}-min_uc{}-min_sc{}-split{}' \
            .format('ml-1m', 3, 5, 0, 'leave_one_out')

with open('data/{}.pkl'.format(folder_name), 'wb') as f:
    pickle.dump(dataset, f)

### Get negative train samples

In [2]:
max_len = 26

In [3]:
from pathlib import Path


dataset_folder = Path(f'data/lstr/seqlen-{max_len}')
dataset_filename = '{}_min_rating{}-min_uc{}-min_sc{}-split{}'.format('ml-1m', 3, 5, 0, 'leave_one_out')
dataset_file = dataset_folder.joinpath(dataset_filename + '.pkl')

### Load Dataset from pkl

In [5]:
dataset: dict[str, dict[str, list[int]]] = pickle.load(open(dataset_file, 'rb'))
train = dataset['train']
val = dataset['val']
test = dataset['test']
umap = dataset['umap']
smap = dataset['smap']
user_count = len(umap)
item_count = len(smap)

In [4]:
from tqdm import trange


def generate_negative_samples(seed: int, sample_size: int = 100) -> dict[str, list[int]]:
    np.random.seed(seed)
    negative_samples = {}
    for user in trange(user_count):
        if isinstance(train[user][1], tuple):
            seen = set([x[0] for x in train[user]])
            seen.update([x[0] for x in val[user]])
            seen.update(x[0] for x in test[user])
        else:
            seen = set(train[user])
            seen.update(val[user])
            seen.update(test[user])
        
        samples = []
        for _ in range(sample_size):
            item = np.random.choice(item_count) + 1
            while item in seen or item in samples:
                item = np.random.choice(item_count) + 1
            samples.append(item)
        
        negative_samples[user] = samples
    
    return negative_samples

In [5]:
negative_train_samples = generate_negative_samples(0, 0)
negative_test_samples = generate_negative_samples(42)

100%|██████████| 6040/6040 [00:00<00:00, 177632.53it/s]
100%|██████████| 6040/6040 [00:06<00:00, 874.73it/s]


### Save Negative train samples

In [6]:
negative_test_save_file_path = '{}-sample_size{}-seed{}-{}.pkl'.format('random', max_len, 42, 'test')
negative_test_save_file = dataset_folder.joinpath(negative_test_save_file_path)

In [54]:

with negative_test_save_file.open('wb') as f:
    pickle.dump(negative_train_samples, f)


In [7]:
with negative_test_save_file.open('rb') as f:
    negative_test_samples = pickle.load(f)

In [8]:
import torch
import torch.utils.data as data_utils
import torch.backends.cudnn as cudnn
import pytorch_lightning as pl
import torch.nn as nn
import random

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
class dotdict(dict):
    """dot.notation access to dictionary attributes"""
    __getattr__ = dict.get
    __setattr__ = dict.__setitem__
    __delattr__ = dict.__delitem__

In [10]:
class BertTrainDataset(data_utils.Dataset):
    def __init__(self, u2seq: dict[str, list[int]], max_len: int, mask_prob: float, mask_token: int, num_items: int, rng: random.Random):
        self.u2seq = u2seq
        self.users = sorted(self.u2seq.keys())
        self.max_len = max_len
        self.mask_prob = mask_prob
        self.mask_token = mask_token
        self.num_items = num_items
        self.rng = rng

    def __len__(self):
        return len(self.users)

    def __getitem__(self, index):
        user = self.users[index]
        seq = self._getseq(user)

        tokens: list[int] = []
        labels: list[int] = []
        for s in seq:
            prob = self.rng.random()
            if prob < self.mask_prob:
                prob /= self.mask_prob

                if prob < 0.8:
                    tokens.append(self.mask_token)
                elif prob < 0.9:
                    tokens.append(self.rng.randint(1, self.num_items))
                else:
                    tokens.append(s)

                labels.append(s)
            else:
                tokens.append(s)
                labels.append(0)

        tokens = tokens[-self.max_len:]
        labels = labels[-self.max_len:]

        mask_len = self.max_len - len(tokens)

        tokens = [0] * mask_len + tokens
        labels = [0] * mask_len + labels

        return torch.LongTensor(tokens), torch.LongTensor(labels)

    def _getseq(self, user):
        return self.u2seq[user]


In [9]:
# train_ds = BertTrainDataset(train, 100, 0.15, item_count+1, item_count, random.Random(0))

### Eval Dataset (Val and Test)

In [11]:
class BertEvalDataset(data_utils.Dataset):
    def __init__(self, u2seq: dict[str, list[int]], u2answer: dict[str, list[int]], max_len: int, mask_token: int, negative_samples: dict[str, list[int]]) -> None:
        self.u2seq = u2seq
        self.users = sorted(self.u2seq.keys())
        self.u2answer = u2answer
        self.max_len = max_len
        self.mask_token = mask_token
        self.negative_samples = negative_samples

    def __len__(self) -> int:
        return len(self.users)

    def __getitem__(self, index) -> tuple[torch.LongTensor, torch.LongTensor, torch.LongTensor]:
        user = self.users[index]
        seq = self.u2seq[user]
        answer = self.u2answer[user]
        negs = self.negative_samples[user]

        candidates = answer + negs
        labels = [1] * len(answer) + [0] * len(negs)

        seq = seq + [self.mask_token]
        seq = seq[-self.max_len:]
        padding_len = self.max_len - len(seq)
        seq = [0] * padding_len + seq

        return torch.LongTensor(seq), torch.LongTensor(candidates), torch.LongTensor(labels)


In [12]:
def fix_random_seed_as(random_seed: int) -> None:
    random.seed(random_seed)
    torch.manual_seed(random_seed)
    torch.cuda.manual_seed_all(random_seed)
    np.random.seed(random_seed)
    cudnn.deterministic = True
    cudnn.benchmark = False

In [13]:
class PositionalEmbedding(pl.LightningModule):
    def __init__(self, max_len: int, d_model: int) -> None:
        super().__init__()

        # Compute the positional encodings once in log space.
        self.pe = nn.Embedding(max_len, d_model)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        batch_size = x.size(0)
        return self.pe.weight.unsqueeze(0).repeat(batch_size, 1, 1)

In [14]:
class BERTEmbedding(pl.LightningModule):
    """
    BERT Embedding which is consisted with under features
        1. TokenEmbedding : normal embedding matrix
        2. PositionalEmbedding : adding positional information using sin, cos
        2. SegmentEmbedding : adding sentence segment info, (sent_A:1, sent_B:2)
        sum of all these features are output of BERTEmbedding
    """

    def __init__(self, vocab_size: int, embed_size: int, max_len: int, dropout: float = 0.1) -> None:
        """
        :param vocab_size: total vocab size
        :param embed_size: embedding size of token embedding
        :param dropout: dropout rate
        """
        super().__init__()
        self.token = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embed_size)
        self.position = PositionalEmbedding(max_len=max_len, d_model=embed_size)
        # self.segment = SegmentEmbedding(embed_size=self.token.embedding_dim)
        self.dropout = nn.Dropout(p=dropout)
        self.embed_size = embed_size

    def forward(self, sequence: torch.Tensor) -> torch.Tensor:
        x: torch.Tensor = self.token(sequence) + self.position(sequence)  # + self.segment(segment_label)
        return self.dropout(x)

In [19]:
def set_template(args):
    if args.template is None:
        return

    elif args.template.startswith('train_bert'):
        args.mode = 'train'

        args.dataset_code = 'ml-' + input('Input 1 for ml-1m, 20 for ml-20m: ') + 'm'
        args.min_rating = 0 if args.dataset_code == 'ml-1m' else 4
        args.min_uc = 5
        args.min_sc = 0
        args.split = 'leave_one_out'

        args.dataloader_code = 'bert'
        batch = 64
        args.train_batch_size = batch
        args.val_batch_size = batch
        args.test_batch_size = batch

        args.train_negative_sampler_code = 'random'
        args.train_negative_sample_size = 0
        args.train_negative_sampling_seed = 0
        args.test_negative_sampler_code = 'random'
        args.test_negative_sample_size = 100
        args.test_negative_sampling_seed = 98765

        args.trainer_code = 'bert'
        args.device = 'cuda'
        args.num_gpu = 1
        args.device_idx = '0'
        args.optimizer = 'Adam'
        args.lr = 0.001
        args.enable_lr_schedule = True
        args.decay_step = 25
        args.gamma = 1.0
        args.num_epochs = 100 if args.dataset_code == 'ml-1m' else 200
        args.metric_ks = [1, 5, 10, 20]
        args.best_metric = 'NDCG@10'

        args.model_code = 'bert'
        args.model_init_seed = 0

        args.bert_dropout = 0.1
        args.bert_hidden_units = 256
        args.bert_mask_prob = 0.15
        args.bert_max_len = 100
        args.bert_num_blocks = 2
        args.bert_num_heads = 4

In [15]:
from torch import NumberType


def recalls_and_ndcgs_for_ks(scores: torch.Tensor, labels: torch.Tensor, ks: list[int]):
    metrics: dict[str, NumberType] = {}

    # Why is this being done?
    # scores = scores
    # labels = labels
    answer_count = labels.sum(1)

    labels_float = labels.float()
    rank = (-scores).argsort(dim=1)
    cut = rank
    for k in sorted(ks, reverse=True):
        cut = cut[:, :k]
        hits = labels_float.gather(1, cut)
        """
            Recall at k is the proportion of relevant items found in the top-k recommendations.
            Recall@k = (# of recommended items @k that are relevant) / (total # of relevant items)
        """
        numerator = hits.sum(1)
        denominator = torch.min(torch.Tensor([k]).to(labels.device), labels.sum(1).float())
        metrics['Recall@%d' % k] = \
            (numerator / denominator).mean().cpu().item()

        """
            NDCG at k is the average of the normalized DCG scores of the top-k recommendations.
            NDCG@k = DCG@k / IDCG@k
            DCG@K = SUM( recc_i / log2(i + 1))
            IDCG@K = SUM( real_i / log2(i + 1))
        """
        position = torch.arange(2, 2+k)
        weights = 1 / torch.log2(position.float())
        # Discounted cumulative gain at k
        dcg = (hits * weights.to(hits.device)).sum(1)
        # Ideal discounted cumulative gain at k
        # idcg = (labels_float * weights.to(labels_float.device)).sum(1)
        idcg = torch.Tensor([weights[:min(int(n), k)].sum()
                            for n in answer_count]).to(dcg.device)
        # What is the above code doing
        ndcg = (dcg / idcg).mean()
        metrics['NDCG@%d' % k] = ndcg.cpu().item()

    return metrics


In [16]:
class BERT(pl.LightningModule):
    def __init__(self, args: dotdict):
        super().__init__()

        fix_random_seed_as(args.model_init_seed)
        # self.init_weights()

        self.args = args
        max_len: int = args.bert_max_len
        num_items: int = args.num_items
        n_layers: int = args.bert_num_blocks
        self.heads: int = args.bert_num_heads
        vocab_size: int = num_items + 2
        hidden: int = args.bert_hidden_units
        self.hidden = hidden
        self.metric_ks = args.metric_ks
        dropout: float = args.bert_dropout
        self.ce = nn.CrossEntropyLoss(ignore_index=0)
        # embedding for BERT, sum of positional, segment, token embeddings
        self.embedding = BERTEmbedding(
            vocab_size=vocab_size, embed_size=self.hidden, max_len=max_len, dropout=dropout)

        # multi-layers transformer blocks, deep network
        self.encoder = nn.TransformerEncoderLayer(hidden, self.heads, hidden * 4, batch_first=True)
        self.transformer = nn.TransformerEncoder(self.encoder, n_layers)
        self.out = nn.Linear(self.hidden, num_items + 1)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        mask = (x == 0).unsqueeze(1).repeat(self.heads, x.size(1), 1)
        x = self.embedding(x)
        output: torch.Tensor = self.transformer(x, mask)
        # output = output.masked_fill(torch.isnan(output), 0)
        return self.out(output)

    def init_weights(self):
        pass

    def training_step(self, batch: tuple[torch.Tensor, torch.Tensor], batch_idx: int) -> torch.Tensor:
        batch_size = batch[0].size(0)
        seq, labels = batch
        logits: torch.Tensor = self(seq)

        logits = logits.view(-1, logits.size(-1))
        labels = labels.view(-1)
        loss: torch.Tensor = self.ce(logits, labels)
        # log_data = {
        #     'state_dict': (self._create_state_dict()),
        #     "train/ce": loss.item()
        # }
        self.log(
            "train/ce", loss, on_step=True, on_epoch=False, prog_bar=True
        )
        return loss

    def validation_step(self, batch: tuple[torch.Tensor, torch.Tensor, torch.Tensor], batch_idx: int) -> dict[str, NumberType]:
        seq, candidates, labels = batch
        scores: torch.Tensor = self(seq)
        scores = scores[:, -1, :]
        scores = scores.gather(1, candidates)
        metrics = recalls_and_ndcgs_for_ks(scores, labels, self.metric_ks)

        self.log_dict(metrics, on_step=True, on_epoch=False, prog_bar=True)
        return metrics

    def test_step(self, batch: tuple[torch.Tensor, torch.Tensor, torch.Tensor], batch_idx: int) -> dict[str, NumberType]:
        seq, candidates, labels = batch
        scores: torch.Tensor = self(seq)
        scores = scores[:, -1, :]
        scores = scores.gather(1, candidates)
        metrics = recalls_and_ndcgs_for_ks(scores, labels, self.metric_ks)

        self.log_dict(metrics, on_step=True, on_epoch=False, prog_bar=True)
        return metrics

    def validation_epoch_end(self, outputs: list[dict[str, NumberType]]) -> None:
        # outputs is a list of dicts
        avg_metrics = {}
        for metric in outputs[0].keys():
            avg_metrics[metric] = torch.mean(torch.FloatTensor([x[metric] for x in outputs]))
        
        self.log_dict(avg_metrics, on_step=False, on_epoch=True, prog_bar=True)



        # for metric in outputs[0].keys():
        #     self.log(metric, torch.mean(torch.stack(
        #         [output[metric] for output in outputs])), on_step=False, on_epoch=True, prog_bar=False)
    
    def test_epoch_end(self, outputs: list[dict[str, NumberType]]) -> None:
        # outputs is a list of dicts
        metric_names = outputs[0].keys()
        avg_metrics = dict.fromkeys(metric_names, [])
        for output in outputs:
            for metric_name in metric_names:
                avg_metrics[metric_name].append(output[metric_name])
        
        for metric_name in metric_names:
            avg_metrics[metric_name] = torch.mean(torch.stack(avg_metrics[metric_name]))
        
        self.log_dict(avg_metrics, on_step=False, on_epoch=True, prog_bar=True)
        # for metric in outputs[0].keys():
        #     self.log(metric, torch.mean(torch.stack(
        #         [output[metric] for output in outputs])), on_step=False, on_epoch=True, prog_bar=False)

    def configure_optimizers(self):
        if self.args['optimizer'].lower() == 'adam':
            return torch.optim.Adam(self.parameters(), lr=self.args['lr'], weight_decay=self.args['weight_decay'])
        elif self.args['optimizer'].lower() == 'sgd':
            return torch.optim.SGD(self.parameters(), lr=self.args['lr'], weight_decay=self.args['weight_decay'], momentum=self.args['momentum'])
        else:
            raise ValueError('Optimizer not supported')

    def _create_state_dict(self):
        return {
            'model_state_dict': self.model.module.state_dict() if self.is_parallel else self.model.state_dict(),
            'optimizer_state_dict': self.optimizer.state_dict(),
        }
    
    def setup(self, stage=None):
        self.train_dataset = BertTrainDataset(train, 100, 0.15, item_count+1, item_count, random.Random(0))
        self.val_dataset = BertEvalDataset(train, val, 100, item_count+1, negative_test_samples)
        self.test_dataset = BertEvalDataset(train, test, 100, item_count+1, negative_test_samples)

    
    def train_dataloader(self):
        return data_utils.DataLoader(
            self.train_dataset,
            batch_size=128,
            shuffle=True,
            pin_memory=True,
        )

    def val_dataloader(self):
        return data_utils.DataLoader(
            self.val_dataset,
            batch_size=128,
            shuffle=False,
            pin_memory=True,
        )

    def test_dataloader(self):
        return data_utils.DataLoader(
            self.test_dataset,
            batch_size=128,
            shuffle=False,
            pin_memory=True,
        )


In [17]:
args = {
    'bert_max_len': 100,
    'num_items': item_count,
    'bert_num_blocks': 2,
    'bert_num_heads': 4,
    'bert_hidden_units': 256,
    'bert_dropout': 0.1,
    'model_init_seed': 42,
    'bert_mask_prob': 0.15,
    'metric_ks': [1, 5, 10, 20],
    'lr': 0.001,
    'weight_decay': 0.0,
    'optimizer': 'Adam',
}

args = dotdict(args)

In [18]:
model = BERT(args)
trainer = pl.Trainer(accelerator="gpu", devices=1, max_epochs=100)
trainer.fit(model)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name        | Type                    | Params
--------------------------------------------------------
0 | ce          | CrossEntropyLoss        | 0     
1 | embedding   | BERTEmbedding           | 974 K 
2 | encoder     | TransformerEncoderLayer | 789 K 
3 | transformer | TransformerEncoder      | 1.6 M 
4 | out         | Linear                  | 952 K 
--------------------------------------------------------
4.3 M     Trainable params
0         Non-trainable params
4.3 M     Total params
17.187    Total estimated model params size (MB)


Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]

  rank_zero_warn(


                                                                           

  rank_zero_warn(
  rank_zero_warn(


Epoch 12:  70%|██████▉   | 67/96 [00:03<00:01, 19.64it/s, loss=6.12, v_num=26, train/ce=5.910, Recall@20=0.986, NDCG@20=0.663, Recall@10=0.912, NDCG@10=0.645, Recall@5=0.789, NDCG@5=0.604, Recall@1=0.389, NDCG@1=0.389]

  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")
