# Getting implicit ratings from movielens 1M dataset.

In [1]:
import os
from tqdm import tqdm
import random
from urllib.request import urlretrieve
from zipfile import ZipFile
import numpy as np
from absl import flags
import torch
import pytorch_lightning as pl
import torch.nn as nn
import torch.utils.data as data

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
FLAGS = flags.FLAGS
flags.DEFINE_string("file_path", "ml-1m/ratings.dat", "file path.")
flags.DEFINE_string("train_path", "ml-1m/ml_seq_train.txt", "train path. If set to None, the program will split the dataset.")
flags.DEFINE_string("val_path", "ml-1m/ml_seq_val.txt", "val path.")
flags.DEFINE_string("test_path", "ml-1m/ml_seq_test.txt", "test path.")
flags.DEFINE_string("meta_path", "ml-1m/ml_seq_meta.txt", "meta path.")
flags.DEFINE_integer("embed_dim", 64, "The size of embedding dimension.")
flags.DEFINE_float("embed_reg", 0.0, "The value of embedding regularization.")
flags.DEFINE_integer("gru_layers", 2, "The number of GRU Layers.")
flags.DEFINE_integer("gru_unit", 128, "The unit of GRU Layer.")
flags.DEFINE_string("gru_activation", "tanh", "Activation Name.")
flags.DEFINE_float("dnn_dropout", 0., "Float between 0 and 1. Dropout of user and item MLP layer.")
flags.DEFINE_boolean("use_l2norm", False, "Whether user embedding, item embedding should be normalized or not.")
flags.DEFINE_string("loss_name", "bpr_loss", "Loss Name.")
flags.DEFINE_float("gamma", 0.5, "If hinge_loss is selected as the loss function, you can specify the margin.")
flags.DEFINE_float("learning_rate", 0.001, "Learning rate.")
flags.DEFINE_integer("neg_num", 4, "The number of negative sample for each positive sample.")
flags.DEFINE_integer("seq_len", 16, "The length of user's behavior sequence.")
flags.DEFINE_integer("epochs", 10, "train steps.")
flags.DEFINE_integer("batch_size", 128, "Batch Size.")
flags.DEFINE_integer("test_neg_num", 4, "The number of test negative samples.")
flags.DEFINE_integer("k", 10, "recall k items at test stage.")

<absl.flags._flagvalues.FlagHolder at 0x7f601434dcc0>

In [2]:
urlretrieve("http://files.grouplens.org/datasets/movielens/ml-1m.zip", "movielens.zip")
ZipFile("movielens.zip", "r").extractall("../data/")

In [4]:
# sequence recommendation
def split_seq_data(file_path):
    """split movielens for sequence recommendation
    Args:
        :param file_path: A string. The file path of 'ratings.dat'.
    :return: train_path, val_path, test_path, meta_path
    """
    dst_path = os.path.dirname(file_path)
    train_path = os.path.join(dst_path, "ml_seq_train.txt")
    val_path = os.path.join(dst_path, "ml_seq_val.txt")
    test_path = os.path.join(dst_path, "ml_seq_test.txt")
    meta_path = os.path.join(dst_path, "ml_seq_meta.txt")
    users, items = set(), set()
    history = {}
    with open(file_path, 'r') as f:
        lines = f.readlines()
        for line in tqdm(lines):
            user, item, score, timestamp = line.strip().split("::")
            users.add(int(user))
            items.add(int(item))
            history.setdefault(int(user), [])
            history[int(user)].append([item, timestamp])
        random.shuffle(list(users))
    with open(train_path, 'w') as f1, open(val_path, 'w') as f2, open(test_path, 'w') as f3:
        for user, hist in history.items():
            hist_u = history[int(user)]
            hist_u.sort(key=lambda x: x[1])
            hist = [x[0] for x in hist_u]
            time = [x[1] for x in hist_u]
            f1.write(str(user) + "\t" + ' '.join(hist[:-2]) + "\t" + ' '.join(time[:-2]) + '\n')
            f2.write(str(user) + "\t" + ' '.join(hist[:-2]) + "\t" + ' '.join(time[:-2]) + "\t" + hist[-2] + '\n')
            f3.write(str(user) + "\t" + ' '.join(hist[:-1]) + "\t" + ' '.join(time[:-1]) + "\t" + hist[-1] + '\n')
    with open(meta_path, 'w') as f:
        f.write(str(max(users)) + '\t' + str(max(items)))
    return train_path, val_path, test_path, meta_path

In [7]:
train_path, val_path, test_path, meta_path = split_seq_data(file_path="../data/ml-1m/ratings.dat")

100%|██████████| 1000209/1000209 [00:03<00:00, 282181.77it/s]


In [3]:
def load_seq_data(file_path, mode, seq_len, neg_num, max_item_num, contain_user=False, contain_time=False):
    """load sequence movielens dataset.
    Args:
        :param file_path: A string. The file path.
        :param mode: A string. "train", "val" or "test".
        :param seq_len: A scalar(int). The length of sequence.
        :param neg_num: A scalar(int). The negative num of one sample.
        :param max_item_num: A scalar(int). The max index of item.
        :param contain_user: A boolean. Whether including user'id input or not.
        :param contain_time: A boolean. Whether including time sequence input or not.
    :return: A dict. data.
    """
    users, click_seqs, time_seqs, pos_items, neg_items = [], [], [], [], []
    with open(file_path) as f:
        lines = f.readlines()
        for line in tqdm(lines):
            if mode == "train":
                user, click_seq, time_seq = line.split('\t')
                click_seq = click_seq.split(' ')
                click_seq = [int(x) for x in click_seq]
                time_seq = time_seq.split(' ')
                for i in range(len(click_seq)-1):
                    if i + 1 >= seq_len:
                        tmp = click_seq[i+1-seq_len:i+1]
                        tmp2 = time_seq[i + 1 - seq_len:i + 1]
                    else:
                        tmp = [0] * (seq_len-i-1) + click_seq[:i+1]
                        tmp2 = [0] * (seq_len - i - 1) + time_seq[:i + 1]
                    # gen_neg = _gen_negative_samples(neg_num, click_seq, max_item_num)
                    # neg_item = [neg_item for neg_item in gen_neg]
                    neg_item = [random.randint(1, max_item_num) for _ in range(neg_num)]
                    users.append(int(user))
                    click_seqs.append(tmp)
                    time_seqs.append(tmp2)
                    pos_items.append(click_seq[i + 1])
                    neg_items.append(neg_item)
            else:  # "val", "test"
                user, click_seq, time_seq, pos_item = line.split('\t')
                click_seq = click_seq.split(' ')
                click_seq = [int(x) for x in click_seq]
                time_seq = time_seq.split(' ')
                if len(click_seq) >= seq_len:
                    tmp = click_seq[len(click_seq) - seq_len:]
                    tmp2 = time_seq[len(time_seq) - seq_len:]
                else:
                    tmp = [0] * (seq_len - len(click_seq)) + click_seq[:]
                    tmp2 = [0] * (seq_len - len(time_seq)) + time_seq[:]
                # Currently getting negative samples at random.
                # Retrieve negative samples by checking ratings, if < 
                neg_item = [random.randint(1, max_item_num) for _ in range(neg_num)]
                users.append(int(user))
                click_seqs.append(tmp)
                time_seqs.append(tmp2)
                pos_items.append(int(pos_item))
                neg_items.append(neg_item)
    data = list(zip(users, click_seqs, time_seqs, pos_items, neg_items))
    random.shuffle(data)
    users, click_seqs, time_seqs, pos_items, neg_items = zip(*data)
    data = {'click_seq': np.array(click_seqs), 'pos_item': np.array(pos_items), 'neg_item': np.array(neg_items)}
    if contain_user:
        data['user'] = np.array(users)
    if contain_time:
        data['time_seq'] = np.array(time_seqs)
    return data

In [6]:
with open(meta_path) as f:
        _, max_item_num = [int(x) for x in f.readline().strip('\n').split('\t')]

In [5]:
train_path = "../data/ml_seq_train.txt"
val_path = "../data/ml_seq_val.txt"
test_path = "../data/ml_seq_test.txt"
meta_path = "../data/ml_seq_meta.txt"

In [7]:
train_data = load_seq_data(train_path, "train", seq_len=16, neg_num=4, max_item_num=max_item_num)
val_data = load_seq_data(val_path, "val", seq_len=16, neg_num=4, max_item_num=max_item_num)
test_data = load_seq_data(test_path, "test", seq_len=16, neg_num=4, max_item_num=max_item_num)

100%|██████████| 6040/6040 [00:08<00:00, 691.61it/s] 
100%|██████████| 6040/6040 [00:00<00:00, 22027.87it/s]
100%|██████████| 6040/6040 [00:00<00:00, 20644.35it/s]


In [59]:
print(train_data['pos_item'])

[ 512  903 1479 ... 1274 2268  923]


In [8]:
model_params = {
    'item_num': max_item_num + 1,
    'embed_dim': 64,
    'gru_layers': 2,
    'gru_unit': 128,
    'gru_activation': 'tanh',
    'dnn_dropout': 0,
    'use_l2norm': False,
    'loss_name': 'bpr_loss',
    'gamma': 0.5,
    'embed_reg': 0.0
}


In [9]:
class MovieDataset(data.Dataset):
    """Movie dataset."""

    def __init__(
        self, data,test=False
    ):
        """
        Args:
            csv_file (string): Path to the csv file with user,past,future.
        """
        self.data = data
        self.test = test

    def __len__(self):
        return len(self.data['click_seq'])

    def __getitem__(self, idx):
        # print(self.data)
        seq_clicks = self.data['click_seq'][idx]
        pos_item = self.data['pos_item'][idx]
        neg_item = self.data['neg_item'][idx]
        
        seq_clicks = torch.LongTensor(seq_clicks)
        # pos_item = torch.LongTensor(pos_item)
        neg_item = torch.LongTensor(neg_item)
        
        return seq_clicks, pos_item, neg_item

In [70]:
train_ds = MovieDataset(train_data)
train_ds.__getitem__(0)
# print(train_ds.__len__())

(tensor([ 339, 2012, 2961, 2052, 1911, 1690,  366,  426,  879,  332, 1999, 2107,
         1324, 2995, 2315,  196]),
 512,
 tensor([2580, 2894, 1414, 2024]))

In [6]:
def get_loss(pos_scores, neg_scores, loss_name, gamma=None):
    """Get loss scores.
    Args:
        :param pos_scores: A tensor with shape of [batch_size, 1].
        :param neg_scores: A tensor with shape of [batch_size, neg_num].
        :param loss_name: A string such as 'bpr_loss', 'hing_loss' and etc.
        :param gamma: A scalar(int). If loss_name == 'hinge_loss', the gamma must be valid.
    :return:
    """
    pos_scores = torch.tile(pos_scores, [1, neg_scores.shape[1]])
    if loss_name == 'bpr_loss':
        loss = bpr_loss(pos_scores, neg_scores)
    return loss

def bpr_loss(pos_scores, neg_scores):
    """bpr loss.
    Args:
        :param pos_scores: A tensor with shape of [batch_size, 1].
        :param neg_scores: A tensor with shape of [batch_size, neg_num].
    :return:
    """
    loss = torch.mean(-torch.log(torch.special.expit(pos_scores - neg_scores)))
    return loss

In [14]:
class GRU4Rec(pl.LightningModule):
    def __init__(self, item_num, embed_dim, gru_layers=1, gru_unit=64, gru_activation='tanh',
                 dnn_dropout=0., use_l2norm=False, loss_name='bpr_loss', gamma=0.5, embed_reg=0., seed=None):
        """GRU4Rec, Sequential Recommendation Model.
        Args:
            :param item_num: An integer type. The largest item index + 1.
            :param embed_dim: An integer type. Embedding dimension of item vector.
            :param gru_layers: An integer type. The number of GRU Layers.
            :param gru_unit:An integer type. The unit of GRU Layer.
            :param gru_activation: A string. The name of activation function. Default 'tanh'.
            :param dnn_dropout: Float between 0 and 1. Dropout of user and item MLP layer.
            :param use_l2norm: A boolean. Whether user embedding, item embedding should be normalized or not.
            :param loss_name: A string. You can specify the current point-loss function 'binary_cross_entropy_loss' or
            pair-loss function as 'bpr_loss'、'hinge_loss'.
            :param gamma: A float type. If hinge_loss is selected as the loss function, you can specify the margin.
            :param embed_reg: A float type. The regularizer of embedding.
            :param seed: A Python integer to use as random seed.
        :return:
        """
        super().__init__()
        super(GRU4Rec, self).__init__()
        self.save_hyperparameters()
        self._device = torch.device('cpu')
        self.item_embedding = nn.Embedding(num_embeddings=item_num,
                                           embedding_dim=embed_dim,
                                           norm_type=2 if use_l2norm else 0, max_norm=0.0)
        self.dropout = nn.Dropout(dnn_dropout)
        # self.gru_layers = [
        #     nn.GRU(input_size=gru_unit, )
        #     if i < gru_layers - 1 else
        #     GRU(units=gru_unit, activation=gru_activation, return_sequences=False)
        #     for i in range(gru_layers)
        # ]
        self.gru_layers = nn.GRU(input_size=embed_dim, hidden_size=gru_unit,
                                 num_layers=gru_layers, dropout=dnn_dropout, batch_first=True)
        self.dense = nn.Linear(embed_dim, embed_dim)
        # norm
        self.use_l2norm = use_l2norm
        # loss name
        self.loss_name = loss_name
        self.gamma = gamma
        self.hidden_size = gru_unit
        self.activation = self.final_activation("tanh")
        self.hidden = self.init_hidden()
        # seed
        # tf.random.set_seed(seed)

    def create_final_activation(self, final_act):
        if final_act == 'tanh':
            self.final_activation = nn.Tanh()
        elif final_act == 'relu':
            self.final_activation = nn.ReLU()
        elif final_act == 'softmax':
            self.final_activation = nn.Softmax()
        elif final_act == 'softmax_logit':
            self.final_activation = nn.LogSoftmax()
        elif final_act.startswith('elu-'):
            self.final_activation = nn.ELU(
                alpha=float(final_act.split('-')[1]))
        elif final_act.startswith('leaky-'):
            self.final_activation = nn.LeakyReLU(
                negative_slope=float(final_act.split('-')[1]))

    def embed_inputs(self, inputs):
        """Embedding inputs.
        Args:
            :param inputs: A list of tensors. The first tensor is seq_clicks tensor, the second tensor is pos_items tensor, the third tensor is neg_items tensor.
        :return:
        """
        seq_clicks, pos_item, neg_item = inputs
        seq_embed = self.item_embedding(seq_clicks)
        print(f"Seq Embedding Shape: {seq_embed.size()}")
        pos_info = self.item_embedding(pos_item)
        print(f"Pos Embedding Shape: {pos_info.size()}")
        neg_items = self.item_embedding(neg_item)
        print(f"Neg Embedding Shape: {neg_items.size()}")

        # Create mask for seq_clicks, 0 for padding, 1 for real value.
        print(f"Seq Clicks Shape: {seq_clicks.size()}")
        print(f"Seq Clicks: {seq_clicks}")
        seq_mask = [[1.0 if x != 0 else 0.0 for x in sequences] for sequences in seq_clicks]
        # Convert seq_mask to torch float tensor.
        print(f"Seq Mask: {seq_mask}")
        print(f"Seq Mask Shape: {len(seq_mask)}")
        seq_mask = torch.FloatTensor(seq_mask)
        print(f"Seq Mask Shape: {seq_mask.size()}")
        return seq_embed, pos_info, neg_items, seq_mask

    def forward(self, batch, hidden):

        seq_embed, pos_info, neg_info, seq_mask = self.embed_inputs(batch)
        # mask
        # mask = tf.cast(tf.not_equal(inputs['click_seq'], 0), dtype=tf.float32)  # (None, seq_len)
        seq_embed = torch.mul(seq_embed, seq_mask.unsqueeze(-1))
        # dropout
        seq_info = self.dropout(seq_embed)
        # gru
        seq_info, hidden = self.gru_layers(seq_info, hidden)
        # Removing sequences from the output of the GRU layer
        seq_info = seq_info[:, -1, :]
        seq_info = self.dense(seq_info)
        seq_info = self.activation(seq_info)
        # norm
        # if self.use_l2norm:
        #     pos_info = tf.math.l2_normalize(pos_info, axis=-1)
        #     neg_info = tf.math.l2_normalize(neg_info, axis=-1)
        #     seq_info = tf.math.l2_normalize(seq_info, axis=-1)
        # calculate positive item scores and negative item scores
        pos_scores = torch.sum(torch.mul(seq_info, pos_info), 1, keepdim=True)
        neg_scores = torch.sum(torch.mul(seq_info.unsqueeze(1), neg_info), 2)
        # loss
        self.add_loss(get_loss(pos_scores, neg_scores, self.loss_name, self.gamma))
        # logits = tf.concat([pos_scores, neg_scores], axis=-1)
        # return logits
        return pos_scores, neg_scores

    def training_step(self, batch, batch_idx):

        pos_scores, neg_scores = self(batch, self.hidden)
        loss = get_loss(pos_scores, neg_scores, self.loss_name, self.gamma)
        # self.log(
        #     "train/mae", mae, on_step=True, on_epoch=False, prog_bar=False
        # )
        
        # self.log(
        #     "train/rmse", rmse, on_step=True, on_epoch=False, prog_bar=False
        # )
        
        # self.log("train/step_loss", loss, on_step=True, on_epoch=False, prog_bar=False)
        return loss
    
    def configure_optimizers(self):
        return torch.optim.AdamW(self.parameters(), lr=0.0005)

    def setup(self, stage=None):
        print("Loading datasets")
        self.train_dataset = MovieDataset(train_data)
        self.val_dataset = MovieDataset(val_data)
        self.test_dataset = MovieDataset(test_data)
        print("Done")

    def train_dataloader(self):
        return torch.utils.data.DataLoader(
            self.train_dataset,
            batch_size=128,
            shuffle=False,
            num_workers=os.cpu_count(),
        )

    def val_dataloader(self):
        return torch.utils.data.DataLoader(
            self.val_dataset,
            batch_size=128,
            shuffle=False,
            num_workers=os.cpu_count(),
        )

    def test_dataloader(self):
        return torch.utils.data.DataLoader(
            self.test_dataset,
            batch_size=128,
            shuffle=False,
            num_workers=os.cpu_count(),
        )
    
    def init_hidden(self):
        '''
        Initialize the hidden state of the GRU
        '''
        try:
            h0 = torch.zeros(2, 128, self.hidden_size).to(self._device)
        except:
            self._device = 'cpu'
            h0 = torch.zeros(2, 128, self.hidden_size).to(self._device)
        return h0


In [15]:
model = GRU4Rec(**model_params)
trainer = pl.Trainer(max_epochs=1)
trainer.fit(model)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  rank_zero_warn("You passed in a `val_dataloader` but have no `validation_step`. Skipping val loop.")

  | Name           | Type      | Params
---------------------------------------------
0 | item_embedding | Embedding | 252 K 
1 | dropout        | Dropout   | 0     
2 | gru_layers     | GRU       | 173 K 
---------------------------------------------
426 K     Trainable params
0         Non-trainable params
426 K     Total params
1.706     Total estimated model params size (MB)


Loading datasets
Done
Epoch 0:   0%|          | 0/7673 [00:00<?, ?it/s] Seq Embedding Shape: torch.Size([128, 16, 64])
Pos Embedding Shape: torch.Size([128, 64])
Neg Embedding Shape: torch.Size([128, 4, 64])
Seq Clicks Shape: torch.Size([128, 16])
Seq Clicks: tensor([[ 231, 2836, 2694,  ..., 1821,  333,  719],
        [  19,   65,  102,  ..., 1179, 2291,  535],
        [ 927,  934, 1125,  ..., 2779, 3046, 2109],
        ...,
        [ 593,  223,  663,  ..., 1641, 1500, 2302],
        [2683, 2706, 2572,  ..., 2688, 2959, 2333],
        [2861, 1580, 2622,  ..., 2709,  586,    4]])


RuntimeError: Boolean value of Tensor with more than one value is ambiguous