In [None]:
# !mkdir /content/etc && git clone https://github.com/dao-v/Movie_Recommendation_System.git /content/etc

In [None]:
!mkdir -p /content/data
%cd /content/data
!curl -Lo yoochoose-data.7z https://s3-eu-west-1.amazonaws.com/yc-rdata/yoochoose-data.7z
!7z x yoochoose-data.7z

# import zipfile
# def extract(filename):
#     print('Extracting {}...'.format(filename))
#     zip_ref = zipfile.ZipFile(filename, 'r')
#     zip_ref.extractall('data')
#     zip_ref.close()

In [6]:
import torch
import argparse
import logging
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder

%reload_ext google.colab.data_table

In [None]:
clicks = pd.read_csv('yoochoose-clicks.dat', header=None, low_memory=False,
                     dtype={0:np.int32, 1:str, 2:np.int64, 3:str},
                     names=["session_id", "timestamp", "item_id", "category"])
clicks.head(20)

In [None]:
buys = pd.read_csv('yoochoose-buys.dat', header=None, low_memory=False,
                   dtype={0:np.int32, 1:str, 2:np.int64, 3:np.int64, 4:np.int64},
                   names=["session_id", "timestamp", "item_id", "price", "quantity"])
buys.head(20)

In [9]:
# query = "item_id==@item_id & session_id==@session_id"
query = "session_id==@session_id"

session_id = 11
print(clicks.query(query))
print(buys.query(query))

    session_id                 timestamp    item_id category
24          11  2014-04-03T10:44:35.672Z  214821275        0
25          11  2014-04-03T10:45:01.674Z  214821275        0
26          11  2014-04-03T10:45:29.873Z  214821371        0
27          11  2014-04-03T10:46:12.162Z  214821371        0
28          11  2014-04-03T10:46:57.355Z  214821371        0
29          11  2014-04-03T10:53:22.572Z  214717089        0
30          11  2014-04-03T10:53:49.875Z  214563337        0
31          11  2014-04-03T10:55:19.267Z  214706462        0
32          11  2014-04-03T10:55:47.327Z  214717436        0
33          11  2014-04-03T10:56:30.520Z  214743335        0
34          11  2014-04-03T10:57:19.331Z  214826837        0
35          11  2014-04-03T10:57:39.433Z  214819762        0
    session_id                 timestamp    item_id  price  quantity
10          11  2014-04-03T11:04:11.417Z  214821371   1046         1
11          11  2014-04-03T11:04:18.097Z  214821371   1046         1


## PyTorch Geometric

In [None]:
!pip install torch-geometric \
  torch-sparse==latest+cu101 \
  torch-scatter==latest+cu101 \
  torch-cluster==latest+cu101 \
  -f https://pytorch-geometric.com/whl/torch-1.7.0.html

In [49]:
import logging
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder

from torch_geometric.data import Data, DataLoader
from torch_geometric.data import InMemoryDataset
from tqdm import tqdm

import torch
from torch import nn
import torch.nn.functional as F
from torch_geometric.nn import TopKPooling
from torch_geometric.nn import global_mean_pool as gap, global_max_pool as gmp
from torch_geometric.nn import MessagePassing
from torch_geometric.utils import remove_self_loops, add_self_loops

from sklearn.metrics import roc_auc_score
from torch.utils.tensorboard import SummaryWriter

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

embed_dim = 128
batch_size = 1024
num_embeds = 52739

c_file = '/content/data/yoochoose-clicks.dat'
b_file = '/content/data/yoochoose-buys.dat'

c_index = ["session_id", "timestamp", "item_id", "category"]
b_index = ["session_id", "timestamp", "item_id", "price", "quantity"]

test_data_file = '/content/data/yoochoose-test.dat'

# Training parameters
num_workers = 4  # for data-loading
grad_clip = 5.  # clip gradients at an absolute value of
print_freq = 10  # print training/validation stats  every __ batches
checkpoint = None  # path to checkpoint, None if none

In [3]:
def clip_gradient(optimizer, grad_clip):
    """
    Clips gradients computed during backpropagation to avoid explosion of gradients.
    :param optimizer: optimizer with the gradients to be clipped
    :param grad_clip: clip value
    """
    for group in optimizer.param_groups:
        for param in group['params']:
            if param.grad is not None:
                param.grad.data.clamp_(-grad_clip, grad_clip)


def save_checkpoint(epoch, epochs_since_improvement, model, optimizer, acc, is_best):
    state = {'epoch': epoch,
             'epochs_since_improvement': epochs_since_improvement,
             'acc': acc,
             'model': model,
             'optimizer': optimizer}
    # filename = 'checkpoint_' + str(epoch) + '_' + str(loss) + '.tar'
    filename = 'checkpoint.tar'
    torch.save(state, filename)
    # If this checkpoint is the best so far, store a copy so it doesn't get overwritten by a worse checkpoint
    if is_best:
        torch.save(state, 'BEST_checkpoint.tar')


class AverageMeter(object):
    """
    Keeps track of most recent, average, sum, and count of a metric.
    """

    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


class LossMeterBag(object):

    def __init__(self, name_list):
        self.meter_dict = dict()
        self.name_list = name_list
        for name in self.name_list:
            self.meter_dict[name] = AverageMeter()

    def update(self, val_list):
        for i, name in enumerate(self.name_list):
            val = val_list[i]
            self.meter_dict[name].update(val)

    def __str__(self):
        ret = ''
        for name in self.name_list:
            ret += '{0}:\t {1:.4f}({2:.4f})\t'.format(name, self.meter_dict[name].val, self.meter_dict[name].avg)

        return ret


def adjust_learning_rate(optimizer, shrink_factor):
    """
    Shrinks learning rate by a specified factor.
    :param optimizer: optimizer whose learning rate must be shrunk.
    :param shrink_factor: factor in interval (0, 1) to multiply learning rate with.
    """

    print("\nDECAYING learning rate.")
    for param_group in optimizer.param_groups:
        param_group['lr'] = param_group['lr'] * shrink_factor
    print("The new learning rate is %f\n" % (optimizer.param_groups[0]['lr'],))


def get_learning_rate(optimizer):
    return optimizer.param_groups[0]['lr']


def accuracy(scores, targets, k=1):
    batch_size = targets.size(0)
    _, ind = scores.topk(k, 1, True, True)
    correct = ind.eq(targets.view(-1, 1).expand_as(ind))
    correct_total = correct.view(-1).float().sum()  # 0D tensor
    return correct_total.item() * (100.0 / batch_size)

In [4]:
clicks = pd.read_csv(c_file, header=None, names=c_index, low_memory=False)
clicks.head(20)

Unnamed: 0,session_id,timestamp,item_id,category
0,1,2014-04-07T10:51:09.277Z,214536502,0
1,1,2014-04-07T10:54:09.868Z,214536500,0
2,1,2014-04-07T10:54:46.998Z,214536506,0
3,1,2014-04-07T10:57:00.306Z,214577561,0
4,2,2014-04-07T13:56:37.614Z,214662742,0
5,2,2014-04-07T13:57:19.373Z,214662742,0
6,2,2014-04-07T13:58:37.446Z,214825110,0
7,2,2014-04-07T13:59:50.710Z,214757390,0
8,2,2014-04-07T14:00:38.247Z,214757407,0
9,2,2014-04-07T14:02:36.889Z,214551617,0


In [5]:
buys = pd.read_csv(b_file, header=None, names=b_index)
buys.head(20)

Unnamed: 0,session_id,timestamp,item_id,price,quantity
0,420374,2014-04-06T18:44:58.314Z,214537888,12462,1
1,420374,2014-04-06T18:44:58.325Z,214537850,10471,1
2,281626,2014-04-06T09:40:13.032Z,214535653,1883,1
3,420368,2014-04-04T06:13:28.848Z,214530572,6073,1
4,420368,2014-04-04T06:13:28.858Z,214835025,2617,1
5,140806,2014-04-07T09:22:28.132Z,214668193,523,1
6,140806,2014-04-07T09:22:28.176Z,214587399,1046,1
7,140806,2014-04-07T09:22:28.219Z,214586690,837,1
8,140806,2014-04-07T09:22:28.268Z,214774667,1151,1
9,140806,2014-04-07T09:22:28.280Z,214578823,1046,1


In [6]:
item_encoder = LabelEncoder()
clicks['item_id'] = item_encoder.fit_transform(clicks.item_id)
clicks.head()

Unnamed: 0,session_id,timestamp,item_id,category
0,1,2014-04-07T10:51:09.277Z,2053,0
1,1,2014-04-07T10:54:09.868Z,2052,0
2,1,2014-04-07T10:54:46.998Z,2054,0
3,1,2014-04-07T10:57:00.306Z,9876,0
4,2,2014-04-07T13:56:37.614Z,19448,0


In [7]:
# randomly sample a couple of them
sampled_session_id = np.random.choice(clicks.session_id.unique(), 1000000, replace=False)
clicks = clicks.loc[clicks.session_id.isin(sampled_session_id)]
clicks.nunique()

session_id    1000000
timestamp     3565095
item_id         35621
category          235
dtype: int64

In [8]:
clicks['label'] = clicks.session_id.isin(buys.session_id)
clicks.head()

Unnamed: 0,session_id,timestamp,item_id,category,label
38,13,2014-04-06T14:50:13.638Z,44324,0,False
39,13,2014-04-06T14:52:54.363Z,22423,0,False
40,13,2014-04-06T14:53:18.268Z,44324,0,False
172,54,2014-04-01T07:33:07.173Z,39369,0,False
173,54,2014-04-01T07:40:28.688Z,29009,0,False


In [9]:
processed_path = '/content/data/yoochoose_click_binary_1M_sess.dataset'
with open(processed_path, 'wb') as f:
    torch.save(clicks, f)

In [10]:
num_embeddings = clicks.item_id.max() + 1
print('num_embeddings: ' + str(num_embeddings))

num_embeddings: 52710


In [45]:
df = torch.load('/content/data/yoochoose_click_binary_1M_sess.dataset')

In [46]:
class YooChooseBinaryDataset(InMemoryDataset):
    def __init__(self, root, transform=None, pre_transform=None):
        super(YooChooseBinaryDataset, self).__init__(root, transform, pre_transform)
        self.data, self.slices = torch.load(self.processed_paths[0])

    @property
    def raw_file_names(self):
        return []

    @property
    def processed_file_names(self):
        return ['yoochoose_click_binary_1M_sess.dataset']

    def download(self):
        pass

    def process(self):
        data_list = []

        # process by session_id
        grouped = df.groupby('session_id')
        for session_id, group in tqdm(grouped):
            sess_item_id = LabelEncoder().fit_transform(group.item_id)
            group = group.reset_index(drop=True)
            group['sess_item_id'] = sess_item_id
            node_features = group.loc[group.session_id == session_id, ['sess_item_id', 'item_id']].sort_values(
                'sess_item_id').item_id.drop_duplicates().values

            node_features = torch.LongTensor(node_features).unsqueeze(1)
            target_nodes = group.sess_item_id.values[1:]
            source_nodes = group.sess_item_id.values[:-1]

            edge_index = torch.tensor([source_nodes,
                                       target_nodes], dtype=torch.long)
            x = node_features

            y = torch.FloatTensor([group.label.values[0]])

            data = Data(x=x, edge_index=edge_index, y=y)
            data_list.append(data)

        data, slices = self.collate(data_list)
        torch.save((data, slices), self.processed_paths[0])

In [48]:
class SAGEConv(MessagePassing):
    def __init__(self, in_channels, out_channels):
        super(SAGEConv, self).__init__(aggr='max')  # "Max" aggregation.
        self.lin = torch.nn.Linear(in_channels, out_channels)
        self.act = torch.nn.ReLU()
        self.update_lin = torch.nn.Linear(in_channels + out_channels, in_channels, bias=False)
        self.update_act = torch.nn.ReLU()

    def forward(self, x, edge_index):
        # x has shape [N, in_channels]
        # edge_index has shape [2, E]

        edge_index, _ = remove_self_loops(edge_index)
        edge_index, _ = add_self_loops(edge_index, num_nodes=x.size(0))

        return self.propagate(edge_index, size=(x.size(0), x.size(0)), x=x)

    def message(self, x_j):
        # x_j has shape [E, in_channels]

        x_j = self.lin(x_j)
        x_j = self.act(x_j)

        return x_j

    def update(self, aggr_out, x):
        # aggr_out has shape [N, out_channels]

        new_embedding = torch.cat([aggr_out, x], dim=1)

        new_embedding = self.update_lin(new_embedding)
        new_embedding = self.update_act(new_embedding)
        return new_embedding

In [50]:
class Net(torch.nn.Module):
    def __init__(self):
        super(Net, self).__init__()

        self.conv1 = SAGEConv(embed_dim, 128)
        self.pool1 = TopKPooling(128, ratio=0.8)
        self.conv2 = SAGEConv(128, 128)
        self.pool2 = TopKPooling(128, ratio=0.8)
        self.conv3 = SAGEConv(128, 128)
        self.pool3 = TopKPooling(128, ratio=0.8)
        self.item_embedding = torch.nn.Embedding(num_embeddings=num_embeds, embedding_dim=embed_dim)
        self.lin1 = torch.nn.Linear(256, 128)
        self.lin2 = torch.nn.Linear(128, 64)
        self.lin3 = torch.nn.Linear(64, 1)
        self.bn1 = torch.nn.BatchNorm1d(128)
        self.bn2 = torch.nn.BatchNorm1d(64)
        self.act1 = torch.nn.ReLU()
        self.act2 = torch.nn.ReLU()

    def forward(self, data):
        x, edge_index, batch = data.x, data.edge_index, data.batch
        x = self.item_embedding(x)
        x = x.squeeze(1)

        x = F.relu(self.conv1(x, edge_index))

        x, edge_index, _, batch, _, _ = self.pool1(x, edge_index, None, batch)
        x1 = torch.cat([gmp(x, batch), gap(x, batch)], dim=1)

        x = F.relu(self.conv2(x, edge_index))

        x, edge_index, _, batch, _, _ = self.pool2(x, edge_index, None, batch)
        x2 = torch.cat([gmp(x, batch), gap(x, batch)], dim=1)

        x = F.relu(self.conv3(x, edge_index))

        x, edge_index, _, batch, _, _ = self.pool3(x, edge_index, None, batch)
        x3 = torch.cat([gmp(x, batch), gap(x, batch)], dim=1)

        x = x1 + x2 + x3

        x = self.lin1(x)
        x = self.act1(x)
        x = self.lin2(x)
        x = self.act2(x)
        x = F.dropout(x, p=0.5, training=self.training)

        x = torch.sigmoid(self.lin3(x)).squeeze(1)

        return x

In [76]:
def train_net(args):
    torch.manual_seed(7)
    np.random.seed(7)
    checkpoint = args.checkpoint
    start_epoch = 0
    best_acc = 0
    writer = SummaryWriter()
    epochs_since_improvement = 0

    # Initialize / load checkpoint
    if checkpoint is None:
        model = Net()
        model = nn.DataParallel(model)

        optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)

    else:
        checkpoint = torch.load(checkpoint)
        start_epoch = checkpoint['epoch'] + 1
        epochs_since_improvement = checkpoint['epochs_since_improvement']
        model = checkpoint['model']
        optimizer = checkpoint['optimizer']

    # Move to GPU, if available
    model = model.to(device)

    # Loss function
    criterion = nn.BCELoss()

    # Custom dataloaders
    dataset = YooChooseBinaryDataset(root='/content/data/')
    dataset = dataset.shuffle()
    train_dataset = dataset[:800000]
    val_dataset = dataset[800000:900000]
    test_dataset = dataset[900000:]
    len(train_dataset), len(val_dataset), len(test_dataset)
    train_loader = DataLoader(train_dataset, batch_size=batch_size)
    val_loader = DataLoader(val_dataset, batch_size=batch_size)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)

    # Epochs
    for epoch in range(start_epoch, args.end_epoch):
        # One epoch's training
        train_loss = train(train_loader=train_loader,
                           model=model,
                           criterion=criterion,
                           optimizer=optimizer,
                           epoch=epoch)

        writer.add_scalar('model/train_loss', train_loss, epoch)

        # One epoch's validation
        train_acc = evaluate(train_loader, model)
        val_acc = evaluate(val_loader, model)
        test_acc = evaluate(test_loader, model)
        print('Epoch: {:03d}, Loss: {:.5f}, Train Auc: {:.5f}, Val Auc: {:.5f}, Test Auc: {:.5f}'.
              format(epoch, train_loss, train_acc, val_acc, test_acc))

        writer.add_scalar('model/train_acc', train_acc, epoch)
        writer.add_scalar('model/val_acc', val_acc, epoch)
        writer.add_scalar('model/test_acc', test_acc, epoch)

        # Check if there was an improvement
        is_best = val_acc > best_acc
        best_acc = max(val_acc, best_acc)
        if not is_best:
            epochs_since_improvement += 1
            print("\nEpochs since last improvement: %d\n" % (epochs_since_improvement,))
        else:
            epochs_since_improvement = 0

        # Save checkpoint
        save_checkpoint(epoch, epochs_since_improvement, model, optimizer, best_acc, is_best)


def train(train_loader, model, criterion, optimizer, epoch):
    model.train()  # train mode (dropout and batchnorm is used)

    losses = AverageMeter()

    # Batches
    for i, data in enumerate(train_loader):
        # Move to GPU, if available
        data = data.to(device)
        label = data.y.to(device)

        # Forward prop.
        out = model(data)

        # Calculate loss
        loss = criterion(out, label)

        # Back prop.
        optimizer.zero_grad()
        loss.backward()

        # Clip gradients
        clip_gradient(optimizer, grad_clip)

        # Update weights
        optimizer.step()

        # Keep track of metrics
        losses.update(loss.item())

        # Print status
        if i % print_freq == 0:
            status = 'Epoch: [{0}][{1}/{2}]\t' \
                     'Loss {loss.val:.5f} ({loss.avg:.5f})\t'.format(epoch, i,
                                                                     len(train_loader),
                                                                     loss=losses,
                                                                     )

    return losses.avg


def evaluate(loader, model):
    model.eval()  # eval mode (dropout and batchnorm is NOT used)

    predictions = []
    labels = []

    # Batches
    with torch.no_grad():
        for data in loader:
            data = data.to(device)
            pred = model(data).detach().cpu().numpy()

            label = data.y.detach().cpu().numpy()
            predictions.append(pred)
            labels.append(label)

    predictions = np.hstack(predictions)
    labels = np.hstack(labels)

    return roc_auc_score(labels, predictions)

In [74]:
class Args:
    lr = 0.005
    end_epoch = 1000
    weight_decay = 0.0
    batch_size = 32
    checkpoint = None

In [86]:
ag = Args()
train_net(ag)

Epoch: 000, Loss: 0.20574, Train Auc: 0.78789, Val Auc: 0.73940, Test Auc: 0.73935


PicklingError: ignored

In [84]:
# !sudo apt update -y
# !sudo apt install python3.7
# !sudo update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.6 1
# !sudo update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.7 2
# !sudo update-alternatives --config python3
# !python -V

There are 2 choices for the alternative python3 (providing /usr/bin/python3).

  Selection    Path                Priority   Status
------------------------------------------------------------
* 0            /usr/bin/python3.7   2         auto mode
  1            /usr/bin/python3.6   1         manual mode
  2            /usr/bin/python3.7   2         manual mode

Press <enter> to keep the current choice[*], or type selection number: 2
