In [14]:
import numpy as np
import pandas as pd
import os
from pathlib import Path
import torch
from torch import nn
from torch.nn import functional as F
from torch.optim import Adam
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
from collections import OrderedDict
from sklearn.metrics import roc_auc_score
from torch_geometric.data import Data, InMemoryDataset
from torch_geometric.loader import DataLoader
from torch_geometric.nn import MessagePassing, TopKPooling, global_max_pool, global_mean_pool
from torch_geometric.utils import remove_self_loops, add_self_loops


# loading Data

In [2]:
np.random.seed(42)
data_dir = Path('/home/scc/Downloads/yoochoose-data')

In [3]:
df = pd.read_csv(data_dir.joinpath('yoochoose-clicks.dat'), header=None)
df.columns=['session_id','timestamp','item_id','category']
display(df.head())

buy_df = pd.read_csv(data_dir.joinpath('yoochoose-buys.dat'), header=None)
buy_df.columns=['session_id','timestamp','item_id','price','quantity']
display(buy_df.head())

  df = pd.read_csv(data_dir.joinpath('yoochoose-clicks.dat'), header=None)


Unnamed: 0,session_id,timestamp,item_id,category
0,1,2014-04-07T10:51:09.277Z,214536502,0
1,1,2014-04-07T10:54:09.868Z,214536500,0
2,1,2014-04-07T10:54:46.998Z,214536506,0
3,1,2014-04-07T10:57:00.306Z,214577561,0
4,2,2014-04-07T13:56:37.614Z,214662742,0


Unnamed: 0,session_id,timestamp,item_id,price,quantity
0,420374,2014-04-06T18:44:58.314Z,214537888,12462,1
1,420374,2014-04-06T18:44:58.325Z,214537850,10471,1
2,281626,2014-04-06T09:40:13.032Z,214535653,1883,1
3,420368,2014-04-04T06:13:28.848Z,214530572,6073,1
4,420368,2014-04-04T06:13:28.858Z,214835025,2617,1


In [4]:
display(buy_df.nunique())

df.nunique()

session_id     509696
timestamp     1136477
item_id         19949
price             735
quantity           28
dtype: int64

session_id     9249729
timestamp     32937845
item_id          52739
category           340
dtype: int64

In [5]:
filter_ = df.groupby('session_id')['item_id'].size()
print(f'before filter df.shape={df.shape}')
df = df[df.session_id.isin(
    filter_[filter_>2].index
)].reset_index(drop=True)
print(f'after filter df.shape={df.shape}')

before filter df.shape=(33003944, 4)
after filter df.shape=(24628059, 4)


In [6]:
# randomly sample a couple of them
sampled_session_id = np.random.choice(df.session_id.unique(), 1000000, replace=False)
df = df.loc[df.session_id.isin(sampled_session_id)]
df.nunique()

session_id    1000000
timestamp     5546275
item_id         37353
category          260
dtype: int64

In [7]:
df.groupby('session_id')['item_id'].size().mean()

5.548274

# dataLoader

In [8]:
item_encoder  = LabelEncoder()
df['item_id'] = item_encoder.fit_transform(df.item_id)
df.head()

Unnamed: 0,session_id,timestamp,item_id,category
10,3,2014-04-02T13:17:46.940Z,21302,0
11,3,2014-04-02T13:26:02.515Z,25022,0
12,3,2014-04-02T13:30:12.318Z,29039,0
37,19,2014-04-01T20:52:12.357Z,5749,0
38,19,2014-04-01T20:52:13.758Z,5749,0


In [9]:
df['label'] = df.session_id.isin(buy_df.session_id) * 1
df.head()

Unnamed: 0,session_id,timestamp,item_id,category,label
10,3,2014-04-02T13:17:46.940Z,21302,0,0
11,3,2014-04-02T13:26:02.515Z,25022,0,0
12,3,2014-04-02T13:30:12.318Z,29039,0,0
37,19,2014-04-01T20:52:12.357Z,5749,0,0
38,19,2014-04-01T20:52:13.758Z,5749,0,0


In [10]:
df.drop_duplicates('session_id')['label'].mean()

0.085507

In [11]:
from typing import List, Tuple, Union


class YooChooseDataset(InMemoryDataset):
    def __init__(self, root, transform=None, pre_transform=None, pre_filter=None):
        super().__init__(root, transform, pre_transform, pre_filter)
        self.data, self.slices = torch.load(self.processed_paths[0])
    
    @property
    def processed_file_names(self) -> str | List[str] | Tuple:
        return [data_dir.joinpath('yoochoose_click_binary_1M_sess.dataset')]

    def process(self):
        data_list = []
        # process by session_id
        # graph level task
        gp = df.groupby('session_id')
        tq_bar = tqdm(gp, total=df.session_id.nunique())
        for session_id, tmp_df in tq_bar:
            tq_bar.set_description(f'session_id={session_id}')
            sess_item_id = LabelEncoder().fit_transform(tmp_df.item_id)
            tmp_df['sess_item_id'] = sess_item_id
            tmp_df.sort_values(by='timestamp', ignore_index=True, inplace=True)
            node_features = torch.LongTensor(
                tmp_df.item_id.drop_duplicates().values
            ).unsqueeze(1)
            # 0->1 1->2
            source_nodes = tmp_df.sess_item_id.values[:-1]
            target_nodes = tmp_df.sess_item_id.values[1:]
            edge_index = torch.tensor([source_nodes, target_nodes], dtype=torch.long)
            data = Data(x=node_features, edge_index=edge_index, 
                        y=torch.FloatTensor([tmp_df.label.values[0]]))
            data_list.append(data)
        
        data, slices = self.collate(data_list)
        torch.save((data, slices), self.processed_paths[0])


In [13]:
dataset = YooChooseDataset(root=data_dir)

Processing...
session_id=11562158: 100%|██████████| 1000000/1000000 [13:35<00:00, 1226.14it/s]
Done!


In [15]:
dataset = dataset.shuffle()
train_dataset = dataset[:800000]
val_dataset = dataset[800000:900000]
test_dataset = dataset[900000:]
len(train_dataset), len(val_dataset), len(test_dataset)

(800000, 100000, 100000)

In [16]:
batch_size=1024
tr_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
te_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [17]:
num_items = df.item_id.max() + 1
num_items

37353

# model Arch

In [18]:
class SAGEConv(MessagePassing):
    def __init__(self, in_c, out_c):
        super().__init__(aggr='max')
        self.lin = nn.Linear(in_c, out_c)
        self.act = nn.ReLU()
        self.update_lin = nn.Linear(in_c + out_c, in_c, bias=False)
        self.update_act = nn.ReLU()
    
    def forward(self, x, edge_index):
        # x: [N, In_c]
        # edge_index [2, E]
        edge_index, _ = remove_self_loops(edge_index)
        edge_index, _ = add_self_loops(edge_index, num_nodes=x.size(0))
        return self.propagate(edge_index, size=(x.size(0), x.size(0)), x=x)
    
    def message(self, x_j):
        return self.act(self.lin(x_j))

    def update(self, aggr_out, x):
        # aggr_out: [N, out_c]
        new_emb = torch.cat([aggr_out, x], dim=1)
        new_emb = self.update_act(self.update_lin(new_emb))
        return new_emb

In [31]:
class Net(nn.Module):
    def __init__(self, emb_num, emb_dim=128):
        super(Net, self).__init__()
        self.item_emb = nn.Embedding(num_embeddings=emb_num, embedding_dim=emb_dim)
        self.conv1 = SAGEConv(emb_dim, 128)
        self.pool1 = TopKPooling(128, ratio=0.8)
        self.conv2 = SAGEConv(128, 128)
        self.pool2 = TopKPooling(128, ratio=0.8)
        self.conv3 = SAGEConv(128, 128)
        self.pool3 = TopKPooling(128, ratio=0.8)

        self.head = nn.Sequential(OrderedDict([
            ('lin1', nn.Linear(256, 128)),
            ('act1', nn.ReLU()),
            ('lin2', nn.Linear(128, 64)),
            ('act2', nn.ReLU()),
            ('dropout', nn.Dropout(p=0.4)),
            ('lin3', nn.Linear(64, 1)),
        ]))
    
    def forward(self, data):
        x, edge_index, batch = data.x, data.edge_index, data.batch
        x = self.item_emb(x)
        x = x.squeeze(1)
        # conv1
        x = F.relu(self.conv1(x, edge_index))
        x, edge_index, _, batch, _, _ = self.pool1(x, edge_index, None, batch)
        x1 = torch.cat([global_max_pool(x, batch), global_mean_pool(x, batch)], dim=1)
        # conv2 after pool1(conv1)
        x = F.relu(self.conv2(x, edge_index))
        x, edge_index, _, batch, _, _ = self.pool2(x, edge_index, None, batch)
        x2 = torch.cat([global_max_pool(x, batch), global_mean_pool(x, batch)], dim=1)
        # conv3 after pool2(conv2)
        x = F.relu(self.conv3(x, edge_index))
        x, edge_index, _, batch, _, _ = self.pool3(x, edge_index, None, batch)
        x3 = torch.cat([global_max_pool(x, batch), global_mean_pool(x, batch)], dim=1)

        x = x1 + x2 + x3
        x = self.head(x)
        return torch.sigmoid(x).squeeze(1)

In [32]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
model = Net(emb_num=num_items, emb_dim=128).to(device)


cuda


# Training

In [35]:
optimizer = Adam(model.parameters(), lr=0.005)
crit = nn.BCELoss()

def train(train_loader):
    model.train()
    loss_all = 0
    tr_tq = tqdm(train_loader, total=len(train_loader))
    for data in tr_tq:
        optimizer.zero_grad()
        data = data.to(device)
        label = data.y.to(device)
        out = model(data)
        loss = crit(out, label)
        loss_item = loss.cpu().detach().item()
        loss_all += data.num_graphs * loss_item
        loss.backward()
        optimizer.step()
        tr_tq.set_postfix({'loss': '{:.5f}'.format(loss_item)})
    return loss_all / len(train_dataset)


@torch.no_grad()
def evaluate(loader):
    model.eval()
    predictions, labels = [], []
    for data in loader:
        data = data.to(device)
        pred = model(data).detach().cpu().numpy()
        label = data.y.detach().cpu().numpy()
        predictions.append(pred)
        labels.append(label)
    
    return roc_auc_score(
        np.hstack(labels),
        np.hstack(predictions)
    )


In [36]:
for epoch in range(1):
    loss = train(tr_loader)
    train_acc = evaluate(tr_loader)
    val_acc = evaluate(val_loader)    
    test_acc = evaluate(te_loader)
    print('Epoch: {:03d}, Loss: {:.5f}, Train Auc: {:.5f}, Val Auc: {:.5f}, Test Auc: {:.5f}'.
          format(epoch, loss, train_acc, val_acc, test_acc))

  0%|          | 0/782 [00:00<?, ?it/s]

100%|██████████| 782/782 [01:19<00:00,  9.89it/s, loss=0.27779]


Epoch: 000, Loss: 0.27991, Train Auc: 0.76778, Val Auc: 0.72351, Test Auc: 0.72211
