In [15]:
pip install -q recommenders

Note: you may need to restart the kernel to use updated packages.


In [16]:
# https://github.com/benhamner/Metrics/blob/master/Python/ml_metrics/average_precision.py
import numpy as np


def apk(actual, predicted, k=12):
    """
    Computes the average precision at k.

    This function computes the average prescision at k between two lists of
    items.

    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements

    Returns
    -------
    score : double
            The average precision at k over the input lists

    """
    if len(predicted) > k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i, p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i + 1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)


def mapk(actual, predicted, k=12):
    """
    Computes the mean average precision at k.

    This function computes the mean average prescision at k between two lists
    of lists of items.

    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements

    Returns
    -------
    score : double
            The mean average precision at k over the input lists

    """
    return np.mean([apk(a, p, k) for a, p in zip(actual, predicted)])

In [17]:
import gc
import sys
import os
import pandas as pd
import numpy as np
import tensorflow as tf
tf.get_logger().setLevel('ERROR') # only show error messages

from recommenders.utils.timer import Timer
from recommenders.models.deeprec.models.graphrec.lightgcn import LightGCN
from recommenders.models.deeprec.DataModel.ImplicitCF import ImplicitCF
from recommenders.models.deeprec.deeprec_utils import prepare_hparams
import datetime
from pathlib import Path

print("System version: {}".format(sys.version))
print("Pandas version: {}".format(pd.__version__))
print("Tensorflow version: {}".format(tf.__version__))

System version: 3.7.12 | packaged by conda-forge | (default, Oct 26 2021, 06:08:21) 
[GCC 9.4.0]
Pandas version: 1.3.5
Tensorflow version: 2.8.0


In [18]:
if os.getenv('LOCAL'):
    print('local')
    INPUT_DIR = Path('./input/transformed')
    OUTPUT_DIR = Path('./output')
else:
    print('kaggle')
    INPUT_DIR = Path('../input/transformed')
    OUTPUT_DIR = Path('/kaggle/working')

local


In [19]:
transactions = pd.read_pickle(INPUT_DIR / 'transactions_train.pkl')[['user', 'item', 't_dat']]
valid_start_date = datetime.date(2020, 9, 16)

In [20]:
def train_valid_stats(train, valid):
    train_users = set(train.user)
    train_items = set(train.item)
    valid_users = set(valid.user)
    valid_items = set(valid.item)
    print(f"train transaction: {len(train)}, train user: {len(train_users)}, train item: {len(train_items)}")
    print(f"valid transaction: {len(valid)}, valid user: {len(valid_users)}, valid item: {len(valid_items)}")
    print(f"valid user coverage: {len(train_users & valid_users) / len(valid_users)}")
    print(f"valid item coverage: {len(train_items & valid_items) / len(valid_items)}")

In [21]:
print("train vaild split")
train_start_date = valid_start_date - datetime.timedelta(days=7)
train = transactions.query("@train_start_date <= t_dat < @valid_start_date").reset_index(drop=True)
valid = transactions.query("@valid_start_date <= t_dat").reset_index(drop=True)
del transactions
train_valid_stats(train, valid)

train vaild split
train transaction: 255241, train user: 72019, train item: 18611
valid transaction: 240311, valid user: 68984, valid item: 17986
valid user coverage: 0.18366577757161082
valid item coverage: 0.775992438563327


In [22]:
print("filter recent items")
train_item_start_date = valid_start_date - datetime.timedelta(days=60)
recent_items = set(train.query("@train_item_start_date <= t_dat < @valid_start_date")['item'])
train = train.query("item in @recent_items").reset_index(drop=True)
train_valid_stats(train, valid)

filter recent items
train transaction: 255241, train user: 72019, train item: 18611
valid transaction: 240311, valid user: 68984, valid item: 17986
valid user coverage: 0.18366577757161082
valid item coverage: 0.775992438563327


In [23]:
print("filter each user recent transactions (train)")
train['last_t_dat'] = train.groupby('user').t_dat.transform(max)
train['diff_t_dat'] = (train.last_t_dat - train.t_dat).dt.days
train = train.query("diff_t_dat < 21").reset_index(drop=True)
train_valid_stats(train, valid)

filter each user recent transactions (train)
train transaction: 255241, train user: 72019, train item: 18611
valid transaction: 240311, valid user: 68984, valid item: 17986
valid user coverage: 0.18366577757161082
valid item coverage: 0.775992438563327


In [24]:
print("drop duplicates (train)")
train = train[['user', 'item']]
valid = valid[['user', 'item']]
train = train.drop_duplicates(ignore_index=True)
train_valid_stats(train, valid)

drop duplicates (train)
train transaction: 227910, train user: 72019, train item: 18611
valid transaction: 240311, valid user: 68984, valid item: 17986
valid user coverage: 0.18366577757161082
valid item coverage: 0.775992438563327


In [25]:
print("drop users and items which apper only once (train)")
for _ in range(3):
    users = train.groupby('user').size().reset_index(name='sz').query("sz > 1").user
    train = train.query("user in @users")
    print(train.shape, len(train.user.unique()), len(train.item.unique()))

    items = train.groupby('item').size().reset_index(name='sz').query("sz > 1").item
    train = train.query("item in @items")
    print(train.shape, len(train.user.unique()), len(train.item.unique()))

train_valid_stats(train, valid)

drop users and items which apper only once (train)
(205270, 2) 49379 17981
(199564, 2) 49304 12275
(198650, 2) 48390 12275
(198523, 2) 48390 12148
(198494, 2) 48361 12148
(198488, 2) 48361 12142
train transaction: 198488, train user: 48361, train item: 12142
valid transaction: 240311, valid user: 68984, valid item: 17986
valid user coverage: 0.12892844717615679
valid item coverage: 0.6126987657066607


In [26]:
# LightGCNが学習できるような形式にする
# - testのみに含まれるユーザーがあってはいけない
# - カラム名をあわせる
# - ratingカラムを追加
users = sorted(set(train.user))
valid = valid.query("user in @users")

train = train.rename(columns={'user': 'userID', 'item': 'itemID'})
valid = valid.rename(columns={'user': 'userID', 'item': 'itemID'})
train['rating'] = 1
valid['rating'] = 1

train.head()

Unnamed: 0,userID,itemID,rating
0,107,2219,1
1,107,61303,1
2,107,13042,1
3,107,92029,1
4,107,61305,1


In [27]:
data = ImplicitCF(train=train, test=valid, seed=42)

In [28]:
hparams = prepare_hparams(
    model_type='lightgcn',
    embed_size=128,
    n_layers=3,
    batch_size=8192,
    decay=0.0001,
    epochs=800,
    learning_rate=0.001,
    eval_epoch=10,
    top_k=12,
    save_model=True,
    save_epoch=100,
    metrics=['recall', 'ndcg', 'precision', 'map'],
    MODEL_DIR=str(OUTPUT_DIR),
)
with Timer() as prepare_time:
    model = LightGCN(hparams, data, seed=42)
del data
print(f"{prepare_time.interval} sec")

Already create adjacency matrix.
Already normalize adjacency matrix.
Using xavier initialization.


2022-03-15 11:19:47.630349: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-03-15 11:19:47.673058: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-03-15 11:19:47.700703: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-03-15 11:19:47.700988: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zer

20.955905377999443 sec


2022-03-15 11:19:48.232946: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-03-15 11:19:48.233163: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-03-15 11:19:48.233330: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-03-15 11:19:48.233717: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 8772 MB memory:  -> device: 0, name: NVIDIA GeForce GTX 1080 Ti, pci bus id: 0000:01:00.0, compute capability: 6.1


In [29]:
gc.collect()
with Timer() as train_time:
    model.fit()
print(f"{train_time.interval} sec")

Epoch 1 (train)1.2s: train loss = 0.69266 = (mf)0.69266 + (embed)0.00000
Epoch 2 (train)1.0s: train loss = 0.68939 = (mf)0.68938 + (embed)0.00001
Epoch 3 (train)1.0s: train loss = 0.66974 = (mf)0.66973 + (embed)0.00002
Epoch 4 (train)0.9s: train loss = 0.60849 = (mf)0.60844 + (embed)0.00005
Epoch 5 (train)1.0s: train loss = 0.50853 = (mf)0.50843 + (embed)0.00010
5.035875362000297 sec


In [30]:
# to avoid oom
tmp = valid[['userID']].drop_duplicates()
step = len(tmp) // 10
res = []
for i in range(0, len(tmp), step):
    res.append(model.recommend_k_items(tmp.iloc[i:i+step], top_k=12, remove_seen=False))
pred = pd.concat(res).reset_index(drop=True)
del tmp

In [31]:
pred = pred.rename(columns={'userID': 'user', 'itemID': 'item_pred'})
pred = pred.groupby('user')['item_pred'].apply(list).reset_index()
pred

Unnamed: 0,user,item_pred
0,86,"[95217, 104072, 104553, 103108, 82628, 3091, 8..."
1,107,"[95217, 103108, 104072, 104553, 82628, 3091, 8..."
2,179,"[95217, 104072, 103108, 104553, 82628, 104527,..."
3,330,"[95217, 104553, 104072, 103108, 82628, 71107, ..."
4,475,"[95217, 103108, 82628, 104072, 82631, 104553, ..."
...,...,...
8889,1371634,"[82827, 88637, 82824, 82837, 74155, 89334, 906..."
8890,1371691,"[95217, 104072, 82628, 82631, 104553, 3091, 10..."
8891,1371747,"[95217, 104072, 104553, 82628, 103108, 82631, ..."
8892,1371813,"[104072, 104553, 95217, 103108, 104527, 82628,..."


In [32]:
valid = pd.read_pickle(INPUT_DIR / 'transactions_train.pkl')[['user', 'item', 't_dat']].query("t_dat >= @valid_start_date")[['user', 'item']].rename(columns={'item': 'item_valid'}).reset_index(drop=True)
valid = valid.groupby('user')['item_valid'].apply(list).reset_index()
valid

Unnamed: 0,user,item_valid
0,80,[28967]
1,86,[87371]
2,107,"[69711, 77256, 33872]"
3,117,[97391]
4,179,"[102397, 98409, 73, 95784, 103796, 105103, 103..."
...,...,...
68979,1371868,"[97531, 103424]"
68980,1371871,"[94310, 91533]"
68981,1371879,"[2118, 84994, 57078, 84991, 101099, 104036, 87..."
68982,1371937,"[67261, 70640]"


In [34]:
merged = valid.merge(pred, on='user')
mpk = mapk(merged['item_valid'], merged['item_pred'])
users_valid = set(valid['user'])
users_merged = set(merged['user'])
user_coverage = len(users_valid & users_merged) / len(users_valid)
print(f"mapk: {mpk}, user coverage: {user_coverage}")

mapk: 0.01676288000521493, user coverage: 0.12892844717615679


In [35]:
pred.to_csv(OUTPUT_DIR / 'pred.csv', index=False)
model.infer_embedding(OUTPUT_DIR / 'user_emb.csv', OUTPUT_DIR / 'item_emb.csv')