In [None]:
pip install -q recommenders

In [None]:
# https://github.com/benhamner/Metrics/blob/master/Python/ml_metrics/average_precision.py
import numpy as np


def apk(actual, predicted, k=12):
    """
    Computes the average precision at k.

    This function computes the average prescision at k between two lists of
    items.

    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements

    Returns
    -------
    score : double
            The average precision at k over the input lists

    """
    if len(predicted) > k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i, p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i + 1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)


def mapk(actual, predicted, k=12):
    """
    Computes the mean average precision at k.

    This function computes the mean average prescision at k between two lists
    of lists of items.

    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements

    Returns
    -------
    score : double
            The mean average precision at k over the input lists

    """
    return np.mean([apk(a, p, k) for a, p in zip(actual, predicted)])

In [None]:
import gc
import sys
import os
import pandas as pd
import numpy as np
import tensorflow as tf
tf.get_logger().setLevel('ERROR') # only show error messages

from recommenders.utils.timer import Timer
from recommenders.models.deeprec.models.graphrec.lightgcn import LightGCN
from recommenders.models.deeprec.DataModel.ImplicitCF import ImplicitCF
from recommenders.models.deeprec.deeprec_utils import prepare_hparams
import datetime
from pathlib import Path

print("System version: {}".format(sys.version))
print("Pandas version: {}".format(pd.__version__))
print("Tensorflow version: {}".format(tf.__version__))

In [None]:
if os.getenv('LOCAL'):
    print('local')
    INPUT_DIR = Path('./input/transformed')
    OUTPUT_DIR = Path('./output')
else:
    print('kaggle')
    INPUT_DIR = Path('../input/transformed')
    OUTPUT_DIR = Path('/kaggle/working')

In [None]:
transactions = pd.read_pickle(INPUT_DIR / 'transactions_train.pkl')[['user', 'item', 't_dat']]
valid_start_date = datetime.date(2020, 9, 16)

In [None]:
def train_valid_stats(train, valid):
    train_users = set(train.user)
    train_items = set(train.item)
    valid_users = set(valid.user)
    valid_items = set(valid.item)
    print(f"train transaction: {len(train)}, train user: {len(train_users)}, train item: {len(train_items)}")
    print(f"valid transaction: {len(valid)}, valid user: {len(valid_users)}, valid item: {len(valid_items)}")
    print(f"valid user coverage: {len(train_users & valid_users) / len(valid_users)}")
    print(f"valid item coverage: {len(train_items & valid_items) / len(valid_items)}")

In [None]:
train_days = 60
recent_item_days = 7
recent_user_days = 21

In [None]:
print("train vaild split")
train_start_date = valid_start_date - datetime.timedelta(days=train_days)
train = transactions.query("@train_start_date <= t_dat < @valid_start_date").reset_index(drop=True)
valid = transactions.query("@valid_start_date <= t_dat").reset_index(drop=True)
del transactions
train_valid_stats(train, valid)

In [None]:
print("filter recent items")
train_item_start_date = valid_start_date - datetime.timedelta(days=recent_item_days)
recent_items = set(train.query("@train_item_start_date <= t_dat < @valid_start_date")['item'])
train = train.query("item in @recent_items").reset_index(drop=True)
train_valid_stats(train, valid)

In [None]:
print("filter each user recent transactions (train)")
train['last_t_dat'] = train.groupby('user').t_dat.transform(max)
train['diff_t_dat'] = (train.last_t_dat - train.t_dat).dt.days
train = train.query("diff_t_dat < @recent_user_days").reset_index(drop=True)
train_valid_stats(train, valid)

In [None]:
print("drop duplicates (train)")
train = train[['user', 'item']]
valid = valid[['user', 'item']]
train = train.drop_duplicates(ignore_index=True)
train_valid_stats(train, valid)

In [None]:
print("drop users and items which apper only once (train)")
for _ in range(3):
    users = train.groupby('user').size().reset_index(name='sz').query("sz > 1").user
    train = train.query("user in @users")
    print(train.shape, len(train.user.unique()), len(train.item.unique()))

    items = train.groupby('item').size().reset_index(name='sz').query("sz > 1").item
    train = train.query("item in @items")
    print(train.shape, len(train.user.unique()), len(train.item.unique()))

train_valid_stats(train, valid)

In [None]:
# LightGCNが学習できるような形式にする
# - testのみに含まれるユーザーがあってはいけない
# - カラム名をあわせる
# - ratingカラムを追加
users = sorted(set(train.user))
valid = valid.query("user in @users")

train = train.rename(columns={'user': 'userID', 'item': 'itemID'})
valid = valid.rename(columns={'user': 'userID', 'item': 'itemID'})
train['rating'] = 1
valid['rating'] = 1

train.head()

In [None]:
data = ImplicitCF(train=train, test=valid, seed=42)

In [None]:
hparams = prepare_hparams(
    model_type='lightgcn',
    embed_size=128,
    n_layers=3,
    batch_size=8192,
    decay=0.0001,
    epochs=1000,
    learning_rate=0.001,
    eval_epoch=10,
    top_k=12,
    save_model=True,
    save_epoch=100,
    metrics=['recall', 'ndcg', 'precision', 'map'],
    MODEL_DIR=str(OUTPUT_DIR),
)
with Timer() as prepare_time:
    model = LightGCN(hparams, data, seed=42)
del data
print(f"{prepare_time.interval} sec")

In [None]:
gc.collect()
with Timer() as train_time:
    model.fit()
print(f"{train_time.interval} sec")

In [None]:
# to avoid oom
tmp = valid[['userID']].drop_duplicates()
step = len(tmp) // 10
res = []
for i in range(0, len(tmp), step):
    res.append(model.recommend_k_items(tmp.iloc[i:i+step], top_k=12, remove_seen=False))
pred = pd.concat(res).reset_index(drop=True)
del tmp

In [None]:
pred = pred.rename(columns={'userID': 'user', 'itemID': 'item_pred'})
pred = pred.groupby('user')['item_pred'].apply(list).reset_index()
pred

In [None]:
valid = pd.read_pickle(INPUT_DIR / 'transactions_train.pkl')[['user', 'item', 't_dat']].query("t_dat >= @valid_start_date")[['user', 'item']].rename(columns={'item': 'item_valid'}).reset_index(drop=True)
valid = valid.groupby('user')['item_valid'].apply(list).reset_index()
valid

In [None]:
merged = valid.merge(pred, on='user')
mpk = mapk(merged['item_valid'], merged['item_pred'])
users_valid = set(valid['user'])
users_merged = set(merged['user'])
user_coverage = len(users_valid & users_merged) / len(users_valid)
print(f"mapk: {mpk}, user coverage: {user_coverage}")

In [None]:
pred.to_csv(OUTPUT_DIR / 'pred.csv', index=False)
model.infer_embedding(OUTPUT_DIR / 'user_emb.csv', OUTPUT_DIR / 'item_emb.csv')