# LightFM (index only, two step)
* lightfm_index_onlyでは簡易的に内積が大きいアイテム上位12件を求めていたが、実際はアイテムのバイアスがあるので、内積が大きいこととスコアが大きいことは等しくはないので、候補アイテムを多めにとっておき、それに対しては正確に計算する

In [1]:
import datetime

import faiss
import matplotlib.pyplot as plt
import numpy as np
import optuna
import pandas as pd
import psutil
from lightfm import LightFM
from scipy.sparse import lil_matrix

import schema
from metric import mapk

In [2]:
transactions = pd.read_csv('input/transformed/transactions_train.csv', parse_dates=['t_dat'], usecols=list(schema.TRANSACTIONS.keys())+['t_dat'], dtype=schema.TRANSACTIONS)
n_user = transactions.customer_id_idx.max() + 1
n_item = transactions.article_id_idx.max() + 1
TOPK = 12

In [3]:
train_days = 21
no_components = 128
learning_schedule = 'adadelta'
loss = 'bpr'
learning_rate = 0.005
item_alpha = 1e-8
user_alpha = 1e-8
max_sampled = 10
epochs = 100

lightfm_params = {
    'no_components': no_components,
    'learning_schedule': learning_schedule,
    'loss': loss,
    'learning_rate': learning_rate,
    'item_alpha': item_alpha,
    'user_alpha': user_alpha,
    'max_sampled': max_sampled,
}

valid_start_date = datetime.date(2020, 9, 16)
valid_end_date = datetime.date(2020, 9, 22)
train_end_date = valid_start_date - datetime.timedelta(days=1)
train_start_date = valid_start_date - datetime.timedelta(days=train_days)

transactions_train = transactions.query("@train_start_date <= t_dat <= @train_end_date")
transactions_valid = transactions.query("@valid_start_date <= t_dat <= @valid_end_date")

val = transactions_valid.groupby('customer_id_idx')['article_id_idx'].apply(list).reset_index()

train = lil_matrix((n_user, n_item))
train[transactions_train.customer_id_idx, transactions_train.article_id_idx] = 1

model = LightFM(**lightfm_params)
model.fit(train, epochs=epochs, num_threads=psutil.cpu_count(logical=False), verbose=True)

Epoch: 100%|██████████| 100/100 [00:57<00:00,  1.75it/s]


<lightfm.lightfm.LightFM at 0x7f693818a430>

普通に内積上位12件

In [4]:
index = faiss.index_factory(no_components, "Flat", faiss.METRIC_INNER_PRODUCT)
index = faiss.index_cpu_to_gpu(faiss.StandardGpuResources(), 0, index)
index.add(model.item_embeddings)
_, candidates = index.search(model.user_embeddings, TOPK)
mapk(val.article_id_idx, candidates[val.customer_id_idx])

0.019324815197848552

上位12件 -> バイアス込み計算

In [5]:
index = faiss.index_factory(no_components, "Flat", faiss.METRIC_INNER_PRODUCT)
index = faiss.index_cpu_to_gpu(faiss.StandardGpuResources(), 0, index)
index.add(model.item_embeddings)
_, candidates = index.search(model.user_embeddings, TOPK)

user_idxs = np.repeat(range(n_user), TOPK)

result = model.predict(user_idxs, candidates.flatten(), num_threads=psutil.cpu_count(logical=False))
result = result.reshape(n_user, TOPK)

idxs_each_user = np.argsort(result, axis=1)[:,::-1]
pred = np.array([candidates[i, x] for i, x in enumerate(idxs_each_user)])

mapk(val.article_id_idx, pred[val.customer_id_idx])

0.019069074568707454

上位100件 -> バイアス込み計算

In [6]:
index = faiss.index_factory(no_components, "Flat", faiss.METRIC_INNER_PRODUCT)
index = faiss.index_cpu_to_gpu(faiss.StandardGpuResources(), 0, index)
index.add(model.item_embeddings)
_, candidates = index.search(model.user_embeddings, 100)

user_idxs = np.repeat(range(n_user), 100)

result = model.predict(user_idxs, candidates.flatten(), num_threads=psutil.cpu_count(logical=False))
result = result.reshape(n_user, 100)

idxs_each_user = np.argsort(result, axis=1)[:,::-1][:,:TOPK]
pred = np.array([candidates[i, x] for i, x in enumerate(idxs_each_user)])

mapk(val.article_id_idx, pred[val.customer_id_idx])

0.018261404718307145

上位12件 + アイテムバイアス上位100件 -> バイアス込み計算

In [7]:
index = faiss.index_factory(no_components, "Flat", faiss.METRIC_INNER_PRODUCT)
index = faiss.index_cpu_to_gpu(faiss.StandardGpuResources(), 0, index)
index.add(model.item_embeddings)
_, candidates = index.search(model.user_embeddings, TOPK)

item_high_bias = np.argsort(model.item_biases)[::-1][:100]
item_high_bias = np.array([item_high_bias] * n_user)
candidates = np.hstack([candidates, item_high_bias])

user_idxs = np.repeat(range(n_user), TOPK + 100)

result = model.predict(user_idxs, candidates.flatten(), num_threads=psutil.cpu_count(logical=False))
result = result.reshape(n_user, TOPK + 100)

idxs_each_user = np.argsort(result, axis=1)[:,::-1][:,:TOPK]
pred = np.array([candidates[i, x] for i, x in enumerate(idxs_each_user)])

mapk(val.article_id_idx, pred[val.customer_id_idx])

0.017932175663766044

↓の理解に反しており謎
* 内積だけだと（アイテムの）バイアスを無視しているのでモデルのスコアを計算したほうがよい
* 候補数は多ければ多いほどよい

特に、アイテムのバイアスが高いのは直近の人気商品であり、それを取り込むとスコアがあがることはノートブックで主張されている :thinking: