# LightFM item features

アイテムの特徴はまあまあ充実しているので、むしろアイテムのほうを特徴のみにする

In [1]:
import datetime

import faiss
import matplotlib.pyplot as plt
import numpy as np
import optuna
import pandas as pd
import psutil
from lightfm import LightFM

import schema
from metric import mapk
from scipy import sparse
from logzero import logger
from utils import train_valid_split

In [2]:
transactions = pd.read_pickle('input/transformed/transactions_train.pkl')[schema.TRANSACTIONS]
articles = pd.read_pickle('input/transformed/articles.pkl')[schema.ARTICLES]
TOPK = 12

In [3]:
tmp = datetime.date(2020, 9, 16) - datetime.timedelta(days=21)
transactions = transactions.query("t_dat >= @tmp")

users = sorted(transactions.customer_id_idx.unique())
items = sorted(transactions.article_id_idx.unique())
mp_user = {x: i for i, x in enumerate(users)}
mp_item = {x: i for i, x in enumerate(items)}
transactions.customer_id_idx = transactions.customer_id_idx.apply(lambda x: mp_user[x])
transactions.article_id_idx = transactions.article_id_idx.apply(lambda x: mp_item[x])

articles = articles.query("article_id_idx in @items").reset_index(drop=True)
articles.article_id_idx = articles.article_id_idx.apply(lambda x: mp_item[x])

n_user = len(users)
n_item = len(items)

In [6]:
def create_article_features(articles):
    df = articles.copy().drop('article_id_idx', axis=1)
    for c in df.columns:
        df = pd.concat([df, pd.get_dummies(df[c], prefix=c)], axis=1).drop(c, axis=1)

    # idxs = sparse.identity(len(df), dtype='f')
    # return sparse.hstack([idxs, df.values]).astype('float32')
    return sparse.csr_matrix(df.values.astype('float32'))

item_features = create_article_features(articles)

In [7]:
train_days = 21
no_components = 1024
learning_schedule = 'adadelta'
loss = 'bpr'
learning_rate = 0.005
item_alpha = 1e-8
user_alpha = 1e-8
max_sampled = 10

lightfm_params = {
    'no_components': no_components,
    'learning_schedule': learning_schedule,
    'loss': loss,
    'learning_rate': learning_rate,
    'item_alpha': item_alpha,
    'user_alpha': user_alpha,
    'max_sampled': max_sampled,
}
print(lightfm_params)

transactions_train, transactions_valid = train_valid_split(transactions, datetime.date(2020, 9, 16), 21)

val = transactions_valid.groupby('customer_id_idx')['article_id_idx'].apply(list).reset_index()

train = sparse.lil_matrix((n_user, n_item))
train[transactions_train.customer_id_idx, transactions_train.article_id_idx] = 1

model = LightFM(**lightfm_params)

for epoch in range(10000):
    model.fit_partial(train, item_features=item_features, epochs=5, num_threads=psutil.cpu_count(logical=False), verbose=True)

    user_biases, user_representations = model.get_user_representations()
    item_biases, item_representations = model.get_item_representations(item_features)

    index = faiss.index_factory(no_components, "Flat", faiss.METRIC_INNER_PRODUCT)
    index = faiss.index_cpu_to_gpu(faiss.StandardGpuResources(), 0, index)
    index.add(item_representations)
    _, idxs = index.search(user_representations, TOPK)

    mpk = mapk(val.article_id_idx, idxs[val.customer_id_idx])
    logger.info(f"epoch={epoch} mapk={mpk}")

[I 220307 14:36:50 utils:27] train: [2020-08-26, 2020-09-16)
[I 220307 14:36:50 utils:29] # of records: 803079
[I 220307 14:36:50 utils:14] valid: [2020-09-16, 2020-09-23)
[I 220307 14:36:50 utils:16] # of records: 240311


{'no_components': 1024, 'learning_schedule': 'adadelta', 'loss': 'bpr', 'learning_rate': 0.005, 'item_alpha': 1e-08, 'user_alpha': 1e-08, 'max_sampled': 10}


[I 220307 14:39:47 1626565973:42] epoch=0 mapk=0.0037638370612872152
[I 220307 14:42:41 1626565973:42] epoch=1 mapk=0.004153410311430187
[I 220307 14:45:36 1626565973:42] epoch=2 mapk=0.0043877802948470825
[I 220307 14:48:33 1626565973:42] epoch=3 mapk=0.004463086877472533
[I 220307 14:51:30 1626565973:42] epoch=4 mapk=0.0048273224444899615
[I 220307 14:54:26 1626565973:42] epoch=5 mapk=0.00485026339037145
[I 220307 14:57:21 1626565973:42] epoch=6 mapk=0.005007803279993207
[I 220307 15:00:15 1626565973:42] epoch=7 mapk=0.004965568016337554
[I 220307 15:03:10 1626565973:42] epoch=8 mapk=0.004943019242196364
[I 220307 15:06:07 1626565973:42] epoch=9 mapk=0.004831250854418344
[I 220307 15:09:02 1626565973:42] epoch=10 mapk=0.004975811977684479
[I 220307 15:11:59 1626565973:42] epoch=11 mapk=0.0050763638851060175
[I 220307 15:14:55 1626565973:42] epoch=12 mapk=0.005056837968565915
[I 220307 15:17:51 1626565973:42] epoch=13 mapk=0.005106182659166273
[I 220307 15:20:47 1626565973:42] epoch=1

KeyboardInterrupt: 

In [121]:
for epoch in range(10000):
    model.fit_partial(train, item_features=item_features, epochs=5, num_threads=psutil.cpu_count(logical=False), verbose=True)

    user_biases, user_representations = model.get_user_representations()
    item_biases, item_representations = model.get_item_representations(item_features)

    index = faiss.index_factory(no_components, "Flat", faiss.METRIC_INNER_PRODUCT)
    index = faiss.index_cpu_to_gpu(faiss.StandardGpuResources(), 0, index)
    index.add(item_representations)
    _, idxs = index.search(user_representations, TOPK)

    mpk = mapk(val.article_id_idx, idxs[val.customer_id_idx])
    logger.info(f"epoch={epoch} mapk={mpk}")

Epoch: 100%|██████████| 5/5 [02:54<00:00, 34.84s/it]
[I 220307 18:32:03 1376685835:13] epoch=0 mapk=0.00541820576909706
Epoch: 100%|██████████| 5/5 [02:52<00:00, 34.52s/it]
[I 220307 18:34:58 1376685835:13] epoch=1 mapk=0.005416876391854967
Epoch: 100%|██████████| 5/5 [02:54<00:00, 34.84s/it]
[I 220307 18:37:55 1376685835:13] epoch=2 mapk=0.00544618636525363
Epoch: 100%|██████████| 5/5 [02:50<00:00, 34.16s/it]
[I 220307 18:40:48 1376685835:13] epoch=3 mapk=0.00536193505852078
Epoch: 100%|██████████| 5/5 [02:52<00:00, 34.48s/it]
[I 220307 18:43:43 1376685835:13] epoch=4 mapk=0.005372772496977109
Epoch: 100%|██████████| 5/5 [02:50<00:00, 34.04s/it]
[I 220307 18:46:36 1376685835:13] epoch=5 mapk=0.005397180134798609
Epoch: 100%|██████████| 5/5 [02:52<00:00, 34.56s/it]
[I 220307 18:49:31 1376685835:13] epoch=6 mapk=0.005581176302516792
Epoch: 100%|██████████| 5/5 [02:51<00:00, 34.21s/it]
[I 220307 18:52:25 1376685835:13] epoch=7 mapk=0.005497551585756469
Epoch:   0%|          | 0/5 [00:34<

KeyboardInterrupt: 

In [122]:
index = faiss.index_factory(no_components, "Flat", faiss.METRIC_INNER_PRODUCT)
index = faiss.index_cpu_to_gpu(faiss.StandardGpuResources(), 0, index)
index.add(item_representations)
_, candidates = index.search(user_representations, 100)

item_high_bias = np.argsort(model.item_biases)[::-1][:100]
item_high_bias = np.array([item_high_bias] * n_user)
candidates = np.hstack([candidates, item_high_bias])

user_idxs = np.repeat(range(n_user), 100 + 100)

result = model.predict(user_idxs, candidates.flatten(), item_features=item_features, num_threads=psutil.cpu_count(logical=False))
result = result.reshape(n_user, 100 + 100)

idxs_each_user = np.argsort(result, axis=1)[:,::-1][:,:12]
pred = np.array([candidates[i, x] for i, x in enumerate(idxs_each_user)])

mapk(val.article_id_idx, pred[val.customer_id_idx])

0.005923731623614608