# LightFM (fresh records)
* trainは各ユーザーの最新購入から3週間以内のデータのみ使う
* 基本的には直近3週間を使いたい & 古いユーザーに関する情報が完全になくなるのを防ぐ

In [1]:
import datetime

import faiss
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import psutil
from lightfm import LightFM
from scipy.sparse import lil_matrix

import schema
from metric import mapk

In [2]:
transactions = pd.read_csv('input/transformed/transactions_train.csv', parse_dates=['t_dat'], usecols=list(schema.TRANSACTIONS.keys())+['t_dat'], dtype=schema.TRANSACTIONS)
n_user = transactions.customer_id_idx.max() + 1
n_item = transactions.article_id_idx.max() + 1
TOPK = 12

In [9]:
train_days = 21
no_components = 256
learning_schedule = 'adadelta'
loss = 'bpr'
learning_rate = 0.005
item_alpha = 1e-8
user_alpha = 1e-8
max_sampled = 20
epochs = 100

lightfm_params = {
    'no_components': no_components,
    'learning_schedule': learning_schedule,
    'loss': loss,
    'learning_rate': learning_rate,
    'item_alpha': item_alpha,
    'user_alpha': user_alpha,
    'max_sampled': max_sampled,
}

valid_start_date = datetime.date(2020, 9, 16)
valid_end_date = datetime.date(2020, 9, 22)
train_start_date = valid_start_date - datetime.timedelta(days=train_days)
train_end_date = valid_start_date - datetime.timedelta(days=1)

transactions_train = transactions.query("@train_start_date <= t_dat <= @train_end_date").reset_index(drop=True)
transactions_valid = transactions.query("@valid_start_date <= t_dat <= @valid_end_date")

val = transactions_valid.groupby('customer_id_idx')['article_id_idx'].apply(list).reset_index()

train = lil_matrix((n_user, n_item))
train[transactions_train.customer_id_idx, transactions_train.article_id_idx] = 1

model = LightFM(**lightfm_params)
model.fit(train, epochs=epochs, num_threads=psutil.cpu_count(logical=False), verbose=True)

index = faiss.index_factory(no_components, "Flat", faiss.METRIC_INNER_PRODUCT)
index = faiss.index_cpu_to_gpu(faiss.StandardGpuResources(), 0, index)
index.add(model.item_embeddings)
_, idxs = index.search(model.user_embeddings, TOPK)

mapk(val.article_id_idx, idxs[val.customer_id_idx])

Epoch: 100%|██████████| 100/100 [01:22<00:00,  1.22it/s]


0.020466072378020644

In [8]:
train_days = 21
no_components = 256
learning_schedule = 'adadelta'
loss = 'bpr'
learning_rate = 0.005
item_alpha = 1e-8
user_alpha = 1e-8
max_sampled = 20
epochs = 100

lightfm_params = {
    'no_components': no_components,
    'learning_schedule': learning_schedule,
    'loss': loss,
    'learning_rate': learning_rate,
    'item_alpha': item_alpha,
    'user_alpha': user_alpha,
    'max_sampled': max_sampled,
}

valid_start_date = datetime.date(2020, 9, 16)
valid_end_date = datetime.date(2020, 9, 22)

transactions_train = transactions.query("t_dat < @valid_start_date").reset_index(drop=True)
transactions_valid = transactions.query("@valid_start_date <= t_dat <= @valid_end_date")

transactions_train['last_t_dat'] = transactions_train.groupby('customer_id_idx').t_dat.transform(max)
transactions_train['diff_t_dat'] = (transactions_train.last_t_dat - transactions_train.t_dat).dt.days
print('before', len(transactions_train))
transactions_train = transactions_train.query("diff_t_dat <= @train_days").reset_index(drop=True)
print('after', len(transactions_train))

val = transactions_valid.groupby('customer_id_idx')['article_id_idx'].apply(list).reset_index()

train = lil_matrix((n_user, n_item))
train[transactions_train.customer_id_idx, transactions_train.article_id_idx] = 1

model = LightFM(**lightfm_params)
model.fit(train, epochs=epochs, num_threads=psutil.cpu_count(logical=False), verbose=True)

index = faiss.index_factory(no_components, "Flat", faiss.METRIC_INNER_PRODUCT)
index = faiss.index_cpu_to_gpu(faiss.StandardGpuResources(), 0, index)
index.add(model.item_embeddings)
_, idxs = index.search(model.user_embeddings, TOPK)

mapk(val.article_id_idx, idxs[val.customer_id_idx])

before 31548013
after 6028836


Epoch: 100%|██████████| 100/100 [07:16<00:00,  4.37s/it]


0.019526944110053766

customerが増えたことで、エポック数が足りなくなったのでは？

In [10]:
train_days = 21
no_components = 256
learning_schedule = 'adadelta'
loss = 'bpr'
learning_rate = 0.005
item_alpha = 1e-8
user_alpha = 1e-8
max_sampled = 20
epochs = 1000

lightfm_params = {
    'no_components': no_components,
    'learning_schedule': learning_schedule,
    'loss': loss,
    'learning_rate': learning_rate,
    'item_alpha': item_alpha,
    'user_alpha': user_alpha,
    'max_sampled': max_sampled,
}

valid_start_date = datetime.date(2020, 9, 16)
valid_end_date = datetime.date(2020, 9, 22)

transactions_train = transactions.query("t_dat < @valid_start_date").reset_index(drop=True)
transactions_valid = transactions.query("@valid_start_date <= t_dat <= @valid_end_date")

transactions_train['last_t_dat'] = transactions_train.groupby('customer_id_idx').t_dat.transform(max)
transactions_train['diff_t_dat'] = (transactions_train.last_t_dat - transactions_train.t_dat).dt.days
print('before', len(transactions_train))
transactions_train = transactions_train.query("diff_t_dat <= @train_days").reset_index(drop=True)
print('after', len(transactions_train))

val = transactions_valid.groupby('customer_id_idx')['article_id_idx'].apply(list).reset_index()

train = lil_matrix((n_user, n_item))
train[transactions_train.customer_id_idx, transactions_train.article_id_idx] = 1

model = LightFM(**lightfm_params)
model.fit(train, epochs=epochs, num_threads=psutil.cpu_count(logical=False), verbose=True)

index = faiss.index_factory(no_components, "Flat", faiss.METRIC_INNER_PRODUCT)
index = faiss.index_cpu_to_gpu(faiss.StandardGpuResources(), 0, index)
index.add(model.item_embeddings)
_, idxs = index.search(model.user_embeddings, TOPK)

mapk(val.article_id_idx, idxs[val.customer_id_idx])

before 31548013
after 6028836


Epoch: 100%|██████████| 1000/1000 [1:11:45<00:00,  4.31s/it]


0.019937798355488014

そんなことはなかった:astonished: