# LightFM (index only)
* インデックスのみを使う
* そのまま各ユーザーに対して内積が近いアイテム上位12件をfaissで求める

In [1]:
import datetime

import faiss
import matplotlib.pyplot as plt
import numpy as np
import optuna
import pandas as pd
import psutil
from lightfm import LightFM
from scipy.sparse import lil_matrix

import schema
from metric import mapk

In [2]:
transactions = pd.read_csv('input/transformed/transactions_train.csv', parse_dates=['t_dat'], usecols=list(schema.TRANSACTIONS.keys())+['t_dat'], dtype=schema.TRANSACTIONS)
n_user = transactions.customer_id_idx.max() + 1
n_item = transactions.article_id_idx.max() + 1
TOPK = 12

In [3]:
def objective(trial: optuna.Trial) -> float:
    train_days = trial.suggest_int('train_days', 7, 70, 7)
    no_components = trial.suggest_int('no_components', 16, 128, 16)
    learning_schedule = trial.suggest_categorical('learning_schedule', ['adagrad', 'adadelta'])
    loss = trial.suggest_categorical('loss', ['bpr', 'warp'])
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-2)
    item_alpha = trial.suggest_loguniform('item_alpha', 1e-12, 1e-2)
    user_alpha = trial.suggest_loguniform('user_alpha', 1e-12, 1e-2)
    max_sampled = trial.suggest_int('max_sampled', 10, 20, 10)
    epochs = 100

    lightfm_params = {
        'no_components': no_components,
        'learning_schedule': learning_schedule,
        'loss': loss,
        'learning_rate': learning_rate,
        'item_alpha': item_alpha,
        'user_alpha': user_alpha,
        'max_sampled': max_sampled,
    }

    valid_start_date = datetime.date(2020, 9, 16)
    valid_end_date = datetime.date(2020, 9, 22)
    train_end_date = valid_start_date - datetime.timedelta(days=1)
    train_start_date = valid_start_date - datetime.timedelta(days=train_days)

    transactions_train = transactions.query("@train_start_date <= t_dat <= @train_end_date")
    transactions_valid = transactions.query("@valid_start_date <= t_dat <= @valid_end_date")

    val = transactions_valid.groupby('customer_id_idx')['article_id_idx'].apply(list).reset_index()

    train = lil_matrix((n_user, n_item))
    train[transactions_train.customer_id_idx, transactions_train.article_id_idx] = 1

    model = LightFM(**lightfm_params)
    model.fit(train, epochs=epochs, num_threads=psutil.cpu_count(logical=False), verbose=True)

    index = faiss.index_factory(no_components, "Flat", faiss.METRIC_INNER_PRODUCT)
    index = faiss.index_cpu_to_gpu(faiss.StandardGpuResources(), 0, index)
    index.add(model.item_embeddings)
    _, idxs = index.search(model.user_embeddings, TOPK)

    return mapk(val.article_id_idx, idxs[val.customer_id_idx])

In [4]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, timeout=5400)

[32m[I 2022-03-04 12:57:20,535][0m A new study created in memory with name: no-name-aa474d6b-d295-48a0-9793-07af0a1c221e[0m
Epoch: 100%|██████████| 100/100 [01:13<00:00,  1.35it/s]
[32m[I 2022-03-04 12:58:46,730][0m Trial 0 finished with value: 3.504378286936561e-05 and parameters: {'train_days': 49, 'no_components': 96, 'learning_schedule': 'adagrad', 'loss': 'bpr', 'learning_rate': 1.1073228281982184e-05, 'item_alpha': 1.2101237633851377e-10, 'max_sampled': 20}. Best is trial 0 with value: 3.504378286936561e-05.[0m
Epoch: 100%|██████████| 100/100 [02:11<00:00,  1.31s/it]
[32m[I 2022-03-04 13:01:08,333][0m Trial 1 finished with value: 0.01370055437915843 and parameters: {'train_days': 49, 'no_components': 64, 'learning_schedule': 'adadelta', 'loss': 'warp', 'learning_rate': 0.0033631506159568355, 'item_alpha': 3.253240421379212e-09, 'max_sampled': 20}. Best is trial 1 with value: 0.01370055437915843.[0m
Epoch: 100%|██████████| 100/100 [00:56<00:00,  1.78it/s]
[32m[I 2022-03-

In [7]:
study.trials_dataframe().sort_values(by='value', ascending=False).head(20)

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_item_alpha,params_learning_rate,params_learning_schedule,params_loss,params_max_sampled,params_no_components,params_train_days,state
62,62,0.019409,2022-03-04 14:20:11.572978,2022-03-04 14:21:16.226517,0 days 00:01:04.653539,1.120258e-08,0.007381,adadelta,bpr,20,128,21,COMPLETE
68,68,0.019378,2022-03-04 14:26:33.679565,2022-03-04 14:27:45.150316,0 days 00:01:11.470751,1.069907e-08,0.006116,adadelta,bpr,20,128,21,COMPLETE
64,64,0.019344,2022-03-04 14:22:23.313030,2022-03-04 14:23:32.267469,0 days 00:01:08.954439,1.218684e-08,0.007584,adadelta,bpr,20,128,21,COMPLETE
63,63,0.01923,2022-03-04 14:21:16.227561,2022-03-04 14:22:23.311838,0 days 00:01:07.084277,2.611423e-09,0.006533,adadelta,bpr,20,128,21,COMPLETE
60,60,0.019148,2022-03-04 14:18:08.849319,2022-03-04 14:19:17.199893,0 days 00:01:08.350574,9.041893e-11,0.004304,adadelta,bpr,20,128,21,COMPLETE
58,58,0.019133,2022-03-04 14:16:32.284142,2022-03-04 14:17:28.662687,0 days 00:00:56.378545,2.111436e-09,0.007204,adadelta,bpr,20,128,14,COMPLETE
44,44,0.019086,2022-03-04 14:02:19.350715,2022-03-04 14:03:14.262960,0 days 00:00:54.912245,2.333734e-08,0.002668,adadelta,bpr,20,128,14,COMPLETE
43,43,0.018963,2022-03-04 14:01:25.625021,2022-03-04 14:02:19.349144,0 days 00:00:53.724123,2.314335e-08,0.002332,adadelta,bpr,20,128,14,COMPLETE
66,66,0.018951,2022-03-04 14:24:15.909622,2022-03-04 14:25:17.620215,0 days 00:01:01.710593,5.419851e-09,0.007436,adadelta,bpr,20,112,21,COMPLETE
61,61,0.018921,2022-03-04 14:19:17.201000,2022-03-04 14:20:11.571831,0 days 00:00:54.370831,8.793074e-11,0.004898,adadelta,bpr,20,128,14,COMPLETE


In [11]:

def objective(trial: optuna.Trial) -> float:
    train_days = 21
    no_components = trial.suggest_int('no_components', 128, 1024, 128)
    learning_schedule = 'adadelta'
    loss = 'bpr'
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-3, 1e-2)
    item_alpha = trial.suggest_loguniform('item_alpha', 1e-12, 1e-6)
    user_alpha = trial.suggest_loguniform('user_alpha', 1e-12, 1e-6)
    max_sampled = trial.suggest_int('max_sampled', 20, 40, 10)
    epochs = 100

    lightfm_params = {
        'no_components': no_components,
        'learning_schedule': learning_schedule,
        'loss': loss,
        'learning_rate': learning_rate,
        'item_alpha': item_alpha,
        'user_alpha': user_alpha,
        'max_sampled': max_sampled,
    }

    valid_start_date = datetime.date(2020, 9, 16)
    valid_end_date = datetime.date(2020, 9, 22)
    train_end_date = valid_start_date - datetime.timedelta(days=1)
    train_start_date = valid_start_date - datetime.timedelta(days=train_days)

    transactions_train = transactions.query("@train_start_date <= t_dat <= @train_end_date")
    transactions_valid = transactions.query("@valid_start_date <= t_dat <= @valid_end_date")

    val = transactions_valid.groupby('customer_id_idx')['article_id_idx'].apply(list).reset_index()

    train = lil_matrix((n_user, n_item))
    train[transactions_train.customer_id_idx, transactions_train.article_id_idx] = 1

    model = LightFM(**lightfm_params)
    model.fit(train, epochs=epochs, num_threads=psutil.cpu_count(logical=False), verbose=True)

    index = faiss.index_factory(no_components, "Flat", faiss.METRIC_INNER_PRODUCT)
    index = faiss.index_cpu_to_gpu(faiss.StandardGpuResources(), 0, index)
    index.add(model.item_embeddings)
    _, idxs = index.search(model.user_embeddings, TOPK)

    return mapk(val.article_id_idx, idxs[val.customer_id_idx])

study = optuna.create_study(direction='maximize')
study.optimize(objective, timeout=1800)

[32m[I 2022-03-04 14:37:23,991][0m A new study created in memory with name: no-name-38fa6f34-22af-4439-b1e7-b24ee8b84852[0m
Epoch: 100%|██████████| 100/100 [03:23<00:00,  2.04s/it]
[32m[I 2022-03-04 14:41:22,357][0m Trial 0 finished with value: 0.020292594405080116 and parameters: {'no_components': 640, 'learning_rate': 0.002307423800929569, 'item_alpha': 1.0473012470885833e-08, 'user_alpha': 2.2692613031009611e-07, 'max_sampled': 40}. Best is trial 0 with value: 0.020292594405080116.[0m
Epoch: 100%|██████████| 100/100 [03:14<00:00,  1.95s/it]
[32m[I 2022-03-04 14:45:11,927][0m Trial 1 finished with value: 0.020472369138518524 and parameters: {'no_components': 640, 'learning_rate': 0.0023573203145302295, 'item_alpha': 1.2144366292590317e-10, 'user_alpha': 1.6480193837622618e-07, 'max_sampled': 30}. Best is trial 1 with value: 0.020472369138518524.[0m
Epoch: 100%|██████████| 100/100 [02:04<00:00,  1.24s/it]
[32m[I 2022-03-04 14:47:38,554][0m Trial 2 finished with value: 0.020

In [12]:
study.trials_dataframe().sort_values(by='value', ascending=False).head(20)

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_item_alpha,params_learning_rate,params_max_sampled,params_no_components,params_user_alpha,state
5,5,0.020939,2022-03-04 14:53:11.115947,2022-03-04 14:58:53.216752,0 days 00:05:42.100805,4.33722e-08,0.005516,30,1024,1.02371e-10,COMPLETE
9,9,0.020812,2022-03-04 15:04:19.075657,2022-03-04 15:08:42.389754,0 days 00:04:23.314097,4.069161e-08,0.004589,30,768,5.098833e-09,COMPLETE
2,2,0.020647,2022-03-04 14:45:11.928808,2022-03-04 14:47:38.554058,0 days 00:02:26.625250,1.783784e-08,0.001024,30,384,7.679867e-09,COMPLETE
8,8,0.0206,2022-03-04 15:01:32.334437,2022-03-04 15:04:19.074501,0 days 00:02:46.740064,1.122649e-12,0.001328,30,512,1.283753e-10,COMPLETE
1,1,0.020472,2022-03-04 14:41:22.358368,2022-03-04 14:45:11.927540,0 days 00:03:49.569172,1.214437e-10,0.002357,30,640,1.648019e-07,COMPLETE
3,3,0.020445,2022-03-04 14:47:38.555243,2022-03-04 14:51:26.843832,0 days 00:03:48.288589,5.221504e-10,0.006376,40,640,9.872071e-12,COMPLETE
4,4,0.020293,2022-03-04 14:51:26.844993,2022-03-04 14:53:11.114830,0 days 00:01:44.269837,7.306686e-10,0.001831,40,256,4.389593e-09,COMPLETE
0,0,0.020293,2022-03-04 14:37:23.993131,2022-03-04 14:41:22.357158,0 days 00:03:58.364027,1.047301e-08,0.002307,40,640,2.269261e-07,COMPLETE
7,7,0.019229,2022-03-04 15:00:28.516188,2022-03-04 15:01:32.333259,0 days 00:01:03.817071,9.656791e-10,0.00169,40,128,3.259961e-12,COMPLETE
6,6,0.018562,2022-03-04 14:58:53.217898,2022-03-04 15:00:28.515223,0 days 00:01:35.297325,8.388026e-07,0.001016,40,256,2.668508e-07,COMPLETE


暫定良さそうパラメーター
* no_components: 1024 (結構多くても良い)
* learning_rate: 0.005 (epoch: 100の時)
* user_alpha: 10^-10 (0と変わらないかも)
* item_alpha: 10^-8 (0と変わらないかも)
* max_sampled: 30 (40のほうが良い可能性はまだあるが、サチっていそう)