# LightFM feature only
UUID入れるとデカすぎて学習し難い可能性があるので、テーブル特徴のみ

In [1]:
import datetime

import faiss
import matplotlib.pyplot as plt
import numpy as np
import optuna
import pandas as pd
import psutil
from lightfm import LightFM

import schema
from metric import mapk
from scipy import sparse

In [2]:
articles = pd.read_csv('input/transformed/articles.csv', usecols=schema.ARTICLES.keys(), dtype=schema.ARTICLES)
customers = pd.read_csv('input/transformed/customers.csv', usecols=schema.CUSTOMERS.keys(), dtype=schema.CUSTOMERS)
n_user = len(customers)
n_item = len(articles)

In [3]:
transactions = pd.read_csv('input/transformed/transactions_train.csv', parse_dates=['t_dat'], usecols=list(schema.TRANSACTIONS.keys())+['t_dat'], dtype=schema.TRANSACTIONS)
TOPK = 12

In [37]:
def create_customer_features(customers):
    df = customers.copy().drop('customer_id_idx', axis=1)
    df['age_is_null'] = df['age'].isnull().astype(int)
    df['age'] = df['age'].fillna(df['age'].mean())
    for c in ['club_member_status_idx', 'fashion_news_frequency_idx']:
        df = pd.concat([df, pd.get_dummies(df[c], prefix=c)], axis=1).drop(c, axis=1)
    idxs = sparse.identity(len(df), dtype='f')
    return sparse.hstack([idxs, df.values]).astype('float32')

def create_article_features(articles):
    df = articles.copy().drop('article_id_idx', axis=1)
    for c in df.columns:
        df = pd.concat([df, pd.get_dummies(df[c], prefix=c)], axis=1).drop(c, axis=1)
    return sparse.csr_matrix(df.values.astype('float32'))

user_features = create_customer_features(customers)
item_features = create_article_features(articles)
print(user_features.shape, item_features.shape)

(1371980, 1371993) (105542, 651)


In [39]:
from utils import extract_transactions_train, extract_transactions_valid

def objective(trial):
    train_days = trial.suggest_int('train_days', 7, 28, 7)
    no_components = trial.suggest_int('no_components', 128, 1024, 128)
    learning_rate = trial.suggest_loguniform('learning_rate', 5*1e-3, 5*1e-2)
    item_alpha = trial.suggest_loguniform('item_alpha', 1e-12, 1e-6)
    user_alpha = trial.suggest_loguniform('user_alpha', 1e-12, 1e-6)
    max_sampled = trial.suggest_int('max_sampled', 10, 20)
    return evaluate(train_days, no_components, learning_rate, item_alpha, user_alpha, max_sampled, 100)


no_components = 512
learning_rate = 0.005
item_alpha = 1e-8
user_alpha = 1e-8
max_sampled = 10
train_days = 21
epochs = 100

lightfm_params = {
    'no_components': no_components,
    'learning_schedule': 'adadelta',
    'loss': 'bpr',
    'learning_rate': learning_rate,
    'item_alpha': item_alpha,
    'user_alpha': user_alpha,
    'max_sampled': max_sampled,
}
print(lightfm_params)

valid_start_date = datetime.date(2020, 9, 16)
transactions_valid = extract_transactions_valid(transactions, valid_start_date)
transactions_train = extract_transactions_train(transactions, valid_start_date, train_days)

val = transactions_valid.groupby('customer_id_idx')['article_id_idx'].apply(list).reset_index()

train = sparse.lil_matrix((n_user, n_item))
train[transactions_train.customer_id_idx, transactions_train.article_id_idx] = 1

model = LightFM(**lightfm_params)
model.fit(train, user_features=user_features, item_features=item_features, epochs=epochs, num_threads=psutil.cpu_count(logical=False), verbose=True)

index = faiss.index_factory(no_components, "Flat", faiss.METRIC_INNER_PRODUCT)
index = faiss.index_cpu_to_gpu(faiss.StandardGpuResources(), 0, index)
index.add(model.get_item_representations(item_features)[1])
_, idxs = index.search(model.get_user_representations(user_features)[1], TOPK)

mapk(val.article_id_idx, idxs[val.customer_id_idx])

[I 220306 15:28:38 utils:14] valid: [2020-09-16, 2020-09-23)
[I 220306 15:28:38 utils:16] # of records: 240311
[I 220306 15:28:38 utils:27] train: [2020-08-26, 2020-09-16)


{'no_components': 512, 'learning_schedule': 'adadelta', 'loss': 'bpr', 'learning_rate': 0.005, 'item_alpha': 1e-08, 'user_alpha': 1e-08, 'max_sampled': 10}


[I 220306 15:28:38 utils:29] # of records: 803079
Epoch:  10%|█         | 5/50 [02:38<23:47, 31.73s/it]


KeyboardInterrupt: 

In [33]:
item_features.shape

(105542, 651)

In [34]:
user_features.shape

(1371980, 13)

In [16]:
model.user_embeddings.shape

(13, 128)

In [8]:
evaluate()

[I 220306 14:33:49 utils:14] valid: [2020-09-16, 2020-09-23)
[I 220306 14:33:49 utils:16] # of records: 240311
[I 220306 14:33:49 utils:27] train: [2020-08-26, 2020-09-16)


{'no_components': 128, 'learning_schedule': 'adadelta', 'loss': 'bpr', 'learning_rate': 0.005, 'item_alpha': 1e-08, 'user_alpha': 1e-08, 'max_sampled': 10}


[I 220306 14:33:49 utils:29] # of records: 803079
Epoch: 100%|██████████| 100/100 [24:30<00:00, 14.70s/it]


IndexError: index 80 is out of bounds for axis 0 with size 13

In [21]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, timeout=10800)

[32m[I 2022-03-04 18:45:37,374][0m A new study created in memory with name: no-name-4ed06eab-d012-4f11-83d8-47aa904aaad4[0m


{'no_components': 384, 'learning_schedule': 'adadelta', 'loss': 'bpr', 'learning_rate': 0.04438449855265177, 'item_alpha': 1.424971069814322e-12, 'user_alpha': 7.225127522877115e-09, 'max_sampled': 12}


Epoch: 100%|██████████| 20/20 [08:35<00:00, 25.80s/it]
[32m[I 2022-03-04 18:54:39,796][0m Trial 0 finished with value: 0.01143350442683491 and parameters: {'train_days': 28, 'no_components': 384, 'learning_rate': 0.04438449855265177, 'item_alpha': 1.424971069814322e-12, 'user_alpha': 7.225127522877115e-09, 'max_sampled': 12}. Best is trial 0 with value: 0.01143350442683491.[0m


{'no_components': 768, 'learning_schedule': 'adadelta', 'loss': 'bpr', 'learning_rate': 0.013067997329139045, 'item_alpha': 1.5731211546179455e-12, 'user_alpha': 2.9335066353527017e-08, 'max_sampled': 19}


Epoch: 100%|██████████| 20/20 [14:07<00:00, 42.40s/it]
[32m[I 2022-03-04 19:09:32,392][0m Trial 1 finished with value: 0.012845120156728552 and parameters: {'train_days': 28, 'no_components': 768, 'learning_rate': 0.013067997329139045, 'item_alpha': 1.5731211546179455e-12, 'user_alpha': 2.9335066353527017e-08, 'max_sampled': 19}. Best is trial 1 with value: 0.012845120156728552.[0m


{'no_components': 128, 'learning_schedule': 'adadelta', 'loss': 'bpr', 'learning_rate': 0.006012048787846059, 'item_alpha': 7.447907575388095e-10, 'user_alpha': 2.190846133101763e-08, 'max_sampled': 19}


Epoch: 100%|██████████| 20/20 [04:12<00:00, 12.65s/it]
[32m[I 2022-03-04 19:13:57,085][0m Trial 2 finished with value: 0.005104265175821466 and parameters: {'train_days': 21, 'no_components': 128, 'learning_rate': 0.006012048787846059, 'item_alpha': 7.447907575388095e-10, 'user_alpha': 2.190846133101763e-08, 'max_sampled': 19}. Best is trial 1 with value: 0.012845120156728552.[0m


{'no_components': 384, 'learning_schedule': 'adadelta', 'loss': 'bpr', 'learning_rate': 0.015392848889039575, 'item_alpha': 1.632272085856521e-11, 'user_alpha': 1.5206117205978786e-10, 'max_sampled': 10}


Epoch: 100%|██████████| 20/20 [09:34<00:00, 28.70s/it]
[32m[I 2022-03-04 19:23:54,963][0m Trial 3 finished with value: 0.01141169605534723 and parameters: {'train_days': 28, 'no_components': 384, 'learning_rate': 0.015392848889039575, 'item_alpha': 1.632272085856521e-11, 'user_alpha': 1.5206117205978786e-10, 'max_sampled': 10}. Best is trial 1 with value: 0.012845120156728552.[0m


{'no_components': 384, 'learning_schedule': 'adadelta', 'loss': 'bpr', 'learning_rate': 0.006156766378622971, 'item_alpha': 6.704053722354312e-09, 'user_alpha': 8.410487972766976e-07, 'max_sampled': 12}


Epoch: 100%|██████████| 20/20 [04:48<00:00, 14.41s/it]
[32m[I 2022-03-04 19:29:06,614][0m Trial 4 finished with value: 0.010542250607274508 and parameters: {'train_days': 14, 'no_components': 384, 'learning_rate': 0.006156766378622971, 'item_alpha': 6.704053722354312e-09, 'user_alpha': 8.410487972766976e-07, 'max_sampled': 12}. Best is trial 1 with value: 0.012845120156728552.[0m


{'no_components': 256, 'learning_schedule': 'adadelta', 'loss': 'bpr', 'learning_rate': 0.010111489318630543, 'item_alpha': 1.3504875673307761e-09, 'user_alpha': 5.116894469211582e-08, 'max_sampled': 19}


Epoch: 100%|██████████| 20/20 [05:49<00:00, 17.49s/it]
[32m[I 2022-03-04 19:35:13,786][0m Trial 5 finished with value: 0.00991481941036904 and parameters: {'train_days': 21, 'no_components': 256, 'learning_rate': 0.010111489318630543, 'item_alpha': 1.3504875673307761e-09, 'user_alpha': 5.116894469211582e-08, 'max_sampled': 19}. Best is trial 1 with value: 0.012845120156728552.[0m


{'no_components': 256, 'learning_schedule': 'adadelta', 'loss': 'bpr', 'learning_rate': 0.019485043246482137, 'item_alpha': 1.0327695820485805e-07, 'user_alpha': 7.316156647786821e-08, 'max_sampled': 20}


Epoch: 100%|██████████| 20/20 [03:52<00:00, 11.62s/it]
[32m[I 2022-03-04 19:39:23,652][0m Trial 6 finished with value: 0.009378920169782907 and parameters: {'train_days': 14, 'no_components': 256, 'learning_rate': 0.019485043246482137, 'item_alpha': 1.0327695820485805e-07, 'user_alpha': 7.316156647786821e-08, 'max_sampled': 20}. Best is trial 1 with value: 0.012845120156728552.[0m


{'no_components': 256, 'learning_schedule': 'adadelta', 'loss': 'bpr', 'learning_rate': 0.01184934934803008, 'item_alpha': 7.863766201428401e-07, 'user_alpha': 6.1070127698484305e-12, 'max_sampled': 15}


Epoch: 100%|██████████| 20/20 [07:20<00:00, 22.04s/it]
[32m[I 2022-03-04 19:47:02,455][0m Trial 7 finished with value: 0.009858210937684736 and parameters: {'train_days': 28, 'no_components': 256, 'learning_rate': 0.01184934934803008, 'item_alpha': 7.863766201428401e-07, 'user_alpha': 6.1070127698484305e-12, 'max_sampled': 15}. Best is trial 1 with value: 0.012845120156728552.[0m


{'no_components': 384, 'learning_schedule': 'adadelta', 'loss': 'bpr', 'learning_rate': 0.014710749790755014, 'item_alpha': 3.0488649378678514e-12, 'user_alpha': 7.881353203163838e-08, 'max_sampled': 18}


Epoch: 100%|██████████| 20/20 [04:52<00:00, 14.62s/it]
[32m[I 2022-03-04 19:52:18,230][0m Trial 8 finished with value: 0.010301398141893844 and parameters: {'train_days': 14, 'no_components': 384, 'learning_rate': 0.014710749790755014, 'item_alpha': 3.0488649378678514e-12, 'user_alpha': 7.881353203163838e-08, 'max_sampled': 18}. Best is trial 1 with value: 0.012845120156728552.[0m


{'no_components': 640, 'learning_schedule': 'adadelta', 'loss': 'bpr', 'learning_rate': 0.022380960573479054, 'item_alpha': 1.4330600129826203e-08, 'user_alpha': 4.305587967272264e-11, 'max_sampled': 14}


Epoch: 100%|██████████| 20/20 [13:14<00:00, 39.73s/it]
[32m[I 2022-03-04 20:06:09,758][0m Trial 9 finished with value: 0.01239332984037478 and parameters: {'train_days': 28, 'no_components': 640, 'learning_rate': 0.022380960573479054, 'item_alpha': 1.4330600129826203e-08, 'user_alpha': 4.305587967272264e-11, 'max_sampled': 14}. Best is trial 1 with value: 0.012845120156728552.[0m


{'no_components': 1024, 'learning_schedule': 'adadelta', 'loss': 'bpr', 'learning_rate': 0.03026938904874928, 'item_alpha': 9.465436641697277e-11, 'user_alpha': 1.4728255087211056e-09, 'max_sampled': 17}


Epoch: 100%|██████████| 20/20 [04:19<00:00, 12.99s/it]
[32m[I 2022-03-04 20:11:20,498][0m Trial 10 finished with value: 0.00874551296053323 and parameters: {'train_days': 7, 'no_components': 1024, 'learning_rate': 0.03026938904874928, 'item_alpha': 9.465436641697277e-11, 'user_alpha': 1.4728255087211056e-09, 'max_sampled': 17}. Best is trial 1 with value: 0.012845120156728552.[0m


{'no_components': 768, 'learning_schedule': 'adadelta', 'loss': 'bpr', 'learning_rate': 0.025371203848486665, 'item_alpha': 2.446531892486056e-08, 'user_alpha': 2.9093087061889047e-11, 'max_sampled': 15}


Epoch: 100%|██████████| 20/20 [14:33<00:00, 43.67s/it]
[32m[I 2022-03-04 20:26:36,528][0m Trial 11 finished with value: 0.012665883990250315 and parameters: {'train_days': 28, 'no_components': 768, 'learning_rate': 0.025371203848486665, 'item_alpha': 2.446531892486056e-08, 'user_alpha': 2.9093087061889047e-11, 'max_sampled': 15}. Best is trial 1 with value: 0.012845120156728552.[0m


{'no_components': 768, 'learning_schedule': 'adadelta', 'loss': 'bpr', 'learning_rate': 0.031417354326112604, 'item_alpha': 9.381584358370356e-11, 'user_alpha': 1.168503412473116e-12, 'max_sampled': 16}


Epoch: 100%|██████████| 20/20 [11:12<00:00, 33.62s/it]
[32m[I 2022-03-04 20:38:31,685][0m Trial 12 finished with value: 0.012441844650280398 and parameters: {'train_days': 21, 'no_components': 768, 'learning_rate': 0.031417354326112604, 'item_alpha': 9.381584358370356e-11, 'user_alpha': 1.168503412473116e-12, 'max_sampled': 16}. Best is trial 1 with value: 0.012845120156728552.[0m


{'no_components': 896, 'learning_schedule': 'adadelta', 'loss': 'bpr', 'learning_rate': 0.008950894525341778, 'item_alpha': 3.863379304227777e-08, 'user_alpha': 3.4622935981607887e-10, 'max_sampled': 14}


Epoch: 100%|██████████| 20/20 [16:27<00:00, 49.39s/it]
[32m[I 2022-03-04 20:55:49,120][0m Trial 13 finished with value: 0.012571848343131362 and parameters: {'train_days': 28, 'no_components': 896, 'learning_rate': 0.008950894525341778, 'item_alpha': 3.863379304227777e-08, 'user_alpha': 3.4622935981607887e-10, 'max_sampled': 14}. Best is trial 1 with value: 0.012845120156728552.[0m


{'no_components': 640, 'learning_schedule': 'adadelta', 'loss': 'bpr', 'learning_rate': 0.02433795229130115, 'item_alpha': 2.793393243768145e-07, 'user_alpha': 2.6459189427259217e-09, 'max_sampled': 17}


Epoch: 100%|██████████| 20/20 [10:03<00:00, 30.18s/it]
[32m[I 2022-03-04 21:06:29,439][0m Trial 14 finished with value: 0.011890880422486368 and parameters: {'train_days': 21, 'no_components': 640, 'learning_rate': 0.02433795229130115, 'item_alpha': 2.793393243768145e-07, 'user_alpha': 2.6459189427259217e-09, 'max_sampled': 17}. Best is trial 1 with value: 0.012845120156728552.[0m


{'no_components': 768, 'learning_schedule': 'adadelta', 'loss': 'bpr', 'learning_rate': 0.049468742688575075, 'item_alpha': 2.0315588185148622e-09, 'user_alpha': 3.019533427743043e-11, 'max_sampled': 13}


Epoch: 100%|██████████| 20/20 [03:53<00:00, 11.68s/it]
[32m[I 2022-03-04 21:11:05,597][0m Trial 15 finished with value: 0.008787746303128932 and parameters: {'train_days': 7, 'no_components': 768, 'learning_rate': 0.049468742688575075, 'item_alpha': 2.0315588185148622e-09, 'user_alpha': 3.019533427743043e-11, 'max_sampled': 13}. Best is trial 1 with value: 0.012845120156728552.[0m


{'no_components': 1024, 'learning_schedule': 'adadelta', 'loss': 'bpr', 'learning_rate': 0.03509733554285025, 'item_alpha': 2.548398644343316e-10, 'user_alpha': 8.409130976927917e-07, 'max_sampled': 16}


Epoch: 100%|██████████| 20/20 [16:18<00:00, 48.92s/it]
[32m[I 2022-03-04 21:28:15,150][0m Trial 16 finished with value: 0.013132215308796704 and parameters: {'train_days': 28, 'no_components': 1024, 'learning_rate': 0.03509733554285025, 'item_alpha': 2.548398644343316e-10, 'user_alpha': 8.409130976927917e-07, 'max_sampled': 16}. Best is trial 16 with value: 0.013132215308796704.[0m


{'no_components': 1024, 'learning_schedule': 'adadelta', 'loss': 'bpr', 'learning_rate': 0.038332971013971893, 'item_alpha': 1.883294566140812e-11, 'user_alpha': 9.250226238062989e-07, 'max_sampled': 20}


Epoch: 100%|██████████| 20/20 [13:26<00:00, 40.33s/it]
[32m[I 2022-03-04 21:42:33,243][0m Trial 17 finished with value: 0.01245508104203842 and parameters: {'train_days': 21, 'no_components': 1024, 'learning_rate': 0.038332971013971893, 'item_alpha': 1.883294566140812e-11, 'user_alpha': 9.250226238062989e-07, 'max_sampled': 20}. Best is trial 16 with value: 0.013132215308796704.[0m


{'no_components': 896, 'learning_schedule': 'adadelta', 'loss': 'bpr', 'learning_rate': 0.008053964283859993, 'item_alpha': 2.1425655653289647e-10, 'user_alpha': 2.483083625435052e-07, 'max_sampled': 17}


Epoch: 100%|██████████| 20/20 [16:36<00:00, 49.81s/it]
[32m[I 2022-03-04 21:59:59,621][0m Trial 18 finished with value: 0.012710893740753917 and parameters: {'train_days': 28, 'no_components': 896, 'learning_rate': 0.008053964283859993, 'item_alpha': 2.1425655653289647e-10, 'user_alpha': 2.483083625435052e-07, 'max_sampled': 17}. Best is trial 16 with value: 0.013132215308796704.[0m


In [22]:
study.trials_dataframe().sort_values(by='value', ascending=False).head(20)

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_item_alpha,params_learning_rate,params_max_sampled,params_no_components,params_train_days,params_user_alpha,state
16,16,0.013132,2022-03-04 21:11:05.598831,2022-03-04 21:28:15.149825,0 days 00:17:09.550994,2.548399e-10,0.035097,16,1024,28,8.409131e-07,COMPLETE
1,1,0.012845,2022-03-04 18:54:39.797119,2022-03-04 19:09:32.392005,0 days 00:14:52.594886,1.573121e-12,0.013068,19,768,28,2.933507e-08,COMPLETE
18,18,0.012711,2022-03-04 21:42:33.245008,2022-03-04 21:59:59.621193,0 days 00:17:26.376185,2.142566e-10,0.008054,17,896,28,2.483084e-07,COMPLETE
11,11,0.012666,2022-03-04 20:11:20.499422,2022-03-04 20:26:36.528212,0 days 00:15:16.028790,2.446532e-08,0.025371,15,768,28,2.909309e-11,COMPLETE
13,13,0.012572,2022-03-04 20:38:31.685981,2022-03-04 20:55:49.120267,0 days 00:17:17.434286,3.863379e-08,0.008951,14,896,28,3.462294e-10,COMPLETE
17,17,0.012455,2022-03-04 21:28:15.150985,2022-03-04 21:42:33.243713,0 days 00:14:18.092728,1.883295e-11,0.038333,20,1024,21,9.250226e-07,COMPLETE
12,12,0.012442,2022-03-04 20:26:36.529609,2022-03-04 20:38:31.684754,0 days 00:11:55.155145,9.381584e-11,0.031417,16,768,21,1.168503e-12,COMPLETE
9,9,0.012393,2022-03-04 19:52:18.231092,2022-03-04 20:06:09.758676,0 days 00:13:51.527584,1.43306e-08,0.022381,14,640,28,4.305588e-11,COMPLETE
14,14,0.011891,2022-03-04 20:55:49.121503,2022-03-04 21:06:29.438814,0 days 00:10:40.317311,2.793393e-07,0.024338,17,640,21,2.645919e-09,COMPLETE
0,0,0.011434,2022-03-04 18:45:37.377221,2022-03-04 18:54:39.795831,0 days 00:09:02.418610,1.424971e-12,0.044384,12,384,28,7.225128e-09,COMPLETE


この実験では特徴いれないほうがよい