# LightFM user features
全部入れると良くなかったので、一旦ユーザーの特徴のみ入れてみる

In [7]:
import datetime

import faiss
import matplotlib.pyplot as plt
import numpy as np
import optuna
import pandas as pd
import psutil
from lightfm import LightFM

import schema
from metric import mapk
from scipy import sparse

In [8]:
articles = pd.read_csv('input/transformed/articles.csv', usecols=schema.ARTICLES.keys(), dtype=schema.ARTICLES)
customers = pd.read_csv('input/transformed/customers.csv', usecols=schema.CUSTOMERS.keys(), dtype=schema.CUSTOMERS)
n_user = len(customers)
n_item = len(articles)

In [9]:
transactions = pd.read_csv('input/transformed/transactions_train.csv', parse_dates=['t_dat'], usecols=list(schema.TRANSACTIONS.keys())+['t_dat'], dtype=schema.TRANSACTIONS)
TOPK = 12

In [10]:
def create_customer_features(customers):
    df = customers.copy().drop('customer_id_idx', axis=1)
    df['age_is_null'] = df['age'].isnull().astype(int)
    df['age'] = df['age'].fillna(df['age'].mean())
    for c in ['club_member_status_idx', 'fashion_news_frequency_idx']:
        df = pd.concat([df, pd.get_dummies(df[c], prefix=c)], axis=1).drop(c, axis=1)

    idxs = sparse.identity(len(df), dtype='f')
    return sparse.hstack([idxs, df.values]).astype('float32')


def create_article_features(articles):
    df = articles.copy().drop('article_id_idx', axis=1)
    for c in df.columns:
        df = pd.concat([df, pd.get_dummies(df[c], prefix=c)], axis=1).drop(c, axis=1)

    idxs = sparse.identity(len(df), dtype='f')
    return sparse.hstack([idxs, df.values]).astype('float32')

user_features = create_customer_features(customers)
# item_features = create_article_features(articles)

In [11]:
def objective(trial: optuna.Trial) -> float:
    train_days = trial.suggest_int('train_days', 7, 28, 7)
    no_components = trial.suggest_int('no_components', 128, 1024, 128)
    learning_schedule = 'adadelta'
    loss = 'bpr'
    learning_rate = trial.suggest_loguniform('learning_rate', 5*1e-3, 5*1e-2)
    item_alpha = trial.suggest_loguniform('item_alpha', 1e-12, 1e-6)
    user_alpha = trial.suggest_loguniform('user_alpha', 1e-12, 1e-6)
    max_sampled = trial.suggest_int('max_sampled', 10, 20)
    epochs = 20

    lightfm_params = {
        'no_components': no_components,
        'learning_schedule': learning_schedule,
        'loss': loss,
        'learning_rate': learning_rate,
        'item_alpha': item_alpha,
        'user_alpha': user_alpha,
        'max_sampled': max_sampled,
    }
    print(lightfm_params)

    valid_start_date = datetime.date(2020, 9, 16)
    valid_end_date = datetime.date(2020, 9, 22)
    train_end_date = valid_start_date - datetime.timedelta(days=1)
    train_start_date = valid_start_date - datetime.timedelta(days=train_days)

    transactions_train = transactions.query("@train_start_date <= t_dat <= @train_end_date")
    transactions_valid = transactions.query("@valid_start_date <= t_dat <= @valid_end_date")

    val = transactions_valid.groupby('customer_id_idx')['article_id_idx'].apply(list).reset_index()

    train = sparse.lil_matrix((n_user, n_item))
    train[transactions_train.customer_id_idx, transactions_train.article_id_idx] = 1

    model = LightFM(**lightfm_params)
    model.fit(train, user_features=user_features, epochs=epochs, num_threads=psutil.cpu_count(logical=False), verbose=True)
    model.fit(train, epochs=epochs, num_threads=psutil.cpu_count(logical=False), verbose=True)

    index = faiss.index_factory(no_components, "Flat", faiss.METRIC_INNER_PRODUCT)
    index = faiss.index_cpu_to_gpu(faiss.StandardGpuResources(), 0, index)
    index.add(model.item_embeddings)
    _, idxs = index.search(model.user_embeddings, TOPK)

    return mapk(val.article_id_idx, idxs[val.customer_id_idx])

In [12]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, timeout=7200)

[32m[I 2022-03-05 10:09:03,056][0m A new study created in memory with name: no-name-98108614-6651-42ad-9786-292c935b772f[0m


{'no_components': 384, 'learning_schedule': 'adadelta', 'loss': 'bpr', 'learning_rate': 0.04097390085924968, 'item_alpha': 3.457096305467413e-07, 'user_alpha': 3.7218283401567005e-09, 'max_sampled': 20}


Epoch: 100%|██████████| 20/20 [03:08<00:00,  9.41s/it]
Epoch: 100%|██████████| 20/20 [00:30<00:00,  1.54s/it]
[32m[I 2022-03-05 10:13:11,749][0m Trial 0 finished with value: 0.01866700699740334 and parameters: {'train_days': 21, 'no_components': 384, 'learning_rate': 0.04097390085924968, 'item_alpha': 3.457096305467413e-07, 'user_alpha': 3.7218283401567005e-09, 'max_sampled': 20}. Best is trial 0 with value: 0.01866700699740334.[0m


{'no_components': 512, 'learning_schedule': 'adadelta', 'loss': 'bpr', 'learning_rate': 0.03349866242414398, 'item_alpha': 9.443054761010596e-11, 'user_alpha': 5.324836756187512e-09, 'max_sampled': 10}


Epoch: 100%|██████████| 20/20 [04:48<00:00, 14.40s/it]
Epoch: 100%|██████████| 20/20 [00:47<00:00,  2.37s/it]
[32m[I 2022-03-05 10:19:24,492][0m Trial 1 finished with value: 0.019112590991056957 and parameters: {'train_days': 28, 'no_components': 512, 'learning_rate': 0.03349866242414398, 'item_alpha': 9.443054761010596e-11, 'user_alpha': 5.324836756187512e-09, 'max_sampled': 10}. Best is trial 1 with value: 0.019112590991056957.[0m


{'no_components': 768, 'learning_schedule': 'adadelta', 'loss': 'bpr', 'learning_rate': 0.02215588795793549, 'item_alpha': 4.393876149870726e-12, 'user_alpha': 2.0890890426492952e-07, 'max_sampled': 12}


Epoch: 100%|██████████| 20/20 [03:03<00:00,  9.15s/it]
Epoch: 100%|██████████| 20/20 [00:44<00:00,  2.22s/it]
[32m[I 2022-03-05 10:24:06,848][0m Trial 2 finished with value: 0.01878062385702717 and parameters: {'train_days': 14, 'no_components': 768, 'learning_rate': 0.02215588795793549, 'item_alpha': 4.393876149870726e-12, 'user_alpha': 2.0890890426492952e-07, 'max_sampled': 12}. Best is trial 1 with value: 0.019112590991056957.[0m


{'no_components': 640, 'learning_schedule': 'adadelta', 'loss': 'bpr', 'learning_rate': 0.006786215206503564, 'item_alpha': 1.1405198151938895e-10, 'user_alpha': 5.9985559985592826e-09, 'max_sampled': 20}


Epoch: 100%|██████████| 20/20 [04:03<00:00, 12.20s/it]
Epoch: 100%|██████████| 20/20 [00:45<00:00,  2.27s/it]
[32m[I 2022-03-05 10:29:41,262][0m Trial 3 finished with value: 0.01954261704970779 and parameters: {'train_days': 21, 'no_components': 640, 'learning_rate': 0.006786215206503564, 'item_alpha': 1.1405198151938895e-10, 'user_alpha': 5.9985559985592826e-09, 'max_sampled': 20}. Best is trial 3 with value: 0.01954261704970779.[0m


{'no_components': 896, 'learning_schedule': 'adadelta', 'loss': 'bpr', 'learning_rate': 0.012562750320194117, 'item_alpha': 2.4685180287738105e-12, 'user_alpha': 6.329080394794304e-12, 'max_sampled': 15}


Epoch: 100%|██████████| 20/20 [04:44<00:00, 14.22s/it]
Epoch: 100%|██████████| 20/20 [01:02<00:00,  3.13s/it]
[32m[I 2022-03-05 10:36:31,429][0m Trial 4 finished with value: 0.019645245547232452 and parameters: {'train_days': 21, 'no_components': 896, 'learning_rate': 0.012562750320194117, 'item_alpha': 2.4685180287738105e-12, 'user_alpha': 6.329080394794304e-12, 'max_sampled': 15}. Best is trial 4 with value: 0.019645245547232452.[0m


{'no_components': 512, 'learning_schedule': 'adadelta', 'loss': 'bpr', 'learning_rate': 0.03372751708504411, 'item_alpha': 1.876812377286852e-12, 'user_alpha': 4.6777373146226224e-11, 'max_sampled': 19}


Epoch: 100%|██████████| 20/20 [02:27<00:00,  7.39s/it]
Epoch: 100%|██████████| 20/20 [00:33<00:00,  1.67s/it]
[32m[I 2022-03-05 10:40:11,230][0m Trial 5 finished with value: 0.01855471358307219 and parameters: {'train_days': 14, 'no_components': 512, 'learning_rate': 0.03372751708504411, 'item_alpha': 1.876812377286852e-12, 'user_alpha': 4.6777373146226224e-11, 'max_sampled': 19}. Best is trial 4 with value: 0.019645245547232452.[0m


{'no_components': 640, 'learning_schedule': 'adadelta', 'loss': 'bpr', 'learning_rate': 0.010334890922121184, 'item_alpha': 5.600160636657007e-10, 'user_alpha': 5.531822941828911e-08, 'max_sampled': 17}


Epoch: 100%|██████████| 20/20 [04:05<00:00, 12.26s/it]
Epoch: 100%|██████████| 20/20 [00:49<00:00,  2.49s/it]
[32m[I 2022-03-05 10:45:53,521][0m Trial 6 finished with value: 0.019242349613253936 and parameters: {'train_days': 21, 'no_components': 640, 'learning_rate': 0.010334890922121184, 'item_alpha': 5.600160636657007e-10, 'user_alpha': 5.531822941828911e-08, 'max_sampled': 17}. Best is trial 4 with value: 0.019645245547232452.[0m


{'no_components': 1024, 'learning_schedule': 'adadelta', 'loss': 'bpr', 'learning_rate': 0.008821138472805483, 'item_alpha': 4.61995309293024e-08, 'user_alpha': 1.2251974498761606e-11, 'max_sampled': 11}


Epoch: 100%|██████████| 20/20 [06:02<00:00, 18.10s/it]
Epoch: 100%|██████████| 20/20 [01:11<00:00,  3.55s/it]
[32m[I 2022-03-05 10:54:14,861][0m Trial 7 finished with value: 0.01961699605254354 and parameters: {'train_days': 21, 'no_components': 1024, 'learning_rate': 0.008821138472805483, 'item_alpha': 4.61995309293024e-08, 'user_alpha': 1.2251974498761606e-11, 'max_sampled': 11}. Best is trial 4 with value: 0.019645245547232452.[0m


{'no_components': 128, 'learning_schedule': 'adadelta', 'loss': 'bpr', 'learning_rate': 0.028407333009453285, 'item_alpha': 8.377426252346118e-11, 'user_alpha': 1.3706583642795385e-09, 'max_sampled': 20}


Epoch: 100%|██████████| 20/20 [01:16<00:00,  3.80s/it]
Epoch: 100%|██████████| 20/20 [00:09<00:00,  2.03it/s]
[32m[I 2022-03-05 10:55:54,919][0m Trial 8 finished with value: 0.016073686489254457 and parameters: {'train_days': 14, 'no_components': 128, 'learning_rate': 0.028407333009453285, 'item_alpha': 8.377426252346118e-11, 'user_alpha': 1.3706583642795385e-09, 'max_sampled': 20}. Best is trial 4 with value: 0.019645245547232452.[0m


{'no_components': 1024, 'learning_schedule': 'adadelta', 'loss': 'bpr', 'learning_rate': 0.0071700919314241045, 'item_alpha': 5.238870421303572e-10, 'user_alpha': 2.3860600037108236e-09, 'max_sampled': 11}


Epoch: 100%|██████████| 20/20 [07:17<00:00, 21.89s/it]
Epoch: 100%|██████████| 20/20 [01:27<00:00,  4.39s/it]
[32m[I 2022-03-05 11:05:49,844][0m Trial 9 finished with value: 0.01972308477408533 and parameters: {'train_days': 28, 'no_components': 1024, 'learning_rate': 0.0071700919314241045, 'item_alpha': 5.238870421303572e-10, 'user_alpha': 2.3860600037108236e-09, 'max_sampled': 11}. Best is trial 9 with value: 0.01972308477408533.[0m


{'no_components': 1024, 'learning_schedule': 'adadelta', 'loss': 'bpr', 'learning_rate': 0.005081556750408691, 'item_alpha': 7.345892309950369e-09, 'user_alpha': 1.3001950792888574e-10, 'max_sampled': 13}


Epoch: 100%|██████████| 20/20 [02:13<00:00,  6.65s/it]
Epoch: 100%|██████████| 20/20 [00:43<00:00,  2.16s/it]
[32m[I 2022-03-05 11:09:54,025][0m Trial 10 finished with value: 0.016587311755713586 and parameters: {'train_days': 7, 'no_components': 1024, 'learning_rate': 0.005081556750408691, 'item_alpha': 7.345892309950369e-09, 'user_alpha': 1.3001950792888574e-10, 'max_sampled': 13}. Best is trial 9 with value: 0.01972308477408533.[0m


{'no_components': 896, 'learning_schedule': 'adadelta', 'loss': 'bpr', 'learning_rate': 0.014820677845855388, 'item_alpha': 8.588392171155115e-12, 'user_alpha': 1.5579436479740479e-12, 'max_sampled': 15}


Epoch: 100%|██████████| 20/20 [06:04<00:00, 18.20s/it]
Epoch: 100%|██████████| 20/20 [01:13<00:00,  3.66s/it]
[32m[I 2022-03-05 11:18:15,542][0m Trial 11 finished with value: 0.019526696655836572 and parameters: {'train_days': 28, 'no_components': 896, 'learning_rate': 0.014820677845855388, 'item_alpha': 8.588392171155115e-12, 'user_alpha': 1.5579436479740479e-12, 'max_sampled': 15}. Best is trial 9 with value: 0.01972308477408533.[0m


{'no_components': 896, 'learning_schedule': 'adadelta', 'loss': 'bpr', 'learning_rate': 0.013483828927392626, 'item_alpha': 3.204081779561042e-09, 'user_alpha': 1.651820227518676e-10, 'max_sampled': 14}


Epoch: 100%|██████████| 20/20 [05:52<00:00, 17.62s/it]
Epoch: 100%|██████████| 20/20 [01:12<00:00,  3.63s/it]
[32m[I 2022-03-05 11:26:24,381][0m Trial 12 finished with value: 0.019504378364876923 and parameters: {'train_days': 28, 'no_components': 896, 'learning_rate': 0.013483828927392626, 'item_alpha': 3.204081779561042e-09, 'user_alpha': 1.651820227518676e-10, 'max_sampled': 14}. Best is trial 9 with value: 0.01972308477408533.[0m


{'no_components': 896, 'learning_schedule': 'adadelta', 'loss': 'bpr', 'learning_rate': 0.008767961167060141, 'item_alpha': 2.060122195301092e-11, 'user_alpha': 1.5689424814202964e-12, 'max_sampled': 17}


Epoch: 100%|██████████| 20/20 [05:57<00:00, 17.86s/it]
Epoch: 100%|██████████| 20/20 [01:12<00:00,  3.62s/it]
[32m[I 2022-03-05 11:34:37,594][0m Trial 13 finished with value: 0.019941548788653816 and parameters: {'train_days': 28, 'no_components': 896, 'learning_rate': 0.008767961167060141, 'item_alpha': 2.060122195301092e-11, 'user_alpha': 1.5689424814202964e-12, 'max_sampled': 17}. Best is trial 13 with value: 0.019941548788653816.[0m


{'no_components': 1024, 'learning_schedule': 'adadelta', 'loss': 'bpr', 'learning_rate': 0.007362127303056436, 'item_alpha': 2.4458970601831836e-11, 'user_alpha': 4.5589850701362635e-08, 'max_sampled': 17}


Epoch: 100%|██████████| 20/20 [07:31<00:00, 22.57s/it]
Epoch: 100%|██████████| 20/20 [01:25<00:00,  4.26s/it]
[32m[I 2022-03-05 11:44:42,249][0m Trial 14 finished with value: 0.01946436011544924 and parameters: {'train_days': 28, 'no_components': 1024, 'learning_rate': 0.007362127303056436, 'item_alpha': 2.4458970601831836e-11, 'user_alpha': 4.5589850701362635e-08, 'max_sampled': 17}. Best is trial 13 with value: 0.019941548788653816.[0m


{'no_components': 768, 'learning_schedule': 'adadelta', 'loss': 'bpr', 'learning_rate': 0.005503152110961714, 'item_alpha': 9.791159646704305e-10, 'user_alpha': 3.924055159264976e-10, 'max_sampled': 17}


Epoch: 100%|██████████| 20/20 [05:46<00:00, 17.33s/it]
Epoch: 100%|██████████| 20/20 [01:00<00:00,  3.05s/it]
[32m[I 2022-03-05 11:52:24,085][0m Trial 15 finished with value: 0.01954365594057469 and parameters: {'train_days': 28, 'no_components': 768, 'learning_rate': 0.005503152110961714, 'item_alpha': 9.791159646704305e-10, 'user_alpha': 3.924055159264976e-10, 'max_sampled': 17}. Best is trial 13 with value: 0.019941548788653816.[0m


{'no_components': 768, 'learning_schedule': 'adadelta', 'loss': 'bpr', 'learning_rate': 0.019144642197203254, 'item_alpha': 4.5530352427294625e-10, 'user_alpha': 1.166175383441355e-12, 'max_sampled': 18}


Epoch: 100%|██████████| 20/20 [01:34<00:00,  4.73s/it]
Epoch: 100%|██████████| 20/20 [00:29<00:00,  1.48s/it]
[32m[I 2022-03-05 11:55:20,626][0m Trial 16 finished with value: 0.01638452515568219 and parameters: {'train_days': 7, 'no_components': 768, 'learning_rate': 0.019144642197203254, 'item_alpha': 4.5530352427294625e-10, 'user_alpha': 1.166175383441355e-12, 'max_sampled': 18}. Best is trial 13 with value: 0.019941548788653816.[0m


{'no_components': 128, 'learning_schedule': 'adadelta', 'loss': 'bpr', 'learning_rate': 0.009645209238999959, 'item_alpha': 1.3087702307745533e-08, 'user_alpha': 2.5840362557649772e-11, 'max_sampled': 13}


Epoch: 100%|██████████| 20/20 [02:40<00:00,  8.04s/it]
Epoch: 100%|██████████| 20/20 [00:14<00:00,  1.34it/s]
[32m[I 2022-03-05 11:58:30,211][0m Trial 17 finished with value: 0.016535167081349358 and parameters: {'train_days': 28, 'no_components': 128, 'learning_rate': 0.009645209238999959, 'item_alpha': 1.3087702307745533e-08, 'user_alpha': 2.5840362557649772e-11, 'max_sampled': 13}. Best is trial 13 with value: 0.019941548788653816.[0m


{'no_components': 896, 'learning_schedule': 'adadelta', 'loss': 'bpr', 'learning_rate': 0.006646892788497398, 'item_alpha': 1.9214964137113685e-11, 'user_alpha': 6.167919643815403e-07, 'max_sampled': 16}


Epoch: 100%|██████████| 20/20 [05:41<00:00, 17.07s/it]
Epoch: 100%|██████████| 20/20 [01:08<00:00,  3.41s/it]
[32m[I 2022-03-05 12:06:21,226][0m Trial 18 finished with value: 0.018183367842720826 and parameters: {'train_days': 28, 'no_components': 896, 'learning_rate': 0.006646892788497398, 'item_alpha': 1.9214964137113685e-11, 'user_alpha': 6.167919643815403e-07, 'max_sampled': 16}. Best is trial 13 with value: 0.019941548788653816.[0m


{'no_components': 1024, 'learning_schedule': 'adadelta', 'loss': 'bpr', 'learning_rate': 0.008320283184731108, 'item_alpha': 2.1235303822142049e-10, 'user_alpha': 2.5716342100158757e-08, 'max_sampled': 10}


Epoch: 100%|██████████| 20/20 [05:44<00:00, 17.22s/it]
Epoch: 100%|██████████| 20/20 [01:04<00:00,  3.22s/it]
[32m[I 2022-03-05 12:14:14,694][0m Trial 19 finished with value: 0.019661758952200868 and parameters: {'train_days': 21, 'no_components': 1024, 'learning_rate': 0.008320283184731108, 'item_alpha': 2.1235303822142049e-10, 'user_alpha': 2.5716342100158757e-08, 'max_sampled': 10}. Best is trial 13 with value: 0.019941548788653816.[0m


In [13]:
study.trials_dataframe().sort_values(by='value', ascending=False).head(20)

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_item_alpha,params_learning_rate,params_max_sampled,params_no_components,params_train_days,params_user_alpha,state
13,13,0.019942,2022-03-05 11:26:24.382729,2022-03-05 11:34:37.594407,0 days 00:08:13.211678,2.060122e-11,0.008768,17,896,28,1.568942e-12,COMPLETE
9,9,0.019723,2022-03-05 10:55:54.920487,2022-03-05 11:05:49.844129,0 days 00:09:54.923642,5.23887e-10,0.00717,11,1024,28,2.38606e-09,COMPLETE
19,19,0.019662,2022-03-05 12:06:21.227686,2022-03-05 12:14:14.694604,0 days 00:07:53.466918,2.12353e-10,0.00832,10,1024,21,2.571634e-08,COMPLETE
4,4,0.019645,2022-03-05 10:29:41.263655,2022-03-05 10:36:31.428736,0 days 00:06:50.165081,2.468518e-12,0.012563,15,896,21,6.32908e-12,COMPLETE
7,7,0.019617,2022-03-05 10:45:53.522395,2022-03-05 10:54:14.861605,0 days 00:08:21.339210,4.619953e-08,0.008821,11,1024,21,1.225197e-11,COMPLETE
15,15,0.019544,2022-03-05 11:44:42.250394,2022-03-05 11:52:24.084984,0 days 00:07:41.834590,9.79116e-10,0.005503,17,768,28,3.924055e-10,COMPLETE
3,3,0.019543,2022-03-05 10:24:06.849207,2022-03-05 10:29:41.262584,0 days 00:05:34.413377,1.14052e-10,0.006786,20,640,21,5.998556e-09,COMPLETE
11,11,0.019527,2022-03-05 11:09:54.026009,2022-03-05 11:18:15.541748,0 days 00:08:21.515739,8.588392e-12,0.014821,15,896,28,1.557944e-12,COMPLETE
12,12,0.019504,2022-03-05 11:18:15.543045,2022-03-05 11:26:24.381409,0 days 00:08:08.838364,3.204082e-09,0.013484,14,896,28,1.65182e-10,COMPLETE
14,14,0.019464,2022-03-05 11:34:37.595490,2022-03-05 11:44:42.249076,0 days 00:10:04.653586,2.445897e-11,0.007362,17,1024,28,4.558985e-08,COMPLETE


この実験では特徴いれないほうがよい