In [1]:
%load_ext autoreload
%autoreload 2

In [4]:
from pathlib import Path
from joblib import Parallel, delayed
from tqdm.notebook import tqdm
import dill
import pandas as pd
import polars as pl
import numpy as np
from scipy.sparse import csr_matrix

import lightfm

In [6]:
from otto_utils import *

In [7]:
# load
val_df_train = pl.read_parquet(VALIDATION_PATH / "train.parquet", use_pyarrow=True)
val_df_valid_input = pl.read_parquet(VALIDATION_PATH / "valid.parquet", use_pyarrow=True)
val_df_valid_targets = pl.read_parquet(VALIDATION_PATH / "test_labels.parquet", use_pyarrow=True)

In [8]:
df = pl.concat([val_df_train, val_df_valid_input])

In [9]:
def dataset_action_weights(df):
    df_action_weights = pl.DataFrame({
        "type": [0, 1, 2], "weight": [10, 30, 60]
    })

    return (
        df
        .join(df_action_weights, on="type", how="inner")
        .groupby(["session", "aid"])
        .agg([
            pl.sum("weight")
        ])
    )

def dataset_clicks_only(df):
    return (
        df
        .filter(pl.col("type") == 0)
        .groupby(["session", "aid"])
        .agg([
            pl.lit(1).alias("weight")
        ])
    )

In [10]:
df_act_w = dataset_action_weights(df)
df_clicks = dataset_clicks_only(df)

In [14]:
model = lightfm.LightFM(
    no_components=32,
    loss="warp"
)

In [15]:
train_data = make_sparse_matrix(df_clicks)
model.fit(train_data, epochs=30, num_threads=40, verbose=True)

Epoch: 100%|██████████| 30/30 [17:32<00:00, 35.07s/it]


<lightfm.lightfm.LightFM at 0x7fcd0a9da2b0>

In [20]:
model.user_embeddings.shape, model.item_embeddings.shape

((12899778, 32), (1855603, 32))

In [21]:
u_embs = model.user_embeddings
i_embs = model.item_embeddings

In [29]:
scores = np.dot(u_embs[[12899776, 12899777]], i_embs.T)

In [34]:
scores[0, 28829], scores[0, 1743163], scores[0, 210052],

(-0.08147804, -0.07942231, 0.09472581)

In [35]:
idx = np.argsort(scores)[:, -20:]

In [36]:
idx

array([[ 571792,  187588,  464792, 1416727, 1494147,  374323,   97936,
         139184, 1440628, 1072286, 1619382,  567461, 1614298,  543442,
          62004, 1173927, 1076578, 1500428,  525798,  210052],
       [ 774039,  983402, 1508133,  843093,  514684,  332566,  884684,
        1177895, 1308131,  452371, 1465235,  463074,  184827,  472515,
        1556767,  786895, 1678437,  149415,   83979,  713404]])

In [41]:
scores[[for i in range(2)]]

array([[-0.00911062, -0.0347143 , -0.01366474, ..., -0.01477376,
        -0.03116323,  0.01482107],
       [-0.4833191 , -0.833352  ,  0.46834445, ..., -0.75247806,
        -1.1062695 , -0.78308463]], dtype=float32)

In [44]:
test_users = val_df_valid_input["session"].unique().to_list()[:100_000]

In [None]:
b_sz = 1000
topk = 20

recs = []

u_embs = model.user_embeddings
i_embs = model.item_embeddings

for test_session in tqdm(range(0, len(test_users), b_sz)):
    test_sessions = test_users[test_session : test_session + b_sz]
    
    scores = np.dot(u_embs[test_sessions], i_embs.T)
    idx = np.argsort(scores)[:, -topk:]
    rec_items = [
        scores[i, idx[i]]
        for i in range(len(test_sessions))
    ]
    
    recs.extend(rec_items)

  0%|          | 0/100 [00:00<?, ?it/s]