In [86]:
import polars as pl
from scipy import sparse
from sklearn import preprocessing
import lightfm
import tqdm
import numpy as np
import pickle

In [2]:
!ls ../data | grep hh_

hh_recsys_sample.pq
hh_recsys_test_hh.pq
hh_recsys_train_hh.pq
hh_recsys_vacancies.pq


In [15]:
log = pl.concat([
    pl.read_parquet('../data/hh_recsys_train_hh.pq'),
    pl.read_parquet('../data/hh_recsys_test_hh.pq'),
]).select(
    pl.col('user_id'),
    pl.col('vacancy_id'),
).explode('vacancy_id').unique().select(
    pl.all(),
    pl.col('user_id').str.slice(2).cast(pl.Int64).alias('uid'),
    pl.col('vacancy_id').str.slice(2).cast(pl.Int64).alias('iid'),
)

log.head(3)

user_id,vacancy_id,uid,iid
str,str,i64,i64
"""u_332060""","""v_1246726""",332060,1246726
"""u_786220""","""v_154052""",786220,154052
"""u_786220""","""v_2727929""",786220,2727929


In [24]:
N_USERS = 1177422
N_ITEMS = 2734129
N_EPOCH = 10

In [58]:
# sample = log.filter(
#     (pl.col('user_id').hash().mod(10).eq(1)) & (pl.col('vacancy_id').hash().mod(10).eq(1))
# )
sample = log

len(log), len(sample)

(15413244, 15413244)

In [59]:
train = sparse.coo_matrix(
    (
        np.ones(len(sample)),
        (
            sample['uid'].to_list(),
            sample['iid'].to_list(),
        ),
    ),
    shape=(N_USERS, N_ITEMS),
)

In [60]:
model = lightfm.LightFM(
    loss='warp',
    no_components=16,
)
model.fit(
    train,
    epochs=10,
    verbose=2,
)


Epoch: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [09:29<00:00, 56.99s/it]


<lightfm.lightfm.LightFM at 0x7f118f8bfa30>

In [87]:
with open('../data/lightfm.pickle', 'wb') as f:
    pickle.dump(model, f, protocol=pickle.HIGHEST_PROTOCOL)

In [89]:
with open('../data/lightfm.pickle', 'rb') as f:
    model_c = pickle.load(f)

In [90]:
model_c.user_embeddings.shape

(1177422, 16)

In [70]:
u = model.user_embeddings[sample['uid'].to_list()]
i = model.item_embeddings[sample['iid'].to_list()]

arr = np.array(sample['iid'].to_list())
np.random.shuffle(arr)
i_shuffled = model.item_embeddings[arr]


In [80]:
x = pl.DataFrame().with_columns(x=[1, 2, 3, 4]).explode('x')
x

x
i64
1
2
3
4


In [85]:
x.select(
    pl.concat_str(pl.lit('v_'), pl.col('x').cast(pl.String))
)

literal
str
"""v_1"""
"""v_2"""
"""v_3"""
"""v_4"""
