In [1]:
# !pip install polars numpy implicit tqdm

In [1]:
import polars as pl
import numpy as np
from tqdm import tqdm
from scipy.sparse import csr_matrix
import implicit
from datetime import timedelta

RANDOM_STATE = 42
N_PREDICTIONS = 100


In [20]:
!ls ../data

dssm_train.pq		history_baseline.pq	train_dataset.fth
hh_recsys_sample.pq	test.fth		train_dataset.sample.fth
hh_recsys_test_hh.pq	train.fth		vac.no_desc.fth
hh_recsys_train_hh.pq	train.sample.fth
hh_recsys_vacancies.pq	train.small_sample.fth


In [21]:
dssm = pl.read_parquet('../data/dssm_train.pq')
dssm.head(2)

user_id,n_sessions,vacancy_id,action_type,is_test,session_id,session_end,target
str,u32,list[u64],list[i64],bool,str,datetime[ns],u64
"""u_237129""",2,"[1481580, 208271, … 1606798]","[2, 2, … 2]",False,"""s_361359""",2023-11-13 12:08:36.976,716724
"""u_237129""",2,"[1481580, 208271, … 1606798]","[2, 2, … 2]",False,"""s_361359""",2023-11-13 12:08:36.976,690666


In [24]:
dssm['vacancy_id'].explode().is_null().mean()

0.0

In [23]:
dssm['target'].is_null().mean()

0.0

In [8]:
sub = pl.read_parquet('../data/history_baseline.pq')
sub.head(2)

user_id,predictions
str,list[str]
"""u_828064""","[""v_21361"", ""v_1642325"", … ""v_1770705""]"
"""u_850674""","[""v_1602597"", ""v_1802097"", … ""v_420479""]"


In [9]:
vacancies = pl.read_parquet('../data/hh_recsys_vacancies.pq')
vacancies.shape

(2734129, 13)

In [10]:
vac = vacancies.select(pl.exclude('description', 'keySkills.keySkill'))
vac.head(2)

vacancy_id,name,company.id,compensation.from,compensation.to,compensation.currencyCode,area.id,area.regionId,employment,workSchedule,workExperience
str,str,str,i64,i64,str,str,str,str,str,str
"""v_862116""","""Смотритель муз…","""c_162972""",16500,,"""RUR""","""a_4761""","""ar_33""","""full""","""fullDay""","""noExperience"""
"""v_288642""","""Ведущий менедж…","""c_208672""",50000,,"""RUR""","""a_744""","""ar_2""","""full""","""fullDay""","""noExperience"""


In [68]:
len(name_mapping)

18058

In [67]:
name_mapping = {key: value + 2 for value, key in enumerate(vac.group_by('name').count().filter(
    pl.col('count') > 10
)['name'].to_list())}

vac.select(
    pl.col('vacancy_id').str.slice(2).cast(pl.UInt64).add(1),
    pl.col('area.regionId').str.slice(3).cast(pl.UInt64).add(1),
    pl.col('area.id').str.slice(2).cast(pl.UInt64).add(1),
    pl.col('company.id').str.slice(2).cast(pl.UInt64).add(1),
    vac['workSchedule'].fill_null('fullDay').replace({
        "flexible": 1,
        "flyInFlyOut": 2,
        "shift": 3,
        "fullDay": 4,
        "remote": 5,
    }, return_dtype=pl.UInt64),
    vac['employment'].fill_null('full').replace({
        "full": 1,
        "project": 2,
        "volunteer": 3,
        "probation": 4,
        "part": 5,
    }, return_dtype=pl.UInt64),
    vac['workExperience'].fill_null('between1And3').replace({
        "between1And3": 1,
        "moreThan6": 2,
        "between3And6": 3,
        "noExperience": 4,
    }, return_dtype=pl.UInt64),
    vac['compensation.currencyCode'].fill_null('RUR').replace({
        "KGS": 1,
        "EUR": 2,
        "USD": 3,
        "AZN": 4,
        "BYR": 5,
        "UZS": 6,
        "UAH": 7,
        "GEL": 8,
        "KZT": 9,
        "RUR": 10,
    }, return_dtype=pl.UInt64),
    vac['name'].replace(name_mapping, return_dtype=pl.UInt64).fill_null(1),
)

vacancy_id,area.regionId,area.id,company.id,workSchedule,employment,workExperience,compensation.currencyCode,name
u64,u64,u64,u64,u64,u64,u64,u64,u64
862117,34,4762,162973,4,1,4,10,1
288643,3,745,208673,4,1,4,10,14604
1840055,79,6224,198110,4,1,3,10,1
2346233,52,4796,6138,4,1,4,10,1
312508,5,6838,206700,4,1,1,10,1
2323707,65,5921,244142,4,1,1,10,7020
704267,72,5399,192039,4,1,1,10,1888
525734,97,5044,5102,4,1,3,10,1
2091087,37,501,230891,2,1,1,10,4677
839658,45,2650,192371,4,1,4,10,1


In [3]:
train = pl.read_parquet('../data/hh_recsys_train_hh.pq')
test = pl.read_parquet('../data/hh_recsys_test_hh.pq')
# vacancies = pl.read_parquet('../data/hh_recsys_vacancies.pq')

In [4]:
'u_786220' in test['user_id'], 'u_786220' in train['user_id']

(True, True)

In [5]:
train = pl.read_parquet('../data/hh_recsys_train_hh.pq')
train = train.with_columns(
    session_end=pl.col('action_dt').list.max(),
)
train.head(2)

user_id,session_id,vacancy_id,action_type,action_dt,session_end
str,str,list[str],list[i64],list[datetime[ns]],datetime[ns]
"""u_332060""","""s_28301374""","[""v_2571684"", ""v_488179"", … ""v_2633899""]","[2, 2, … 2]","[2023-11-01 00:40:58.105, 2023-11-01 00:58:13.091, … 2023-11-01 01:35:54.456]",2023-11-01 01:50:26.158
"""u_1057881""","""s_33868982""","[""v_665861""]",[2],[2023-11-01 00:23:51.452],2023-11-01 00:23:51.452


In [6]:
%%time

targets = train.select(
    pl.col('session_id'),
    pl.col('vacancy_id'),
    pl.col('action_type'),
    pl.col('user_id'),
    pl.col('session_end'),
    pl.col('session_end').shift().over('user_id').alias('session_end_prev'),
).explode(
    ['vacancy_id', 'action_type'],
).filter(
    pl.col('action_type') == 1,
).filter(
    ~pl.col('session_end_prev').is_null()
).select(pl.exclude('action_type'))

targets.head(2)

CPU times: user 5.12 s, sys: 2.67 s, total: 7.78 s
Wall time: 3.46 s


session_id,vacancy_id,user_id,session_end,session_end_prev
str,str,str,datetime[ns],datetime[ns]
"""s_7884954""","""v_646272""","""u_639152""",2023-11-01 11:26:12.954,2023-11-01 01:24:42.259
"""s_7884954""","""v_1970993""","""u_639152""",2023-11-01 11:26:12.954,2023-11-01 01:24:42.259


In [73]:
%%time
features = train.filter(
    pl.col('user_id') == 'u_159379',
).sort('session_end').select(
    [
        pl.col('user_id').alias('user_id'),
        pl.col('session_id').alias('session_id'),
    ] + [pl.struct(
        pl.col('action_type'),
        pl.col('vacancy_id'),
        # pl.col('action_dt'),
        pl.col('action_dt').list.len().map_elements(
            lambda l: l,
            return_dtype=pl.List(pl.UInt32),
        ).alias('lag'),
    ).alias(f"lag_{i}").shift(i).over('user_id') for i in range(1, 6)],
).select(
    pl.exclude([f"lag_{i}" for i in range(1, 6)]),
    pl.concat_list([pl.col(f"lag_{i}").struct.field('lag').fill_null([]) for i in range(1, 6)[::-1]]).map_elements(
        lambda l: [i + 1 for i, j in enumerate(l) for _ in range(j)],
    ).alias("lag_hist"),
    pl.concat_list([pl.col(f"lag_{i}").struct.field('action_type').fill_null([]) for i in range(1, 6)[::-1]]).alias("action_type_hist"),
    pl.concat_list([pl.col(f"lag_{i}").struct.field('vacancy_id').fill_null([]) for i in range(1, 6)[::-1]]).alias("vacancy_id_hist"),
    # pl.concat_list([pl.col(f"lag_{i}").struct.field('action_dt').fill_null([]) for i in range(1, 6)]).alias("action_dt_hist"),
)
features.head(8)

CPU times: user 51 ms, sys: 4.13 ms, total: 55.1 ms
Wall time: 31.8 ms


user_id,session_id,lag_hist,action_type_hist,vacancy_id_hist
str,str,list[i64],list[i64],list[str]
"""u_159379""","""s_4190515""",[],[],[]
"""u_159379""","""s_27912075""","[1, 1, … 1]","[2, 2, … 2]","[""v_929376"", ""v_1346688"", … ""v_308738""]"
"""u_159379""","""s_20061977""","[1, 1, … 2]","[2, 2, … 2]","[""v_929376"", ""v_1346688"", … ""v_929376""]"
"""u_159379""","""s_28816753""","[1, 1, … 3]","[2, 2, … 2]","[""v_929376"", ""v_1346688"", … ""v_1045162""]"
"""u_159379""","""s_25981829""","[1, 1, … 4]","[2, 2, … 2]","[""v_929376"", ""v_1346688"", … ""v_1970484""]"
"""u_159379""","""s_3504209""","[1, 1, … 5]","[2, 2, … 2]","[""v_929376"", ""v_1346688"", … ""v_2337447""]"
"""u_159379""","""s_26799506""","[1, 1, … 5]","[2, 2, … 2]","[""v_1596417"", ""v_308738"", … ""v_308738""]"
"""u_159379""","""s_20850382""","[1, 1, … 5]","[2, 2, … 2]","[""v_929376"", ""v_1045162"", … ""v_2337447""]"


In [75]:
user_x_session_x_vac = features.explode(
    'lag_hist',
    'action_type_hist',
    'vacancy_id_hist',
).filter(~pl.col('lag_hist').is_null()).group_by(
    'user_id',
    'session_id',
    'vacancy_id_hist',
).agg(
    pl.col('lag_hist').min(),
    (pl.col('action_type_hist') == 1).sum().alias('action_1'),
    (pl.col('action_type_hist') == 2).sum().alias('action_2'),
    (pl.col('action_type_hist') == 3).sum().alias('action_3'),
)
session_x_vac.head(2)

user_id,session_id,vacancy_id_hist,lag_hist,action_1,action_2,action_3
str,str,str,i64,u32,u32,u32
"""u_159379""","""s_27912075""","""v_1596417""",1,0,1,0
"""u_159379""","""s_27912075""","""v_2457518""",1,0,1,0


In [98]:
user_x_session = features.explode(
    'lag_hist',
    'action_type_hist',
    'vacancy_id_hist',
).filter(
    ~pl.col('lag_hist').is_null()
).join(
    vac,
    left_on='vacancy_id_hist',
    right_on='vacancy_id',
).group_by(
    'user_id',
    'session_id',
).agg(
    pl.col('area.id').mode().first(),
    pl.col('area.regionId').mode().first(),
    pl.col('employment').mode().first(),
    pl.col('workSchedule').mode().first(),
    pl.col('workExperience').mode().first(),
    pl.col('name').mode().first(),
    pl.col('company.id').mode().first(),
    pl.col('compensation.currencyCode').mode().first(),
    pl.col('compensation.from').min().alias('compensation.from.min'),
    pl.col('compensation.from').max().alias('compensation.from.max'),
    pl.col('compensation.to').max().alias('compensation.to.max'),
    pl.col('compensation.to').min().alias('compensation.to.min'),
)

user_x_session.head()

user_id,session_id,area.id,area.regionId,employment,workSchedule,workExperience,name,company.id,compensation.currencyCode,compensation.from.min,compensation.from.max,compensation.to.max,compensation.to.min
str,str,str,str,str,str,str,str,str,str,i64,i64,i64,i64
"""u_159379""","""s_27662744""","""a_1756""","""ar_41""","""full""","""fullDay""","""between1And3""","""Специалист слу…","""c_96070""","""RUR""",30000,55000,70000,45000
"""u_159379""","""s_26799506""","""a_1756""","""ar_41""","""full""","""fullDay""","""noExperience""","""Младший менедж…","""c_197879""","""RUR""",50000,90000,105000,50000
"""u_159379""","""s_14669609""","""a_1756""","""ar_41""","""full""","""fullDay""","""between1And3""","""Специалист слу…","""c_96070""","""RUR""",30000,90000,105000,45000
"""u_159379""","""s_20061977""","""a_1756""","""ar_41""","""full""","""fullDay""","""between1And3""","""Специалист слу…","""c_255843""","""RUR""",50000,110000,70000,50000
"""u_159379""","""s_520442""","""a_1756""","""ar_41""","""full""","""fullDay""","""between1And3""","""Специалист слу…","""c_96070""","""RUR""",30000,90000,105000,45000


In [10]:
vacancies.drop(['description', 'keySkills.keySkill']).to_pandas().to_feather('../data/vac.no_desc.fth')

In [4]:
train = train.to_pandas()
test = test.to_pandas()
# vacancies = vacancies.to_pandas()

In [7]:
train[train['user_id'].apply(lambda i: hash(i) % 100 == 0)].to_feather('../data/train.fth')

In [6]:
train.head(2)

Unnamed: 0,user_id,session_id,vacancy_id,action_type,action_dt
0,u_332060,s_28301374,"[v_2571684, v_488179, v_2389179, v_1393783, v_...","[2, 2, 2, 2, 2, 2, 1, 1, 2, 1, 2, 1, 2, 2, 2, ...","[2023-11-01T00:40:58.105000000, 2023-11-01T00:..."
1,u_1057881,s_33868982,[v_665861],[2],[2023-11-01T00:23:51.452000000]


In [7]:
test.head(2)

Unnamed: 0,user_id,session_id,vacancy_id,action_type,action_dt
0,u_482520,s_25018731,"[v_2597196, v_1223061, v_1223061]","[2, 2, 1]","[2023-11-19T12:03:13.089000000, 2023-11-19T12:..."
1,u_582132,s_481216,"[v_470400, v_470400, v_1530783]","[2, 1, 2]","[2023-11-21T15:39:47.981000000, 2023-11-21T15:..."


In [8]:
# vacancies.head(2)

In [None]:
predictions = test.select(pl.col('vacancy_id').list.unique(maintain_order=True).list.tail(2))['vacancy_id'].to_list()
test = test.with_columns(pl.lit(predictions).alias('predictions'))

test.select(['user_id', 'session_id', 'predictions']).write_parquet('../data/submission.pq')

In [24]:
test

user_id,session_id,vacancy_id,action_type,action_dt,predictions
str,str,list[str],list[i64],list[datetime[ns]],list[str]
"""u_482520""","""s_25018731""","[""v_2597196"", ""v_1223061"", ""v_1223061""]","[2, 2, 1]","[2023-11-19 12:03:13.089, 2023-11-19 12:03:30.396, 2023-11-19 12:05:03.473]","[""v_2597196"", ""v_1223061""]"
"""u_582132""","""s_481216""","[""v_470400"", ""v_470400"", ""v_1530783""]","[2, 1, 2]","[2023-11-21 15:39:47.981, 2023-11-21 15:43:57.620, 2023-11-21 15:44:37.982]","[""v_470400"", ""v_1530783""]"
"""u_212584""","""s_16918781""","[""v_1572055"", ""v_1572055"", … ""v_1329494""]","[2, 3, … 1]","[2023-11-16 08:41:47.031, 2023-11-16 08:45:21.403, … 2023-11-16 09:05:27.379]","[""v_1592343"", ""v_1329494""]"
"""u_425177""","""s_17505104""","[""v_1375331"", ""v_1922852""]","[2, 2]","[2023-11-17 12:42:18.513, 2023-11-17 12:43:52.744]","[""v_1375331"", ""v_1922852""]"
"""u_700997""","""s_15528830""","[""v_2152997"", ""v_2152997"", ""v_1217630""]","[2, 1, 2]","[2023-11-16 17:22:53.530, 2023-11-16 17:23:19.885, 2023-11-16 17:23:33.472]","[""v_2152997"", ""v_1217630""]"
"""u_687201""","""s_11187326""","[""v_2602959"", ""v_549337"", … ""v_212060""]","[2, 2, … 1]","[2023-11-21 14:41:30.546, 2023-11-21 15:00:25.506, … 2023-11-21 15:24:56.271]","[""v_698513"", ""v_212060""]"
"""u_573219""","""s_27394215""","[""v_1836184"", ""v_2733253"", … ""v_2297339""]","[1, 2, … 2]","[2023-11-21 18:48:43.424, 2023-11-21 18:49:12.552, … 2023-11-21 19:48:55.704]","[""v_1828011"", ""v_2297339""]"
"""u_1082782""","""s_30341164""","[""v_1026036"", ""v_1026036"", … ""v_206700""]","[2, 1, … 2]","[2023-11-17 02:54:19.561, 2023-11-17 02:54:50.486, … 2023-11-17 03:21:22.008]","[""v_206700"", ""v_528671""]"
"""u_81444""","""s_1909964""","[""v_2497384"", ""v_1480870"", … ""v_1500697""]","[2, 2, … 1]","[2023-11-15 12:38:34.579, 2023-11-15 13:03:41.360, … 2023-11-15 13:05:15.916]","[""v_36093"", ""v_1500697""]"
"""u_858357""","""s_12463318""","[""v_1085881"", ""v_522338"", … ""v_1776027""]","[2, 2, … 2]","[2023-11-18 11:58:33.953, 2023-11-18 12:01:09.562, … 2023-11-18 14:39:24.255]","[""v_2170682"", ""v_93799""]"


# Simple ALS

In [25]:
unique_users = train['user_id'].unique().to_list()
unique_vacancies = train['vacancy_id'].explode().unique().to_list()

user2idx = {user_id: idx for idx, user_id in enumerate(unique_users)}
vac2idx = {vac_id: idx for idx, vac_id in enumerate(unique_vacancies)}

idx2vac = {idx: vac_id for vac_id, idx in vac2idx.items()}
action_weights = {
    1: 4.0,
    2: 1.0,
    3: 2.0
}

pairs = train.select(['user_id', 'vacancy_id', 'action_type']).explode(('vacancy_id', 'action_type'))
users = pairs['user_id'].map_dict(user2idx).to_numpy()
vacancies = pairs['vacancy_id'].map_dict(vac2idx).to_numpy()
preferences = pairs['action_type'].map_dict(action_weights).to_numpy()

  users = pairs['user_id'].map_dict(user2idx).to_numpy()
  vacancies = pairs['vacancy_id'].map_dict(vac2idx).to_numpy()
  preferences = pairs['action_type'].map_dict(action_weights).to_numpy()


In [33]:
pairs

user_id,vacancy_id,action_type
str,str,i64
"""u_332060""","""v_2571684""",2
"""u_332060""","""v_488179""",2
"""u_332060""","""v_2389179""",2
"""u_332060""","""v_1393783""",2
"""u_332060""","""v_2608935""",2
"""u_332060""","""v_1119127""",2
"""u_332060""","""v_323088""",1
"""u_332060""","""v_794390""",1
"""u_332060""","""v_1020162""",2
"""u_332060""","""v_476655""",1


In [26]:
uv_mat = csr_matrix((preferences, (users, vacancies)))

als_model = implicit.als.AlternatingLeastSquares(
    factors=150,
    random_state=RANDOM_STATE,
    iterations=20,
    calculate_training_loss=True,
    regularization=0.001
)
als_model.fit(uv_mat)

100%|██████████| 20/20 [00:24<00:00,  1.21s/it, loss=2.14e-5]


In [27]:
als_model.user_factors.shape, als_model.item_factors.shape

((882409, 150), (1458353, 150))

In [28]:
test_users = test['user_id'].to_list()
test_vacancies = test.select(pl.col('vacancy_id').list.unique(maintain_order=True))['vacancy_id'].to_list()

In [30]:
predictions = []

for user, vacs in tqdm(zip(test_users, test_vacancies), total=len(test_users)):
    if user not in user2idx:
        predictions.append(vacs)
        continue
    cuser = user2idx[user]

    recommendations = als_model.recommend(cuser, uv_mat[cuser], N=N_PREDICTIONS)[0]
    recommendations = [idx2vac[cv] for cv in recommendations]
    predictions.append(recommendations)

100%|██████████| 83189/83189 [02:21<00:00, 588.66it/s]


In [31]:
test = test.with_columns(pl.lit(predictions).alias('predictions'))
test.select(['user_id', 'session_id', 'predictions']).write_parquet('als_submission.pq')

  test = test.with_columns(pl.lit(predictions).alias('predictions'))


In [32]:
test

user_id,session_id,vacancy_id,action_type,action_dt,predictions
str,str,list[str],list[i64],list[datetime[ns]],list[str]
"""u_482520""","""s_25018731""","[""v_2597196"", ""v_1223061"", ""v_1223061""]","[2, 2, 1]","[2023-11-19 12:03:13.089, 2023-11-19 12:03:30.396, 2023-11-19 12:05:03.473]","[""v_2650515"", ""v_367301"", … ""v_2337447""]"
"""u_582132""","""s_481216""","[""v_470400"", ""v_470400"", ""v_1530783""]","[2, 1, 2]","[2023-11-21 15:39:47.981, 2023-11-21 15:43:57.620, 2023-11-21 15:44:37.982]","[""v_1347362"", ""v_1798396"", … ""v_726002""]"
"""u_212584""","""s_16918781""","[""v_1572055"", ""v_1572055"", … ""v_1329494""]","[2, 3, … 1]","[2023-11-16 08:41:47.031, 2023-11-16 08:45:21.403, … 2023-11-16 09:05:27.379]","[""v_1572055"", ""v_953153"", … ""v_1329494""]"
"""u_425177""","""s_17505104""","[""v_1375331"", ""v_1922852""]","[2, 2]","[2023-11-17 12:42:18.513, 2023-11-17 12:43:52.744]","[""v_2650515"", ""v_367301"", … ""v_676801""]"
"""u_700997""","""s_15528830""","[""v_2152997"", ""v_2152997"", ""v_1217630""]","[2, 1, 2]","[2023-11-16 17:22:53.530, 2023-11-16 17:23:19.885, 2023-11-16 17:23:33.472]","[""v_1431178"", ""v_2395102"", … ""v_1352210""]"
"""u_687201""","""s_11187326""","[""v_2602959"", ""v_549337"", … ""v_212060""]","[2, 2, … 1]","[2023-11-21 14:41:30.546, 2023-11-21 15:00:25.506, … 2023-11-21 15:24:56.271]","[""v_1465910"", ""v_1586766"", … ""v_668735""]"
"""u_573219""","""s_27394215""","[""v_1836184"", ""v_2733253"", … ""v_2297339""]","[1, 2, … 2]","[2023-11-21 18:48:43.424, 2023-11-21 18:49:12.552, … 2023-11-21 19:48:55.704]","[""v_701974"", ""v_1081658"", … ""v_27909""]"
"""u_1082782""","""s_30341164""","[""v_1026036"", ""v_1026036"", … ""v_206700""]","[2, 1, … 2]","[2023-11-17 02:54:19.561, 2023-11-17 02:54:50.486, … 2023-11-17 03:21:22.008]","[""v_964765"", ""v_460169"", … ""v_9584""]"
"""u_81444""","""s_1909964""","[""v_2497384"", ""v_1480870"", … ""v_1500697""]","[2, 2, … 1]","[2023-11-15 12:38:34.579, 2023-11-15 13:03:41.360, … 2023-11-15 13:05:15.916]","[""v_1431178"", ""v_2395102"", … ""v_2192945""]"
"""u_858357""","""s_12463318""","[""v_1085881"", ""v_522338"", … ""v_1776027""]","[2, 2, … 2]","[2023-11-18 11:58:33.953, 2023-11-18 12:01:09.562, … 2023-11-18 14:39:24.255]","[""v_1840884"", ""v_1507795"", … ""v_1876516""]"
