In [1]:
import polars as pl

In [2]:
!ls ../data | grep hh_

hh_recsys_sample.pq
hh_recsys_test_hh.pq
hh_recsys_train_hh.pq
hh_recsys_vacancies.pq


In [4]:
compact_log = pl.concat([
    pl.read_parquet('../data/hh_recsys_train_hh.pq'),
    pl.read_parquet('../data/hh_recsys_test_hh.pq'),
])
compact_log.head(2)

user_id,session_id,vacancy_id,action_type,action_dt
str,str,list[str],list[i64],list[datetime[ns]]
"""u_332060""","""s_28301374""","[""v_2571684"", ""v_488179"", … ""v_2633899""]","[2, 2, … 2]","[2023-11-01 00:40:58.105, 2023-11-01 00:58:13.091, … 2023-11-01 01:35:54.456]"
"""u_1057881""","""s_33868982""","[""v_665861""]",[2],[2023-11-01 00:23:51.452]


In [5]:
log = compact_log.select(
    'vacancy_id',
    'action_dt',
).explode(
    'vacancy_id',
    'action_dt',
)
log.head(2)

vacancy_id,action_dt
str,datetime[ns]
"""v_2571684""",2023-11-01 00:40:58.105
"""v_488179""",2023-11-01 00:58:13.091


In [42]:
log['action_dt'].cast(pl.Date).unique().shape

(21,)

In [65]:
(20/21)**62

0.048558298182761084

In [17]:
last_log_dt = log['action_dt'].cast(pl.Date).max()
last_log_dt

datetime.date(2023, 11, 21)

In [27]:
n_action = log.group_by(
    'vacancy_id'
).agg(
    pl.col('action_dt').cast(pl.Date).max(),
    pl.count().alias('n_action'),
).sort('n_action', descending=True)

n_action.head(10)

vacancy_id,action_dt,n_action
str,date,u32
"""v_460169""",2023-11-21,13746
"""v_964765""",2023-11-21,6798
"""v_1840884""",2023-11-21,6687
"""v_2691293""",2023-11-16,6622
"""v_1481785""",2023-11-10,5779
"""v_1507795""",2023-11-21,5259
"""v_2337447""",2023-11-21,4540
"""v_745592""",2023-11-08,4537
"""v_690752""",2023-11-13,4441
"""v_2730052""",2023-11-21,4201


In [58]:
n_action.filter(
    (
        pl.col('n_action') > 95
    ) & (
        pl.col('action_dt') == last_log_dt
    ),
)

vacancy_id,action_dt,n_action
str,date,u32
"""v_460169""",2023-11-21,13746
"""v_964765""",2023-11-21,6798
"""v_1840884""",2023-11-21,6687
"""v_1507795""",2023-11-21,5259
"""v_2337447""",2023-11-21,4540
"""v_2730052""",2023-11-21,4201
"""v_402043""",2023-11-21,4058
"""v_1075856""",2023-11-21,4034
"""v_2507405""",2023-11-21,3959
"""v_2557303""",2023-11-21,3506


In [73]:
deleted = n_action.filter(
    (
        pl.col('n_action') > 95
    ) & (
        pl.col('action_dt') != last_log_dt
    ),
)
deleted.head(5)

vacancy_id,action_dt,n_action
str,date,u32
"""v_2691293""",2023-11-16,6622
"""v_1481785""",2023-11-10,5779
"""v_745592""",2023-11-08,4537
"""v_690752""",2023-11-13,4441
"""v_1455671""",2023-11-19,3946


In [74]:
cp = pl.read_parquet('../data/catboost_predictions_v3.pq').select(
    pl.col('predictions').alias('vacancy_id'),
    pl.lit(list(range(100)), dtype=pl.List(pl.Int64)).alias('idx'),
    'user_id',
).explode('vacancy_id', 'idx')
cp.head()

vacancy_id,idx,user_id
str,i64,str
"""v_622811""",0,"""u_775363"""
"""v_2668345""",1,"""u_775363"""
"""v_390512""",2,"""u_775363"""
"""v_2530865""",3,"""u_775363"""
"""v_1783814""",4,"""u_775363"""


In [80]:
cp.join(
    deleted,
    on='vacancy_id',
).group_by('user_id').min()['idx'].mean()

9.56227820045076

In [69]:
cp.shape

(8318900, 2)

In [56]:
1389875 / 8318900

0.16707437281371335

In [139]:
o = pl.read_parquet('../data/catboost_predictions_v3.pq')
n = pl.read_parquet('../data/catboost_predictions_v4.pq')


In [140]:
n.join(
    o,
    on='user_id',
).select(
    pl.col('predictions').list.first() == pl.col('predictions_right').list.first()
).mean()

predictions
f64
0.453113


In [141]:
n.select(
    pl.col('predictions').list.len()
).min()

predictions
u32
100


In [121]:
%%time
stats = log.select(
    'vacancy_id',
    pl.col('action_dt').cast(pl.Date).alias('dt')
).group_by(
    'vacancy_id',
    'dt',
).count().sort('dt').select(
    pl.col('vacancy_id'),
    pl.col('dt'),
    pl.col('count').cum_sum().over('vacancy_id').alias('vacancy_views'),
    pl.col('count').alias('vacancy_views_last_day'),
    (pl.col('count') / pl.col('count').cum_sum().over('vacancy_id')).alias('vacancy_views_last_day_share'),
).sort('vacancy_id', 'dt')

stats.head()

CPU times: user 29.6 s, sys: 3.04 s, total: 32.7 s
Wall time: 5.12 s


vacancy_id,dt,vacancy_views,vacancy_views_last_day,vacancy_views_last_day_share
str,date,u32,u32,f64
"""v_0""",2023-11-06,3,3,1.0
"""v_0""",2023-11-07,5,2,0.4
"""v_0""",2023-11-09,6,1,0.166667
"""v_0""",2023-11-10,8,2,0.25
"""v_0""",2023-11-13,9,1,0.111111


In [111]:
last_log_dt

datetime.date(2023, 11, 21)

In [135]:
import datetime
d = pl.read_parquet('../data/final_train_dataset.pq')

In [138]:
d['dt'].dt.timestamp()

dt
i64
1700438400000000
1699401600000000
1699401600000000
1700438400000000
1699920000000000
1699833600000000
1700006400000000
1699833600000000
1699833600000000
1700438400000000


In [131]:
d.select(pl.all().is_null()).mean()

user_id,vacancy_id,likes_count,applies_count,views_count,likes_back,applies_back,views_back,score,score_pos,name,company.id,compensation.from,compensation.to,compensation.currencyCode,area.id,area.regionId,employment,workSchedule,workExperience,dt,vacancy_since_action,vacancy_actions,vacancy_actions_last_day,vacancy_actions_last_day_share,eq_name,eq_company.id,eq_area.id,eq_employment,eq_workSchedule,eq_workExperience,sessions_back,is_test,target
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.321364,0.544293,0.246873,0.0,0.089349,0.0,0.0,0.0,0.0,0.182216,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [107]:
pl.read_parquet('../data/dssm_train.pq').select(
    pl.all(),
    pl.col('session_end').cast(pl.Date).alias('dt')
).head()

user_id,n_sessions,vacancy_id,action_type,is_test,session_id,session_end,target,dt
str,u32,list[u64],list[i64],bool,str,datetime[ns],u64,date
"""u_237129""",2,"[1481580, 208271, … 1606798]","[2, 2, … 2]",False,"""s_361359""",2023-11-13 12:08:36.976,716724,2023-11-13
"""u_237129""",2,"[1481580, 208271, … 1606798]","[2, 2, … 2]",False,"""s_361359""",2023-11-13 12:08:36.976,690666,2023-11-13
"""u_864547""",4,"[1026937, 937429, … 11453]","[2, 1, … 2]",False,"""s_706889""",2023-11-10 12:24:02.680,1402024,2023-11-10
"""u_864547""",4,"[1026937, 937429, … 11453]","[2, 1, … 2]",False,"""s_706889""",2023-11-10 12:24:02.680,444323,2023-11-10
"""u_67865""",12,"[1902157, 1389016, … 525654]","[2, 2, … 2]",True,"""s_10270673""",2023-11-21 10:33:14.025,2705314,2023-11-21
