In [20]:
import dill
import warnings
import numpy as np
import pandas as pd
from rectools.columns import Columns
from rectools.dataset import Dataset
from rectools.models import PopularModel
from rectools.models.lightfm import LightFMWrapperModel
warnings.filterwarnings('ignore')

In [34]:
interactions = pd.read_csv("interactions.csv")
interactions["last_watch_dt"] = interactions["last_watch_dt"].astype(np.datetime64)
interactions

Unnamed: 0,user_id,item_id,last_watch_dt,total_dur,watched_pct
0,176549,9506,2021-05-11,4250,72.0
1,699317,1659,2021-05-29,8317,100.0
2,656683,7107,2021-05-09,10,0.0
3,864613,7638,2021-07-05,14483,100.0
4,964868,9506,2021-04-30,6725,100.0
...,...,...,...,...,...
5476246,648596,12225,2021-08-13,76,0.0
5476247,546862,9673,2021-04-13,2308,49.0
5476248,697262,15297,2021-08-20,18307,63.0
5476249,384202,16197,2021-04-19,6203,100.0


In [35]:
interactions_default_values = {
    "last_watch_dt": interactions["last_watch_dt"].median(),
    Columns.Weight: 1,
    "watched_pct": 0,
    'total_dur': 0,
}
interactions_default_values

{'last_watch_dt': Timestamp('2021-07-01 00:00:00'),
 'weight': 1,
 'watched_pct': 0,
 'total_dur': 0}

In [2]:
with open(f"dataset.dill", "rb") as file:
    dataset: Dataset = dill.load(file)
train_interactions = dataset.interactions.df
train_interactions

Unnamed: 0,user_id,item_id,weight,last_watch_dt
0,0,0,3.0,2021-05-11
1,1,1,3.0,2021-05-29
3,2,2,3.0,2021-07-05
4,3,0,3.0,2021-04-30
5,4,3,3.0,2021-05-13
...,...,...,...,...
5476242,233267,46,3.0,2021-04-21
5476244,54448,166,3.0,2021-08-02
5476245,173679,100,1.0,2021-05-12
5476247,165659,2141,3.0,2021-04-13


In [3]:
with open(f"test_interactions.dill", "rb") as file:
    test_interactions: pd.DataFrame = dill.load(file)
test_interactions

Unnamed: 0,user_id,item_id,last_watch_dt,total_dur,watched_pct,weight
9,203219,13582,2021-08-22,6975,100.0,3
54,200197,9335,2021-08-16,83,2.0,1
64,73446,14488,2021-08-19,6011,100.0,3
84,10010,512,2021-08-15,14,0.0,1
94,890735,14200,2021-08-16,1179,28.0,3
...,...,...,...,...,...,...
5476169,589589,983,2021-08-21,2403,43.0,3
5476188,590892,8618,2021-08-21,1335,23.0,3
5476191,857162,12360,2021-08-16,11,0.0,1
5476201,273558,10605,2021-08-21,34030,100.0,3


In [4]:
TEST_USERS = test_interactions[Columns.User].unique()
TEST_USERS

array([203219, 200197,  73446, ..., 623792, 442859, 857162], dtype=int64)

In [5]:
with open(f"LightFM_warp_8_0784.dill", "rb") as file:
    lightfm_model: LightFMWrapperModel = dill.load(file)
print(lightfm_model)

<rectools.models.lightfm.LightFMWrapperModel object at 0x00000245CF72BCA0>


In [6]:
n_candidates = 100

In [40]:
lightfm_recos = lightfm_model.recommend(users=TEST_USERS, dataset=dataset, k=n_candidates, filter_viewed=True)
lightfm_recos.head()

Unnamed: 0,user_id,item_id,score,rank
0,203219,15297,3.580714,1
1,203219,10440,3.536684,2
2,203219,13865,3.146872,3
3,203219,4151,3.101431,4
4,203219,2657,3.067135,5


In [8]:
popular_model = PopularModel()
popular_model.fit(dataset);

In [9]:
popular_recos = popular_model.recommend(users=TEST_USERS, dataset=dataset, k=n_candidates, filter_viewed=True)
popular_recos.head()

Unnamed: 0,user_id,item_id,score,rank
0,203219,10440,159230.0,1
1,203219,15297,158315.0,2
2,203219,13865,96838.0,3
3,203219,9728,92359.0,4
4,203219,4151,70650.0,5


In [53]:
combined_recos = pd.merge(
    lightfm_recos, popular_recos, on=["user_id", "item_id"], how="outer"
)
combined_recos.sort_values(["user_id",	"item_id"], inplace=True)
combined_recos

Unnamed: 0,user_id,item_id,score_x,rank_x,score_y,rank_y
5827259,1,14,-227.010579,60.0,5667.0,73.0
13091046,1,24,,,4861.0,92.0
5827252,1,101,-226.932782,53.0,6661.0,58.0
5827206,1,142,-225.937359,7.0,35213.0,8.0
5827261,1,657,-227.025621,62.0,6490.0,59.0
...,...,...,...,...,...,...
12102109,1097544,16270,,,5780.0,69.0
2592690,1097544,16291,-228.083697,91.0,,
2592657,1097544,16361,-227.835225,58.0,5336.0,80.0
2592689,1097544,16447,-228.083070,90.0,,


In [69]:
# combined_recos_filled = combined_recos.copy(deep=True)
# for prefix in ["x", "y"]:
#     combined_recos_filled[f"score_{prefix}"] = combined_recos.groupby("user_id")[f"score_{prefix}"].apply(lambda x: x.fillna(x.min() - 0.01))
#     combined_recos_filled[f"rank_{prefix}"] = combined_recos.groupby("user_id")[f"rank_{prefix}"].apply(lambda x: x.fillna(x.max() + 1))
# combined_recos_filled

Unnamed: 0,user_id,item_id,score_x,rank_x,score_y,rank_y
5827259,1,14,-227.010579,60.0,5667.00,73.0
13091046,1,24,-227.304226,101.0,4861.00,92.0
5827252,1,101,-226.932782,53.0,6661.00,58.0
5827206,1,142,-225.937359,7.0,35213.00,8.0
5827261,1,657,-227.025621,62.0,6490.00,59.0
...,...,...,...,...,...,...
12102109,1097544,16270,-228.128515,101.0,5780.00,69.0
2592690,1097544,16291,-228.083697,91.0,4503.99,101.0
2592657,1097544,16361,-227.835225,58.0,5336.00,80.0
2592689,1097544,16447,-228.083070,90.0,4503.99,101.0


In [72]:
df = pd.merge(
    test_interactions,
    combined_recos, # combined_recos_filled, # lightfm_recos, 
    how='outer',  # right ? 
    on=['user_id', 'item_id']
)

# # Проставляем дефолтные значения интеракций
# min_score: float =  df['score'].min() - 0.01
# max_rank: int = df['rank'].max() + 1  # 101

for prefix in ["x", "y"]:
    df[f"score_{prefix}"] = df.groupby("user_id")[f"score_{prefix}"].apply(lambda x: x.fillna(x.min() - 0.01))
    df[f"rank_{prefix}"] = df.groupby("user_id")[f"rank_{prefix}"].apply(lambda x: x.fillna(x.max() + 1))

default_values = {
    # 'score': min_score, 'rank': max_rank,
    # Важно использовате те же дефолтные значения для интеракций, 
    # чтобы не сделать утечку
    **interactions_default_values,
}
df.fillna(default_values, inplace=True)
df

Unnamed: 0,user_id,item_id,last_watch_dt,total_dur,watched_pct,weight,score_x,rank_x,score_y,rank_y
0,203219,13582,2021-08-22,6975.0,100.0,3.0,1.721243,101.0,4503.99,101.0
1,200197,9335,2021-08-16,83.0,2.0,1.0,-226.284075,101.0,4503.99,101.0
2,73446,14488,2021-08-19,6011.0,100.0,3.0,-206.203359,101.0,4088.99,101.0
3,10010,512,2021-08-15,14.0,0.0,1.0,-205.246856,101.0,4432.99,101.0
4,890735,14200,2021-08-16,1179.0,28.0,3.0,-160.690570,101.0,4503.99,101.0
...,...,...,...,...,...,...,...,...,...,...
14942813,1097544,16270,2021-07-01,0.0,0.0,1.0,-228.128515,101.0,5780.00,69.0
14942814,1097544,16291,2021-07-01,0.0,0.0,1.0,-228.083697,91.0,4503.99,101.0
14942815,1097544,16361,2021-07-01,0.0,0.0,1.0,-227.835225,58.0,5336.00,80.0
14942816,1097544,16447,2021-07-01,0.0,0.0,1.0,-228.083070,90.0,4503.99,101.0


In [38]:
df[df[Columns.User] == 203219]

Unnamed: 0,user_id,item_id,last_watch_dt,total_dur,watched_pct,weight,score,rank
0,203219,13582,2021-08-22,6975.0,100.0,3.0,-229.412933,101.0
2410,203219,13865,2021-08-20,7333.0,100.0,3.0,3.146872,3.0
15225,203219,9996,2021-08-21,6.0,0.0,1.0,2.673163,11.0
92317,203219,14814,2021-08-20,7.0,0.0,1.0,-229.412933,101.0
119054,203219,4976,2021-08-19,8768.0,100.0,3.0,-229.412933,101.0
...,...,...,...,...,...,...,...,...
333119,203219,8618,2021-07-01,0.0,0.0,1.0,1.749395,96.0
333120,203219,5803,2021-07-01,0.0,0.0,1.0,1.749230,97.0
333121,203219,13955,2021-07-01,0.0,0.0,1.0,1.748001,98.0
333122,203219,341,2021-07-01,0.0,0.0,1.0,1.745352,99.0
