In [1]:
import numpy as np
import pandas as pd
from rectools import Columns

from rectools.dataset import Dataset

## Loading data

In [2]:
interactions = pd.read_csv('../data/interactions.csv')
users = pd.read_csv('../data/users.csv')
items = pd.read_csv('../data/items.csv')

Columns.Datetime = 'last_watch_dt'

interactions.head()

Unnamed: 0,user_id,item_id,last_watch_dt,total_dur,watched_pct
0,176549,9506,2021-05-11,4250,72.0
1,699317,1659,2021-05-29,8317,100.0
2,656683,7107,2021-05-09,10,0.0
3,864613,7638,2021-07-05,14483,100.0
4,964868,9506,2021-04-30,6725,100.0


In [3]:
interactions.drop(interactions[interactions[Columns.Datetime].str.len() != 10].index, inplace=True)
interactions[Columns.Datetime] = pd.to_datetime(interactions[Columns.Datetime], format="%Y-%m-%d")
interactions[Columns.Weight] = np.where(interactions["watched_pct"] > 10, 3, 1)

In [4]:
def get_user_features(users, data):
    users = users.loc[users[Columns.User].isin(data[Columns.User])].copy()
    user_features_frames = []
    for feature in ["sex", "age", "income"]:
        feature_frame = users.reindex(columns=[Columns.User, feature])
        feature_frame.columns = ["id", "value"]
        feature_frame["feature"] = feature
        user_features_frames.append(feature_frame)
    return pd.concat(user_features_frames)

In [5]:
def get_item_features(items, data):
    items = items.loc[items[Columns.Item].isin(data[Columns.Item])].copy()
    items["genre"] = items["genres"].str.lower().str.replace(", ", ",", regex=False).str.split(",")
    genre_feature = items[["item_id", "genre"]].explode("genre")
    genre_feature.columns = ["id", "value"]
    genre_feature["feature"] = "genre"
    content_feature = items.reindex(columns=[Columns.Item, "content_type"])
    content_feature.columns = ["id", "value"]
    content_feature["feature"] = "content_type"
    return pd.concat((genre_feature, content_feature))

In [6]:
max_date = interactions[Columns.Datetime].max()
min_date = interactions[Columns.Datetime].min()

print(f"min дата в interactions: {min_date}")
print(f"max дата в interactions: {max_date}")
print(f"Продолжительность: {max_date - min_date}")

min дата в interactions: 2021-03-13 00:00:00
max дата в interactions: 2021-08-22 00:00:00
Продолжительность: 162 days 00:00:00


In [7]:
ranker_days_count = 30

interactions = interactions[
    (interactions[Columns.Datetime] < max_date - pd.Timedelta(days=ranker_days_count))
]

In [8]:
user_features = get_user_features(users, interactions)
item_features = get_item_features(items, interactions)

In [9]:
dataset = Dataset.construct(
    interactions_df=interactions,
    user_features_df=user_features,
    cat_user_features=["sex", "age", "income"],
    item_features_df=item_features,
    cat_item_features=["genre", "content_type"]
)

## LightFM training

In [None]:
from lightfm import LightFM
from rectools.models import LightFMWrapperModel

LFM_model = LightFMWrapperModel(
    LightFM(
        no_components=30,
        loss='warp',
        learning_rate=0.005,
        user_alpha=0.12482318873553576,
        item_alpha=0.10577830153990038,
        random_state=42,
    ),
    epochs=7,
    num_threads=12,
)

In [12]:
LFM_model.fit(dataset)

<rectools.models.lightfm.LightFMWrapperModel at 0x7f346948ca60>

In [13]:
top_N = 50
candidates = LFM_model.recommend(
    users=dataset.user_id_map.external_ids,
    dataset=dataset,
    k=top_N,
    filter_viewed=True
)

In [14]:
candidates = candidates.rename({"rank": "lfm_rank", "score": "lfm_score"}, axis=1)
candidates.sort_values('lfm_score', ascending=False)

Unnamed: 0,user_id,item_id,lfm_score,lfm_rank
27605400,92080,15297,0.000009,1
30033650,851632,15297,0.000009,1
14855450,729572,15297,0.000009,1
21617400,557144,15297,0.000009,1
18450650,664721,15297,0.000009,1
...,...,...,...,...
12455649,537179,6626,-0.000043,50
26333499,404605,12981,-0.000043,50
13177599,613447,12981,-0.000043,50
17090599,333598,1449,-0.000043,50


In [15]:
candidates.to_csv('lfm_candidates.csv')