In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from replay.session_handler import State

spark = State().session
spark

In [3]:
from rs_datasets import MovieLens

data = MovieLens("1m")
data.info()

ratings


Unnamed: 0,user_id,item_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968



users


Unnamed: 0,user_id,gender,age,occupation,zip_code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117



items


Unnamed: 0,item_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance





In [4]:
from replay.data_preparator import DataPreparator

log = DataPreparator().transform(
    data=data.ratings,
    columns_names={
        "user_id": "user_id",
        "item_id": "item_id",
        "relevance": "rating",
        "timestamp": "timestamp"
    }
)

In [5]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer,LabelBinarizer

mlb = MultiLabelBinarizer()
lb = LabelBinarizer()
item_features = pd.DataFrame(mlb.fit_transform(data.items.genres.apply(lambda x: x.split("|"))),
                   columns=list(map(lambda x: f"genre_{x}",mlb.classes_)),
                   index=data.items.item_id).reset_index()

In [6]:
item_features_spark = DataPreparator().transform(
    data=item_features,
    columns_names={
        "item_id": "item_id"
    }
).drop("timestamp")

In [7]:
from replay.splitters import UserSplitter

second_stage_splitter = UserSplitter(
    drop_cold_items=True,
    drop_cold_users=True,
    item_test_size=1,
    user_test_size=1000,
    seed=1234,
    shuffle=True
)

In [8]:
from replay.models import ALSWrap

first_model = ALSWrap(rank=96)

In [17]:
from replay.models import ClassifierRec

second_model = ClassifierRec(use_recs_value=True,numTrees=100)

In [18]:
from replay.scenarios import TwoStagesScenario

two_stages = TwoStagesScenario(
    second_stage_splitter=second_stage_splitter,
    second_model=second_model,
    first_model=first_model
)

In [19]:
recs = two_stages.get_recs(log, 10, item_features=item_features_spark)

01-Jun-20 11:07:36, sponge_bob_magic, DEBUG: mixed_train stat: total lines: 999209, total users: 6040, total items: 3706
01-Jun-20 11:07:36, sponge_bob_magic, DEBUG: test stat: total lines: 1000, total users: 1000, total items: 663
01-Jun-20 11:07:37, sponge_bob_magic, DEBUG: first_train stat: total lines: 501083, total users: 6040, total items: 3618
01-Jun-20 11:07:37, sponge_bob_magic, DEBUG: first_test stat: total lines: 498126, total users: 6040, total items: 3602
01-Jun-20 11:07:37, sponge_bob_magic, DEBUG: Предварительная стадия обучения (pre-fit)
01-Jun-20 11:07:37, sponge_bob_magic, DEBUG: Основная стадия обучения (fit)
01-Jun-20 11:07:38, sponge_bob_magic, DEBUG: Индексирование данных
01-Jun-20 11:07:38, sponge_bob_magic, DEBUG: Обучение модели
01-Jun-20 11:08:16, sponge_bob_magic, DEBUG: баланс классов: положительных 173914 из 604000
01-Jun-20 11:08:18, sponge_bob_magic, DEBUG: Список item содержит элементы, которые отсутствовали при обучении. Результат предсказания будет не 

In [20]:
train, test = second_stage_splitter.split(log)

In [21]:
first_recs = first_model.fit_predict(
    log=train,
    k=10,
    users=test.select("user_id").distinct().cache(),
    items=train.select("item_id").distinct().cache(),
)

01-Jun-20 11:09:00, sponge_bob_magic, DEBUG: Предварительная стадия обучения (pre-fit)
01-Jun-20 11:09:00, sponge_bob_magic, DEBUG: Основная стадия обучения (fit)
01-Jun-20 11:09:02, sponge_bob_magic, DEBUG: Индексирование данных
01-Jun-20 11:09:02, sponge_bob_magic, DEBUG: Обучение модели


In [22]:
two_stages.experiment.add_result("first_stage", first_recs)

In [23]:
two_stages.experiment.results

Unnamed: 0,HitRate@10
two_stages_scenario,0.14
first_stage,0.229
