In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from replay.session_handler import State

spark = State().session
spark

In [10]:
import pandas as pd

df = pd.read_csv("data/ml1m_ratings.dat", sep="\t", names=["user_id", "item_id", "relevance", "timestamp"])
items = pd.read_csv("data/ml1m_items.dat", sep="\t", names=["item_id", "titile", "genres"])
df.head()

Unnamed: 0,user_id,item_id,relevance,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [7]:
from replay.data_preparator import DataPreparator

log = DataPreparator().transform(
    data=df,
    columns_names={
        "user_id": "user_id",
        "item_id": "item_id",
        "relevance": "relevance",
        "timestamp": "timestamp"
    }
)

In [12]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer,LabelBinarizer

mlb = MultiLabelBinarizer()
lb = LabelBinarizer()
item_features = pd.DataFrame(mlb.fit_transform(items.genres.apply(lambda x: x.split("|"))),
                   columns=list(map(lambda x: f"genre_{x}",mlb.classes_)),
                   index=items.item_id).reset_index()

In [13]:
item_features_spark = DataPreparator().transform(
    data=item_features,
    columns_names={
        "item_id": "item_id"
    }
).drop("timestamp")

In [14]:
from replay.splitters import UserSplitter

second_stage_splitter = UserSplitter(
    drop_cold_items=True,
    drop_cold_users=True,
    item_test_size=1,
    user_test_size=1000,
    seed=1234,
    shuffle=True
)

In [15]:
from replay.models import ALSWrap

first_model = ALSWrap(rank=96)



In [19]:
from replay.models import ClassifierRec
from pyspark.ml.classification import RandomForestClassifier

second_model = ClassifierRec(RandomForestClassifier(numTrees=100), use_recs_value=True)

In [20]:
from replay.scenarios import TwoStagesScenario

two_stages = TwoStagesScenario(
    second_stage_splitter=second_stage_splitter,
    second_model=second_model,
    first_model=first_model
)

In [21]:
recs = two_stages.get_recs(log, 10, item_features=item_features_spark)

02-Mar-21 18:00:15, replay, DEBUG: mixed_train stat: total lines: 999209, total users: 6040, total items: 3705
DEBUG:replay:mixed_train stat: total lines: 999209, total users: 6040, total items: 3705
02-Mar-21 18:00:18, replay, DEBUG: test stat: total lines: 999, total users: 999, total items: 656
DEBUG:replay:test stat: total lines: 999, total users: 999, total items: 656
02-Mar-21 18:00:20, replay, DEBUG: first_train stat: total lines: 501110, total users: 6040, total items: 3614
DEBUG:replay:first_train stat: total lines: 501110, total users: 6040, total items: 3614
02-Mar-21 18:00:22, replay, DEBUG: first_test stat: total lines: 498099, total users: 6040, total items: 3623
DEBUG:replay:first_test stat: total lines: 498099, total users: 6040, total items: 3623
02-Mar-21 18:00:22, replay, DEBUG: Начало обучения ALSWrap
DEBUG:replay:Начало обучения ALSWrap
02-Mar-21 18:00:22, replay, DEBUG: Предварительная стадия обучения (pre-fit)
DEBUG:replay:Предварительная стадия обучения (pre-fit

In [22]:
train, test = second_stage_splitter.split(log)

In [23]:
first_recs = first_model.fit_predict(
    log=train,
    k=10,
    users=test.select("user_id").distinct().cache(),
    items=train.select("item_id").distinct().cache(),
)

02-Mar-21 18:07:55, replay, DEBUG: Начало обучения ALSWrap
DEBUG:replay:Начало обучения ALSWrap
02-Mar-21 18:07:55, replay, DEBUG: Предварительная стадия обучения (pre-fit)
DEBUG:replay:Предварительная стадия обучения (pre-fit)
02-Mar-21 18:07:55, replay, DEBUG: Основная стадия обучения (fit)
DEBUG:replay:Основная стадия обучения (fit)
02-Mar-21 18:08:27, replay, DEBUG: Начало предикта ALSWrap
DEBUG:replay:Начало предикта ALSWrap


In [24]:
two_stages.experiment.add_result("first_stage", first_recs)

In [25]:
two_stages.experiment.results

Unnamed: 0,HitRate@10
two_stages_scenario,0.172172
first_stage,0.235235
