In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from replay.session_handler import State

spark = State().session
spark

In [4]:
from rs_datasets import MovieLens

data = MovieLens("1m")
data.info()

ratings


Unnamed: 0,user_id,item_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968



users


Unnamed: 0,user_id,gender,age,occupation,zip_code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117



items


Unnamed: 0,item_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance





In [5]:
from replay.data_preparator import DataPreparator

log = DataPreparator().transform(
    data=data.ratings,
    columns_names={
        "user_id": "user_id",
        "item_id": "item_id",
        "relevance": "rating",
        "timestamp": "timestamp"
    }
)

In [6]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer,LabelBinarizer

mlb = MultiLabelBinarizer()
lb = LabelBinarizer()
item_features = pd.DataFrame(mlb.fit_transform(data.items.genres.apply(lambda x: x.split("|"))),
                   columns=list(map(lambda x: f"genre_{x}",mlb.classes_)),
                   index=data.items.item_id).reset_index()

In [7]:
item_features_spark = DataPreparator().transform(
    data=item_features,
    columns_names={
        "item_id": "item_id"
    }
).drop("timestamp")

In [8]:
from replay.splitters import UserSplitter

second_stage_splitter = UserSplitter(
    drop_cold_items=True,
    drop_cold_users=True,
    item_test_size=10,
    seed=1234,
    shuffle=True
)

first_stage_splitter = UserSplitter(
    drop_cold_items=False, item_test_size=0.5, shuffle=True, seed=42
)


In [9]:
from replay.models import ALSWrap
# при 98 все падает с Java heap space error
first_model = ALSWrap(rank=40)



In [10]:
from replay.models import ClassifierRec
from pyspark.ml.classification import RandomForestClassifier
second_model = ClassifierRec(RandomForestClassifier(seed=47), use_recs_value=True)

## Двухуровневый сценарий со статистическими фичами

In [11]:
from replay.scenarios import TwoStagesScenario
from replay.metrics import NDCG, HitRate, Precision, Recall, RocAuc

two_stages_with_stat = TwoStagesScenario(
    second_stage_splitter=second_stage_splitter,
    second_model=second_model,
    first_model=first_model,
    metrics={NDCG(): [1, 5, 10], HitRate(): [1, 5, 10]},
    calculate_statistical_features=True
)

In [12]:
%%time 
recs_with_stat = two_stages_with_stat.get_recs(log, 10, item_features=item_features_spark)
two_stages_with_stat.experiment.results


08-Dec-20 18:11:49, replay, DEBUG: mixed_train stat: total lines: 939809, total users: 6040, total items: 3699
DEBUG:replay:mixed_train stat: total lines: 939809, total users: 6040, total items: 3699
08-Dec-20 18:11:53, replay, DEBUG: test stat: total lines: 60393, total users: 6040, total items: 3051
DEBUG:replay:test stat: total lines: 60393, total users: 6040, total items: 3051
08-Dec-20 18:11:58, replay, DEBUG: first_train stat: total lines: 471386, total users: 6040, total items: 3604
DEBUG:replay:first_train stat: total lines: 471386, total users: 6040, total items: 3604
08-Dec-20 18:11:59, replay, DEBUG: first_test stat: total lines: 468423, total users: 6040, total items: 3611
DEBUG:replay:first_test stat: total lines: 468423, total users: 6040, total items: 3611
08-Dec-20 18:11:59, replay, DEBUG: Начало обучения ALSWrap
DEBUG:replay:Начало обучения ALSWrap
08-Dec-20 18:11:59, replay, DEBUG: Предварительная стадия обучения (pre-fit)
DEBUG:replay:Предварительная стадия обучения 

CPU times: user 4.14 s, sys: 1.09 s, total: 5.23 s
Wall time: 4min 54s


Unnamed: 0,HitRate@1,HitRate@5,HitRate@10,NDCG@1,NDCG@5,NDCG@10
two_stages_scenario,0.265232,0.645861,0.805795,0.265232,0.216265,0.18537


## Двухуровневый сценарий без статистических фичей

In [13]:
%%time
two_stages_without_stat = TwoStagesScenario(
    second_stage_splitter=second_stage_splitter,
    second_model=second_model,
    first_model=first_model,
    metrics={NDCG(): [1, 5, 10], HitRate(): [1, 5, 10]},
    calculate_statistical_features=False
)
recs_without_stat = two_stages_without_stat.get_recs(log, 10, item_features=item_features_spark)
two_stages_without_stat.experiment.results


08-Dec-20 18:16:38, replay, DEBUG: mixed_train stat: total lines: 939809, total users: 6040, total items: 3699
DEBUG:replay:mixed_train stat: total lines: 939809, total users: 6040, total items: 3699
08-Dec-20 18:16:39, replay, DEBUG: test stat: total lines: 60393, total users: 6040, total items: 3051
DEBUG:replay:test stat: total lines: 60393, total users: 6040, total items: 3051
08-Dec-20 18:16:40, replay, DEBUG: first_train stat: total lines: 471386, total users: 6040, total items: 3604
DEBUG:replay:first_train stat: total lines: 471386, total users: 6040, total items: 3604
08-Dec-20 18:16:40, replay, DEBUG: first_test stat: total lines: 468423, total users: 6040, total items: 3611
DEBUG:replay:first_test stat: total lines: 468423, total users: 6040, total items: 3611
08-Dec-20 18:16:40, replay, DEBUG: Начало обучения ALSWrap
DEBUG:replay:Начало обучения ALSWrap
08-Dec-20 18:16:40, replay, DEBUG: Предварительная стадия обучения (pre-fit)
DEBUG:replay:Предварительная стадия обучения 

CPU times: user 4.32 s, sys: 1.08 s, total: 5.4 s
Wall time: 5min 50s


Unnamed: 0,HitRate@1,HitRate@5,HitRate@10,NDCG@1,NDCG@5,NDCG@10
two_stages_scenario,0.255629,0.600497,0.763742,0.255629,0.198136,0.168892


In [14]:
two_stages_with_stat.experiment.add_result("two_stages_without_stat", recs_without_stat)
two_stages_with_stat.experiment.results



Unnamed: 0,HitRate@1,HitRate@5,HitRate@10,NDCG@1,NDCG@5,NDCG@10
two_stages_scenario,0.265232,0.645861,0.805795,0.265232,0.216265,0.18537
two_stages_without_stat,0.255629,0.600497,0.763742,0.255629,0.198136,0.168892


## Модель первого уровня, обученная на всем train

In [15]:
train, test = second_stage_splitter.split(log)
first_train, first_test = first_stage_splitter.split(train)


In [16]:
%%time
first_recs_all = first_model.fit_predict(
    log=train,
    k=10,
    users=test.select("user_id").distinct().cache(),
    items=train.select("item_id").distinct().cache(),
)

08-Dec-20 18:23:12, replay, DEBUG: Начало обучения ALSWrap
DEBUG:replay:Начало обучения ALSWrap
08-Dec-20 18:23:12, replay, DEBUG: Предварительная стадия обучения (pre-fit)
DEBUG:replay:Предварительная стадия обучения (pre-fit)
08-Dec-20 18:23:14, replay, DEBUG: Основная стадия обучения (fit)
DEBUG:replay:Основная стадия обучения (fit)
08-Dec-20 18:23:38, replay, DEBUG: Начало предикта ALSWrap
DEBUG:replay:Начало предикта ALSWrap


CPU times: user 1.55 s, sys: 285 ms, total: 1.84 s
Wall time: 33.7 s


In [17]:
two_stages_with_stat.experiment.add_result("first_stage_all", first_recs_all)
two_stages_with_stat.experiment.results



Unnamed: 0,HitRate@1,HitRate@5,HitRate@10,NDCG@1,NDCG@5,NDCG@10
two_stages_scenario,0.265232,0.645861,0.805795,0.265232,0.216265,0.18537
two_stages_without_stat,0.255629,0.600497,0.763742,0.255629,0.198136,0.168892
first_stage_all,0.337086,0.725993,0.870695,0.337086,0.265648,0.224414


## Модель первого уровня, обученная на половине train (как в двухуровневом сценарии)

In [18]:
%%time
first_model.fit(log=first_train)
first_model_half = first_model.predict(
    log=train,
    k=10,
    users=test.select("user_id").distinct().cache(),
    items=train.select("item_id").distinct().cache(),
)

two_stages_with_stat.experiment.add_result("first_stage_half", first_model_half)

08-Dec-20 18:27:08, replay, DEBUG: Начало обучения ALSWrap
DEBUG:replay:Начало обучения ALSWrap
08-Dec-20 18:27:08, replay, DEBUG: Предварительная стадия обучения (pre-fit)
DEBUG:replay:Предварительная стадия обучения (pre-fit)
08-Dec-20 18:27:08, replay, DEBUG: Основная стадия обучения (fit)
DEBUG:replay:Основная стадия обучения (fit)
08-Dec-20 18:27:21, replay, DEBUG: Начало предикта ALSWrap
DEBUG:replay:Начало предикта ALSWrap


CPU times: user 1.3 s, sys: 368 ms, total: 1.67 s
Wall time: 3min 5s


In [19]:
two_stages_with_stat.experiment.results

Unnamed: 0,HitRate@1,HitRate@5,HitRate@10,NDCG@1,NDCG@5,NDCG@10
two_stages_scenario,0.265232,0.645861,0.805795,0.265232,0.216265,0.18537
two_stages_without_stat,0.255629,0.600497,0.763742,0.255629,0.198136,0.168892
first_stage_all,0.337086,0.725993,0.870695,0.337086,0.265648,0.224414
first_stage_half,0.275828,0.652483,0.810927,0.275828,0.220098,0.18783


## Двухуровневый сценарий с усиленным классификатором

In [20]:
second_model = ClassifierRec(spark_classifier=RandomForestClassifier(numTrees=100, seed=47), use_recs_value=True)

### Двухуровневый сценарий со статистическими фичами

In [21]:
two_stages_with_stat_strong = TwoStagesScenario(
    second_stage_splitter=second_stage_splitter,
    second_model=second_model,
    first_model=first_model,
    metrics={NDCG(): [1, 5, 10], HitRate(): [1, 5, 10]},
    calculate_statistical_features=True
)

In [22]:
%%time 
recs_with_stat = two_stages_with_stat_strong.get_recs(log, 10, item_features=item_features_spark)
two_stages_with_stat.experiment.add_result("two_stages_with_stat_strong", recs_with_stat)

08-Dec-20 18:30:15, replay, DEBUG: mixed_train stat: total lines: 939809, total users: 6040, total items: 3699
DEBUG:replay:mixed_train stat: total lines: 939809, total users: 6040, total items: 3699
08-Dec-20 18:30:16, replay, DEBUG: test stat: total lines: 60393, total users: 6040, total items: 3051
DEBUG:replay:test stat: total lines: 60393, total users: 6040, total items: 3051
08-Dec-20 18:30:17, replay, DEBUG: first_train stat: total lines: 471386, total users: 6040, total items: 3604
DEBUG:replay:first_train stat: total lines: 471386, total users: 6040, total items: 3604
08-Dec-20 18:30:17, replay, DEBUG: first_test stat: total lines: 468423, total users: 6040, total items: 3611
DEBUG:replay:first_test stat: total lines: 468423, total users: 6040, total items: 3611
08-Dec-20 18:30:17, replay, DEBUG: Начало обучения ALSWrap
DEBUG:replay:Начало обучения ALSWrap
08-Dec-20 18:30:17, replay, DEBUG: Предварительная стадия обучения (pre-fit)
DEBUG:replay:Предварительная стадия обучения 

CPU times: user 4.74 s, sys: 1.22 s, total: 5.97 s
Wall time: 5min 15s


<replay.scenarios.two_stages_scenario.TwoStagesScenario at 0x13640af90>

### Двухуровневый сценарий без статистических фичей

In [23]:
%%time
two_stages_without_stat_strong = TwoStagesScenario(
    second_stage_splitter=second_stage_splitter,
    second_model=second_model,
    first_model=first_model,
    metrics={NDCG(): [1, 5, 10], HitRate(): [1, 5, 10]},
    calculate_statistical_features=False
)
recs_without_stat = two_stages_without_stat_strong.get_recs(log, 10, item_features=item_features_spark)
two_stages_with_stat.experiment.add_result("two_stages_without_stat_strong", recs_without_stat)
two_stages_without_stat_strong.experiment.results

08-Dec-20 18:35:31, replay, DEBUG: mixed_train stat: total lines: 939809, total users: 6040, total items: 3699
DEBUG:replay:mixed_train stat: total lines: 939809, total users: 6040, total items: 3699
08-Dec-20 18:35:32, replay, DEBUG: test stat: total lines: 60393, total users: 6040, total items: 3051
DEBUG:replay:test stat: total lines: 60393, total users: 6040, total items: 3051
08-Dec-20 18:35:32, replay, DEBUG: first_train stat: total lines: 471386, total users: 6040, total items: 3604
DEBUG:replay:first_train stat: total lines: 471386, total users: 6040, total items: 3604
08-Dec-20 18:35:33, replay, DEBUG: first_test stat: total lines: 468423, total users: 6040, total items: 3611
DEBUG:replay:first_test stat: total lines: 468423, total users: 6040, total items: 3611
08-Dec-20 18:35:33, replay, DEBUG: Начало обучения ALSWrap
DEBUG:replay:Начало обучения ALSWrap
08-Dec-20 18:35:33, replay, DEBUG: Предварительная стадия обучения (pre-fit)
DEBUG:replay:Предварительная стадия обучения 

CPU times: user 4.22 s, sys: 959 ms, total: 5.17 s
Wall time: 4min 54s


Unnamed: 0,HitRate@1,HitRate@5,HitRate@10,NDCG@1,NDCG@5,NDCG@10
two_stages_scenario,0.241391,0.601159,0.764238,0.241391,0.197375,0.16827


In [None]:
two_stages_with_stat.experiment.add_result("two_stages_without_stat_strong", recs_without_stat)

In [26]:
two_stages_with_stat.experiment.results.sort_values('NDCG@10', ascending=False)

Unnamed: 0,HitRate@1,HitRate@5,HitRate@10,NDCG@1,NDCG@5,NDCG@10
first_stage_all,0.337086,0.725993,0.870695,0.337086,0.265648,0.224414
first_stage_half,0.275828,0.652483,0.810927,0.275828,0.220098,0.18783
two_stages_scenario,0.265232,0.645861,0.805795,0.265232,0.216265,0.18537
two_stages_with_stat_strong,0.258444,0.625,0.77798,0.258444,0.20883,0.176807
two_stages_without_stat,0.255629,0.600497,0.763742,0.255629,0.198136,0.168892
two_stages_without_stat_strong,0.241391,0.601159,0.764238,0.241391,0.197375,0.16827


Модель первого уровня работает лучше, чем двухуровневый сценарий. Двухуровневый сценарий, использущий статистические признаки, работает лучше, чем без них.