In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from replay.session_handler import State

spark = State().session
spark

In [5]:
import pandas as pd
df = pd.read_csv("ml1m.dat", sep="\t", names=["user_id", "item_id", "relevance", "timestamp"])
df.head()

Unnamed: 0,user_id,item_id,relevance,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [6]:
from replay.data_preparator import DataPreparator

log = DataPreparator().transform(
    data=df,
    columns_names={
        "user_id": "user_id",
        "item_id": "item_id",
        "relevance": "relevance",
        "timestamp": "timestamp"
    }
)

In [7]:
from replay.splitters import UserSplitter

splitter = UserSplitter(
    drop_cold_items=True,
    drop_cold_users=True,
    item_test_size=1,
    user_test_size=1000,
    seed=1234,
    shuffle=True
)
train, test = splitter.split(log)
(
    train.count(), 
    test.count()
)

(999209, 999)

In [15]:
from replay.metrics import HitRate, NDCG, Coverage, Unexpectedness
from replay.experiment import Experiment

metrics = Experiment(
    test,
    {
        NDCG(): 10,
        HitRate(): 10,
        Coverage(log): 10,
        Unexpectedness(log): 10
    }
)

In [19]:
%%time
from replay.models import PopRec

metrics.add_result(
    "PopRec",
    PopRec().fit_predict(
        log=train,
        k=10,
        users=test.select("user_id").distinct()
    )
)

02-Mar-21 16:50:29, replay, DEBUG: Начало обучения PopRec
DEBUG:replay:Начало обучения PopRec
02-Mar-21 16:50:29, replay, DEBUG: Предварительная стадия обучения (pre-fit)
DEBUG:replay:Предварительная стадия обучения (pre-fit)
02-Mar-21 16:50:30, replay, DEBUG: Основная стадия обучения (fit)
DEBUG:replay:Основная стадия обучения (fit)
02-Mar-21 16:50:32, replay, DEBUG: Начало предикта PopRec
DEBUG:replay:Начало предикта PopRec
02-Mar-21 16:50:32, replay, DEBUG: Выделение дефолтных пользователей
DEBUG:replay:Выделение дефолтных пользователей


CPU times: user 1.2 s, sys: 182 ms, total: 1.39 s
Wall time: 19.7 s


In [17]:
%%time
from replay.models import RandomRec

for alpha in [-0.5, 0.0, 0.5, 1.0, 10.0, 100.0]:
    metrics.add_result(
        f"RandomRec(alpha={alpha})",
        RandomRec(alpha=alpha).fit_predict(
            log=train,
            k=10,
            users=test.select("user_id").distinct()
        )
    )

02-Mar-21 16:45:21, replay, DEBUG: Начало обучения RandomRec
DEBUG:replay:Начало обучения RandomRec
02-Mar-21 16:45:21, replay, DEBUG: Предварительная стадия обучения (pre-fit)
DEBUG:replay:Предварительная стадия обучения (pre-fit)
02-Mar-21 16:45:21, replay, DEBUG: Основная стадия обучения (fit)
DEBUG:replay:Основная стадия обучения (fit)
02-Mar-21 16:45:23, replay, DEBUG: Начало предикта RandomRec
DEBUG:replay:Начало предикта RandomRec
02-Mar-21 16:45:23, replay, DEBUG: Выделение дефолтных пользователей
DEBUG:replay:Выделение дефолтных пользователей
02-Mar-21 16:45:40, replay, DEBUG: Начало обучения RandomRec
DEBUG:replay:Начало обучения RandomRec
02-Mar-21 16:45:40, replay, DEBUG: Предварительная стадия обучения (pre-fit)
DEBUG:replay:Предварительная стадия обучения (pre-fit)
02-Mar-21 16:45:40, replay, DEBUG: Основная стадия обучения (fit)
DEBUG:replay:Основная стадия обучения (fit)
02-Mar-21 16:45:41, replay, DEBUG: Начало предикта RandomRec
DEBUG:replay:Начало предикта RandomRec


CPU times: user 7.66 s, sys: 1.15 s, total: 8.81 s
Wall time: 1min 47s


In [20]:
metrics.results.sort_values("HitRate@10", ascending=False)

Unnamed: 0,Coverage@10,HitRate@10,NDCG@10,Unexpectedness@10
PopRec,0.033729,0.077077,0.043189,0.165033
RandomRec(alpha=-0.5),0.898003,0.004004,0.000988,0.165364
RandomRec(alpha=0.5),0.91932,0.004004,0.000965,0.165397
RandomRec(alpha=100.0),0.928494,0.004004,0.001047,0.165397
RandomRec(alpha=1.0),0.923368,0.003003,0.003867,0.165397
RandomRec(alpha=10.0),0.924177,0.003003,0.001821,0.165397
RandomRec(alpha=0.0),0.910416,0.001001,0.001922,0.165397
