In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
from pyspark.sql.session import SparkSession
import os
import logging

spark_memory = "16g"
spark_cores = "*"
user_home = os.environ["HOME"]

spark = (
    SparkSession
    .builder
    .config('spark.driver.memory', spark_memory)
    .config('spark.local.dir', os.path.join(user_home, "tmp"))
    .master(f'local[{spark_cores}]')
    .enableHiveSupport()
    .getOrCreate()
)

spark_logger = logging.getLogger('py4j')
spark_logger.setLevel(logging.WARN)
logger = logging.getLogger()
formatter = logging.Formatter('%(asctime)s, %(name)s, %(levelname)s: %(message)s',
                              datefmt='%d-%b-%y %H:%M:%S')
hdlr = logging.StreamHandler()
hdlr.setFormatter(formatter)
logger.addHandler(hdlr)
logger.setLevel(logging.DEBUG)
spark

In [3]:
from sponge_bob_magic.data_preparator.data_preparator import DataPreparator

data_preparator = DataPreparator(spark)

In [4]:
# MovieLens 100K
log = data_preparator.transform_log(
    os.path.join(user_home, "data/ml-100k/u.data"),
    format_type="csv",
    columns_names={
        "user_id": "_c0",
        "item_id": "_c1",
        "relevance": "_c2",
        "timestamp": "_c3"
    },
    date_format=None,
    sep="\t",
    header=False
)

In [5]:
from pyspark.sql.functions import min, max

log.agg(min("timestamp"), max("timestamp")).head()

Row(min(timestamp)=datetime.datetime(1997, 9, 20, 7, 5, 10), max(timestamp)=datetime.datetime(1998, 4, 23, 3, 10, 38))

In [6]:
%%time
from sponge_bob_magic.validation_schemes import ValidationSchemes

splitter = ValidationSchemes(spark)

train, test_input, test = splitter.log_split_randomly(
    log, test_size=0.2,
    drop_cold_users=True, drop_cold_items=True
)
print(
    train.count(), 
    test_input.count(), 
    test.count()
)

79977 79977 19982
CPU times: user 4 ms, sys: 0 ns, total: 4 ms
Wall time: 2.5 s


In [7]:
from sponge_bob_magic.metrics.metrics import Metric

metrics = Metric()

In [8]:
%%time
from sponge_bob_magic.models.popular_recomennder import PopularRecommender

popular_recomennder = PopularRecommender(spark, 0, 0)
recs = popular_recomennder.fit_predict(
    k=10,
    users=test.select("user_id").distinct().cache(),
    items=test.select("item_id").distinct().cache(),
    log=train.cache(),
    context=None,
    user_features=None,
    item_features=None,
    path=os.path.join(user_home, "models/popular.model")
).cache()
print(metrics.hit_rate_at_k(recs, test.cache(), 10))
print(metrics.recall_at_k(recs, test.cache(), 10))
print(metrics.precision_at_k(recs, test.cache(), 10))
print(metrics.ndcg_at_k(recs, test.cache(), 10))

09-Dec-19 17:52:32, root, DEBUG: Проверка датафреймов


09-Dec-19 17:52:33, root, DEBUG: Предварительная стадия обучения (pre-fit)


09-Dec-19 17:52:33, root, DEBUG: Среднее количество items у каждого user: 85


09-Dec-19 17:52:36, root, DEBUG: Основная стадия обучения (fit)


09-Dec-19 17:52:36, root, DEBUG: Проверка датафреймов


09-Dec-19 17:52:40, root, DEBUG: Количество items после фильтрации: 95


0.7571884984025559


0.13333333333333333


0.18956336528221515


0.21710429490555574
CPU times: user 248 ms, sys: 56 ms, total: 304 ms
Wall time: 32 s


In [9]:
%%time
from sponge_bob_magic.models.knn_recommender import KNNRecommender

knn_recommender = KNNRecommender(spark, 30)
recs = knn_recommender.fit_predict(
    k=10,
    users=test.select("user_id").distinct().cache(),
    items=test.select("item_id").distinct().cache(),
    log=train.cache(),
    context=None,
    user_features=None,
    item_features=None,
    path=os.path.join(user_home, "models/knn.model")
).cache()
print(metrics.hit_rate_at_k(recs, test.cache(), 10))
print(metrics.recall_at_k(recs, test.cache(), 10))
print(metrics.precision_at_k(recs, test.cache(), 10))
print(metrics.ndcg_at_k(recs, test.cache(), 10))

09-Dec-19 17:53:04, root, DEBUG: Проверка датафреймов


09-Dec-19 17:53:04, root, DEBUG: Предварительная стадия обучения (pre-fit)


09-Dec-19 17:53:17, root, DEBUG: Основная стадия обучения (fit)


09-Dec-19 17:53:23, root, DEBUG: Проверка датафреймов


0.8753993610223643


0.1


0.30457933972310974


0.35236354503608003
CPU times: user 132 ms, sys: 36 ms, total: 168 ms
Wall time: 42.6 s


In [10]:
%%time
from sponge_bob_magic.models.als_recommender import ALSRecommender

als_recommender = ALSRecommender(spark, rank=20)
recs = als_recommender.fit_predict(
    k=10,
    users=test.select("user_id").distinct().cache(),
    items=test.select("item_id").distinct().cache(),
    log=train.cache(),
    context=None,
    user_features=None,
    item_features=None,
    path=os.path.join(user_home, "models/als.model")
).cache()
print(metrics.hit_rate_at_k(recs, test.cache(), 10))
print(metrics.recall_at_k(recs, test.cache(), 10))
print(metrics.precision_at_k(recs, test.cache(), 10))
print(metrics.ndcg_at_k(recs, test.cache(), 10))

09-Dec-19 17:53:47, root, DEBUG: Проверка датафреймов


09-Dec-19 17:53:47, root, DEBUG: Предварительная стадия обучения (pre-fit)


09-Dec-19 17:53:47, root, DEBUG: Основная стадия обучения (fit)


09-Dec-19 17:53:47, root, DEBUG: Индексирование данных


09-Dec-19 17:53:47, root, DEBUG: Обучение модели


09-Dec-19 17:53:51, root, DEBUG: Проверка датафреймов


0.7092651757188498


0.06666666666666667


0.11778487752928649


0.13429284179619064
CPU times: user 152 ms, sys: 44 ms, total: 196 ms
Wall time: 39.3 s
