In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%%time
from pyspark.sql import SparkSession

spark = (
    SparkSession
    .builder
    .config("spark.driver.memory", "128g")
    .config("spark.sql.shuffle.partitions", "64")
    .config("spark.local.dir", "tmp")
    .getOrCreate()
)
spark

CPU times: user 656 ms, sys: 1.85 s, total: 2.51 s
Wall time: 7.78 s


In [3]:
# эта папка заранее приготовлена, чтобы в ней был целый сентябрь и один день октября
!ls /datascc/share/small_data

user_item_data2019-09-01.csv  user_item_data2019-09-17.csv
user_item_data2019-09-02.csv  user_item_data2019-09-18.csv
user_item_data2019-09-03.csv  user_item_data2019-09-19.csv
user_item_data2019-09-04.csv  user_item_data2019-09-20.csv
user_item_data2019-09-05.csv  user_item_data2019-09-21.csv
user_item_data2019-09-06.csv  user_item_data2019-09-22.csv
user_item_data2019-09-07.csv  user_item_data2019-09-23.csv
user_item_data2019-09-08.csv  user_item_data2019-09-24.csv
user_item_data2019-09-09.csv  user_item_data2019-09-25.csv
user_item_data2019-09-10.csv  user_item_data2019-09-26.csv
user_item_data2019-09-11.csv  user_item_data2019-09-27.csv
user_item_data2019-09-12.csv  user_item_data2019-09-28.csv
user_item_data2019-09-13.csv  user_item_data2019-09-29.csv
user_item_data2019-09-14.csv  user_item_data2019-09-30.csv
user_item_data2019-09-15.csv  user_item_data2019-10-01.csv
user_item_data2019-09-16.csv


In [4]:
from sponge_bob_magic.data_preparator.data_preparator import DataPreparator

data_preparator = DataPreparator(spark)

In [5]:
%%time
log = data_preparator.transform_log(
    "/datascc/share/small_data",
    format_type="csv",
    columns_names={"timestamp": "_c0", "user_id": "_c1", "item_id": "_c2", "relevance": "_c3"}
)

CPU times: user 8 ms, sys: 16 ms, total: 24 ms
Wall time: 18.1 s


In [6]:
%%time
from sponge_bob_magic.validation_schemes import ValidationSchemes

train, test_input, test = ValidationSchemes(spark).log_split_by_date(
    log,
    test_start="2019-10-01",
    drop_cold_items=False,
    drop_cold_users=False
)

CPU times: user 8 ms, sys: 0 ns, total: 8 ms
Wall time: 33.5 ms


In [7]:
%%time
# sanity check
from pyspark.sql.functions import min, max

print(train.agg(max("timestamp")).head()[0])
print(test.agg(min("timestamp")).head()[0])
print(train.count() + test.count(), log.count())

2019-09-30 00:00:00
2019-10-01 00:00:00
48522117 48522117
CPU times: user 8 ms, sys: 8 ms, total: 16 ms
Wall time: 25.5 s


In [8]:
# названия полей соответствуют выгрузке, которую делал Иван Киреев
user_feature_cols = ["v" + ("000" + str(i))[-3:] for i in range(256)]

In [9]:
user_features = data_preparator.transform_features(
    "/datascc/share/parquet_embeddings/",
    format_type="parquet",
    columns_names={"timestamp": "application_date", "user_id": "client_dk", "features": user_feature_cols}
)

In [10]:
item_feature_cols = ["t" + str(i) for i in range(256)]

In [11]:
# эти фичи можно сгенерировать кодом из тетрадки:
# https://stash.ca.sbrf.ru/projects/AI-LAB/repos/stories_recsys/browse/experiments/prepare_text_embeddings.ipynb
item_features = data_preparator.transform_features(
    "/datascc/share/stories_texts_embeddings",
    format_type="parquet",
    columns_names={"item_id": "campaign_id", "features": item_feature_cols}
)

In [12]:
from sponge_bob_magic.models.linear_recomennder import LinearRecommender

recommender = LinearRecommender(spark)

In [13]:
%%time

recommender.fit(train, user_features, item_features, "linear.model")

CPU times: user 644 ms, sys: 132 ms, total: 776 ms
Wall time: 28min 16s


In [22]:
%%time
# здесь мы немного заглядываем в будущее, потому что люди чаще смотрят новые истории
# просто потому, что новые истории чаще показываются

recs = recommender.predict(
    user_features=user_features,
    item_features=item_features,
    k=10,
    context=None,
    users=test.select("user_id").distinct(),
    items=test.select("item_id").distinct(),
    log=train,
    to_filter_seen_items=True,
    path="recs"
).cache()

CPU times: user 828 ms, sys: 188 ms, total: 1.02 s
Wall time: 24min 57s


In [23]:
recs.count()

1451020

In [24]:
%%time
from sponge_bob_magic.metrics.metrics import Metrics

test_positive = test.filter("relevance == 1").cache()
metrics = Metrics()
print(metrics.hit_rate_at_k(recs, test_positive, 10))

0.05112955024741216
CPU times: user 16 ms, sys: 0 ns, total: 16 ms
Wall time: 15 s


In [16]:
# на всякий случай сверяемся с baseline
from sponge_bob_magic.models.popular_recomennder import PopularRecommender

popular = PopularRecommender(spark)

In [17]:
%%time

spark.sparkContext.setCheckpointDir("checkpoints")
popular.fit(
    log=train.filter("relevance == 1"),
    user_features=None,
    item_features=None
)

CPU times: user 12 ms, sys: 4 ms, total: 16 ms
Wall time: 37.1 s


In [18]:
%%time

popular_recs = popular.predict(
    k=10,
    log=train,
    users=test.select("user_id").distinct(),
    items=test.select("item_id").distinct(),
    context="no_context",
    user_features=None,
    item_features=None
)

CPU times: user 32 ms, sys: 12 ms, total: 44 ms
Wall time: 1min 21s


In [19]:
%%time
# к сожалению, качество модели по сравнению с baseline пока оставляет желать лучшего

print(metrics.hit_rate_at_k(popular_recs, test_positive, 10))

0.10433188673634923
CPU times: user 8 ms, sys: 4 ms, total: 12 ms
Wall time: 52.1 s
