In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%%time
import pandas as pd

user_item_data = (
    pd.read_csv(
        "/mnt/wind/ideas_data/2017.csv",
        sep=";",
        usecols=["id_project", "user_id", "type"],
        dtype=str
    ).append(
        pd.read_csv(
            "/mnt/wind/ideas_data/2018.csv",
            sep=";",
            usecols=["id_project", "user_id", "type"],
            dtype=str
        )
    ).query("type in ('Pim::IdeaBestPracticeVote', 'Pim::IdeaResourceVote', 'Pim::IdeaVote')")
    .drop(columns=["type"])
    .rename(columns={"user_id": "user_id", "id_project": "item_id"})
)
user_item_data.to_csv("data.csv", index=False)

CPU times: user 7.97 s, sys: 1.63 s, total: 9.6 s
Wall time: 8.16 s


In [3]:
%%time
from pyspark.sql import SparkSession

spark = (
    SparkSession
    .builder
    .config("spark.driver.memory", "30g")
    .config("spark.sql.shuffle.partitions", "20")
    .config("spark.local.dir", "~/tmp")
    .getOrCreate()
)

CPU times: user 44 ms, sys: 4 ms, total: 48 ms
Wall time: 2.34 s


In [4]:
%%time
from sponge_bob_magic.data_preparator.data_preparator import DataPreparator

data_preparator = DataPreparator(spark)

CPU times: user 4 ms, sys: 0 ns, total: 4 ms
Wall time: 6.15 ms


In [5]:
%%time
from pyspark.sql.types import DoubleType, StringType, StructType, StructField, TimestampType

log = data_preparator.transform_log(
    path="data.csv",
    format_type="csv",
    schema=StructType([
        StructField("user_id", StringType()),
        StructField("item_id", StringType())
    ]),
    columns_names={
        "user_id": "user_id",
        "item_id": "item_id"
    }
).cache()

CPU times: user 12 ms, sys: 8 ms, total: 20 ms
Wall time: 3.8 s


In [6]:
%%time
from sponge_bob_magic.validation_schemes import ValidationSchemes

train, test_input, test = ValidationSchemes(spark).log_split_randomly(
    log,
    test_size=0.2,
    drop_cold_items=True,
    drop_cold_users=True
)
train = train.cache()
test = test.cache()

CPU times: user 4 ms, sys: 0 ns, total: 4 ms
Wall time: 176 ms


In [7]:
%%time
from sponge_bob_magic.models.knn_recommender import KNNRecommender

knn_recommender = KNNRecommender(spark, 34)

CPU times: user 204 ms, sys: 24 ms, total: 228 ms
Wall time: 240 ms


In [8]:
%%time

model = knn_recommender.fit(
    train,
    user_features=None,
    item_features=None
)

CPU times: user 24 ms, sys: 0 ns, total: 24 ms
Wall time: 4.78 s


In [9]:
%%time

recs = knn_recommender.predict(
    k=10,
    log=train,
    users=test.select("user_id").distinct(),
    items=None,
    user_features=None,
    item_features=None,
    context=None
).cache()

CPU times: user 4 ms, sys: 12 ms, total: 16 ms
Wall time: 1.21 s


In [10]:
%%time
from sponge_bob_magic.metrics.metrics import Metrics

print(Metrics.hit_rate_at_k(recs, test, 10))

0.3269561603989196
CPU times: user 64 ms, sys: 28 ms, total: 92 ms
Wall time: 8min 10s
