# Core modules of Sim4Rec

This notebook contains the core modules of Sim4Rec:
1. Generator

RealDataGenerator to sample from the real data and SDVDataGenerator to generate synthetic data.

2. Response Pipeline

The Response module allows you to model user responses to recommended items.

3. Simulator

The Simulator class stores and updates the simulation log and provides an API to invoke the core simulation pipeline models.

4. Evaluation

Sim4Rec provides functionality to evaluate synthetic data, a response model and a recommender system.

In [1]:
import os
import sys
os.environ["JAVA_HOME"] = "/home/jovyan/ivanova/env39_sim4rec_check/lib/jvm"
os.environ["PYSPARK_PYTHON"] = sys.executable
os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
os.environ["OMP_NUM_THREADS"] = "32"
os.environ["NUMBA_NUM_THREADS"] = "4"

In [2]:
import random
import time
from typing import Tuple

import numpy as np
import pandas as pd
from rs_datasets import MovieLens

%matplotlib inline
import warnings

from pyspark import SparkConf
from pyspark.ml import PipelineModel
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.recommendation import ALS
from pyspark.sql import SparkSession
import pyspark.sql.functions as sf

from sim4rec.modules import (
    evaluate_synthetic, 
    CompositeGenerator, 
    EvaluateMetrics, 
    RealDataGenerator, 
    SDVDataGenerator, 
    Simulator,
)
from sim4rec.recommenders.ucb import UCB
from sim4rec.response import BernoulliResponse
from sim4rec.utils import VectorElementExtractor, pandas_to_spark

warnings.filterwarnings("ignore")

In [3]:
SPARK_LOCAL_DIR = "./tmp/task_3"
CHECKPOINT_DIR = "./tmp/task_3_checkpoints"

In [4]:
%%bash -s "$CHECKPOINT_DIR" "$SPARK_LOCAL_DIR"
# Simulator keeps history of interactions between users and recommender system
# To re-run the simulation cycle or start a new one, clear the directory or use a different CHECKPOINT_DIR
rm -rf $1 $2

In [5]:
spark = SparkSession.builder\
    .appName("simulator")\
    .master("local[4]")\
    .config("spark.sql.shuffle.partitions", "4")\
    .config("spark.default.parallelism", "4")\
    .config("spark.driver.extraJavaOptions", "-XX:+UseG1GC")\
    .config("spark.executor.extraJavaOptions", "-XX:+UseG1GC")\
    .config("spark.sql.autoBroadcastJoinThreshold", "-1")\
    .config("spark.driver.memory", "4g")\
    .config("spark.local.dir", SPARK_LOCAL_DIR)\
    .getOrCreate()

spark.sparkContext.setLogLevel("ERROR")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/03/12 17:51:15 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/03/12 17:51:15 WARN SparkConf: Note that spark.local.dir will be overridden by the value set by the cluster manager (via SPARK_LOCAL_DIRS in mesos/standalone/kubernetes and LOCAL_DIRS in YARN).
25/03/12 17:51:16 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
25/03/12 17:51:16 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
25/03/12 17:51:16 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.
25/03/12 17:51:16 WARN Utils: Service 'SparkUI' could not bind on port 4043. Attempting port 4044.


In [6]:
SEED = 1234
np.random.seed(SEED)

## Download Data

In [7]:
data = MovieLens("20m")
ratings_df = data.ratings
ratings_df.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,1,2,3.5,1112486027
1,1,29,3.5,1112484676
2,1,32,3.5,1112484819
3,1,47,3.5,1112484727
4,1,50,3.5,1112484580


In [8]:
rating_threshold = ratings_df["rating"].median()
rating_threshold

3.5

In [9]:
ratings_df["relevance"] = (ratings_df["rating"] >= rating_threshold).astype(int)

## Train-test-split

We assume that the recommender model is trained on a small amount of data. Thus, we split 20% of the earliest interactions for training the recommender system, 20% of the latest interactions for testing the simulator, and the remaining 60% of the interactions for training the simulator.

In [10]:
def train_test_split(
    df: pd.DataFrame,
    train_rs_quantile: float = 0.2,
    test_sim_quantile: float = 0.8,
) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    """
    Splits data to for recommender model training, simulator training and evaluation

    :param df: 
        Ratings dataframe with users and items identifiers, rating and timestamp
    :param train_rs_quantile: 
        quantile to split data for recommender model training by time point
    :param test_sim_quantile: 
        quantile to split data for simulator evaluation by time point
    :returns: Tuple of Pandas DataFrames:
        train_sim_df is used to train simulator
        train_rs_df is used to train recommender model on real data
        test_sim_df is used to evaluate simulator
    """
    train_rs_timepoint = df["timestamp"].quantile(
        q=train_rs_quantile, interpolation="nearest"
    )
    test_sim_timepoint = df["timestamp"].quantile(
        q=test_sim_quantile, interpolation="nearest"
    )

    test_sim_df = df.query("timestamp >= @test_sim_timepoint")
    train_sim_df = df.drop(test_sim_df.index)
    train_rs_df = train_sim_df.query("timestamp < @train_rs_timepoint")
    return train_sim_df, train_rs_df, test_sim_df

In [11]:
train_sim_df, train_rs_df, test_sim_df = train_test_split(ratings_df)

## Data Preprocessing

We encode the user and item IDs with consecutive values so that the new IDs do not contain missing values. 

In [12]:
# check if any users are present in train_rs_df but not in train_sim_df
(~train_rs_df.user_id.isin(train_sim_df.user_id)).sum()

0

In [13]:
# check if any items are present in train_rs_df but not in train_sim_df
(~train_rs_df.item_id.isin(train_sim_df.item_id)).sum()

0

In [14]:
all_users = train_sim_df["user_id"].astype("category").cat.codes
all_items = train_sim_df["item_id"].astype("category").cat.codes

user_id2idx = dict(zip(train_sim_df["user_id"], all_users))
item_id2idx = dict(zip(train_sim_df["item_id"], all_items))

In [15]:
def encode_ids(df: pd.DataFrame, user_id2idx: dict, item_id2idx: dict) -> pd.DataFrame:
    """
    Encodes users and items identifiers

    :param df:
        dataframe with user_id and item_id columns
    :param user_id2idx:
        dict to encode users identifiers to indexes
    :param item_id2idx:
        dict to encode items identifiers to indexes
    :returns: Pandas DataFrame with encoded columns
    """
    df.loc[:, "user_idx"] = df["user_id"].map(user_id2idx)
    df.loc[:, "item_idx"] = df["item_id"].map(item_id2idx)
    return df

In [16]:
train_sim_df = encode_ids(train_sim_df, user_id2idx, item_id2idx)
train_rs_df = encode_ids(train_rs_df, user_id2idx, item_id2idx) 
test_sim_df = encode_ids(test_sim_df, user_id2idx, item_id2idx)

In [17]:
train_rs = pandas_to_spark(train_rs_df)

In [18]:
train_sim_positive_df = train_sim_df.query("relevance == 1")
train_sim_positive = pandas_to_spark(train_sim_positive_df)

We filter cold items and users from test_sim dataframe. 

In [19]:
users_train_sim = train_sim_positive_df["user_id"].unique()
test_sim_df = test_sim_df[test_sim_df["user_id"].isin(users_train_sim)]

items_train_sim = train_sim_positive_df["item_id"].unique()
test_sim_df = test_sim_df[test_sim_df["item_id"].isin(items_train_sim)]

test_sim = pandas_to_spark(test_sim_df)

We will train the synthetic data generator on the entire dataframe. We encode the user and item IDs with consecutive values.

In [20]:
ratings_df["user_idx"] = ratings_df["user_id"].astype("category").cat.codes
ratings_df["item_idx"] = ratings_df["item_id"].astype("category").cat.codes
train = pandas_to_spark(ratings_df)

## Generator

The generator module implements functionality to store and sample data for an iteration and to generate synthetic data. 

#### RealDataGenerator

RealDataGenerator works with the real data and returns only ids.

In [21]:
users_gen_df = ratings_df[["user_idx"]].drop_duplicates()
users_gen = pandas_to_spark(users_gen_df)

In [22]:
real_data_generator = RealDataGenerator(
    label="users_real", 
    seed=SEED
)
real_data_generator.fit(users_gen)
real_data_generator.generate(10000)
real_users = real_data_generator.sample(0.1)
print(f"n_samples = {real_users.count()}")
real_users.limit(5).toPandas()

                                                                                

n_samples = 1006


Unnamed: 0,user_idx
0,22079
1,49583
2,51156
3,11094
4,46008


#### SDVDataGenerator

SDVDataGenerator generates synthetic data and returns both ids and user features.

In [23]:
als = ALS(
    rank=4,
    maxIter=5,
    userCol="user_idx",
    itemCol="item_idx",
    ratingCol="relevance",
    seed=SEED,
    nonnegative=True,
)
als_model = als.fit(train)

                                                                                

Obtain vector representations of real users.

In [24]:
user_features = als_model.userFactors.orderBy("id")
user_features = (user_features.withColumn("user_feature", sf.col("features"))).select(
    ["id"] + [sf.col("user_feature")[i] for i in range(4)]
)
user_features.limit(5).toPandas()

Unnamed: 0,id,user_feature[0],user_feature[1],user_feature[2],user_feature[3]
0,0,0.600763,0.386934,0.554788,0.442143
1,1,0.527399,0.336708,0.494451,0.395085
2,2,0.53732,0.359514,0.510023,0.404002
3,3,0.451957,0.267677,0.403091,0.333005
4,4,0.526905,0.322884,0.482649,0.398983


Fit CopulaGAN to non-negative ALS embeddings of real users and generate synthetic user feature vectors.

In [40]:
svd_data_generator = SDVDataGenerator(
    label="synth",
    id_column_name="user_id",
    model_name="copulagan",
    parallelization_level=4,
    device_name="cpu",
    seed=SEED,
)
svd_data_generator.fit(user_features.drop("id").sample(0.1))
synthetic_users = svd_data_generator.generate(user_features.sample(0.1).count())
synthetic_users.limit(5).toPandas()

                                                                                

Unnamed: 0,user_id,user_feature[0],user_feature[1],user_feature[2],user_feature[3]
0,synth_0,0.39479,0.282485,0.484643,0.369866
1,synth_1,0.582869,0.371751,0.528684,0.345175
2,synth_2,0.508617,0.297746,0.380002,0.357792
3,synth_3,0.68806,0.349757,0.562963,0.421951
4,synth_4,0.515549,0.356184,0.452109,0.359094


## Response

The response pipeline consists of the following core components: an ALS model for predicting scores, LogisticRegression for transforming scores into probabilities, and BernoulliResponse for sampling responses with the given probability for each user-item pair.

In [26]:
als = ALS(
    rank=10,
    maxIter=5,
    userCol="user_idx",
    itemCol="item_idx",
    ratingCol="relevance",
    seed=SEED,
)
als_model = als.fit(train_sim_positive)

va = VectorAssembler(inputCols=["prediction"], outputCol="features")

calibration = LogisticRegression(
    featuresCol="features",
    labelCol="relevance",
    predictionCol="lr_pred",
    probabilityCol="lr_prob",
    maxIter=500,
    tol=1e-2
)
calibration_model = calibration.fit(va.transform(als_model.transform(test_sim)))

vee = VectorElementExtractor(inputCol="lr_prob", outputCol="response_proba", index=1)
br = BernoulliResponse(inputCol="response_proba", outputCol="response", seed=SEED)

response_pipeline = PipelineModel(stages=[als_model, va, calibration_model, vee, br])
predictions = response_pipeline.transform(test_sim).select(
    "user_idx",
    "item_idx",
    "relevance",
    "response_proba",
    "response"
)
predictions.limit(5).toPandas()

                                                                                

Unnamed: 0,user_idx,item_idx,relevance,response_proba,response
0,1357.0,2.0,1,0.204251,0
1,22360.0,2.0,0,0.221479,0
2,24012.0,2.0,1,0.238128,0
3,27728.0,2.0,1,0.267854,0
4,35923.0,2.0,1,0.266895,0


## Simulator

The Simulator class stores and checks the consistency of the simulation log with the Update Log function, and provides an API for invoking the core simulation pipeline models. 

#### Simulator Initialization

In [27]:
train_users_gen_df = train_sim_positive_df[["user_idx"]].drop_duplicates()
train_users_gen = pandas_to_spark(train_users_gen_df)

In [28]:
train_items_gen_df = train_sim_positive_df[["item_idx"]].drop_duplicates()
train_items_gen = pandas_to_spark(train_items_gen_df)

In [29]:
users_generator = RealDataGenerator(label="users_real", seed=SEED)
users_generator.fit(train_users_gen)
users_generator.generate(500)

DataFrame[user_idx: bigint]

In [41]:
items_generator = RealDataGenerator(label="items_real", seed=SEED)
items_generator.fit(train_items_gen)

In [47]:
%%bash -s "$CHECKPOINT_DIR"
rm -rf $1

In [44]:
sim = Simulator(
    user_gen=users_generator,
    item_gen=items_generator,
    data_dir=f"{CHECKPOINT_DIR}/pipeline",
    spark_session=spark
)

#### Users and responses sampling

In [33]:
rs_model = UCB(sample=True, seed=SEED)
rs_model.fit(train_rs)

                                                                                

In [34]:
current_users = sim.sample_users(0.1).cache()
log = sim.get_log(train_users_gen)
recs = rs_model.predict(
    log=log, 
    k=50, 
    users=current_users, 
    items=train_items_gen, 
    filter_seen_items=False
)
true_resp = sim.sample_responses(
        recs_df=recs,
        user_features=current_users,
        item_features=train_items_gen,
        action_models=response_pipeline,
)
sim.update_log(true_resp, iteration=0)

                                                                                

## Evaluation

The evaluation module provides functionality to evaluate the quality of synthetic data generation, the quality of the response model and recommender system. The custom metrics are supported.

#### Evaluation of synthetic data

Graphs from [Sim4Rec repo](https://github.com/sb-ai-lab/Sim4Rec/blob/main/experiments/amazon_embeddings.ipynb) Generators quality with ALS.

In [35]:
real_users = user_features.sample(0.1)
synthetic_users = svd_data_generator.generate(real_users.count())
gen_score = evaluate_synthetic(
    synthetic_users.drop("user_id"), 
    real_users.drop("id")
)

                                                                                

#### Evaluation of a response model

In [36]:
pipeline_eval = EvaluateMetrics(
    userKeyCol="user_idx",
    itemKeyCol="item_idx",
    predictionCol="response_proba",
    labelCol="relevance",
    mllib_metrics=["areaUnderROC"],
)

predictions = response_pipeline.transform(test_sim)
predictions = predictions.withColumn(
    "response_proba", predictions["response_proba"].astype("double")
)
pipeline_eval(predictions)
print(f"ROC-AUC = {pipeline_eval(predictions)['areaUnderROC']}")

                                                                                

ROC-AUC = 0.6132717980328471


#### Evaluation of a recommender system with custom metric

In [None]:
%%bash -s "$CHECKPOINT_DIR"
rm -rf $1

In [48]:
sim = Simulator(
    user_gen=users_generator,
    item_gen=items_generator,
    data_dir=f"{CHECKPOINT_DIR}/pipeline_eval",
    spark_session=spark
)

In [49]:
def n_clicks_per_user(response):
    return (
        response.groupBy("user_idx")
        .agg(sf.sum("response").alias("num_positive"))
        .select(sf.mean("num_positive"))
        .collect()[0][0]
    )

In [50]:
metrics = []

recs = rs_model.predict(
    log=log, k=50, users=current_users, items=train_items_gen, filter_seen_items=False
).cache()

true_resp = (
    sim.sample_responses(
        recs_df=recs,
        user_features=current_users,
        item_features=train_items_gen,
        action_models=response_pipeline,
    )
    .select("user_idx", "item_idx", "relevance", "response")
    .cache()
)

sim.update_log(true_resp, iteration=0)

metrics.append(n_clicks_per_user(true_resp))

                                                                                