# Synthetic data generation

This tutorial shows how to use Sim4Rec to generate synthetic data based on real data. First, we fit non-negative ALS to real data containing user interactions and obtain vector representations of real users. Then, we fit CopulaGAN to non-negative ALS embeddings of real users and generate synthetic user feature vectors with CopulaGAN. We evaluate the quality of the generated synthetic data using the metrics LogisticDetection, SVCDetection, KSTest, and ContinuousKLDivergence.

In [None]:
import os
import sys
os.environ["JAVA_HOME"] = "/home/jovyan/ivanova/env39_sim4rec_demo_new/lib/jvm"
os.environ["PYSPARK_PYTHON"] = sys.executable
os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
os.environ["OMP_NUM_THREADS"] = "32"
os.environ["NUMBA_NUM_THREADS"] = "4"

In [3]:
import random
import time

import numpy as np
import pandas as pd
from rs_datasets import MovieLens
import warnings

from pyspark import SparkConf
from pyspark.ml.recommendation import ALS
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

from sim4rec.modules import evaluate_synthetic, SDVDataGenerator
from sim4rec.utils import pandas_to_spark

warnings.filterwarnings("ignore") 

In [4]:
SPARK_LOCAL_DIR = "./tmp"
CHECKPOINT_DIR = "./tmp/checkpoints"

spark = SparkSession.builder\
    .appName("simulator")\
    .master("local[4]")\
    .config("spark.sql.shuffle.partitions", "4")\
    .config("spark.default.parallelism", "4")\
    .config("spark.driver.extraJavaOptions", "-XX:+UseG1GC")\
    .config("spark.executor.extraJavaOptions", "-XX:+UseG1GC")\
    .config("spark.sql.autoBroadcastJoinThreshold", "-1")\
    .config("spark.driver.memory", "4g")\
    .config("spark.local.dir", SPARK_LOCAL_DIR)\
    .getOrCreate()

spark.sparkContext.setLogLevel("ERROR")

24/08/12 13:19:47 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/08/12 13:19:47 WARN SparkConf: Note that spark.local.dir will be overridden by the value set by the cluster manager (via SPARK_LOCAL_DIRS in mesos/standalone/kubernetes and LOCAL_DIRS in YARN).


In [5]:
SEED = 1234
np.random.seed(SEED)

## Download Data

In [6]:
data = MovieLens("20m")
ratings_df = data.ratings
ratings_df.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,1,2,3.5,1112486027
1,1,29,3.5,1112484676
2,1,32,3.5,1112484819
3,1,47,3.5,1112484727
4,1,50,3.5,1112484580


In [7]:
rating_threshold = ratings_df["rating"].median()
rating_threshold

3.5

In [8]:
ratings_df["relevance"] = (ratings_df["rating"] >= rating_threshold).astype(int)

## Data Preprocessing

We encode the user and item IDs with consecutive values so that the new IDs do not contain missing values.

In [9]:
ratings_df["user_idx"] = ratings_df["user_id"].astype("category").cat.codes
ratings_df["item_idx"] = ratings_df["item_id"].astype("category").cat.codes

In [10]:
train = pandas_to_spark(ratings_df)

## Non-negative ALS for User Embeddings

Fit non-negative ALS to historical data to produce the real user vectors that represent their profiles.

In [11]:
# initialization of non-negative ALS
als = ALS(
    rank=64,
    maxIter=5,
    userCol="user_idx",
    itemCol="item_idx",
    ratingCol="relevance",
    seed=SEED,
    nonnegative=True,
)
# fit ALS
als_model = als.fit(train)

                                                                                

Obtain vector representations of real users.

In [12]:
user_features = als_model.userFactors.orderBy("id")
user_features = (user_features.withColumn("user_feature", col("features"))).select(
    ["id"] + [col("user_feature")[i] for i in range(64)]
)
user_features.limit(5).toPandas()

Unnamed: 0,id,user_feature[0],user_feature[1],user_feature[2],user_feature[3],user_feature[4],user_feature[5],user_feature[6],user_feature[7],user_feature[8],...,user_feature[54],user_feature[55],user_feature[56],user_feature[57],user_feature[58],user_feature[59],user_feature[60],user_feature[61],user_feature[62],user_feature[63]
0,0,0.098465,0.052241,0.125119,0.202401,0.178767,0.148965,0.118857,0.080273,0.152939,...,0.077556,0.187669,0.081136,0.129824,0.07812,0.145179,0.09678,0.099917,0.154622,0.113657
1,1,0.083297,0.047674,0.103928,0.181666,0.161923,0.12828,0.096112,0.068772,0.143337,...,0.059835,0.174903,0.070022,0.12274,0.070366,0.123795,0.083381,0.087477,0.137442,0.099442
2,2,0.08975,0.044578,0.118326,0.188298,0.157684,0.135538,0.109513,0.071072,0.142481,...,0.074062,0.166076,0.075291,0.117528,0.06825,0.135233,0.085594,0.092989,0.142653,0.100441
3,3,0.065662,0.048028,0.068513,0.146848,0.143943,0.101966,0.062271,0.059757,0.12878,...,0.031322,0.164175,0.050798,0.111714,0.065364,0.094431,0.07275,0.065016,0.117407,0.087755
4,4,0.076065,0.054559,0.085825,0.180649,0.170346,0.122832,0.076311,0.067553,0.156395,...,0.041212,0.192011,0.060475,0.13315,0.074588,0.115542,0.083266,0.077581,0.139359,0.101348


## CopulaGAN User Embeddings Generator

Fit the SDVDataGenerator with the CopulaGAN synthetic data model to non-negative ALS embeddings of real users and generate synthetic user feature vectors.

In [14]:
# initialization of data generator
sdv_data_generator = SDVDataGenerator(
    label="synth",
    id_column_name="user_id",
    model_name="copulagan",
    parallelization_level=4,
    device_name="cpu",
    seed=SEED,
)
# fit data generator
sdv_data_generator.fit(user_features.drop("id").sample(0.1))
# generate user embeddings
synthetic_users = sdv_data_generator.generate(user_features.sample(0.1).count())
synthetic_users.limit(5).toPandas()

                                                                                

Unnamed: 0,user_id,user_feature[0],user_feature[1],user_feature[2],user_feature[3],user_feature[4],user_feature[5],user_feature[6],user_feature[7],user_feature[8],...,user_feature[54],user_feature[55],user_feature[56],user_feature[57],user_feature[58],user_feature[59],user_feature[60],user_feature[61],user_feature[62],user_feature[63]
0,synth_0,0.091851,0.03296,0.095349,0.133471,0.118124,0.113068,0.083493,0.058168,0.111959,...,0.075365,0.130613,0.062801,0.143937,0.040654,0.115569,0.080157,0.085276,0.147917,0.093266
1,synth_1,0.092978,0.012748,0.124107,0.186155,0.0699,0.149438,0.134901,0.051662,0.149086,...,0.103417,0.095883,0.071916,0.096389,0.062542,0.136404,0.06532,0.102663,0.137588,0.064048
2,synth_2,0.110147,0.052691,0.128284,0.213336,0.113021,0.164293,0.119523,0.07921,0.175648,...,0.103819,0.159714,0.074061,0.120538,0.089832,0.132798,0.090667,0.109788,0.147975,0.147257
3,synth_3,0.070364,0.015948,0.122228,0.146514,0.019321,0.088826,0.079565,0.025113,0.104235,...,0.135366,0.003427,0.021899,0.018142,0.006339,0.090419,0.00906,0.058113,0.074619,0.001528
4,synth_4,0.025496,0.007917,0.074783,0.069922,0.041347,0.024782,0.013793,0.036255,0.067663,...,0.064014,0.004586,0.003224,0.04784,0.036774,0.005925,0.016935,0.025793,0.034176,0.025598


## Generator Evaluation

We evaluate the quality of synthetic user profiles using the following metrics:

$LogisticDetection = 1 - ROC\text{-}AUC_{LR}$ 

Evaluates how difficult it is to separate the real and synthetic data using the Logistic Regression Model. 

The less, the better.

$SVCDetection = 1 - ROC\text{-}AUC_{SVC}$ 

Evaluates how difficult it is to separate the real and synthetic data using the Support Vector Classifier. 

The less, the better.

$KSTest = 1 - sup_x|CDF_{real} - CDF_{synth}|$ 

Analyzes the similarity of distributions of continuous-valued attributes in real and synthetic data using the Kolmogorov-Smirnov two-sample test. 

The more, the better.

$ContinuousKLD = \frac{1}{1 + KLD}$ 

Evaluates the distance of probability distributions of continuous-valued attributes in real and synthetic data. 

The more, the better.

In [15]:
# sample users
real_users = user_features.sample(0.1)
# generate synthetic users
synthetic_users = sdv_data_generator.generate(real_users.count())
# evaluate the quality of synthetic data
gen_score = evaluate_synthetic(
    synthetic_users.drop("user_id"), 
    real_users.drop("id")
)
gen_score

                                                                                

{'LogisticDetection': 0.017155137360620132,
 'SVCDetection': 0.000693956093678505,
 'KSTest': 0.9043275057028799,
 'ContinuousKLDivergence': 0.5256557092611626}

## Sample Users

In [16]:
synthetic_users.limit(5).toPandas()

Unnamed: 0,user_id,user_feature[0],user_feature[1],user_feature[2],user_feature[3],user_feature[4],user_feature[5],user_feature[6],user_feature[7],user_feature[8],...,user_feature[54],user_feature[55],user_feature[56],user_feature[57],user_feature[58],user_feature[59],user_feature[60],user_feature[61],user_feature[62],user_feature[63]
0,synth_0,0.068271,0.030453,0.10731,0.187089,0.111303,0.12673,0.085476,0.047734,0.141227,...,0.022608,0.151056,0.041063,0.09836,0.039581,0.093707,0.069244,0.079853,0.123905,0.076813
1,synth_1,0.069742,0.014864,0.119846,0.118705,0.111567,0.111034,0.113755,0.046971,0.128664,...,0.087698,0.048108,0.047665,0.064227,0.055969,0.08933,0.061879,0.076437,0.07648,0.065742
2,synth_2,0.061838,0.050451,0.068097,0.092286,0.103057,0.087552,0.010885,0.038328,0.128539,...,0.061101,0.113733,0.03599,0.049317,0.051593,0.076033,0.066701,0.067808,0.108011,0.057752
3,synth_3,0.065518,0.031554,0.087072,0.149439,0.094151,0.108444,0.122548,0.039658,0.119494,...,0.075721,0.046217,0.044091,0.05837,0.040536,0.083683,0.045573,0.0684,0.119543,0.056258
4,synth_4,0.084342,0.02321,0.120369,0.160859,0.146979,0.136696,0.118686,0.076258,0.126789,...,0.070004,0.105374,0.061081,0.149775,0.058745,0.120349,0.065245,0.068379,0.103655,0.074493
