
# CS 230 Deep Learning: Final Project  
### Exploration of User Privacy Preservation via CTGAN Data Synthesis for Deep Recommenders

---
**Contributors:** Savannah McCoy

**In this notebook we will generate synthetic user review data and train several deep recommenders using various split of real and synthetic data**

_(The outputs of this notebook have been cleared prior to upload for easy reading. Full outputs for all cells of the following code can be found in the notebooks in the **dev-notebooks** directory)_

In [None]:
import csv
import pandas as pd
import json
import missingno as msno
import os
import tempfile
import numpy as np
import time
import datetime

from typing import Dict, Text
import matplotlib.pyplot as plt
%matplotlib inline

import tensorflow as tf
import tensorflow_datasets as tfds
import tensorflow_recommenders as tfrs

from tabgan.sampler import GANGenerator, OriginalGenerator 
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

### Reformatting json files

In [None]:
df = pd.read_csv("Video_Games_5.json", sep="\n", header=None)
d = {}
def split_data(row):
    dt = json.loads(row[0])
    d[row.name] = dt
    return

df.apply(lambda row: split_data(row), axis=1)

In [None]:
with open("sample.json", "w") as outfile:
    json.dump(d, outfile)

In [None]:
df3 = pd.read_csv("meta_Video_Games.json", sep="\n", header=None)

In [None]:
dp = {}
def split_data(row):
    dt = json.loads(row[0])
    dp[row.name] = dt
    return

df3.apply(lambda row: split_data(row), axis=1)

In [None]:
with open("meta.json", "w") as outfile:
    json.dump(dp, outfile)

### Read reformatted json files

In [None]:
meta_df = pd.read_json('meta_sub.json')
reviews_df = pd.read_json('reviews.json')
reviews_df = reviews_df.T

In [None]:
# examine missing data
msno.matrix(meta_df)

In [None]:
msno.matrix(reviews_df)

In [None]:
reviews_df.head(5)

### Clean data and encoding categorial variables

In [None]:
reviews_df = pd.read_json('reviews.json')
reviews_df = reviews_df.T

In [None]:
del reviews_df["reviewTime"] 

In [None]:
reviews_df.head()

In [None]:
reviews_df = reviews_df.rename(columns={"overall": "rating", 
                           "reviewerID": "userID", 
                           "asin": "productID"}, 
                  errors="raise")

In [None]:
reviews_df.to_csv("reviews_filtered.csv", index=False)

In [None]:
reviews_df = pd.read_csv('reviews_filtered.csv')

In [None]:
del reviews_df["style"]

In [None]:
reviews_df['vote'] = reviews_df['vote'].astype(str)

In [None]:
le = preprocessing.LabelEncoder()
le.fit(reviews_df["vote"])
new_vote = le.transform(reviews_df["vote"])
reviews_df["cat_vote"] = new_vote

In [None]:
reviews_df.head(5)

In [None]:
def bin_rating(x):
    if x >= 4:
        return 1
    else:
        return 0

reviews_df["bin_rating"] = reviews_df["rating"].apply(lambda x: bin_rating(x))

In [None]:
def bin_verified(x):
    if x:
        return 1
    else:
        return 0

reviews_df["bin_verified"] = reviews_df["verified"].apply(lambda x: bin_verified(x))

In [None]:
del reviews_df["rating"]
del reviews_df["verified"]
del reviews_df["vote"]

In [None]:
reviews_df.to_csv("reviews_categorical.csv", index=False)

In [None]:
reviews_df = pd.read_csv('reviews_categorical.csv')
reviews_df.columns = ["userID", "productID", "vote", "rating", "verified"]
reviews_df = reviews_df.sample(n=100000, replace=False, random_state=3)

### Generating synthetic data

In [None]:
# !pip install tabgan==1.1.0
# !pip install -U scikit-learn

In [None]:
rdf = pd.DataFrame(columns =["userID", "productID", "vote", "verified"])
tsr = pd.Series()

In [None]:
reviews_df = pd.read_csv('reviews_categorical.csv')
reviews_df.columns = ["userID", "productID", "vote", "rating", "verified"]

In [None]:
for i in range(125):
    reviews_df_sub = reviews_df.sample(frac=0.03, replace=False, random_state=i)
    df_train, df_test = train_test_split(reviews_df_sub, test_size=0.2)
    df_train2 = df_train[["userID", "productID", "vote", "verified"]]
    df_test2 = df_test[["userID", "productID", "vote", "verified"]]
    df_target = df_train[["rating"]]
    new_train, new_target = GANGenerator(cat_cols=["userID", "productID", "vote", 
                                                    "verified", "rating"], 
                                          is_post_process=True, epochs=3
                                          ).generate_data_pipe(train_df=df_train2, 
                                                                test_df=df_test2, 
                                                                target=df_target, 
                                                                only_adversarial=False,
                                                                use_adversarial=True,
                                                                )
    rdf = pd.concat([rdf, new_train])  
    tsr = pd.concat([tsr, new_target])
    print("DONE: Iteration", i+1)

### Merging meta data to Review data

In [None]:
meta_df = pd.read_json('meta.json')

In [None]:
meta_df = meta_df.T
meta_df.head(5)

In [None]:
del meta_df["tech1"]
del meta_df["fit"] 
del meta_df["also_buy"] 
del meta_df["tech2"] 
del meta_df["rank"] 
del meta_df["also_view"] 
del meta_df["main_cat"] 
del meta_df["similar_item"] 
del meta_df["imageURL"]
del meta_df["imageURLHighRes"]
del meta_df["details"]

In [None]:
meta_df['price'] = meta_df['price'].astype(str)
meta_df['brand'] = meta_df['brand'].astype(str)
meta_df['category'] = meta_df['category'].astype(str)
meta_df['description'] = meta_df['description'].astype(str)
meta_df['date'] = meta_df['date'].astype(str)
meta_df['title'] = meta_df['title'].astype(str)
meta_df['feature'] = meta_df['feature'].astype(str)

In [None]:
def filter_prices(x):
    if x.startswith('$'):
        return x
    else:
        return ''
    
meta_df["price"] = meta_df["price"].apply(lambda x: filter_prices(x))

In [None]:
le = preprocessing.LabelEncoder()
le.fit(meta_df["price"])
new_price = le.transform(meta_df["price"])
meta_df["price"] = new_price

In [None]:
le = preprocessing.LabelEncoder()
le.fit(meta_df["date"])
new_date = le.transform(meta_df["date"])
meta_df["date"] = new_date

In [None]:
meta_df['description'] = meta_df['description'].str.strip('[]').str.split(',')
meta_df['feature'] = meta_df['feature'].str.strip('[]').str.split(',')
meta_df['category'] = meta_df['category'].str.strip('[]').str.split(',')

In [None]:
def get_cat2(x):
    if len(x) >= 2:
        return x[1].replace("'", '')
    else:
        return ''
    
def get_cat3(x):
    if len(x) >= 3:
        return x[2].replace("'", '')
    else:
        return ''
    
def get_description(x):
    if len(x) >= 1:
        return x[0].replace("'", '')
    else:
        return ''

def get_feature(x):
    if len(x) >= 1:
        return x[0].replace("'", '')
    else:
        return ''
    

In [None]:
meta_df["description"] = meta_df["description"].apply(lambda x: get_description(x))
meta_df["feature"] = meta_df["feature"].apply(lambda x: get_feature(x))

In [None]:
del meta_df["category"]

In [None]:
meta_df = meta_df.rename(columns={"asin": "productID"})

In [None]:
meta_df.head(3)

In [None]:
result = pd.merge(meta_df, reviews_df, on='productID')
result.head(10)

In [None]:
result_df = result.sample(n=100000, replace=False, random_state=63)
result_df.to_csv("100_real_data.csv", index=False)

### Create dataset splits

In [None]:
## Read full metadata data set
meta_df = pd.read_json('meta.json')
meta_df = meta_df.T

# process metadata to be merged with split data
def filter_prices(x):
    if x.startswith('$'):
        return x
    else:
        return ''

del meta_df["tech1"]
del meta_df["fit"] 
del meta_df["also_buy"] 
del meta_df["tech2"] 
del meta_df["rank"] 
del meta_df["also_view"] 
del meta_df["main_cat"] 
del meta_df["similar_item"] 
del meta_df["imageURL"]
del meta_df["imageURLHighRes"]
del meta_df["details"]
del meta_df["description"] 
del meta_df["feature"]
del meta_df["title"]
del meta_df["brand"]
del meta_df["category"]

meta_df['price'] = meta_df['price'].astype(str)
meta_df['date'] = meta_df['date'].astype(str) 
meta_df["price"] = meta_df["price"].apply(lambda x: filter_prices(x))

le = preprocessing.LabelEncoder()
le.fit(meta_df["price"])
new_price = le.transform(meta_df["price"])
meta_df["price"] = new_price

le = preprocessing.LabelEncoder()
le.fit(meta_df["date"])
new_date = le.transform(meta_df["date"])
meta_df["date"] = new_date

meta_df = meta_df.rename(columns={"asin": "productID"})
meta_df.head(5)

In [None]:
# read in real and synthetic datasets
real_df = pd.read_csv('reviews_categorical.csv')
real_df.columns = ["userID", "productID", "vote", "rating", "verified"]
syn_df = pd.read_csv('synthetic_review_data.csv')

In [None]:
real_df.head(5)

In [None]:
# 100 Synth
syn_df = syn_df.rename(columns={"0": "rating"})
syn_df.head(5)

In [None]:
# merge metadata and full synth 
result1 = pd.merge(meta_df, syn_df, on='productID')

In [None]:
result1.head()

In [None]:
len(result1)

In [None]:
# save synthetic dataset
synth_100 = result1.sample(n=100000, replace=False, random_state=35)
synth_100.to_csv("100_synth_data.csv", index=False)

In [None]:
# create 75/25 split
real_df1 = real_df.sample(n=75000, replace=False, random_state=3)
syn_df1 = syn_df.sample(n=25000, replace=False, random_state=35)

In [None]:
df_75_25 = pd.concat([real_df1, syn_df1])
df_75_25

In [None]:
result2 = pd.merge(meta_df, df_75_25, on='productID')

In [None]:
# save 75/25 dataset
real_75 = result2.sample(n=100000, replace=False, random_state=35)
real_75.to_csv("75_real_25_synth_data.csv", index=False)

In [None]:
# create 50/50 split
real_df2 = real_df.sample(n=50000, replace=False, random_state=43)
syn_df2 = syn_df.sample(n=50000, replace=False, random_state=35)
df_50_50 = pd.concat([real_df2, syn_df2])
result3 = pd.merge(meta_df, df_50_50, on='productID')
print(len(result3))
synth_50 = result3.sample(n=100000, replace=False, random_state=35)
synth_50.to_csv("50_real_50_synth_data.csv", index=False)

In [None]:
# create 90/10 split
real_df3 = real_df.sample(n=90000, replace=False, random_state=43)
syn_df3 = syn_df.sample(n=10000, replace=False, random_state=35)
df_90_10 = pd.concat([real_df3, syn_df3])
result4 = pd.merge(meta_df, df_90_10, on='productID')
print(len(result4))
synth_10 = result4.sample(n=100000, replace=False, random_state=35)
synth_10.to_csv("90_real_10_synth_data.csv", index=False)

### Train Recommender Models on datatset splits

In [None]:
# ! pip install -q tensorflow-recommenders
# ! pip install -q --upgrade tensorflow-datasets
# ! pip install latex
# ! sudo apt-get install texlive-latex-recommended 
# ! sudo apt install texlive-latex-extra
# ! sudo apt install dvipng
# ! sudo apt-get install texlive-latex-extra texlive-fonts-recommended dvipng cm-super

In [None]:
# define a user model
class UserModel(tf.keras.Model):

    def __init__(self):
        super().__init__()

        max_tokens = 10_000
        self.embedding_dimension = 32

        self.user_embedding = tf.keras.Sequential([
            tf.keras.layers.StringLookup(vocabulary=u_uids, mask_token=None),
            tf.keras.layers.Embedding(len(u_uids) + 1, 64)])

        self.vote_embedding = tf.keras.Sequential([
            tf.keras.layers.Discretization(vote_buckets.tolist()),
            tf.keras.layers.Embedding(len(vote_buckets) + 2, 32)])
        self.normalized_vote = tf.keras.layers.Normalization(axis=None)
        self.normalized_vote.adapt(votes)

        self.verified_embedding = tf.keras.Sequential([
            tf.keras.layers.Discretization(verified_buckets.tolist()),
            tf.keras.layers.Embedding(len(verified_buckets) + 2, 32)])
        self.normalized_verified = tf.keras.layers.Normalization(axis=None)
        self.normalized_verified.adapt(verified)

        self.date_embedding = tf.keras.Sequential([
            tf.keras.layers.Discretization(date_buckets.tolist()),
            tf.keras.layers.Embedding(len(date_buckets) + 2, 32)])
        self.normalized_date = tf.keras.layers.Normalization(axis=None)
        self.normalized_date.adapt(votes)

        self.price_embedding = tf.keras.Sequential([
            tf.keras.layers.Discretization(price_buckets.tolist()),
            tf.keras.layers.Embedding(len(price_buckets) + 2, 32)])
        self.normalized_price = tf.keras.layers.Normalization(axis=None)
        self.normalized_price.adapt(prices)


    def call(self, inputs):
        return tf.concat([
            self.user_embedding(inputs["userID"]),
            self.vote_embedding(inputs["vote"]),
            tf.reshape(self.normalized_vote(inputs["vote"]), (-1, 1)),
            self.verified_embedding(inputs["verified"]),
            tf.reshape(self.normalized_verified(inputs["verified"]), (-1, 1)),
            self.date_embedding(inputs["date"]),
            tf.reshape(self.normalized_verified(inputs["date"]), (-1, 1)),
            self.price_embedding(inputs["price"]),
            tf.reshape(self.normalized_vote(inputs["price"]), (-1, 1)),
        ], axis=1)

In [None]:
# define a product model
class ProductModel(tf.keras.Model):

    def __init__(self):
        super().__init__()

        self.product_embedding = tf.keras.Sequential([
                      tf.keras.layers.experimental.preprocessing.StringLookup(
                          vocabulary=u_pids, mask_token=None),
                      tf.keras.layers.Embedding(len(u_pids) + 1, 64)])


    def call(self, inputs):
        print(inputs)
        return tf.concat([ 
            self.product_embedding(inputs["productID"])
        ], axis=1)

In [None]:
# define a recommender model
class ProductRecommendationModel(tfrs.models.Model):

    def __init__(self, rating_weight, retrieval_weight):
        super().__init__()

        # user and product representations
        self.user_model = tf.keras.Sequential([
                          UserModel(),
                          tf.keras.layers.Dense(64)])

        ## candidate model is the item model
        self.product_model = tf.keras.Sequential([
                              ProductModel(),
                              tf.keras.layers.Dense(64)])


        # model using user and product embeddings to predict ratings
        self.rating_model = tf.keras.Sequential([
            tf.keras.layers.Dense(256, activation="relu"),
            tf.keras.layers.Dense(128, activation="relu"),
            tf.keras.layers.Dense(1),
        ])

        # loss weights
        self.rating_weight = rating_weight
        self.retrieval_weight = retrieval_weight

        # retrieval tasks: factorization loss and RMSE
        self.rating_task: tf.keras.layers.Layer = tfrs.tasks.Ranking(
            loss=tf.keras.losses.MeanSquaredError(),
            metrics=[tf.keras.metrics.RootMeanSquaredError()],
        )
        self.retrieval_task: tf.keras.layers.Layer = tfrs.tasks.Retrieval(
            metrics=tfrs.metrics.FactorizedTopK(
                candidates=products.batch(128).map(self.product_model)
            )
        )



    def compute_loss(self, features, training=False):
        # define how the loss is computed
        ratings = features.pop("rating")

        user_embeddings = self.user_model({
            "userID": features["userID"],
            "vote": features["vote"],
            "verified": features["verified"],
            "date": features["date"],
            "price": features["price"], 
        })

        product_embeddings = self.product_model({
            "productID": features["productID"],  
        })

        rating_predictions = self.rating_model(
            tf.concat([user_embeddings, product_embeddings], axis=1)
        )  

        # compute loss for each task
        rating_loss = self.rating_task(labels=ratings, predictions=rating_predictions)
        retrieval_loss = self.retrieval_task(user_embeddings, product_embeddings)

        # combine them using the loss weights
        return (self.rating_weight * rating_loss + self.retrieval_weight * retrieval_loss)


In [None]:
# read datasets to pandas dfs
df1 = pd.read_csv('100_synth_data.csv')
df2 = pd.read_csv('90_real_10_synth_data.csv')
df3 = pd.read_csv('75_real_25_synth_data.csv')
df4 = pd.read_csv('50_real_50_synth_data.csv')

In [None]:
# create list of dfs to iterate over
dfs = [df1, df2, df3, df4]
strps = ["100_synth", "90_real", "75_real", "50_real"]

In [None]:
i = 0
for df in dfs:
    # read data from csv files
    reviews_df = df

    # convert dataframe to tensor dataset format
    reviews = tf.data.Dataset.from_tensor_slices(dict(reviews_df))

    # select the features to map
    reviews = reviews.map(lambda x: {
        "productID": x["productID"],      # embedding
        "userID": x["userID"],            # embedding
        "rating": x["rating"],            # target
        "vote": x["vote"],                # numeric
        "date": x["date"],                # numeric
        "price": x["price"],              # numeric
        "verified": x["verified"]         # numeric
        })

    products = reviews.map(lambda x: {
        "productID": x["productID"]})
    
    start1 = time.time()

    # process numeric inputs
    votes = np.concatenate(list(reviews.map(lambda x: x["vote"]).batch(100)))
    mx_v, mn_v = votes.max(), votes.min()
    vote_buckets = np.linspace(mn_v, mx_v, num=1000)

    prices = np.concatenate(list(reviews.map(lambda x: x["price"]).batch(100)))
    mx_p, mn_p = prices.max(), prices.min()
    price_buckets = np.linspace(mn_p, mx_p, num=1000)

    verified = np.concatenate(list(reviews.map(lambda x: x["verified"]).batch(100)))
    mx_v, mn_v = verified.max(), verified.min()
    verified_buckets = np.linspace(mn_v, mx_v, num=1000)

    dates = np.concatenate(list(reviews.map(lambda x: x["date"]).batch(100)))
    mx_d, mn_d = dates.max(), dates.min()
    date_buckets = np.linspace(mn_d, mx_d, num=1000)

    # process string inputs for embeddings
    u_pids = np.unique(np.concatenate(list(products.batch(1000).map(lambda x: x["productID"]))))
    u_uids = np.unique(np.concatenate(list(reviews.batch(1000).map(lambda x: x["userID"]))))

    # create retrieval and ranking model
    model = ProductRecommendationModel(rating_weight=0.5, retrieval_weight=0.5)
    model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))

    # split the data into a training set and a testing set.
    tf.random.set_seed(42)
    shuffled = reviews.shuffle(100_000, seed=42, reshuffle_each_iteration=False)

    train = shuffled.take(80_000)
    test = shuffled.skip(80_000).take(20_000)

    cached_train = train.shuffle(100_000).batch(2048)
    cached_test = test.batch(4096).cache()

    # train model
    start = time.time()
    history = model.fit(cached_train, epochs=10)
    end = time.time()
    print("TIME TO TRAIN MODEL:", str(datetime.timedelta(seconds=end-start)))

    # print result metrics
    start = time.time()
    model.evaluate(cached_test, return_dict=True)
    end = time.time()
    print("TIME TO EVALUATE MODEL:", str(datetime.timedelta(seconds=end-start)))
    end1 = time.time()
    print("\n\n\n ----------------------------------------------------- \n\nTOTAL TIME TO TRAIN MODEL:", str(datetime.timedelta(seconds=end1-start1)))

    # plot train accuracy over epochs 
    mpl.rcParams.update(mpl.rcParamsDefault)
    mpl.rcParams['text.usetex'] = True
    plt.rcParams['text.latex.preamble']=[r"\usepackage{lmodern}"]
    params = {'text.usetex' : True,
              'font.size' : 11,
              'font.family' : 'lmodern',
              'text.latex.unicode': True,
              }
    plt.rcParams.update(params) 
    fig = plt.figure()''/''
    strp = strps[i]
    epochs = [i for i in range(10)]
    plt.plot(epochs, history.history["factorized_top_k/top_100_categorical_accuracy"], label="accuracy")
    plt.title("Factorized Top-100 Categorical Over Epochs")
    plt.xlabel("epoch")
    plt.ylabel(" accuracy");
    plt.legend()
    plt.savefig(strp+"graph.pdf")

    i = i + 1

In [None]:
model.rating_model.summary()
model.user_model.summary()
model.product_model.summary()