In this notebook, we will learn to implement a simple differentially private matrix factorization from scratch, with three different strategies:
* input perturbation with Laplacian mechanism
* gradient perturbation with Laplacian mechanism
* (unbounded) gradient perturbation with Gaussian mechanism

#### Import packages

In [None]:
!conda install pandas tqdm

import sys
sys.path.append('..')

import requests
import os
import io
import zipfile
import pandas as pd
import numpy as np
from tqdm import tqdm
from utils import create_maps, splitting

np.random.seed(42)

#### Load Data

First, we download the latest version of the Movielens Small dataset

In [None]:
url = "https://files.grouplens.org/datasets/movielens/ml-latest-small.zip"
print(f"Getting Movielens Small from : {url} ...")
response = requests.get(url)

ml_ratings = []

print("Extracting ratings...")
with zipfile.ZipFile(io.BytesIO(response.content)) as zip_ref:
    for line in zip_ref.open("ml-latest-small/ratings.csv"):
        ml_ratings.append(str(line, "utf-8"))

print("Printing ratings to data/movielens/ ...")
os.makedirs("data/movielens", exist_ok=True)
with open("data/movielens/dataset.csv", "w") as f:
    f.writelines(ml_ratings)

Processing dataset

In [None]:
dataframe_ml_small = pd.read_csv('data/movielens/dataset.csv')

train_set, test_set = splitting(dataframe_ml_small)
train_set = train_set.loc[:, ['userId', 'movieId', 'rating']]
test_set = test_set.loc[:, ['userId', 'movieId', 'rating']]
maps = create_maps(train_set)

### Define the model

Create MF class

In [None]:
class MF:
    def __init__(self, dataset, maps, n_factors, relevance=3.5, i_avg=None, u_avg=None):
        """
        :param dataset: interaction dataset should be a Pandas dataframe with three columns for user, item, and rating
        :param n_factors:
        """
        print("Building model...")
        self.ext2int_user_map, self.int2ext_user_map, self.ext2int_item_map, self.int2ext_item_map = maps
        self.dataset = self.format_dataset(dataset)
        self.rated_items = {
            self.ext2int_user_map[u]: dataset[(dataset.iloc[:, 0] == u) & (dataset.iloc[:, 2] >= relevance)].iloc[:,
                                      1].map(self.ext2int_item_map).astype(int).to_list() for u in
            self.ext2int_user_map}
        n_users = len(self.ext2int_user_map)
        n_items = len(self.ext2int_item_map)
        self.n_interactions = len(dataset)
        self.delta_ratings = dataset.iloc[:, 2].max() - dataset.iloc[:, 2].min()
        self.p = np.random.normal(size=(n_users, n_factors), scale=1./n_factors, loc=0)
        self.q = np.random.normal(size=(n_items, n_factors), scale=1./n_factors, loc=0)

        self.b_u = np.zeros(n_users)
        self.b_i = np.zeros(n_items)
        self.b = np.mean(dataset['rating'])

        self.i_avg = None
        self.u_avg = None
        if i_avg is not None and u_avg is not None:
            self.i_avg = i_avg.to_numpy()
            self.u_avg = u_avg.to_numpy()

        def format_dataset(self, df):
            dataset = {}
            dataset['userId'] = df.iloc[:, 0].map(self.ext2int_user_map).to_dict()
            dataset['itemId'] = df.iloc[:, 1].map(self.ext2int_item_map).to_dict()
            dataset['rating'] = df.iloc[:, 2].to_dict()
            return dataset

    def train(self, lr, beta, epochs):
        print("Starting training...")
        for e in range(epochs):
            print(f"*** Epoch {e + 1}/{epochs} ***")
            for i in tqdm(range(self.n_interactions)):
                p_u = self.p[self.dataset['userId'][i]]
                q_i = self.q[self.dataset['itemId'][i]]
                pred = self.b + self.b_u[self.dataset['userId'][i]] + self.b_i[self.dataset['itemId'][i]] + p_u.dot(q_i)
                err = self.dataset['rating'][i] - pred

                # Update biases
                self.b_u[self.dataset['userId'][i]] += lr * (err - beta * self.b_u[self.dataset['userId'][i]])
                self.b_i[self.dataset['itemId'][i]] += lr * (err - beta * self.b_i[self.dataset['itemId'][i]])

                self.p[self.dataset['userId'][i]] = p_u + lr * (err * q_i - beta * p_u)
                self.q[self.dataset['itemId'][i]] = q_i + lr * (err * p_u - beta * q_i)

    def train_laplace_dp(self, lr, beta, epochs, eps, err_max=None):
        for e in range(epochs):
            print(f"*** Epoch {e + 1}/{epochs} ***")
            for i in tqdm(range(self.n_interactions)):
                p_u = self.p[self.dataset['userId'][i]]
                q_i = self.q[self.dataset['itemId'][i]]
                pred = self.b + self.b_u[self.dataset['userId'][i]] + self.b_i[self.dataset['itemId'][i]] + p_u.dot(q_i)
                err = self.dataset['rating'][i] - pred + np.random.laplace(scale=(epochs * self.delta_ratings / eps))
                if err_max:
                    err = np.clip(err, -err_max, err_max)

                # Update biases
                self.b_u[self.dataset['userId'][i]] += lr * (err - beta * self.b_u[self.dataset['userId'][i]])
                self.b_i[self.dataset['itemId'][i]] += lr * (err - beta * self.b_i[self.dataset['itemId'][i]])

                self.p[self.dataset['userId'][i]] = p_u + lr * (err * q_i)
                self.q[self.dataset['itemId'][i]] = q_i + lr * (err * p_u)

    def train_gaussian_unbounded_dp(self, lr, epochs, eps, delta, err_max=None):
        for e in range(epochs):
            print(f"*** Epoch {e + 1}/{epochs} ***")
            for i in tqdm(range(self.n_interactions)):
                p_u = self.p[self.dataset['userId'][i]]
                q_i = self.q[self.dataset['itemId'][i]]
                pred = p_u.dot(q_i)
                err = np.clip(self.dataset['rating'][i] - pred, err_max)
                self.p[self.dataset['userId'][i]] = p_u + lr * (err * q_i)
                self.q[self.dataset['itemId'][i]] = q_i + lr * (err * p_u)
            for u in self.int2ext_user_map:
                2 * s_p * epochs * np.sqrt(2 * np.log(2 / delta)) / eps
                # TODO: Terminare

    def evaluate(self, test=None, cutoff=10, relevance=0.5):
        print("Starting evaluation...")
        if self.i_avg is not None and self.u_avg is not None:
            prediction = self.b + self.b_u[:, np.newaxis] + self.b_i[np.newaxis, :] + \
                         (np.dot(self.p, self.q.T).T + self.i_avg[:, None]).T + self.u_avg[:, None]
        else:
            prediction = self.b + self.b_u[:, np.newaxis] + self.b_i[np.newaxis, :] + np.dot(self.p, self.q.T)
        precisions = []
        recalls = []
        print("Reading test set...")
        relevant_items_test = {
            self.ext2int_user_map[u]: set(test[(test.iloc[:, 0] == u) & (test.iloc[:, 2] >= relevance)].iloc[:, 1].map(
                self.ext2int_item_map).dropna().astype(int).to_list()) for u in self.ext2int_user_map}
        print("Computing metrics...")
        for u in self.int2ext_user_map:
            prediction[u, self.rated_items[u]] = - np.inf
            unordered_top_k = np.argpartition(prediction[u], -cutoff)[-cutoff:]
            top_k = unordered_top_k[np.argsort(prediction[u][unordered_top_k])][::-1]
            n_rel_and_rec_k = sum(i in relevant_items_test[u] for i in top_k)
            precisions.append(n_rel_and_rec_k / cutoff)
            try:
                recalls.append(n_rel_and_rec_k / len(relevant_items_test[u]))
            except ZeroDivisionError:
                recalls.append(0)
        precision = sum(precisions) / len(precisions)
        recall = sum(recalls) / len(recalls)

        print(f"Precision@{cutoff}: {precision}")
        print(f"Recall@{cutoff}: {recall}")

#### Initialize and Train The Model
Now, we are ready to initialize and train the model.

In [None]:
f = 100
lr = 0.001
beta = 0.1
epochs = 20

mf = MF(train_set, maps, f, relevance=4)
mf.train(lr, beta, epochs)

#### Evaluate The Model

The evaluation is computed on Top-K recommendation lists (default K = 10).

In [None]:
mf.evaluate(test_set)

Build DP schema on dataset

Before feeding our recommender, we preprocess the dataset by measuring noisy versions of some global effects. In detail we measure:
* a differentially private version of the global average
* a differentially private version of the item averages
* a differentially private version of the user averages

Finally, we clamp the resulting ratings

This preprocessing is proven to allow deriving more accurate predictions when using the MF approach

In [None]:
def privatize_global_effects(ratings, b_m, b_u, eps_global_avg, eps_item_avg, eps_user_avg, clamping):
    min_rating = ratings['rating'].min()
    max_rating = ratings['rating'].max()
    delta_r = max_rating - min_rating

    # Measure the noisy version

    global_average_item = (ratings['rating'].sum() + np.random.laplace(scale=(delta_r / eps_global_avg))) / len(ratings)

    item_sets = ratings.groupby('movieId')['rating']
    i_avg = (item_sets.sum() + b_m * global_average_item + np.random.laplace(scale=(delta_r / eps_item_avg),
                                                                             size=len(item_sets))) / (
                        item_sets.count() + b_m)
    i_avg = np.clip(i_avg, min_rating, max_rating)

    merged = ratings.join(i_avg, on=['movieId'], lsuffix='_x', rsuffix='_y')

    merged['rating'] = merged['rating_x'] - merged['rating_y']
    merged = merged.drop(columns=['rating_x', 'rating_y'], axis=1)

    global_average_user = (merged['rating'].sum() + np.random.laplace(scale=(delta_r / eps_global_avg))) / len(merged)

    user_sets = merged.groupby('userId')['rating']
    u_avg = (user_sets.sum() + b_u * global_average_user + np.random.laplace(scale=(delta_r / eps_user_avg))) / (
                user_sets.count() + b_u)
    u_avg = np.clip(u_avg, -2, 2)  # Valore dal paper

    preprocessed_ratings = merged.join(u_avg, on=['userId'], lsuffix='_x', rsuffix='_y')

    preprocessed_ratings['rating'] = preprocessed_ratings['rating_x'] - preprocessed_ratings['rating_y']
    preprocessed_ratings = preprocessed_ratings.drop(columns=['rating_x', 'rating_y'], axis=1)
    preprocessed_ratings['rating'] = np.clip(preprocessed_ratings['rating'], -clamping, clamping)

    return preprocessed_ratings, i_avg, u_avg

#### Train model with preprocessed data within DP on train schema

In [None]:
b_m = 1
b_u = 1
eps_global_avg = 1
eps_item_avg = 1
eps_user_avg = 1
clamping = 1

preproc_train_set, i_avg, u_avg = privatize_global_effects(train_set, b_m, b_u, eps_global_avg, eps_item_avg,
                                                           eps_user_avg, clamping)

mf_dp_data = MF(train_set, maps, f, relevance=4, i_avg=i_avg, u_avg=u_avg)

#### Evaluate the performance with this DP schema

In [None]:
mf_dp_data.train(lr, beta, epochs)
mf_dp_data.evaluate(test_set)

#### Train model with DP schema during training phase

In [None]:
mf_dp_train = MF(train_set, maps, f, relevance=4, i_avg=i_avg, u_avg=u_avg)

mf_dp_data.train_laplace_dp(lr, beta, epochs, 10)

#### Evaluate the performance

In [None]:
mf_dp_data.evaluate(test_set)