[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ramonzaca/MLSecOPs/blob/main/TP_07/07_membership_inference.ipynb)

**How black is a black box? - Membership inference - Practice 7**

*So far we've been considering a ML model as a black box.*

*We did, in fact, check that we can estimate how a prediction is been done. (Prediction 5)*

*And that's been done considering that the outputs the black box gives, comes from it's encoded knowledge*

*So it's safe to assume that that knowledge is non-retrievable? Let's check on that.*


In [None]:
# Install required packages
# !pip install adversarial-robustness-toolbox

In [None]:
import os
import sys

sys.path.insert(0, os.path.abspath(".."))

In [None]:
# Membership Inference Attacks Tutorial

# Imports
import numpy as np
import torch
from torch import nn, optim
from torch.utils.data import DataLoader
from torch.utils.data.dataset import Dataset
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt
from tqdm import tqdm


from art.utils import load_nursery, load_diabetes, to_categorical
from art.estimators.classification.scikitlearn import ScikitlearnRandomForestClassifier
from art.estimators.classification.pytorch import PyTorchClassifier
from art.estimators.regression.scikitlearn import ScikitlearnRegressor
from art.attacks.inference.membership_inference import (
    MembershipInferenceBlackBoxRuleBased,
    MembershipInferenceBlackBox,
    ShadowModels,
)
from art.metrics.privacy.worst_case_mia_score import get_roc_for_fpr

In [None]:
# Utility Functions
def calc_precision_recall(predicted, actual, positive_value=1):
    score = 0  # both predicted and actual are positive
    num_positive_predicted = 0  # predicted positive
    num_positive_actual = 0  # actual positive
    for i in range(len(predicted)):
        if predicted[i] == positive_value:
            num_positive_predicted += 1
        if actual[i] == positive_value:
            num_positive_actual += 1
        if predicted[i] == actual[i]:
            if predicted[i] == positive_value:
                score += 1

    if num_positive_predicted == 0:
        precision = 1
    else:
        precision = (
            score / num_positive_predicted
        )  # the fraction of predicted "Yes" responses that are correct
    if num_positive_actual == 0:
        recall = 1
    else:
        recall = (
            score / num_positive_actual
        )  # the fraction of "Yes" responses that are predicted correctly

    return precision, recall


def print_section(title):
    print(f"\n{'='*50}\n{title}\n{'='*50}")

In [None]:
# 1. Random Forest Model on Nursery Dataset
print_section("1. Random Forest Model on Nursery Dataset")

In [None]:
# Load and prepare data
(x_train, y_train), (x_test, y_test), _, _ = load_nursery(test_set=0.5)

In [None]:
# Train random forest model
model = RandomForestClassifier()
model.fit(x_train, y_train)
art_classifier = ScikitlearnRandomForestClassifier(model)
print("Base model accuracy:", model.score(x_test, y_test))

In [None]:
# Rule-based attack
attack = MembershipInferenceBlackBoxRuleBased(art_classifier)
inferred_train = attack.infer(x_train, y_train)
inferred_test = attack.infer(x_test, y_test)

In [None]:
# Calculate and print attack metrics
train_acc = np.sum(inferred_train) / len(inferred_train)
test_acc = 1 - (np.sum(inferred_test) / len(inferred_test))
acc = (train_acc * len(inferred_train) + test_acc * len(inferred_test)) / (
    len(inferred_train) + len(inferred_test)
)
print(f"Members Accuracy: {train_acc:.4f}")
print(f"Non Members Accuracy {test_acc:.4f}")
print(f"Attack Accuracy {acc:.4f}")

In [None]:
print(
    calc_precision_recall(
        np.concatenate((inferred_train, inferred_test)),
        np.concatenate((np.ones(len(inferred_train)), np.zeros(len(inferred_test)))),
    )
)

In [None]:
# Black-box attack
attack_train_ratio = 0.5
attack_train_size = int(len(x_train) * attack_train_ratio)
attack_test_size = int(len(x_test) * attack_train_ratio)

bb_attack = MembershipInferenceBlackBox(art_classifier)
bb_attack.fit(
    x_train[:attack_train_size],
    y_train[:attack_train_size],
    x_test[:attack_test_size],
    y_test[:attack_test_size],
)

In [None]:
# Infer membership and check accuracy
inferred_train_bb = bb_attack.infer(
    x_train[attack_train_size:], y_train[attack_train_size:]
)
inferred_test_bb = bb_attack.infer(x_test[attack_test_size:], y_test[attack_test_size:])
train_acc_bb = np.sum(inferred_train_bb) / len(inferred_train_bb)
test_acc_bb = 1 - (np.sum(inferred_test_bb) / len(inferred_test_bb))
acc_bb = (
    train_acc_bb * len(inferred_train_bb) + test_acc_bb * len(inferred_test_bb)
) / (len(inferred_train_bb) + len(inferred_test_bb))
print(f"Members Accuracy: {train_acc_bb:.4f}")
print(f"Non Members Accuracy {test_acc_bb:.4f}")
print(f"Attack Accuracy {acc_bb:.4f}")

In [None]:
print(
    calc_precision_recall(
        np.concatenate((inferred_train_bb, inferred_test_bb)),
        np.concatenate(
            (np.ones(len(inferred_train_bb)), np.zeros(len(inferred_test_bb)))
        ),
    )
)

In [None]:
# 2. Neural Network Model on Nursery Dataset
print_section("2. Neural Network Model on Nursery Dataset")

In [None]:
# Reduce dataset size and prepare data
train_set_size = 500
x_train, y_train = x_train[:train_set_size], y_train[:train_set_size]
x_test, y_test = x_test[:train_set_size], y_test[:train_set_size]
attack_train_size = int(len(x_train) * attack_train_ratio)
attack_test_size = int(len(x_test) * attack_train_ratio)

In [None]:
# Define and train neural network model
class ModelToAttack(nn.Module):
    def __init__(self, num_classes, num_features):
        super(ModelToAttack, self).__init__()

        self.fc1 = nn.Sequential(
            nn.Linear(num_features, 1024),
            nn.Tanh(),
        )

        self.fc2 = nn.Sequential(
            nn.Linear(1024, 512),
            nn.Tanh(),
        )

        self.fc3 = nn.Sequential(
            nn.Linear(512, 256),
            nn.Tanh(),
        )

        self.fc4 = nn.Sequential(
            nn.Linear(256, 128),
            nn.Tanh(),
        )

        self.classifier = nn.Linear(128, num_classes)

    def forward(self, x):
        out = self.fc1(x)
        out = self.fc2(out)
        out = self.fc3(out)
        out = self.fc4(out)
        return self.classifier(out)

In [None]:
mlp_model = ModelToAttack(4, 24)
mlp_model = torch.nn.DataParallel(mlp_model)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(mlp_model.parameters(), lr=0.01)

In [None]:
class NurseryDataset(Dataset):
    def __init__(self, x, y=None):
        self.x = torch.from_numpy(x.astype(np.float64)).type(torch.FloatTensor)

        if y is not None:
            self.y = torch.from_numpy(y.astype(np.int8)).type(torch.LongTensor)
        else:
            self.y = torch.zeros(x.shape[0])

    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx):
        if idx >= len(self.x):
            raise IndexError("Invalid Index")

        return self.x[idx], self.y[idx]

In [None]:
train_set = NurseryDataset(x_train, y_train)
train_loader = DataLoader(train_set, batch_size=100, shuffle=True, num_workers=0)

In [None]:
for epoch in tqdm(range(20), desc="Training Epochs"):
    for input, targets in train_loader:
        input, targets = (
            torch.autograd.Variable(input),
            torch.autograd.Variable(targets),
        )

        optimizer.zero_grad()
        outputs = mlp_model(input)
        loss = criterion(outputs, targets)

        loss.backward()
        optimizer.step()

In [None]:
mlp_art_model = PyTorchClassifier(
    model=mlp_model,
    loss=criterion,
    optimizer=optimizer,
    input_shape=(24,),
    nb_classes=4,
)

train_pred = np.array(
    [np.argmax(arr) for arr in mlp_art_model.predict(x_train.astype(np.float32))]
)
print("Base model Train accuracy: ", np.sum(train_pred == y_train) / len(y_train))

In [None]:
test_pred = np.array(
    [np.argmax(arr) for arr in mlp_art_model.predict(x_test.astype(np.float32))]
)
print("Base model Test accuracy: ", np.sum(test_pred == y_test) / len(y_test))

In [None]:
# Rule-based attack on neural network
mlp_attack = MembershipInferenceBlackBoxRuleBased(mlp_art_model)

mlp_inferred_train = mlp_attack.infer(x_train.astype(np.float32), y_train)
mlp_inferred_test = mlp_attack.infer(x_test.astype(np.float32), y_test)

mlp_train_acc = np.sum(mlp_inferred_train) / len(mlp_inferred_train)
mlp_test_acc = 1 - (np.sum(mlp_inferred_test) / len(mlp_inferred_test))
mlp_acc = (
    mlp_train_acc * len(mlp_inferred_train) + mlp_test_acc * len(mlp_inferred_test)
) / (len(mlp_inferred_train) + len(mlp_inferred_test))
print(mlp_train_acc)
print(mlp_test_acc)
print(mlp_acc)

In [None]:
print(
    calc_precision_recall(
        np.concatenate((mlp_inferred_train, mlp_inferred_test)),
        np.concatenate(
            (np.ones(len(mlp_inferred_train)), np.zeros(len(mlp_inferred_test)))
        ),
    )
)

In [None]:
# Black-box attack on neural network
mlp_attack_bb = MembershipInferenceBlackBox(mlp_art_model, attack_model_type="rf")

mlp_attack_bb.fit(
    x_train[:attack_train_size].astype(np.float32),
    y_train[:attack_train_size],
    x_test[:attack_test_size].astype(np.float32),
    y_test[:attack_test_size],
)

mlp_inferred_train_bb = mlp_attack_bb.infer(
    x_train[attack_train_size:].astype(np.float32), y_train[attack_train_size:]
)
mlp_inferred_test_bb = mlp_attack_bb.infer(
    x_test[attack_test_size:].astype(np.float32), y_test[attack_test_size:]
)

mlp_train_acc_bb = np.sum(mlp_inferred_train_bb) / len(mlp_inferred_train_bb)
mlp_test_acc_bb = 1 - (np.sum(mlp_inferred_test_bb) / len(mlp_inferred_test_bb))
mlp_acc_bb = (
    mlp_train_acc_bb * len(mlp_inferred_train_bb)
    + mlp_test_acc_bb * len(mlp_inferred_test_bb)
) / (len(mlp_inferred_train_bb) + len(mlp_inferred_test_bb))

print(f"Members Accuracy: {mlp_train_acc_bb:.4f}")
print(f"Non Members Accuracy {mlp_test_acc_bb:.4f}")
print(f"Attack Accuracy {mlp_acc_bb:.4f}")

In [None]:
print(
    calc_precision_recall(
        np.concatenate((mlp_inferred_train_bb, mlp_inferred_test_bb)),
        np.concatenate(
            (np.ones(len(mlp_inferred_train_bb)), np.zeros(len(mlp_inferred_test_bb)))
        ),
    )
)

In [None]:
# Worst Case vs Average Case evaluation for Black-Box Attack
bb_members_test_prob = mlp_attack_bb.infer(
    x_train[attack_train_size:].astype(np.float32),
    y_train[attack_train_size:],
    probabilities=True,
)
bb_nonmembers_test_prob = mlp_attack_bb.infer(
    x_test[attack_test_size:].astype(np.float32),
    y_test[attack_test_size:],
    probabilities=True,
)

bb_mia_test_probs = np.concatenate(
    (
        np.squeeze(bb_members_test_prob, axis=-1),
        np.squeeze(bb_nonmembers_test_prob, axis=-1),
    )
)

bb_mia_test_labels = np.concatenate(
    (
        np.ones_like(y_train[:attack_train_size]),
        np.zeros_like(y_test[:attack_test_size]),
    )
)

fpr, tpr, threshold = get_roc_for_fpr(
    attack_proba=bb_mia_test_probs, attack_true=bb_mia_test_labels, targeted_fpr=0.01
)[0]
print(f"{tpr=}: {fpr=}: {threshold=}")

In [None]:
fpr, tpr, _ = roc_curve(y_score=bb_mia_test_probs, y_true=bb_mia_test_labels)
plt.figure()
plt.plot(fpr, tpr, color="darkorange", linewidth=2, label="ROC curve")
plt.plot([0, 1], [0, 1], color="navy", linewidth=2, linestyle="--", label="No skills")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver operating characteristic example")
plt.legend(loc="lower right")
plt.show()

In [None]:
# 3. Shadow Models
print_section("3. Shadow Models")

In [None]:
# Load and prepare data
(x_target, y_target), (x_shadow, y_shadow), _, _ = load_nursery(test_set=0.75)

In [None]:
target_train_size = len(x_target) // 2
x_target_train, y_target_train = (
    x_target[:target_train_size],
    y_target[:target_train_size],
)
x_target_test, y_target_test = (
    x_target[target_train_size:],
    y_target[target_train_size:],
)

In [None]:
# Train random forest model
model = RandomForestClassifier()
model.fit(x_target_train, y_target_train)
art_classifier = ScikitlearnRandomForestClassifier(model)
print("Base model accuracy:", model.score(x_target_test, y_target_test))

In [None]:
# Train shadow models
shadow_models = ShadowModels(art_classifier, num_shadow_models=3)
shadow_dataset = shadow_models.generate_shadow_dataset(
    x_shadow, to_categorical(y_shadow, 4)
)
(
    (member_x, member_y, member_predictions),
    (nonmember_x, nonmember_y, nonmember_predictions),
) = shadow_dataset

In [None]:
# Shadow models' accuracy
print(
    [
        sm.model.score(x_target_test, y_target_test)
        for sm in shadow_models.get_shadow_models()
    ]
)

In [None]:
# Black-box attack using shadow models
attack = MembershipInferenceBlackBox(art_classifier, attack_model_type="rf")
attack.fit(
    member_x,
    member_y,
    nonmember_x,
    nonmember_y,
    member_predictions,
    nonmember_predictions,
)
member_infer = attack.infer(x_target_train, y_target_train)
nonmember_infer = attack.infer(x_target_test, y_target_test)
member_acc = np.sum(member_infer) / len(x_target_train)
nonmember_acc = 1 - np.sum(nonmember_infer) / len(x_target_test)
acc = (member_acc * len(x_target_train) + nonmember_acc * len(x_target_test)) / (
    len(x_target_train) + len(x_target_test)
)
print("Attack Member Acc:", member_acc)
print("Attack Non-Member Acc:", nonmember_acc)
print("Attack Accuracy:", acc)

In [None]:
print(
    calc_precision_recall(
        np.concatenate((member_infer, nonmember_infer)),
        np.concatenate((np.ones(len(member_infer)), np.zeros(len(nonmember_infer)))),
    )
)

In [None]:
# Rule-based attack
baseline = MembershipInferenceBlackBoxRuleBased(art_classifier)

bl_inferred_train = baseline.infer(x_target_train, y_target_train)
bl_inferred_test = baseline.infer(x_target_test, y_target_test)

bl_member_acc = np.sum(bl_inferred_train) / len(bl_inferred_train)
bl_nonmember_acc = 1 - (np.sum(bl_inferred_test) / len(bl_inferred_test))
bl_acc = (
    bl_member_acc * len(bl_inferred_train) + bl_nonmember_acc * len(bl_inferred_test)
) / (len(bl_inferred_train) + len(bl_inferred_test))
print(bl_member_acc)
print(bl_nonmember_acc)
print("Baseline Accuracy:", bl_acc)

In [None]:
print(
    calc_precision_recall(
        np.concatenate((bl_inferred_train, bl_inferred_test)),
        np.concatenate(
            (np.ones(len(bl_inferred_train)), np.zeros(len(bl_inferred_test)))
        ),
    )
)

In [None]:
# 4. Regression Models
print_section("4. Regression Models")

In [None]:
# Load data
(x_train, y_train), (x_test, y_test), _, _ = load_diabetes(test_set=0.5)

In [None]:
# Train MLP regression model
model = LinearRegression()
model.fit(x_train, y_train)
art_regressor = ScikitlearnRegressor(model)
print("Base model score:", model.score(x_test, y_test))

In [None]:
# Black-box attack on regression model
bb_attack = MembershipInferenceBlackBox(
    art_regressor, attack_model_type="rf", input_type="loss"
)

attack_train_ratio = 0.5
attack_train_size = int(len(x_train) * attack_train_ratio)
attack_test_size = int(len(x_test) * attack_train_ratio)

bb_attack.fit(
    x_train[:attack_train_size],
    y_train[:attack_train_size],
    x_test[:attack_test_size],
    y_test[:attack_test_size],
)

inferred_train_bb = bb_attack.infer(x_train.astype(np.float32), y_train)
inferred_test_bb = bb_attack.infer(x_test.astype(np.float32), y_test)

train_acc_bb = np.sum(inferred_train_bb) / len(inferred_train_bb)
test_acc_bb = 1 - (np.sum(inferred_test_bb) / len(inferred_test_bb))
acc_bb = (
    train_acc_bb * len(inferred_train_bb) + test_acc_bb * len(inferred_test_bb)
) / (len(inferred_train_bb) + len(inferred_test_bb))
print("Member accuracy:", train_acc_bb)
print("Non-Member accuracy:", test_acc_bb)
print("Accuracy:", acc_bb)

In [None]:
print(
    calc_precision_recall(
        np.concatenate((inferred_train_bb, inferred_test_bb)),
        np.concatenate(
            (np.ones(len(inferred_train_bb)), np.zeros(len(inferred_test_bb)))
        ),
    )
)