# Split Learning and Label Leakage

In [None]:
import secretflow as sf
import matplotlib.pyplot as plt
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
sf.init(["client", "server"], address="local")
client, server = sf.PYU("client"), sf.PYU("server")

In [None]:
import sys

In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim

# from sklearn.preprocessing import StandardScaler
# from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, roc_curve, auc

from attack.labelleakage import NormAttackSplitNNManager
from collaborative.splitnn import SplitNNAPI, SplitNNClient
from utils.utils import NumpyDataset

In [None]:
from tensorflow import keras
from tensorflow.keras import layers
import tensorflow as tf


def zero_activation(x):
    return tf.zeros_like(x)

In [None]:
def create_first_net(input_dim, hidden_dim, name="first_net"):
    # Create model
    def create_model():
        from tensorflow import keras
        from tensorflow.keras import layers
        import tensorflow as tf

        model = keras.Sequential(
            [
                keras.Input(shape=input_dim),
                layers.Dense(hidden_dim // 2, activation="relu"),
                layers.Dense(hidden_dim, activation="relu"),
            ]
        )
        # Compile model
        model.summary()
        learning_rate = 1e-3  # 设置学习率
        optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
        model.compile(
            loss="binary_crossentropy",
            optimizer=optimizer,
            metrics=["accuracy", tf.keras.metrics.AUC()],
        )
        return model

    return create_model


def create_zero_net(input_dim, hidden_dim, name="zero_net"):
    # Create model
    def create_model():
        from tensorflow import keras
        from tensorflow.keras import layers
        import tensorflow as tf

        model = keras.Sequential(
            [
                keras.Input(shape=input_dim),
                layers.Dense(hidden_dim, activation="relu"),
            ]
        )
        # Compile model
        model.summary()
        learning_rate = 1e-3  # 设置学习率
        optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
        model.compile(
            loss="binary_crossentropy",
            optimizer=optimizer,
            metrics=["accuracy", tf.keras.metrics.AUC()],
        )
        return model

    return create_model

In [None]:
def create_fuse_model(
    input_dim_1, input_dim_2, output_dim, party_nums, name="fuse_model"
):
    def create_model():
        from tensorflow import keras
        from tensorflow.keras import layers
        import tensorflow as tf

        # input
        input_layers = []
        # for i in range(party_nums):
        input_layers.append(
            keras.Input(
                input_dim_1,
            )
        )
        input_layers.append(
            keras.Input(
                input_dim_2,
            )
        )

        merged_layer = layers.concatenate(input_layers)
        output = layers.Dense(output_dim, activation="sigmoid")(merged_layer)
        # output = layers.Dense(output_dim, activation='relu')(fuse_layer)

        model = keras.Model(inputs=input_layers, outputs=output)
        model.summary()

        learning_rate = 1e-3  # 设置学习率
        optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)

        model.compile(
            loss="binary_crossentropy",
            optimizer=optimizer,
            metrics=["accuracy", tf.keras.metrics.AUC()],
        )
        return model

    return create_model

## Parameters and Pre-processing

In [None]:
import pandas as pd
import numpy as np
from secretflow.utils.simulation.datasets import dataset

raw_df = pd.read_csv(
    "https://storage.googleapis.com/download.tensorflow.org/data/creditcard.csv"
)
raw_df_neg = raw_df[raw_df["Class"] == 0]
raw_df_pos = raw_df[raw_df["Class"] == 1]

down_df_neg = raw_df_neg  # .sample(40000)
down_df = pd.concat([down_df_neg, raw_df_pos])

neg, pos = np.bincount(down_df["Class"])
total = neg + pos
print(
    "Examples:\n    Total: {}\n    Positive: {} ({:.2f}% of total)\n".format(
        total, pos, 100 * pos / total
    )
)

In [None]:
cleaned_df = down_df.copy()
# You don't want the `Time` column.
cleaned_df.pop("Time")
# The `Amount` column covers a huge range. Convert to log-space.
eps = 0.001  # 0 => 0.1¢
cleaned_df["Log Ammount"] = np.log(cleaned_df.pop("Amount") + eps)

In [None]:
client_data_index = [
    col
    for col in cleaned_df.columns
    if col != "Class" and col != "V1" and col != "V2" and col != "V3" and col != "V4"
]
client_data = cleaned_df[client_data_index]
client_data

In [None]:
server_data = cleaned_df[["V1", "V2", "V3", "V4", "Class"]]
server_data

In [None]:
df = pd.concat([client_data, server_data], axis=1)
df = df[-284160:]
df

In [None]:
from secretflow.data.split import train_test_split
from sfl.ml.nn import SLModel

spu = sf.SPU(sf.utils.testing.cluster_def(["client", "server"]))

In [None]:
from secretflow.utils.simulation.data.dataframe import create_df

data = create_df(
    source=df,
    parts={client: (0, 25), server: (25, 29)},
    axis=1,
    shuffle=False,
)
label = create_df(
    source=df,
    parts={server: (29, 30)},
    axis=1,
    shuffle=False,
)

In [None]:
print(f"label= {type(label)},\ndata = {type(data)}")

In [None]:
# 确认VDataFrame存储无误
# print(data.to_csv({server: "server"}))
# print(data.to_csv({client: "client"}))

In [None]:
from secretflow.preprocessing.scaler import MinMaxScaler, StandardScaler

scaler = StandardScaler()
data = scaler.fit_transform(data)
# data = data.clip(-5,5)  # 由于sf中没有实现clip函数，因此先注释掉

In [None]:
random_state = 1234
train_data, test_data = train_test_split(
    data, train_size=0.8, random_state=random_state
)
train_label, test_label = train_test_split(
    label, train_size=0.8, random_state=random_state
)

In [None]:
train_label.shape

## Split Learning

### 实例化模型

In [None]:
hidden_dim_1 = 28
hidden_dim_2 = 4
# Create FirstNet model
client_builder = create_first_net(input_dim=25, hidden_dim=hidden_dim_1)
# client_model = first_net_builder()

# Create ZeroNet model
server_zero_builder = create_zero_net(input_dim=4, hidden_dim=hidden_dim_2)
# server_zero_model = zero_net_builder()

In [None]:
fuse_builder = create_fuse_model(
    input_dim_1=hidden_dim_1, input_dim_2=hidden_dim_2, party_nums=2, output_dim=1
)
# fuse_net_model = fuse_net_builder()

In [None]:
base_model_dict = {client: client_builder, server: server_zero_builder}

### 增加DP模块

In [None]:
from sfl.security.privacy import DPStrategy, LabelDP
from sfl.security.privacy.mechanism.tensorflow import GaussianEmbeddingDP

# Define DP operations
train_batch_size = 1024
gaussian_embedding_dp = GaussianEmbeddingDP(
    noise_multiplier=0.5,
    l2_norm_clip=1.0,
    batch_size=train_batch_size,
    num_samples=train_data.values.partition_shape()[server][0],
    is_secure_generator=False,
)
label_dp = LabelDP(eps=64.0)
dp_strategy_server = DPStrategy(label_dp=label_dp)
dp_strategy_client = DPStrategy(embedding_dp=gaussian_embedding_dp)
dp_strategy_dict = {client: dp_strategy_client, server: dp_strategy_server}
dp_spent_step_freq = 10

In [None]:
sl_model = SLModel(
    base_model_dict=base_model_dict, device_y=server, model_fuse=fuse_builder
)

### 训练SplitNN

In [None]:
from attack.labelleakage import NormAttackSplitNNManager_sf

manager = NormAttackSplitNNManager_sf(device="cpu")
NormAttackSplitNNAPI = manager.attach(SLModel)
normattacksplitnn = NormAttackSplitNNAPI(
    base_model_dict=base_model_dict, device_y=server, model_fuse=fuse_builder
)

In [None]:
train_batch_size = 1024
epochs = 10
history = normattacksplitnn.fit(
    train_data,
    train_label,
    validation_data=(test_data, test_label),
    epochs=epochs,
    batch_size=train_batch_size,
    shuffle=True,
    verbose=1,
    validation_freq=1,
    dp_spent_step_freq=dp_spent_step_freq,
)

### 评估SplitNN

In [None]:
# Plot the change of loss during training
plt.plot(history["train_loss"])
plt.plot(history["val_loss"])
plt.title("Model loss")
plt.ylabel("Loss")
plt.xlabel("Epoch")
plt.legend(["Train", "Val"], loc="upper right")
plt.show()

In [None]:
# Plot the change of accuracy during training
plt.plot(history["train_accuracy"])
plt.plot(history["val_accuracy"])
plt.title("Model accuracy")
plt.ylabel("Accuracy")
plt.xlabel("Epoch")
plt.legend(["Train", "Val"], loc="upper left")
plt.show()

In [None]:
# Plot the Area Under Curve(AUC) of loss during training
plt.plot(history["train_auc_1"])
plt.plot(history["val_auc_1"])
plt.title("Model Area Under Curve")
plt.ylabel("Area Under Curve")
plt.xlabel("Epoch")
plt.legend(["Train", "Val"], loc="upper left")
plt.show()

## Norm Attack实现

In [None]:
g_norms = normattacksplitnn.attack_grad(
    train_data,
    train_label,
    validation_data=(test_data, test_label),
    epochs=epochs,
    batch_size=train_batch_size,
    shuffle=True,
    verbose=1,
    validation_freq=1,
)
# dp_spent_step_freq=dp_spent_step_freq,)

### Norm Attack评估

In [None]:
import jax.numpy as jnp

# jnp.sum(g_norms,axis=1)
normattack_pred = jnp.concatenate(g_norms)
normattack_pred.shape

In [None]:
import jax

ground_label = sf.reveal(train_label.values.partitions[server])
ground_label = ground_label.flatten()
ground_label.shape

In [None]:
# 将 JAX 数组转换为 NumPy 数组
y_true_numpy = ground_label.tolist()
y_pred_numpy = normattack_pred.tolist()

# 计算 ROC-AUC
roc_auc = roc_auc_score(y_true_numpy, y_pred_numpy)

In [None]:
roc_auc