以下代码满足#766任务中结合SLModel实现Yahoo Answers数据集完成Layer-wise算法实现。其中数据集的封装是在datasets.py中通过load_yahoo函数实现。运行该代码需要满足以下前置条件：
1. 下载Yahoo Answers数据集至本地。
2. 在原有数据集上命名四个属性，分别代表问答类型（标签），问题，问题附加与回答，处理好数据集后保存。
user_data = pd.read_csv(path, header=None, encoding="utf-8") 
user_data.columns = ['type', 'question', 'attention', 'answers'] 

4. 修改dataset.py中的load_yahoo中的文件绝对路径。
5. 将dataset.py置于目录secretflow/utils/simulation下替换原有dataset.py。

In [None]:
# Copyright 2022 Ant Group Co., Ltd.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Author: Yuanran Song
# E-mail: 809127446@qq.com

In [None]:
import secretflow as sf
import matplotlib.pyplot as plt
from typing import List
import math
import numpy as np
sf.shutdown()
sf.init(['alice', 'bob'], address='local')
alice, bob = sf.PYU('alice'), sf.PYU('bob')
import pandas as pd
from secretflow.utils.simulation.datasets import dataset


from secretflow.data.split import train_test_split
from secretflow.ml.nn import SLModel
# spu = sf.SPU(sf.utils.testing.cluster_def(['alice', 'bob']))
from secretflow.utils.simulation.datasets import load_yahoo

yahoo数据集读取与预处理，由于yahoo数据集没有特征名与标签名因此添加了四个属性

In [None]:
data = load_yahoo (parts={alice: (1, 2), bob: (2, 4)}, axis=1)
# Alice holds the label.
label = load_yahoo(parts={alice: (0, 1)}, axis=1)
# data['age'].partitions[alice].data
from secretflow.preprocessing.scaler import MinMaxScaler
from secretflow.preprocessing.encoder import LabelEncoder
encoder = LabelEncoder()
data['question'] = encoder.fit_transform(data['question'])
data['attention'] = encoder.fit_transform(data['attention'])
data['answers'] = encoder.fit_transform(data['answers'])
label = encoder.fit_transform(label)
print(f"label= {type(label)},\ndata = {type(data)}")
scaler = MinMaxScaler()

data = scaler.fit_transform(data)
from secretflow.data.split import train_test_split
random_state = 1234
train_data,test_data = train_test_split(data, train_size=0.8, random_state=random_state)
train_label,test_label = train_test_split(label, train_size=0.8, random_state=random_state)

创建拆分学习双方模型

In [None]:
def create_base_model(input_dim, output_dim,  name='base_model'):
    # Create model
    def create_model():
        from tensorflow import keras
        from tensorflow.keras import layers
        import tensorflow as tf
        model = keras.Sequential(
            [
                keras.Input(shape=input_dim),
                layers.Dense(100,activation ="relu" ),
                layers.Dense(output_dim, activation="relu"),
            ]
        )
        # Compile model
        model.summary()
        model.compile(loss='binary_crossentropy',
                      optimizer='adam',
                      metrics=["accuracy",tf.keras.metrics.AUC()])
        return model
    return create_model
# prepare model
hidden_size = 64
# get the number of features of each party.
# When the input data changes, the network automatically adjusts to the input data
alice_input_feature_num = train_data.values.partition_shape()[alice][1]
bob_input_feature_num = train_data.values.partition_shape()[bob][1]

model_base_alice = create_base_model(alice_input_feature_num, hidden_size)
model_base_bob = create_base_model(bob_input_feature_num, hidden_size)
model_base_alice()
model_base_bob()


def create_fuse_model(input_dim, output_dim, party_nums, name='fuse_model'):
    def create_model():
        from tensorflow import keras
        from tensorflow.keras import layers
        import tensorflow as tf
        # input
        input_layers = []
        for i in range(party_nums):
            input_layers.append(keras.Input(input_dim,))

        merged_layer = layers.concatenate(input_layers)
        fuse_layer = layers.Dense(64, activation='relu')(merged_layer)
        output = layers.Dense(output_dim, activation='sigmoid')(fuse_layer)

        model = keras.Model(inputs=input_layers, outputs=output)
        model.summary()

        model.compile(loss='binary_crossentropy',
                      optimizer='adam',
                      metrics=["accuracy",tf.keras.metrics.AUC()])
        return model
    return create_model
model_fuse = create_fuse_model(
    input_dim=hidden_size, party_nums=2, output_dim=1)
model_fuse()



接下来定义lldp_strategy，我们可以使用其对base_model与fuse_model实现逐层差分隐私加噪。
其相较于隐语内置dp_strategy里的embedding_dp（embedding层加噪）与label_dp（标签加噪）相比。使用LLDP策略的全局模型具有更快的收敛速度和更高的预测准确性。这些改进来自于LLDP算法中对隐私预算进行了层次化的分配。
实验结果会于代码末尾附上。以下代码给出fuse_model进行加噪的例子，往函数中输入base_model也可实现对本地模型的加噪策略。用户可根据隐私保护需求自由选择。

In [None]:
def create_lldp_strategy(model):
    model_data_list = model().get_weights()
    print(model_data_list)
    delta = math.exp(-3)
    epsilon = [80, 80, 40, 40, 30, 30]  ##对三层模型进行加噪
    # data_list_l = data[:4]
    # data_list_r = data[4:]
    for i in range(len(model_data_list)):
        sigma = math.sqrt(2 * math.log(1.25 / delta)) / epsilon[i]
        # print("sigma:", sigma)
        noise = np.random.normal(0, sigma, model_data_list[i].shape)
        # add_noise
        model_data_list[i] = model_data_list[i] + noise
    model().set_weights(model_data_list)
    return model
model_fuse= create_lldp_strategy(model_fuse)
model_fuse()

base_model_dict = {
    alice: model_base_alice,
    bob:   model_base_bob
}

train_batch_size = 128

sl_model = SLModel(
    base_model_dict=base_model_dict,
    device_y=alice,
    model_fuse=model_fuse,)
    # dp_strategy_dict=dp_strategy_dict,)

sf.reveal(test_data.partitions[alice].data), sf.reveal(test_label.partitions[alice].data)
sf.reveal(train_data.partitions[alice].data), sf.reveal(train_label.partitions[alice].data)
history =  sl_model.fit(train_data,
             train_label,
             validation_data=(test_data,test_label),
             epochs=10,
             batch_size=train_batch_size,
             shuffle=True,
             verbose=1,
             validation_freq=1,)
             # dp_spent_step_freq=dp_spent_step_freq,)
print(history)
print(history.keys())
global_metric = sl_model.evaluate(test_data, test_label, batch_size=128)
print(global_metric)