In [2]:
import pandas as pd

origin_data_path = (
    "/root/develop/Open_Source/ant-sf/secretflow/oscp/DCN/data/criteo_train_small.csv"
)
origin_df = pd.read_csv(origin_data_path)

In [4]:
columns = origin_df.columns
train_alice = origin_df.sample(frac=0.1, axis='columns')
train_bob = origin_df.drop(columns=train_alice.columns)
if 'label' not in train_alice.columns:
    train_alice['label'] = origin_df['label']
    train_bob = train_bob.drop(columns=['label'])

In [5]:
train_alice.to_csv(
    "/root/develop/Open_Source/ant-sf/secretflow/oscp/DCN/dcn_split/train_alice.csv",
    index=False,
    sep="|",
    encoding='utf-8',
)
train_bob.to_csv(
    "/root/develop/Open_Source/ant-sf/secretflow/oscp/DCN/dcn_split/train_bob.csv",
    index=False,
    sep="|",
    encoding='utf-8',
)

In [6]:
train_alice.columns

Index(['C21', 'C35', 'I13', 'I1', 'C33', 'C30', 'C38', 'I5', 'C37', 'I12',
       'C19', 'I11', 'I9', 'I8', 'I10', 'C24', 'C27', 'C34', 'label', 'C16'],
      dtype='object')

In [7]:
train_bob.columns

Index(['I2', 'I3', 'I4', 'I6', 'I7', 'C14', 'C15', 'C17', 'C18', 'C20', 'C22',
       'C23', 'C25', 'C26', 'C28', 'C29', 'C31', 'C32', 'C36', 'C39'],
      dtype='object')

In [20]:
origin_val_data_path = "/root/develop/secretflowdev/OSCP/DCN/data/criteo_val_small.csv"
origin_val_df = pd.read_csv(origin_val_data_path)

In [21]:
val_alice = origin_val_df[origin_val_df.columns.intersection(train_alice.columns)]
val_bob = origin_val_df[origin_val_df.columns.intersection(train_bob.columns)]
val_alice.to_csv("/root/develop/secretflowdev/OSCP/DCN/dcn_split/val_alice.csv", index=False, sep="|", encoding='utf-8')
val_bob.to_csv("/root/develop/secretflowdev/OSCP/DCN/dcn_split/val_bob.csv", index=False, sep="|", encoding='utf-8')


In [22]:
val_alice.columns

Index(['I2', 'I5', 'I7', 'I8', 'I9', 'I10', 'I11', 'I12', 'C17', 'C19', 'C24',
       'C25', 'C28', 'C31', 'C32', 'C33', 'C35', 'C36', 'C37', 'C39'],
      dtype='object')

构造dataloader

In [12]:
import torch
from torch.utils.data import DataLoader, Dataset
from dataset import CriteoDataset

class AliceDataset(Dataset):
    def __init__(self, df, label_df, vocab_dir):
        self.df = df
        self.label_df = label_df

        cat_features = [x for x in self.df.columns if x.startswith('C')]
        num_features = [x for x in self.df.columns if x.startswith('I')]

        self.x_cat = torch.tensor(self.df[cat_features].values, dtype=torch.long) if cat_features else None
        self.x_num = torch.tensor(self.df[num_features].values, dtype=torch.float32) if num_features else None

        self.y = torch.tensor(self.label_df.values, dtype=torch.int)
        
    def __getitem__(self, index):
        # x_cat = torch.tensor([int(self.df['C'])])
        return ((self.x_cat[index], self.x_num[index]), self.y[index])
    
    def __len__(self):
        return len(self.y)

class BobDataset(Dataset):
    def __init__(self, df, vocab_dir):
        self.df = df

        cat_features = [x for x in self.df.columns if x.startswith('C')]
        num_features = [x for x in self.df.columns if x.startswith('I')]

        self.x_cat = torch.tensor(self.df[cat_features].values, dtype=torch.long) if cat_features else None
        self.x_num = torch.tensor(self.df[num_features].values, dtype=torch.float32) if num_features else None

        
    def __getitem__(self, index):
        # x_cat = torch.tensor([int(self.df['C'])])
        return (self.x_cat[index], self.x_num[index])
    
    def __len__(self):
        return len(self.df)




In [13]:
gen_data_path = r'/root/develop/secretflowdev/OSCP/DCN/dcn_split'


def create_dataset_builder_alice(batch_size=32):
    def dataset_builder(x):
        data_set = AliceDataset(x[0], x[1], gen_data_path)
        dataloader = DataLoader(
            dataset=data_set,
            batch_size=batch_size,
        )
        return dataloader

    return dataset_builder

In [14]:
def create_dataset_builder_bob(batch_size=32):
    def dataset_builder(x):
        data_set = BobDataset(x[0], gen_data_path)
        dataloader = DataLoader(
            dataset=data_set,
            batch_size=batch_size,
        )
        return dataloader

    return dataset_builder

In [15]:
import secretflow as sf

# Check the version of your SecretFlow
print('The version of SecretFlow: {}'.format(sf.__version__))

# In case you have a running secretflow runtime already.
sf.shutdown()
sf.init(['alice', 'bob'], address="local", log_to_driver=False)
alice, bob = sf.PYU('alice'), sf.PYU('bob')


batch_size = 8
dataset_buidler_dict = {
    alice: create_dataset_builder_alice(batch_size=batch_size),
    bob: create_dataset_builder_bob(
        batch_size=batch_size,
    ),
}

The version of SecretFlow: 1.3.0.dev20231120


2024-01-05 09:56:02,940	INFO worker.py:1538 -- Started a local Ray instance.


构建模型

In [16]:
from sl_dcn_torch import DCNBase, DCNFuse

train_alice = pd.read_csv("/root/develop/secretflowdev/OSCP/DCN/dcn_split/train_alice.csv", sep="|")
train_bob = pd.read_csv("/root/develop/secretflowdev/OSCP/DCN/dcn_split/train_bob.csv", sep="|")
cat_features_alice = [x for x in train_alice.columns if x.startswith('C')]
num_features_alice = [x for x in train_alice.columns if x.startswith('I')]
cat_features_bob = [x for x in train_bob.columns if x.startswith('C')]
num_features_bob = [x for x in train_bob.columns if x.startswith('I')]



In [17]:
d_numerical_alice = train_alice[num_features_alice].values.shape[1]
categorical_alice = [train_alice[col].max()+1 for col in cat_features_alice]

d_numerical_bob = train_bob[num_features_bob].values.shape[1]
categorical_bob = [train_bob[col].max()+1 for col in cat_features_bob]

In [19]:
d_numerical_alice, categorical_alice, d_numerical_bob, categorical_bob

(8,
 [30956, 16, 1855, 41283, 268488, 4, 16, 15, 4731, 3881, 120965, 8934],
 5,
 [1261,
  531,
  321439,
  267,
  10863,
  563,
  3,
  30792,
  3068,
  26,
  205924,
  10,
  240748,
  70])

In [20]:
def create_base_model_alice():
    model = DCNBase(
        d_numerical=d_numerical_alice,
        categories=categorical_alice,
        d_embed_max=8,
        n_cross=2,
        mlp_layers=[128, 64, 32],
        mlp_dropout=0.25
    )
    
    return model
    

In [21]:
def create_base_model_bob():
    model = DCNBase(
        d_numerical=d_numerical_bob,
        categories=categorical_bob,
        d_embed_max=8,
        n_cross=2,
        mlp_layers=[128, 64, 32],
        mlp_dropout=0.25
    )
    
    return model

In [22]:
def create_fuse_model():
    model = DCNFuse(
        n_classes= 1,
        deep_dim_out=9, 
        cross_dim_out= 32
    )
    
    return model

In [23]:
from torch import nn, optim
from torchmetrics import AUROC, Accuracy, Precision
from secretflow.ml.nn.fl.utils import metric_wrapper, optim_wrapper
from secretflow.ml.nn.utils import TorchModel
from secretflow.ml.nn import SLModel

In [24]:
loss_fn = nn.BCEWithLogitsLoss
optim_fn = optim_wrapper(optim.Adam, lr=0.002, weight_decay=0.001)

In [26]:
base_model_alice = TorchModel(
    model_fn=create_base_model_alice(),
    loss_fn=loss_fn,
    optim_fn=optim_fn,
    metrics=[
        metric_wrapper(Accuracy, task="binary", num_classes=2, average='micro'),
        metric_wrapper(AUROC, task="binary"),
        metric_wrapper(Precision, task="binary", num_classes=2, average='micro'),
    ]
)

base_model_bob = TorchModel(
    model_fn=create_base_model_bob(),
    loss_fn=loss_fn,
    optim_fn=optim_fn,
    metrics=[
        metric_wrapper(Accuracy, task="binary", num_classes=2, average='micro'),
        metric_wrapper(AUROC, task="binary"),
        metric_wrapper(Precision, task="binary", num_classes=2, average='micro'),
    ]
)

In [28]:
fuse_model = TorchModel(
    model_fn=create_fuse_model(),
    loss_fn=loss_fn,
    optim_fn=optim_fn,
    metrics=[
        metric_wrapper(Accuracy, task="binary", num_classes=1, average='micro'),
        metric_wrapper(Precision, task="binary", num_classes=1, average='micro'),
        metric_wrapper(AUROC, task="binary", num_classes=1),
    ],
)

AttributeError: cannot assign module before Module.__init__() call