In [1]:
!pip install opacus --no-deps

Collecting opacus
  Downloading opacus-1.5.3-py3-none-any.whl.metadata (8.4 kB)
Downloading opacus-1.5.3-py3-none-any.whl (251 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m251.7/251.7 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: opacus
Successfully installed opacus-1.5.3


In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from opacus import PrivacyEngine
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
import pandas as pd

# 1. 加载并预处理Adult数据集
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
column_names = [
    "age", "workclass", "fnlwgt", "education", "education-num",
    "marital-status", "occupation", "relationship", "race", "sex",
    "capital-gain", "capital-loss", "hours-per-week", "native-country", "income"
]
df = pd.read_csv(url, names=column_names, na_values=" ?", skipinitialspace=True)
df.dropna(inplace=True)

# Label encode categorical variables
categorical_cols = df.select_dtypes(include="object").columns
for col in categorical_cols:
    df[col] = LabelEncoder().fit_transform(df[col])

# 分离特征与标签
X = df.drop("income", axis=1).values
y = df["income"].values

# 标准化
scaler = StandardScaler()
X = scaler.fit_transform(X)

# 划分训练/测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 转为 PyTorch 张量
train_dataset = TensorDataset(torch.tensor(X_train, dtype=torch.float32), torch.tensor(y_train, dtype=torch.long))
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

# 2. 定义简单的 MLP 模型
class MLP(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 2)
        )
    def forward(self, x):
        return self.net(x)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = MLP(X_train.shape[1]).to(device)

# 3. 使用 Opacus 设置 DP-SGD
optimizer = optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()

privacy_engine = PrivacyEngine()
model, optimizer, train_loader = privacy_engine.make_private(
    module=model,
    optimizer=optimizer,
    data_loader=train_loader,
    noise_multiplier=1.0,       # 噪声强度
    max_grad_norm=1.0           # 梯度裁剪
)

# 4. 训练模型
model.train()
for epoch in range(50):
    total_loss = 0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)

        optimizer.zero_grad()
        output = model(X_batch)
        loss = criterion(output, y_batch)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    epsilon = privacy_engine.accountant.get_epsilon(delta=1e-5)
    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}, ε: {epsilon:.2f}")

# 5. 保存模型
torch.save(model.state_dict(), "dp_mlp_adult.pth")
print("模型已保存为 dp_mlp_adult.pth")

  self._maybe_warn_non_full_backward_hook(args, result, grad_fn)


Epoch 1, Loss: 247.8479, ε: 0.28
Epoch 2, Loss: 208.2311, ε: 0.37
Epoch 3, Loss: 206.3226, ε: 0.45
Epoch 4, Loss: 210.3760, ε: 0.51
Epoch 5, Loss: 209.7249, ε: 0.57
Epoch 6, Loss: 206.5736, ε: 0.63
Epoch 7, Loss: 199.6644, ε: 0.67
Epoch 8, Loss: 207.2675, ε: 0.72
Epoch 9, Loss: 206.5569, ε: 0.76
Epoch 10, Loss: 207.1718, ε: 0.81
Epoch 11, Loss: 208.1316, ε: 0.85
Epoch 12, Loss: 207.5018, ε: 0.88
Epoch 13, Loss: 206.0630, ε: 0.92
Epoch 14, Loss: 213.3688, ε: 0.96
Epoch 15, Loss: 212.4060, ε: 0.99
Epoch 16, Loss: 210.7302, ε: 1.03
Epoch 17, Loss: 210.0463, ε: 1.06
Epoch 18, Loss: 214.9727, ε: 1.09
Epoch 19, Loss: 215.2165, ε: 1.12
Epoch 20, Loss: 218.2425, ε: 1.15
Epoch 21, Loss: 213.2884, ε: 1.18
Epoch 22, Loss: 215.7030, ε: 1.21
Epoch 23, Loss: 217.3048, ε: 1.24
Epoch 24, Loss: 213.5284, ε: 1.27
Epoch 25, Loss: 214.9405, ε: 1.29
Epoch 26, Loss: 216.1917, ε: 1.32
Epoch 27, Loss: 218.8712, ε: 1.35
Epoch 28, Loss: 216.4158, ε: 1.37
Epoch 29, Loss: 217.5578, ε: 1.40
Epoch 30, Loss: 214.104

In [6]:
from sklearn.metrics import accuracy_score

model.eval()
all_preds = []
all_labels = []
test_dataset = TensorDataset(torch.tensor(X_test, dtype=torch.float32), torch.tensor(y_test, dtype=torch.long))
test_loader = DataLoader(test_dataset, batch_size=64)
with torch.no_grad():
    for X_batch, y_batch in test_loader:
        X_batch = X_batch.to(device)
        outputs = model(X_batch)
        preds = torch.argmax(outputs, dim=1).cpu().numpy()
        all_preds.extend(preds)
        all_labels.extend(y_batch.numpy())

acc = accuracy_score(all_labels, all_preds)
print(f"测试集准确率: {acc:.4f}")

测试集准确率: 0.8512


  test_dataset = TensorDataset(torch.tensor(X_test, dtype=torch.float32), torch.tensor(y_test, dtype=torch.long))
