In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
import pandas as pd

# 模型定义
class MLP(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 2)
        )
    def forward(self, x):
        return self.net(x)

In [3]:
# 加载 Adult 数据集
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
column_names = [
    "age", "workclass", "fnlwgt", "education", "education-num",
    "marital-status", "occupation", "relationship", "race", "sex",
    "capital-gain", "capital-loss", "hours-per-week", "native-country", "income"
]
df = pd.read_csv(url, names=column_names, na_values=" ?", skipinitialspace=True)
df.dropna(inplace=True)

# Label encode 所有分类变量
categorical_cols = df.select_dtypes(include="object").columns
for col in categorical_cols:
    df[col] = LabelEncoder().fit_transform(df[col])

# 划分特征与标签
X = df.drop("income", axis=1).values
y = df["income"].values

# 标准化特征
scaler = StandardScaler()
X = scaler.fit_transform(X)

# 划分训练/测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 转换为 PyTorch 张量
train_dataset = TensorDataset(torch.tensor(X_train, dtype=torch.float32), torch.tensor(y_train, dtype=torch.long))
test_dataset = TensorDataset(torch.tensor(X_test, dtype=torch.float32), torch.tensor(y_test, dtype=torch.long))

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64)

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = MLP(input_dim=X.shape[1]).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

# 训练过程
for epoch in range(50):
    model.train()
    total_loss = 0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)

        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")

Epoch 1, Loss: 155.9559
Epoch 2, Loss: 134.7725
Epoch 3, Loss: 133.0155
Epoch 4, Loss: 131.9643
Epoch 5, Loss: 131.1339
Epoch 6, Loss: 131.1067
Epoch 7, Loss: 130.1360
Epoch 8, Loss: 129.1174
Epoch 9, Loss: 128.6521
Epoch 10, Loss: 128.1481
Epoch 11, Loss: 127.5576
Epoch 12, Loss: 126.9869
Epoch 13, Loss: 126.5614
Epoch 14, Loss: 126.1991
Epoch 15, Loss: 125.5653
Epoch 16, Loss: 125.3739
Epoch 17, Loss: 124.8498
Epoch 18, Loss: 124.3676
Epoch 19, Loss: 123.9509
Epoch 20, Loss: 123.8035
Epoch 21, Loss: 123.6361
Epoch 22, Loss: 122.7482
Epoch 23, Loss: 122.6213
Epoch 24, Loss: 122.1687
Epoch 25, Loss: 122.1375
Epoch 26, Loss: 121.4983
Epoch 27, Loss: 121.2784
Epoch 28, Loss: 120.9124
Epoch 29, Loss: 120.7904
Epoch 30, Loss: 120.7200
Epoch 31, Loss: 120.2634
Epoch 32, Loss: 119.9015
Epoch 33, Loss: 119.7356
Epoch 34, Loss: 119.4158
Epoch 35, Loss: 119.0794
Epoch 36, Loss: 118.8973
Epoch 37, Loss: 118.5654
Epoch 38, Loss: 118.4348
Epoch 39, Loss: 118.0890
Epoch 40, Loss: 118.2465
Epoch 41,

In [5]:
torch.save(model.state_dict(), "non_dp_mlp_adult.pth")
print("已保存非DP模型：non_dp_mlp_adult.pth")

已保存非DP模型：non_dp_mlp_adult.pth


In [6]:
from sklearn.metrics import accuracy_score

model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for X_batch, y_batch in test_loader:
        X_batch = X_batch.to(device)
        outputs = model(X_batch)
        preds = torch.argmax(outputs, dim=1).cpu().numpy()
        all_preds.extend(preds)
        all_labels.extend(y_batch.numpy())

acc = accuracy_score(all_labels, all_preds)
print(f"测试集准确率: {acc:.4f}")

测试集准确率: 0.8529
