In [1]:
!pip install sdv

Collecting sdv
  Downloading sdv-1.20.1-py3-none-any.whl.metadata (14 kB)
Collecting boto3<2.0.0,>=1.28 (from sdv)
  Downloading boto3-1.38.8-py3-none-any.whl.metadata (6.6 kB)
Collecting botocore<2.0.0,>=1.31 (from sdv)
  Downloading botocore-1.38.8-py3-none-any.whl.metadata (5.7 kB)
Collecting copulas>=0.12.1 (from sdv)
  Downloading copulas-0.12.2-py3-none-any.whl.metadata (9.4 kB)
Collecting ctgan>=0.11.0 (from sdv)
  Downloading ctgan-0.11.0-py3-none-any.whl.metadata (10 kB)
Collecting deepecho>=0.7.0 (from sdv)
  Downloading deepecho-0.7.0-py3-none-any.whl.metadata (10 kB)
Collecting rdt>=1.16.0 (from sdv)
  Downloading rdt-1.16.0-py3-none-any.whl.metadata (10 kB)
Collecting sdmetrics>=0.20.1 (from sdv)
  Downloading sdmetrics-0.20.1-py3-none-any.whl.metadata (9.4 kB)
Collecting jmespath<2.0.0,>=0.7.1 (from boto3<2.0.0,>=1.28->sdv)
  Downloading jmespath-1.0.1-py3-none-any.whl.metadata (7.6 kB)
Collecting s3transfer<0.13.0,>=0.12.0 (from boto3<2.0.0,>=1.28->sdv)
  Downloading s3t

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
import pandas as pd

# 模型定义
class MLP(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 2)
        )
    def forward(self, x):
        return self.net(x)

In [3]:
# 加载 Adult 数据集
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
column_names = [
    "age", "workclass", "fnlwgt", "education", "education-num",
    "marital-status", "occupation", "relationship", "race", "sex",
    "capital-gain", "capital-loss", "hours-per-week", "native-country", "income"
]
df = pd.read_csv(url, names=column_names, na_values=" ?", skipinitialspace=True)
df.dropna(inplace=True)

from sdv.metadata import SingleTableMetadata

# create Metadata
metadata = SingleTableMetadata()

metadata.visualize()

# detect the metadata form the dataframe
metadata.detect_from_dataframe(df)

# initialize a synthetic data generator using the metadata
# from sdv.single_table import GaussianCopulaSynthesizer

# synthesizer = GaussianCopulaSynthesizer(metadata)
from sdv.single_table import CTGANSynthesizer
synthesizer = CTGANSynthesizer(metadata)

# train the synthesier to fit the data
synthesizer.fit(
    data=df
)

synthetic_data = synthesizer.sample(
    num_rows=df.shape[0]
)


# Label encode 所有分类变量
categorical_cols = synthetic_data.select_dtypes(include="object").columns
for col in categorical_cols:
    synthetic_data[col] = LabelEncoder().fit_transform(synthetic_data[col])

# 划分特征与标签
X = synthetic_data.drop("income", axis=1).values
y = synthetic_data["income"].values

# 标准化特征
scaler = StandardScaler()
X = scaler.fit_transform(X)

# 划分训练/测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 转换为 PyTorch 张量
train_dataset = TensorDataset(torch.tensor(X_train, dtype=torch.float32), torch.tensor(y_train, dtype=torch.long))
test_dataset = TensorDataset(torch.tensor(X_test, dtype=torch.float32), torch.tensor(y_test, dtype=torch.long))

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64)



In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = MLP(input_dim=X.shape[1]).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

# 训练过程
for epoch in range(50):
    model.train()
    total_loss = 0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)

        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")

Epoch 1, Loss: 151.1878
Epoch 2, Loss: 128.0143
Epoch 3, Loss: 125.8620
Epoch 4, Loss: 125.0848
Epoch 5, Loss: 124.3182
Epoch 6, Loss: 123.7202
Epoch 7, Loss: 123.4643
Epoch 8, Loss: 122.7176
Epoch 9, Loss: 122.3516
Epoch 10, Loss: 121.4918
Epoch 11, Loss: 121.3454
Epoch 12, Loss: 120.7582
Epoch 13, Loss: 120.4347
Epoch 14, Loss: 120.1528
Epoch 15, Loss: 119.6856
Epoch 16, Loss: 119.4217
Epoch 17, Loss: 119.2653
Epoch 18, Loss: 119.0662
Epoch 19, Loss: 118.4365
Epoch 20, Loss: 118.1289
Epoch 21, Loss: 117.9664
Epoch 22, Loss: 117.8572
Epoch 23, Loss: 117.6244
Epoch 24, Loss: 117.4727
Epoch 25, Loss: 116.6937
Epoch 26, Loss: 116.6972
Epoch 27, Loss: 116.6163
Epoch 28, Loss: 116.3136
Epoch 29, Loss: 116.0305
Epoch 30, Loss: 115.5602
Epoch 31, Loss: 115.3368
Epoch 32, Loss: 115.3619
Epoch 33, Loss: 115.2926
Epoch 34, Loss: 115.0689
Epoch 35, Loss: 114.5750
Epoch 36, Loss: 114.5807
Epoch 37, Loss: 114.3585
Epoch 38, Loss: 114.5366
Epoch 39, Loss: 113.9037
Epoch 40, Loss: 113.6281
Epoch 41,

In [5]:
torch.save(model.state_dict(), "non_dp_mlp_adult.pth")
print("已保存非DP模型：non_dp_mlp_adult.pth")

已保存非DP模型：non_dp_mlp_adult.pth


In [5]:
from sklearn.metrics import accuracy_score

model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for X_batch, y_batch in test_loader:
        X_batch = X_batch.to(device)
        outputs = model(X_batch)
        preds = torch.argmax(outputs, dim=1).cpu().numpy()
        all_preds.extend(preds)
        all_labels.extend(y_batch.numpy())

acc = accuracy_score(all_labels, all_preds)
print(f"测试集准确率: {acc:.4f}")
# 测试集准确率: 0.7413

测试集准确率: 0.8591


In [7]:
# Use the real data to test the performance
# Label encode 所有分类变量
categorical_cols = df.select_dtypes(include="object").columns
for col in categorical_cols:
    df[col] = LabelEncoder().fit_transform(df[col])

# 划分特征与标签
X = df.drop("income", axis=1).values
y = df["income"].values

# 标准化特征
scaler = StandardScaler()
X = scaler.fit_transform(X)

# 划分训练/测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 转换为 PyTorch 张量
train_dataset = TensorDataset(torch.tensor(X_train, dtype=torch.float32), torch.tensor(y_train, dtype=torch.long))
test_dataset = TensorDataset(torch.tensor(X_test, dtype=torch.float32), torch.tensor(y_test, dtype=torch.long))

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64)

from sklearn.metrics import accuracy_score

model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for X_batch, y_batch in test_loader:
        X_batch = X_batch.to(device)
        outputs = model(X_batch)
        preds = torch.argmax(outputs, dim=1).cpu().numpy()
        all_preds.extend(preds)
        all_labels.extend(y_batch.numpy())

acc = accuracy_score(all_labels, all_preds)
print(f"测试集准确率: {acc:.4f}")
# 测试集准确率: 0.7413

测试集准确率: 0.8153
