# Generate MIA Features

Based on di.py script

In [27]:
from utils import prepare_model
from metrics import aggregate_metrics, reference_model_registry
import json, os
import argparse
from datasets import load_dataset
from dataloader import load_data

In [28]:
# model_name = "EleutherAI/pythia-410m-deduped"
model_name = "EleutherAI/pythia-2.8b"
cache_dir = "/tmp"

In [29]:
model, tokenizer = prepare_model(model_name, cache_dir=cache_dir)

tokenizer_config.json:   0%|          | 0.00/396 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/5.68G [00:00<?, ?B/s]

Model loaded


In [32]:
from datasets import load_dataset

ds = load_dataset("haritzpuerto/the_pile_arxiv_1k_sample")

# Step 1: Aggregate Features with MIAs

In [35]:
pile_train = ds['train'].select(range(1000))
pile_val = ds['validation'].select(range(1000))

In [36]:
metric_list = ["k_min_probs", "ppl", "zlib_ratio", "k_max_probs"]

In [37]:
metrics_train = aggregate_metrics(model, tokenizer, pile_train, metric_list, None, batch_size = 10)

  0%|          | 0/100 [00:00<?, ?it/s]

100%|██████████| 100/100 [04:13<00:00,  2.53s/it]


In [38]:
len(metrics_train['ppl'])

1000

In [39]:
metrics_val = aggregate_metrics(model, tokenizer, pile_val, metric_list, None, batch_size = 10)

100%|██████████| 100/100 [04:12<00:00,  2.53s/it]


# Step 2: Learn MIA Correlations

In this stage, we train a linear regressor to learn the importance of weights for different MIA attacks to use for the final dataset inference procedure. 

Across each MIA feature value, we first modify the top 5% outliers by changing their values to the mean of the distribution. This step is crucial to prevent issues in Step 3, where the model might learn skewed correlations due to a few outlier samples. 

We then pass the data through a linear regression model to learn weights for each feature.
All ‘suspect’ samples in Asus are labeled as 0, and all validation samples in Aval are labeled as 1.

A regressor is trained to predict the label given the samples, effectively learning the correlation between the features
and their membership status.

Based on the linear_di.py script

In [40]:
import os
import sys
import json
import numpy as np
import pandas as pd
from scipy.stats import ttest_ind, chi2, norm
import torch
import torch.nn as nn
import argparse
from tqdm import tqdm
from selected_features import feature_list

In [41]:
def split_train_val(metrics):
    keys = list(metrics.keys())
    num_elements = len(metrics[keys[0]])
    print (f"Using {num_elements} elements")
    # select a random subset of val_metrics (50% of ids)
    ids_train = np.random.choice(num_elements, num_elements//2, replace=False)
    ids_val = np.array([i for i in range(num_elements) if i not in ids_train])
    new_metrics_train = {}
    new_metrics_val = {}
    for key in keys:
        new_metrics_train[key] = np.array(metrics[key])[ids_train]
        new_metrics_val[key] = np.array(metrics[key])[ids_val]
    return new_metrics_train, new_metrics_val

def remove_outliers(metrics, remove_frac=0.05, outliers = "zero"):
    # Sort the array to work with ordered data
    sorted_ids = np.argsort(metrics)
    
    # Calculate the number of elements to remove from each side
    total_elements = len(metrics)
    elements_to_remove_each_side = int(total_elements * remove_frac / 2) 
    
    # Ensure we're not attempting to remove more elements than are present
    if elements_to_remove_each_side * 2 > total_elements:
        raise ValueError("remove_frac is too large, resulting in no elements left.")
    
    # Change the removed metrics to 0.
    lowest_ids = sorted_ids[:elements_to_remove_each_side]
    highest_ids = sorted_ids[-elements_to_remove_each_side:]
    all_ids = np.concatenate((lowest_ids, highest_ids))

    # import pdb; pdb.set_trace()
    
    trimmed_metrics = np.copy(metrics)
    
    if outliers == "zero":
        trimmed_metrics[all_ids] = 0
    elif outliers == "mean" or outliers == "mean+p-value":
        trimmed_metrics[all_ids] = np.mean(trimmed_metrics)
    elif outliers == "clip":
        highest_val_permissible = trimmed_metrics[highest_ids[0]]
        lowest_val_permissible = trimmed_metrics[lowest_ids[-1]]
        trimmed_metrics[highest_ids] =  highest_val_permissible
        trimmed_metrics[lowest_ids] =   lowest_val_permissible
    elif outliers == "randomize":
        #this will randomize the order of metrics
        trimmed_metrics = np.delete(trimmed_metrics, all_ids)
    else:
        assert outliers in ["keep", "p-value"]
        pass
        
    
    return trimmed_metrics

def normalize_and_stack(train_metrics, val_metrics, normalize="train"):
    '''
    excpects an input list of list of metrics
    normalize val with corre
    '''
    new_train_metrics = []
    new_val_metrics = []
    for (tm, vm) in zip(train_metrics, val_metrics):
        if normalize == "combined":
            combined_m = np.concatenate((tm, vm))
            mean_tm = np.mean(combined_m)
            std_tm = np.std(combined_m)
        else:
            mean_tm = np.mean(tm)
            std_tm = np.std(tm)
        
        if normalize == "no":
            normalized_vm = vm
            normalized_tm = tm
        else:
            #normalization should be done with respect to the train set statistics
            normalized_vm = (vm - mean_tm) / std_tm
            normalized_tm = (tm - mean_tm) / std_tm
        
        new_train_metrics.append(normalized_tm)
        new_val_metrics.append(normalized_vm)

    train_metrics = np.stack(new_train_metrics, axis=1)
    val_metrics = np.stack(new_val_metrics, axis=1)
    return train_metrics, val_metrics

## Step 2.1 Remove Outliers

Across each MIA feature value, we first modify the top 5% outliers by changing their values to the mean of the distribution. This step is crucial to prevent issues in Step 3, where the model might learn skewed correlations due to a few outlier samples. 

In [42]:
# Remove outliers

outliers = "clip" #  choices=["randomize", "keep", "zero", "mean", "clip", "mean+p-value", "p-value"]

keys = list(metrics_train.keys())
train_metrics = []
val_metrics = []
for key in keys:
    metrics_train_key = np.array(metrics_train[key])
    metrics_val_key = np.array(metrics_val[key])

    # remove the top 2.5% and bottom 2.5% of the data
    
    metrics_train_key = remove_outliers(metrics_train_key, remove_frac = 0.05, outliers = outliers)
    metrics_val_key = remove_outliers(metrics_val_key, remove_frac = 0.05, outliers = outliers)

    train_metrics.append(metrics_train_key)
    val_metrics.append(metrics_val_key)

# concatenate the train and val metrics by stacking them

# train_metrics, val_metrics = new_train_metrics, new_val_metrics
train_metrics, val_metrics = normalize_and_stack(train_metrics, val_metrics)

In [43]:
keys

['ppl',
 'k_min_probs_0.05',
 'k_min_probs_0.1',
 'k_min_probs_0.2',
 'k_min_probs_0.3',
 'k_min_probs_0.4',
 'k_min_probs_0.5',
 'k_min_probs_0.6',
 'k_max_probs_0.05',
 'k_max_probs_0.1',
 'k_max_probs_0.2',
 'k_max_probs_0.3',
 'k_max_probs_0.4',
 'k_max_probs_0.5',
 'k_max_probs_0.6',
 'zlib_ratio']

In [44]:
print(train_metrics.shape)
print(val_metrics.shape)

(1000, 16)
(999, 16)


In [45]:
# aux function
def get_dataset_splits(_train_metrics, _val_metrics, num_samples):
    # get the train and val sets
    for_train_train_metrics = _train_metrics[:num_samples]
    for_train_val_metrics = _val_metrics[:num_samples]
    for_val_train_metrics = _train_metrics[num_samples:]
    for_val_val_metrics = _val_metrics[num_samples:]


    # create the train and val sets
    train_x = np.concatenate((for_train_train_metrics, for_train_val_metrics), axis=0)
    train_y = np.concatenate((-1*np.zeros(for_train_train_metrics.shape[0]), np.ones(for_train_val_metrics.shape[0])))
    val_x = np.concatenate((for_val_train_metrics, for_val_val_metrics), axis=0)
    val_y = np.concatenate((-1*np.zeros(for_val_train_metrics.shape[0]), np.ones(for_val_val_metrics.shape[0])))
    
    # return tensors
    train_x = torch.tensor(train_x, dtype=torch.float32)
    train_y = torch.tensor(train_y, dtype=torch.float32)
    val_x = torch.tensor(val_x, dtype=torch.float32)
    val_y = torch.tensor(val_y, dtype=torch.float32)
    
    return (train_x, train_y), (val_x, val_y)

In [46]:
# aux functions about MIA classifier

def train_model(inputs, y, num_epochs=10000):
    num_features = inputs.shape[1]
    model = get_model(num_features)
        
    criterion = nn.BCEWithLogitsLoss()  # Binary Cross Entropy Loss for binary classification
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
    
    # Convert y to float tensor for BCEWithLogitsLoss
    y_float = y.float()

    with tqdm(range(num_epochs)) as pbar:
        for epoch in pbar:
            optimizer.zero_grad()
            outputs = model(inputs).squeeze()  # Squeeze the output to remove singleton dimension
            loss = criterion(outputs, y_float)
            loss.backward()
            optimizer.step()
            pbar.set_description('loss {}'.format(loss.item()))
    return model

def get_model(num_features, linear = True):
    if linear:
        model = nn.Linear(num_features, 1)
    else:
        model = nn.Sequential(
            nn.Linear(num_features, 10),
            nn.ReLU(),
            nn.Linear(10, 1)  # Single output neuron
        )
    return model

def get_predictions(model, val, y):
    with torch.no_grad():
        preds = model(val).detach().squeeze()
    criterion = nn.BCEWithLogitsLoss()
    loss = criterion(preds, y.float())
    return preds.numpy(), loss.item()

## Step 2.2: Learn the weights of each feature

We then pass the data through a linear regression model to learn weights for each feature.

All ‘suspect’ samples in Asus are labeled as 0, and all validation samples in Aval are labeled as 1.

In [66]:
# aux functions about p-values
list_number_samples = [2, 5, 10, 20, 50, 100, 150, 200, 300, 400, 500, 600, 700, 800, 900, 1000]

def get_p_value_list(heldout_train, heldout_val, list_number_samples):
    # list_number_samples is used to see how the p-values changes across different number of samples
    p_value_list = []
    for num_samples in list_number_samples:
        heldout_train_curr = heldout_train[:num_samples]
        heldout_val_curr = heldout_val[:num_samples]
        t, p_value = ttest_ind(heldout_train_curr, heldout_val_curr, alternative='less')
        p_value_list.append(p_value)
    return p_value_list
    
    

def split_train_val(metrics):
    keys = list(metrics.keys())
    num_elements = len(metrics[keys[0]])
    print (f"Using {num_elements} elements")
    # select a random subset of val_metrics (50% of ids)
    ids_train = np.random.choice(num_elements, num_elements//2, replace=False)
    ids_val = np.array([i for i in range(num_elements) if i not in ids_train])
    new_metrics_train = {}
    new_metrics_val = {}
    for key in keys:
        new_metrics_train[key] = np.array(metrics[key])[ids_train]
        new_metrics_val[key] = np.array(metrics[key])[ids_val]
    return new_metrics_train, new_metrics_val

In [55]:
num_samples = 500 # How many samples to use for training and validation?

np.random.shuffle(train_metrics)
np.random.shuffle(val_metrics)

# train a model by creating a train set and a held out set
(train_x, train_y), (val_x, val_y) = get_dataset_splits(train_metrics, val_metrics, num_samples)

model = train_model(train_x, train_y, num_epochs = 1000)

# using the model weights, get importance of each feature, and save to csv
weights = model.weight.data.squeeze().tolist() 
features = keys
feature_importance = {feature: weight for feature, weight in zip(features, weights)}
df = pd.DataFrame(list(feature_importance.items()), columns=['Feature', 'Importance'])

loss 0.6831618547439575: 100%|██████████| 1000/1000 [00:01<00:00, 514.16it/s]


In [59]:
df

Unnamed: 0,Feature,Importance
0,ppl,0.024805
1,k_min_probs_0.05,0.283683
2,k_min_probs_0.1,-0.39049
3,k_min_probs_0.2,0.381211
4,k_min_probs_0.3,0.246895
5,k_min_probs_0.4,-0.033674
6,k_min_probs_0.5,-1.09652
7,k_min_probs_0.6,0.597002
8,k_max_probs_0.05,0.211646
9,k_max_probs_0.1,-0.004144


# Step 3: Dataset Inference



In [67]:
preds, loss = get_predictions(model, val_x, val_y)
preds_train, loss_train = get_predictions(model, train_x, train_y)
og_train = preds_train[train_y == 0]
og_val = preds_train[train_y == 1]

heldout_train = preds[val_y == 0]
heldout_val = preds[val_y == 1]
# alternate hypothesis: heldout_train < heldout_val

if outliers == "p-value" or outliers == "mean+p-value":
    heldout_train = remove_outliers(heldout_train, remove_frac = 0.05, outliers = "randomize")
    heldout_val = remove_outliers(heldout_val, remove_frac = 0.05, outliers = "randomize")

p_value_list = get_p_value_list(heldout_train, heldout_val, list_number_samples=[1000])

In [68]:
p_value_list

[0.0038565105899075196]

In [48]:
num_random = 1 # How many random runs to do?
num_samples = 50 # How many samples to use for training and validation?
for i in range(num_random):
    np.random.shuffle(train_metrics)
    np.random.shuffle(val_metrics)
    
    # train a model by creating a train set and a held out set
    (train_x, train_y), (val_x, val_y) = get_dataset_splits(train_metrics, val_metrics, num_samples)
    
    model = train_model(train_x, train_y, num_epochs = 1000)
    preds, loss = get_predictions(model, val_x, val_y)
    preds_train, loss_train = get_predictions(model, train_x, train_y)
    og_train = preds_train[train_y == 0]
    og_val = preds_train[train_y == 1]

    heldout_train = preds[val_y == 0]
    heldout_val = preds[val_y == 1]
    # alternate hypothesis: heldout_train < heldout_val
    
    if outliers == "p-value" or outliers == "mean+p-value":
        heldout_train = remove_outliers(heldout_train, remove_frac = 0.05, outliers = "randomize")
        heldout_val = remove_outliers(heldout_val, remove_frac = 0.05, outliers = "randomize")

    p_value_list = get_p_value_list(heldout_train, heldout_val)



loss 0.6395527124404907: 100%|██████████| 1000/1000 [00:01<00:00, 631.87it/s]


In [23]:
heldout_val

array([ 2.88661   , -0.3874984 , -0.79101145, -0.01261542,  0.46413305,
       -0.13811317, -0.41857734, -0.3877732 , -0.14110729,  0.01094624,
       -0.10262962, -0.7831085 ,  0.49617842, -0.20826474,  0.14064273,
       -0.03651986,  1.8923998 ,  1.2640717 , -0.43924025,  0.06790521,
       -0.19424674,  0.7400899 , -0.28640273,  2.265852  ,  1.4073684 ,
       -0.07915518, -0.15063187, -0.8372953 ,  0.99971545,  0.05996384,
        0.7860302 ,  0.3840671 , -0.10859922,  1.0737193 ,  0.5344206 ,
        5.2956033 ,  0.23978582, -0.60437536,  1.1431992 , -0.63941866,
        0.36124316, -1.4273818 ,  2.5828536 , -0.14944503,  0.70359766,
        0.79249525, -0.04778824,  0.6618693 , -0.4120402 , -1.0859761 ],
      dtype=float32)

In [49]:
df

Unnamed: 0,Feature,Importance
0,ppl,1.540416
1,k_min_probs_0.05,0.165446
2,k_min_probs_0.1,-0.637058
3,k_min_probs_0.2,0.328577
4,k_min_probs_0.3,0.423384
5,k_min_probs_0.4,0.048116
6,k_min_probs_0.5,-0.419963
7,k_min_probs_0.6,-1.439138
8,k_max_probs_0.05,2.402489
9,k_max_probs_0.1,-3.714744


In [50]:
p_value_list

[0.4107766773806425,
 0.36980158876864666,
 0.04752750963490783,
 0.11867921856676313,
 0.37389410780078125,
 0.22268208266235634,
 0.06599884749456343,
 0.020113088684094473,
 0.011704511371354429,
 0.00850177909203288,
 0.042105663716985556,
 0.008986398306532558,
 0.0007218499858375457,
 0.0003354090956819818,
 0.001154656311558945,
 0.0035443146056995225]

In [50]:
heldout_train

array([ 0.51663136, -0.1560684 ,  0.5157334 , -0.21615733,  0.52323455,
        1.8859843 ,  0.511774  , -0.625938  , -0.99483424, -0.41054124,
        1.4793929 ,  0.22003429, -0.9509558 ,  0.13129918, -0.22658832,
       -1.0837651 , -0.5716633 , -0.11621578,  0.5561568 , -0.8394647 ,
        0.4006781 , -0.09853537,  0.88472337,  0.8128087 ,  0.18891428,
       -0.06233807, -0.15602644,  0.5759302 , -1.1861213 ,  0.3780684 ,
       -0.85187584,  0.44206327, -1.2670323 ,  1.8527066 ,  0.04044168,
       -0.7800655 ,  1.0773345 ,  1.4372088 ,  0.412579  , -0.24890132,
       -0.2590168 , -0.1424504 ,  0.50125974,  0.04421322,  1.0535867 ,
        0.71610206, -1.673247  , -0.5741055 , -1.6307869 ,  0.26277822],
      dtype=float32)