# Generate MIA Features

Based on di.py script

In [1]:
from utils import prepare_model
from metrics import aggregate_metrics, reference_model_registry
import json, os
import argparse
from datasets import load_dataset
from dataloader import load_data

In [2]:
model_name = "EleutherAI/pythia-410m-deduped"
cache_dir = "/tmp"

In [3]:
model, tokenizer = prepare_model(model_name, cache_dir=cache_dir)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Model loaded


In [4]:
from datasets import load_dataset

ds = load_dataset("haritzpuerto/the_pile_arxiv_1k_sample")

In [24]:
pile_train = ds['train'].select(range(100))
pile_val = ds['validation'].select(range(100))

In [25]:
metric_list = ["k_min_probs", "ppl", "zlib_ratio", "k_max_probs"]

In [36]:
metrics_train = aggregate_metrics(model, tokenizer, pile_train, metric_list, None, batch_size = 10)

100%|██████████| 10/10 [00:04<00:00,  2.05it/s]


In [37]:
metrics_val = aggregate_metrics(model, tokenizer, pile_val, metric_list, None, batch_size = 10)

  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 10/10 [00:05<00:00,  1.85it/s]


# Training the MIA Classifier

Based on the linear_di.py script

In [38]:
import os
import sys
import json
import numpy as np
import pandas as pd
from scipy.stats import ttest_ind, chi2, norm
import torch
import torch.nn as nn
import argparse
from tqdm import tqdm
from selected_features import feature_list

In [39]:
def split_train_val(metrics):
    keys = list(metrics.keys())
    num_elements = len(metrics[keys[0]])
    print (f"Using {num_elements} elements")
    # select a random subset of val_metrics (50% of ids)
    ids_train = np.random.choice(num_elements, num_elements//2, replace=False)
    ids_val = np.array([i for i in range(num_elements) if i not in ids_train])
    new_metrics_train = {}
    new_metrics_val = {}
    for key in keys:
        new_metrics_train[key] = np.array(metrics[key])[ids_train]
        new_metrics_val[key] = np.array(metrics[key])[ids_val]
    return new_metrics_train, new_metrics_val

def remove_outliers(metrics, remove_frac=0.05, outliers = "zero"):
    # Sort the array to work with ordered data
    sorted_ids = np.argsort(metrics)
    
    # Calculate the number of elements to remove from each side
    total_elements = len(metrics)
    elements_to_remove_each_side = int(total_elements * remove_frac / 2) 
    
    # Ensure we're not attempting to remove more elements than are present
    if elements_to_remove_each_side * 2 > total_elements:
        raise ValueError("remove_frac is too large, resulting in no elements left.")
    
    # Change the removed metrics to 0.
    lowest_ids = sorted_ids[:elements_to_remove_each_side]
    highest_ids = sorted_ids[-elements_to_remove_each_side:]
    all_ids = np.concatenate((lowest_ids, highest_ids))

    # import pdb; pdb.set_trace()
    
    trimmed_metrics = np.copy(metrics)
    
    if outliers == "zero":
        trimmed_metrics[all_ids] = 0
    elif outliers == "mean" or outliers == "mean+p-value":
        trimmed_metrics[all_ids] = np.mean(trimmed_metrics)
    elif outliers == "clip":
        highest_val_permissible = trimmed_metrics[highest_ids[0]]
        lowest_val_permissible = trimmed_metrics[lowest_ids[-1]]
        trimmed_metrics[highest_ids] =  highest_val_permissible
        trimmed_metrics[lowest_ids] =   lowest_val_permissible
    elif outliers == "randomize":
        #this will randomize the order of metrics
        trimmed_metrics = np.delete(trimmed_metrics, all_ids)
    else:
        assert outliers in ["keep", "p-value"]
        pass
        
    
    return trimmed_metrics

def normalize_and_stack(train_metrics, val_metrics, normalize="train"):
    '''
    excpects an input list of list of metrics
    normalize val with corre
    '''
    new_train_metrics = []
    new_val_metrics = []
    for (tm, vm) in zip(train_metrics, val_metrics):
        if normalize == "combined":
            combined_m = np.concatenate((tm, vm))
            mean_tm = np.mean(combined_m)
            std_tm = np.std(combined_m)
        else:
            mean_tm = np.mean(tm)
            std_tm = np.std(tm)
        
        if normalize == "no":
            normalized_vm = vm
            normalized_tm = tm
        else:
            #normalization should be done with respect to the train set statistics
            normalized_vm = (vm - mean_tm) / std_tm
            normalized_tm = (tm - mean_tm) / std_tm
        
        new_train_metrics.append(normalized_tm)
        new_val_metrics.append(normalized_vm)

    train_metrics = np.stack(new_train_metrics, axis=1)
    val_metrics = np.stack(new_val_metrics, axis=1)
    return train_metrics, val_metrics

In [40]:
# Remove outliers

outliers = "clip" #  choices=["randomize", "keep", "zero", "mean", "clip", "mean+p-value", "p-value"]

keys = list(metrics_train.keys())
train_metrics = []
val_metrics = []
for key in keys:
    metrics_train_key = np.array(metrics_train[key])
    metrics_val_key = np.array(metrics_val[key])

    # remove the top 2.5% and bottom 2.5% of the data
    
    metrics_train_key = remove_outliers(metrics_train_key, remove_frac = 0.05, outliers = outliers)
    metrics_val_key = remove_outliers(metrics_val_key, remove_frac = 0.05, outliers = outliers)

    train_metrics.append(metrics_train_key)
    val_metrics.append(metrics_val_key)

# concatenate the train and val metrics by stacking them

# train_metrics, val_metrics = new_train_metrics, new_val_metrics
train_metrics, val_metrics = normalize_and_stack(train_metrics, val_metrics)

In [41]:
print(train_metrics.shape)
print(val_metrics.shape)

(100, 16)
(100, 16)


In [None]:
# aux function
def get_dataset_splits(_train_metrics, _val_metrics, num_samples):
    # get the train and val sets
    for_train_train_metrics = _train_metrics[:num_samples]
    for_train_val_metrics = _val_metrics[:num_samples]
    for_val_train_metrics = _train_metrics[num_samples:]
    for_val_val_metrics = _val_metrics[num_samples:]


    # create the train and val sets
    train_x = np.concatenate((for_train_train_metrics, for_train_val_metrics), axis=0)
    train_y = np.concatenate((-1*np.zeros(for_train_train_metrics.shape[0]), np.ones(for_train_val_metrics.shape[0])))
    val_x = np.concatenate((for_val_train_metrics, for_val_val_metrics), axis=0)
    val_y = np.concatenate((-1*np.zeros(for_val_train_metrics.shape[0]), np.ones(for_val_val_metrics.shape[0])))
    
    # return tensors
    train_x = torch.tensor(train_x, dtype=torch.float32)
    train_y = torch.tensor(train_y, dtype=torch.float32)
    val_x = torch.tensor(val_x, dtype=torch.float32)
    val_y = torch.tensor(val_y, dtype=torch.float32)
    
    return (train_x, train_y), (val_x, val_y)

In [44]:
# aux functions about MIA classifier

def train_model(inputs, y, num_epochs=10000):
    num_features = inputs.shape[1]
    model = get_model(num_features)
        
    criterion = nn.BCEWithLogitsLoss()  # Binary Cross Entropy Loss for binary classification
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
    
    # Convert y to float tensor for BCEWithLogitsLoss
    y_float = y.float()

    with tqdm(range(num_epochs)) as pbar:
        for epoch in pbar:
            optimizer.zero_grad()
            outputs = model(inputs).squeeze()  # Squeeze the output to remove singleton dimension
            loss = criterion(outputs, y_float)
            loss.backward()
            optimizer.step()
            pbar.set_description('loss {}'.format(loss.item()))
    return model

def get_model(num_features, linear = True):
    if linear:
        model = nn.Linear(num_features, 1)
    else:
        model = nn.Sequential(
            nn.Linear(num_features, 10),
            nn.ReLU(),
            nn.Linear(10, 1)  # Single output neuron
        )
    return model

def get_predictions(model, val, y):
    with torch.no_grad():
        preds = model(val).detach().squeeze()
    criterion = nn.BCEWithLogitsLoss()
    loss = criterion(preds, y.float())
    return preds.numpy(), loss.item()

In [45]:
# aux functions about p-values
p_sample_list = [2, 5, 10, 20, 50, 100, 150, 200, 300, 400, 500, 600, 700, 800, 900, 1000]

def get_p_value_list(heldout_train, heldout_val):
    p_value_list = []
    for num_samples in p_sample_list:
        heldout_train_curr = heldout_train[:num_samples]
        heldout_val_curr = heldout_val[:num_samples]
        t, p_value = ttest_ind(heldout_train_curr, heldout_val_curr, alternative='less')
        p_value_list.append(p_value)
    return p_value_list
    
    

def split_train_val(metrics):
    keys = list(metrics.keys())
    num_elements = len(metrics[keys[0]])
    print (f"Using {num_elements} elements")
    # select a random subset of val_metrics (50% of ids)
    ids_train = np.random.choice(num_elements, num_elements//2, replace=False)
    ids_val = np.array([i for i in range(num_elements) if i not in ids_train])
    new_metrics_train = {}
    new_metrics_val = {}
    for key in keys:
        new_metrics_train[key] = np.array(metrics[key])[ids_train]
        new_metrics_val[key] = np.array(metrics[key])[ids_val]
    return new_metrics_train, new_metrics_val

In [47]:
num_random = 1 # How many random runs to do?
num_samples = 50 # How many samples to use for training and validation?
for i in range(num_random):
    np.random.shuffle(train_metrics)
    np.random.shuffle(val_metrics)
    
    # train a model by creating a train set and a held out set
    (train_x, train_y), (val_x, val_y) = get_dataset_splits(train_metrics, val_metrics, num_samples)
    
    model = train_model(train_x, train_y, num_epochs = 1000)
    preds, loss = get_predictions(model, val_x, val_y)
    preds_train, loss_train = get_predictions(model, train_x, train_y)
    og_train = preds_train[train_y == 0]
    og_val = preds_train[train_y == 1]

    heldout_train = preds[val_y == 0]
    heldout_val = preds[val_y == 1]
    # alternate hypothesis: heldout_train < heldout_val
    
    if outliers == "p-value" or outliers == "mean+p-value":
        heldout_train = remove_outliers(heldout_train, remove_frac = 0.05, outliers = "randomize")
        heldout_val = remove_outliers(heldout_val, remove_frac = 0.05, outliers = "randomize")

    p_value_list = get_p_value_list(heldout_train, heldout_val)

    # using the model weights, get importance of each feature, and save to csv
    weights = model.weight.data.squeeze().tolist() 
    features = keys
    feature_importance = {feature: weight for feature, weight in zip(features, weights)}
    df = pd.DataFrame(list(feature_importance.items()), columns=['Feature', 'Importance'])

loss 0.6218028664588928: 100%|██████████| 1000/1000 [00:01<00:00, 707.72it/s]


In [55]:
heldout_val

array([ 2.6639361 , -0.36645943,  0.5381946 , -2.1395907 , -0.18874298,
       -0.7441065 ,  0.26053375,  0.79215133,  0.40245932,  0.80567044,
       -1.3478304 , -1.5925108 , -0.6377495 ,  1.7314936 , -0.23978443,
       -0.28761864,  1.7737526 , -0.71792346, -1.0783162 ,  1.3108569 ,
        1.2593244 ,  1.0556872 ,  0.32088178, -0.41791135,  1.4035882 ,
       -0.5711216 ,  0.21770804,  0.69964176, -1.6696621 ,  0.56620973,
        1.1084148 , -0.15289994, -1.7719492 ,  1.919026  , -0.14105304,
       -0.42632312,  0.35625046,  0.5107425 ,  1.079502  ,  1.1825978 ,
        0.5884947 ,  0.7529997 , -0.02442701,  0.3700201 ,  0.17306606,
        0.47691858, -1.4506899 ,  1.0371825 , -0.02043207,  0.99005145],
      dtype=float32)

In [48]:
df

Unnamed: 0,Feature,Importance
0,ppl,-0.199422
1,k_min_probs_0.05,0.103656
2,k_min_probs_0.1,-0.416267
3,k_min_probs_0.2,2.08343
4,k_min_probs_0.3,0.78407
5,k_min_probs_0.4,-4.846397
6,k_min_probs_0.5,0.682575
7,k_min_probs_0.6,2.636874
8,k_max_probs_0.05,0.410194
9,k_max_probs_0.1,-1.433336


In [49]:
p_value_list

[0.298164900660768,
 0.5653108109572574,
 0.46028305708833517,
 0.4521887443828846,
 0.19495937087732057,
 0.19495937087732057,
 0.19495937087732057,
 0.19495937087732057,
 0.19495937087732057,
 0.19495937087732057,
 0.19495937087732057,
 0.19495937087732057,
 0.19495937087732057,
 0.19495937087732057,
 0.19495937087732057,
 0.19495937087732057]

In [50]:
heldout_train

array([ 0.51663136, -0.1560684 ,  0.5157334 , -0.21615733,  0.52323455,
        1.8859843 ,  0.511774  , -0.625938  , -0.99483424, -0.41054124,
        1.4793929 ,  0.22003429, -0.9509558 ,  0.13129918, -0.22658832,
       -1.0837651 , -0.5716633 , -0.11621578,  0.5561568 , -0.8394647 ,
        0.4006781 , -0.09853537,  0.88472337,  0.8128087 ,  0.18891428,
       -0.06233807, -0.15602644,  0.5759302 , -1.1861213 ,  0.3780684 ,
       -0.85187584,  0.44206327, -1.2670323 ,  1.8527066 ,  0.04044168,
       -0.7800655 ,  1.0773345 ,  1.4372088 ,  0.412579  , -0.24890132,
       -0.2590168 , -0.1424504 ,  0.50125974,  0.04421322,  1.0535867 ,
        0.71610206, -1.673247  , -0.5741055 , -1.6307869 ,  0.26277822],
      dtype=float32)