# Detecting Pretraining Data from Large Language Models

This notebook implements the methods for detecting whether a piece of text was part of a language model's pretraining data. It includes functionality for:
- Loading and preparing models
- Calculating perplexity
- Evaluating detection metrics
- Visualizing results

In [None]:
# Import required libraries
import logging
logging.basicConfig(level='ERROR')
import numpy as np
from pathlib import Path
from openai import OpenAI
import torch
import zlib
from transformers import AutoTokenizer, AutoModelForCausalLM
from tqdm import tqdm
import numpy as np
from datasets import load_dataset
import os
import json
from collections import defaultdict
import matplotlib.pyplot as plt
from sklearn.metrics import auc, roc_curve
import matplotlib
import random

## Model Loading and Setup

Functions for loading and configuring the language models.

In [None]:
def load_model(name1, name2):
    """Load two models for comparison.
    
    Args:
        name1: Name/path of the first model
        name2: Name/path of the second model
        
    Returns:
        Tuple of (model1, model2, tokenizer1, tokenizer2)
    """
    if "davinci" in name1:
        model1 = None
        tokenizer1 = None
    else:
        model1 = AutoModelForCausalLM.from_pretrained(name1, return_dict=True, device_map='auto')
        model1.eval()
        tokenizer1 = AutoTokenizer.from_pretrained(name1)

    if "davinci" in name2:
        model2 = None
        tokenizer2 = None
    else:
        model2 = AutoModelForCausalLM.from_pretrained(name2, return_dict=True, device_map='auto')
        model2.eval()
        tokenizer2 = AutoTokenizer.from_pretrained(name2)
    return model1, model2, tokenizer1, tokenizer2

## Perplexity Calculation

Functions for calculating perplexity using both OpenAI and HuggingFace models.

In [None]:
def calculatePerplexity_gpt3(prompt, modelname):
    """Calculate perplexity using OpenAI's API."""
    prompt = prompt.replace('\x00','')
    responses = None
    api_key = os.environ.get("OPENAI_API_KEY")
    client = OpenAI(api_key=api_key)
    # Map old model names to new ones
    model_mapping = {
        "text-davinci-003": "gpt-3.5-turbo-instruct",
        "text-davinci-002": "gpt-3.5-turbo-instruct"
    }
    modelname = model_mapping.get(modelname, modelname)
    while responses is None:
        try:
            responses = client.completions.create(
                        model=modelname, 
                        prompt=prompt,
                        max_tokens=1,
                        temperature=1.0,
                        logprobs=5,
                        echo=True)
        except openai.BadRequestError as e:
            print(f"OpenAI API Error: {str(e)}")
            if "maximum context length" in str(e).lower():
                print("The input text is too long for the model's context window.")
            elif "logprobs" in str(e).lower():
                print("The logprobs parameter is not supported or exceeds the maximum value of 5.")
            else:
                print("Please check the OpenAI API documentation for more details.")
    data = responses.choices[0].logprobs
    all_prob = [d for d in data.token_logprobs if d is not None]
    p1 = np.exp(-np.mean(all_prob))
    return p1, all_prob, np.mean(all_prob)

def calculatePerplexity(sentence, model, tokenizer, gpu):
    """Calculate perplexity using HuggingFace models."""
    # Set the default device to mps
    torch.set_default_device("mps")
    input_ids = torch.tensor(tokenizer.encode(sentence)).unsqueeze(0)
    input_ids = input_ids.to("mps")
    with torch.no_grad():
        outputs = model(input_ids, labels=input_ids)
    loss, logits = outputs[:2]
    
    # Apply softmax to the logits to get probabilities
    probabilities = torch.nn.functional.log_softmax(logits, dim=-1)
    all_prob = []
    input_ids_processed = input_ids[0][1:]
    for i, token_id in enumerate(input_ids_processed):
        probability = probabilities[0, i, token_id].item()
        all_prob.append(probability)
    return torch.exp(loss).item(), all_prob, loss.item()

## Inference and Evaluation

Functions for performing inference and evaluating results.

In [None]:
def inference(model1, model2, tokenizer1, tokenizer2, text, ex, modelname1, modelname2):
    """Perform inference using both models and calculate metrics."""
    pred = {}

    if "davinci" in modelname1:
        p1, all_prob, p1_likelihood = calculatePerplexity_gpt3(text, modelname1) 
        p_lower, _, p_lower_likelihood = calculatePerplexity_gpt3(text.lower(), modelname1)
    else:
        p1, all_prob, p1_likelihood = calculatePerplexity(text, model1, tokenizer1, gpu=model1.device)
        p_lower, _, p_lower_likelihood = calculatePerplexity(text.lower(), model1, tokenizer1, gpu=model1.device)

    if "davinci" in modelname2:
        p_ref, all_prob_ref, p_ref_likelihood = calculatePerplexity_gpt3(text, modelname2)
    else:
        p_ref, all_prob_ref, p_ref_likelihood = calculatePerplexity(text, model2, tokenizer2, gpu=model2.device)
   
    # Calculate various metrics
    pred["ppl"] = p1
    pred["ppl/Ref_ppl (calibrate PPL to the reference model)"] = p1_likelihood-p_ref_likelihood
    pred["ppl/lowercase_ppl"] = -(np.log(p_lower) / np.log(p1)).item()
    zlib_entropy = len(zlib.compress(bytes(text, 'utf-8')))
    pred["ppl/zlib"] = np.log(p1)/zlib_entropy
    
    # Calculate min-k probabilities
    for ratio in [0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6]:
        k_length = int(len(all_prob)*ratio)
        topk_prob = np.sort(all_prob)[:k_length]
        pred[f"Min_{ratio*100}% Prob"] = -np.mean(topk_prob).item()

    ex["pred"] = pred
    return ex

def evaluate_data(test_data, model1, model2, tokenizer1, tokenizer2, col_name, modelname1, modelname2):
    """Evaluate data using both models."""
    print(f"all data size: {len(test_data)}")
    all_output = []
    for ex in tqdm(test_data): 
        text = ex[col_name]
        new_ex = inference(model1, model2, tokenizer1, tokenizer2, text, ex, modelname1, modelname2)
        all_output.append(new_ex)
    return all_output

## Visualization and Metrics

Functions for plotting results and calculating metrics.

In [None]:
def sweep(score, x):
    """Compute ROC curve and return metrics."""
    fpr, tpr, _ = roc_curve(x, -score)
    acc = np.max(1-(fpr+(1-tpr))/2)
    return fpr, tpr, auc(fpr, tpr), acc

def do_plot(prediction, answers, sweep_fn=sweep, metric='auc', legend="", output_dir=None):
    """Generate ROC curves and calculate metrics."""
    fpr, tpr, auc_score, acc = sweep_fn(np.array(prediction), np.array(answers, dtype=bool))
    low = tpr[np.where(fpr<.05)[0][-1]]
    print('Attack %s   AUC %.4f, Accuracy %.4f, TPR@5%%FPR of %.4f\n'%(legend, auc_score, acc, low))

    metric_text = ''
    if metric == 'auc':
        metric_text = 'auc=%.3f'%auc_score
    elif metric == 'acc':
        metric_text = 'acc=%.3f'%acc

    plt.plot(fpr, tpr, label=legend+metric_text)
    return legend, auc_score, acc, low

def fig_fpr_tpr(all_output, output_dir):
    """Generate and save FPR-TPR plots."""
    print("output_dir", output_dir)
    answers = []
    metric2predictions = defaultdict(list)
    for ex in all_output:
        answers.append(ex["label"])
        for metric in ex["pred"].keys():
            if ("raw" in metric) and ("clf" not in metric):
                continue
            metric2predictions[metric].append(ex["pred"][metric])
    
    plt.figure(figsize=(4,3))
    with open(f"{output_dir}/auc.txt", "w") as f:
        for metric, predictions in metric2predictions.items():
            legend, auc_score, acc, low = do_plot(predictions, answers, legend=metric, metric='auc', output_dir=output_dir)
            f.write('%s   AUC %.4f, Accuracy %.4f, TPR@0.1%%FPR of %.4f\n'%(legend, auc_score, acc, low))

    plt.semilogx()
    plt.semilogy()
    plt.xlim(1e-5,1)
    plt.ylim(1e-5,1)
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.plot([0, 1], [0, 1], ls='--', color='gray')
    plt.subplots_adjust(bottom=.18, left=.18, top=.96, right=.96)
    plt.legend(fontsize=8)
    plt.savefig(f"{output_dir}/auc.png")

## Utility Functions

Helper functions for data loading and manipulation.

In [None]:
def load_jsonl(input_path):
    """Load data from a JSONL file."""
    with open(input_path, 'r') as f:
        data = [json.loads(line) for line in tqdm(f)]
    random.seed(0)
    random.shuffle(data)
    return data

def dump_jsonl(data, path):
    """Save data to a JSONL file."""
    with open(path, 'w') as f:
        for line in tqdm(data):
            f.write(json.dumps(line) + "\n")

def read_jsonl(path):
    """Read data from a JSONL file."""
    with open(path, 'r') as f:
        return [json.loads(line) for line in tqdm(f)]

def convert_huggingface_data_to_list_dic(dataset):
    """Convert HuggingFace dataset to list of dictionaries."""
    all_data = []
    for i in range(len(dataset)):
        ex = dataset[i]
        all_data.append(ex)
    return all_data

## Example Usage

Here's how to use the functions above:

In [None]:
# Example usage
if __name__ == "__main__":
    # Set up output directory
    output_dir = "output"
    Path(output_dir).mkdir(parents=True, exist_ok=True)
    
    # Load models
    target_model = "gpt-3.5-turbo-instruct"
    ref_model = "huggyllama/llama-7b"
    model1, model2, tokenizer1, tokenizer2 = load_model(target_model, ref_model)
    
    # Load data
    dataset = load_dataset("swj0419/WikiMIA", split="WikiMIA_length64")
    data = convert_huggingface_data_to_list_dic(dataset)
    
    # Evaluate
    all_output = evaluate_data(data, model1, model2, tokenizer1, tokenizer2, "input", target_model, ref_model)
    
    # Plot results
    fig_fpr_tpr(all_output, output_dir)