In [72]:
### Imports
import random
import numpy as np
import torch
from torch.utils.data import DataLoader
from datasets import load_dataset
from transformers import GPTNeoXForCausalLM, AutoTokenizer
from transformers.optimization import AdamW
from datetime import datetime
import arxivscraper.arxivscraper as ax
import pubmed_parser as pp
from tqdm import tqdm

device = "cuda" if torch.cuda.is_available() else "cpu"

In [2]:
### Set Random Seed
seed = 229
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.backends.cudnn.deterministic = True

In [88]:
### Model Specs

model_name = "EleutherAI/pythia-70m-deduped"
model_revision = "step3000"
model_cache_dir = "./pythia-70m-deduped/step3000"

model = GPTNeoXForCausalLM.from_pretrained(
  model_name,
  revision=model_revision,
  cache_dir=model_cache_dir,
).to(device)

tokenizer = AutoTokenizer.from_pretrained(
  model_name,
  revision=model_revision,
  cache_dir=model_cache_dir,
)

In [17]:
### Loading the Data

# def collate_fn(batch):
#     tokens = [tokenizer.encode(example["text"], return_tensors="pt", truncation=True) for example in batch]
#     max_length = max([t.size(1) for t in tokens])
#     tokens_padded = [torch.cat([t, t.new_zeros(t.size(0), max_length - t.size(1))], dim=1) for t in tokens]
#     tokens_padded = torch.cat(tokens_padded, dim=0)
#     return tokens_padded

val_dataset = load_dataset("the_pile_val.py", split="validation")
old_pubmed_abstracts = val_dataset.filter(lambda x: x["meta"]["pile_set_name"] == "PubMed Abstracts")
# old_dataloader = DataLoader(old_pubmed_abstracts, batch_size = bs, collate_fn=collate_fn)

No config specified, defaulting to: the_pile_val/all
Found cached dataset the_pile_val (/home/ubuntu/.cache/huggingface/datasets/the_pile_val/all/0.0.0/33c9237089c5fb09f83b2ab7ac73d703de97fc4d122e4b82a7777d85b0919e30)
Loading cached processed dataset at /home/ubuntu/.cache/huggingface/datasets/the_pile_val/all/0.0.0/33c9237089c5fb09f83b2ab7ac73d703de97fc4d122e4b82a7777d85b0919e30/cache-05320e510b09147c.arrow


In [18]:
# Sanity Testing
first_batch = next(iter(old_dataloader))
x = first_batch
y = [old_pubmed_abstracts[i]["meta"] for i in range(4)]
print("Text (x):", x)
print("Metadata (y):", y)

Text (x): tensor([[15131,   273,  4600,  ...,     0,     0,     0],
        [43493, 40411, 21473,  ...,     0,     0,     0],
        [10697,   302,   301,  ...,     0,     0,     0],
        ...,
        [29845,   474,  9953,  ...,     0,     0,     0],
        [ 6075,   412,   250,  ...,     0,     0,     0],
        [   52,  4066,   451,  ...,     0,     0,     0]])
Metadata (y): [{'pile_set_name': 'PubMed Abstracts'}, {'pile_set_name': 'PubMed Abstracts'}, {'pile_set_name': 'PubMed Abstracts'}, {'pile_set_name': 'PubMed Abstracts'}]


In [None]:
!wget https://ftp.ncbi.nlm.nih.gov/pubmed/baseline/pubmed23n1165.xml.gz -P pubmed
!gzip -d pubmed/pubmed23n1165.xml.gz

In [58]:
pubmed_xml = pp.parse_medline_xml('pubmed/pubmed23n1165.xml')

In [59]:
new_pubmed_abstracts = [record for record in pubmed_xml if record["pubdate"] == "2022"]

In [60]:
# def collate_fn_new(batch):
#     tokens = [tokenizer.encode(example["abstract"], return_tensors="pt", truncation=True) for example in batch]
#     max_length = max([t.size(1) for t in tokens])
#     tokens_padded = [torch.cat([t, t.new_zeros(t.size(0), max_length - t.size(1))], dim=1) for t in tokens]
#     tokens_padded = torch.cat(tokens_padded, dim=0)
#     return tokens_padded
# new_dataloader = DataLoader(new_pubmed_abstracts, batch_size = bs, collate_fn=collate_fn_new)

In [None]:
# ### Training Shadow Learners - not really necessary, since we're just loss thresholding and not fine-tuning on any new data

# N = 2

# # Initialize the model and optimizer
# models = [GPTNeoXForCausalLM.from_pretrained(model_name, revision=model_revision, cache_dir=model_cache_dir).to(device) for i in range(N)]

# ### Membership Inference Attacks

# # In-distribution data (split dataset into chunks of size 1/(N))
# dataset = torch.utils.data.random_split(val_dataset, [1/(N) for i in range(N)])
# dataloaders = [DataLoader(dataset[i], batch_size = bs, collate_fn=collate_fn) for i in range(N)]

In [None]:
# # Get data that is out of distribution - see data sheet (https://arxiv.org/abs/2201.07311)
# scraper = ax.Scraper(category='cs',date_from='2023-02-15',date_until='2023-02-22',t=10)
# output = scraper.scrape()
# filtered_output = [record for record in output if datetime(2023,2,15)<=datetime.strptime(record["created"], "%Y-%m-%d")<=datetime(2023,2,22)]
# print(filtered_output[0])
# # print([record["created"] for record in filtered_output])

In [65]:
inference_dataset = [(record["text"],1) for record in old_pubmed_abstracts] + [(record["abstract"],0) for record in new_pubmed_abstracts]

In [73]:
def collate_fn(batch):
    tokens = [tokenizer.encode(example[0], return_tensors="pt", truncation=True) for example in batch]
    labels = [example[1] for example in batch]
    max_length = max([t.size(1) for t in tokens])
    tokens_padded = [torch.cat([t, t.new_zeros(t.size(0), max_length - t.size(1))], dim=1) for t in tokens]
    tokens_padded = torch.cat(tokens_padded, dim=0)
    return tokens_padded, labels
pubmed_abstracts = DataLoader(inference_dataset, batch_size = 1, collate_fn=collate_fn)

In [90]:
# Get loss on these tokens fragments vs in-dist data
model.eval()
netValLoss = 0.0
nTotal = 0
with torch.no_grad():
    for i, data in enumerate(tqdm(pubmed_abstracts)):
        tokens, label = data
        outputs = model(tokens.to(device))
        print(outputs)
        break
#         , labels=data["y"]["input_ids"].to(device)).loss
#         netValLoss += loss.item() * len(data["x"]["input_ids"])
#         nTotal += len(data["x"]["input_ids"])
# print(f"Epoch {epoch} Val Loss: {netValLoss/nTotal}")

# Loss Thresholding
threshold = 0.1

# Compute loss on N shadow learners vs. true dataset on each of N dataset parts (train/test)
# TODO

# Compute ROC curve for different loss thresholds
# TODO
# %%


  0%|          | 0/58768 [00:00<?, ?it/s]

CausalLMOutputWithPast(loss=None, logits=tensor([[[-6.4428, -7.2423,  2.9016,  ..., -7.6228, -6.6135, -6.6238],
         [-4.9570, -4.9083, -0.7677,  ..., -4.9640, -4.7972, -4.5522],
         [-5.6739, -5.5564,  2.6232,  ..., -5.6942, -4.5192, -5.8052],
         ...,
         [-5.7245, -4.3139, -0.5978,  ..., -4.8841, -4.8563, -4.8166],
         [-5.9730, -5.9575,  4.7044,  ..., -6.4474, -5.7025, -6.4462],
         [-4.5142, -5.0790,  0.7325,  ..., -5.1604, -4.7003, -4.6883]]],
       device='cuda:0', grad_fn=<UnsafeViewBackward0>), past_key_values=((tensor([[[[-1.7525e-02,  3.8135e-01, -3.3033e-01,  ..., -5.2595e-02,
           -2.7351e-01,  9.7222e-01],
          [ 6.8990e-01,  1.8075e+00, -1.9392e-01,  ..., -4.3601e-01,
           -1.7871e-01,  8.2021e-01],
          [-5.1992e-01,  4.7992e-02,  6.2647e-01,  ...,  3.2847e-01,
            6.1152e-01, -5.8120e-01],
          ...,
          [ 1.8042e-03, -1.0538e+00,  6.0481e-01,  ..., -3.4079e-01,
           -7.0248e-01,  1.8663e-01],



