In [1]:
import torch
import open_clip
from tqdm import tqdm
from torch.utils.data.distributed import DistributedSampler
from torch.utils.data import DataLoader, SequentialSampler

import pickle
import os

import faiss
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import normalize

from datasets import load_dataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# import sys
# sys.path.append("..")
# from src.text_data import TokenizedTextClassificationDataset

In [10]:
DATASET = "sst2"
DATA_PATH = f"/mnt/ssd/ronak/datasets/{DATASET}"
DEVICE = 'cuda:0'
MODEL_NAME = f"gpt2"
SEED = 11182023

In [11]:
%%capture
train = load_dataset('sst2', split='train', cache_dir=DATA_PATH)

### Use GPT-2 Tokenizer

In [12]:
def get_param_shapes(parameters):
    return [torch.tensor(p.shape) for p in parameters]

def get_num_parameters(param_shapes):
    return torch.tensor([torch.prod(s) for s in param_shapes]).sum().item()

def flatten_gradient(grads):
    return torch.cat([p.reshape(-1) for p in grads])

In [13]:
# tokenizer = GPT2Tokenizer.from_pretrained(f'gpt2-{MODEL_SIZE}')
# model = GPT2LMHeadModel.from_pretrained(f'gpt2-{MODEL_SIZE}').to(DEVICE)
tokenizer = GPT2Tokenizer.from_pretrained(f'gpt2')
model = GPT2LMHeadModel.from_pretrained(f'gpt2').to(DEVICE)
print(f"number of parameters in foundation model: {get_num_parameters(get_param_shapes(model.parameters()))}")

number of parameters in foundation model: 124439808


In [14]:
texts = []
for sent in tqdm(train['sentence']):
    encoded_input = tokenizer(sent, return_tensors='pt')
    texts.append(encoded_input['input_ids'].to(DEVICE))

100%|██████████| 67349/67349 [00:11<00:00, 5645.68it/s]


### Apply Model and Compute Scores

In [27]:
n_components = 12
n_parameters = get_num_parameters(get_param_shapes(model.parameters()))
np.random.seed(123)
rand_project = np.random.normal(size=(n_components, n_parameters)).astype(np.float16) / np.sqrt(n_components)

In [26]:
n_components = 32000
f"memory at {n_components} n_components: {int(n_components * n_parameters * 2 / 1e9):04d} GB" 

'memory at 32000 n_components: 7964 GB'

In [28]:
with torch.no_grad():
    grads = []
    for i, text in tqdm(enumerate(texts)):
        torch.cuda.empty_cache()
        model.zero_grad(set_to_none=True)
        with torch.enable_grad():
            output = model(input_ids=text, labels=text, output_hidden_states=False, output_attentions=False)
            g_out = torch.autograd.grad(outputs=output.loss, inputs=model.parameters())
        grads.append(rand_project @ flatten_gradient(g_out).half().detach().cpu().numpy())
        if i > 2000:
            break

5it [00:56, 11.29s/it]


KeyboardInterrupt: 

In [31]:
grads = np.stack(grads)
print(grads.shape)
np.save(os.path.join(DATA_PATH, f"{MODEL_NAME}_scores.npy"), grads)

(6, 12)


In [32]:
grads

array([[  6.07  ,  48.78  , -10.84  , -12.93  ,  25.67  , -24.38  ,
         -1.136 ,  39.03  ,  26.98  , -29.83  , -11.02  , -22.98  ],
       [ 20.52  ,  17.73  ,  -8.59  ,  25.83  ,  -3.47  , -18.31  ,
         -1.494 ,  25.42  ,   6.164 ,  -4.277 , -31.89  , -19.78  ],
       [  8.97  ,  -7.6   ,   6.887 , -54.1   , -25.97  ,  15.5   ,
         11.2   ,   2.47  , -15.69  ,   0.8696,  -7.473 , -29.27  ],
       [ -9.55  ,  33.88  ,   1.843 ,  27.84  ,  31.34  ,   1.576 ,
         27.06  ,   9.8   ,   2.244 , -13.74  ,  12.19  ,  24.7   ],
       [ -5.08  ,   6.668 ,   1.905 ,  21.84  ,   4.645 ,   5.27  ,
          8.9   ,   2.875 ,  16.14  ,  -0.1884,  -2.309 ,   2.852 ],
       [  7.434 , -10.58  ,   5.49  ,   1.426 ,  12.266 ,  23.55  ,
        -18.55  , -21.88  ,  -0.342 ,   0.1289, -11.49  ,  -3.13  ]],
      dtype=float16)