# Setup inc model loading

In [1]:
%load_ext autoreload
%autoreload 2

In [36]:
import transformers
import importlib
import torch as t
import gptj_parallel
import os

In [4]:
# slow, ~90secs
model = transformers.AutoModelForSequenceClassification.from_pretrained("EleutherAI/gpt-j-6B")
model.eval()

Some weights of the model checkpoint at EleutherAI/gpt-j-6B were not used when initializing GPTJForSequenceClassification: ['lm_head.bias', 'lm_head.weight']
- This IS expected if you are initializing GPTJForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing GPTJForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of GPTJForSequenceClassification were not initialized from the model checkpoint at EleutherAI/gpt-j-6B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


GPTJForSequenceClassification(
  (transformer): GPTJModel(
    (wte): Embedding(50400, 4096)
    (drop): Dropout(p=0.0, inplace=False)
    (h): ModuleList(
      (0): GPTJBlock(
        (ln_1): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
        (attn): GPTJAttention(
          (attn_dropout): Dropout(p=0.0, inplace=False)
          (resid_dropout): Dropout(p=0.0, inplace=False)
          (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (out_proj): Linear(in_features=4096, out_features=4096, bias=False)
        )
        (mlp): GPTJMLP(
          (fc_in): Linear(in_features=4096, out_features=16384, bias=True)
          (fc_out): Linear(in_features=16384, out_features=4096, bias=True)
          (dropout): Dropout(p=0.0, inplace=False)
        )
      )
      (1): GPTJBlock(
        (ln_1): LayerNorm

# Check working model

In [5]:
tokenizer = transformers.AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B")

In [6]:
tokens = t.tensor(tokenizer.encode("Two tokens")).unsqueeze(0)

In [44]:
def test_components(model, components):
    model.eval()
    # components = [gptj_parallel.GPTJComponent(i, model) for i in range(4)]
    x = tokens.clone()
    x2 = tokens.clone()

    for i in range(4):
        print(i)
        x = components[i](x)

    x2 = model(x2, output_hidden_states=True)

    print(x[1], x2.logits)
    assert t.allclose(x[1], x2.logits)
    print("Passed closeness test")

# Save and load models

In [35]:
# slow, ~45s?
for i in range(4):
    if not os.path.exists(f"component{i}.pt")
        t.save(components[i], f"component{i}.pt")

In [40]:
loaded_components = []

for i in range(4):
    component = t.load(f"component{i}.pt")
    loaded_components.append(component)

In [43]:
test_components(model, loaded_components)

0
1
2
3
tensor([[ 1.7715, -0.0593]], grad_fn=<SelectBackward0>) tensor([[ 1.7715, -0.0593]], grad_fn=<IndexBackward0>)


# Get dataset

In [10]:
# This is slow, beware! ~20 seconds
print("Begin getting IMDB...",end="")
import torchtext
data_train, data_test = torchtext.datasets.IMDB(root=".data", split=("train", "test"))
data_train = list(data_train)
data_test = list(data_test)
print("done")

Begin getting IMDB...done


In [11]:
num_batches = 1000
processed_data = []
all_texts = [dt[1] for dt in data_train]
print("Begin processing...",end="")
tokenized = tokenizer(all_texts)["input_ids"]
print("done.")

Begin processing...

Token indices sequence length is longer than the specified maximum sequence length for this model (2072 > 2048). Running this sequence through the model will result in indexing errors


done.


In [28]:
SEQ_LEN = 1024
PAD_TOKEN = 0

training_data = t.full(size=(25_000, SEQ_LEN), fill_value=PAD_TOKEN).long()

for i, sentence in enumerate(tokenized):
    tensorized_sentence = t.tensor(sentence[:SEQ_LEN]).long()

    if len(sentence) >= 1024:
        training_data[i] = tensorized_sentence
    else:
        assert len(sentence) > 0
        training_data[i][-len(sentence):] = tensorized_sentence

In [31]:
t.save(training_data, "training_data_1024.pt")

In [56]:
our_activations = t.load("our_activations.pt")
print(our_activations.shape)

torch.Size([2, 2])


In [67]:
input = t.load("input_test.pt").to(model.device)
print(input.shape)
output = model(input[1:2,])
print(output.logits.shape)
t.isclose(output.logits.cpu(), our_activations[1].cpu())

torch.Size([2, 10])
torch.Size([1, 2])


tensor([[True, True]])

In [68]:
model.config.pad_token_id = 0