In [1]:
import transformers
import torch
from torch import nn

In [2]:
model = transformers.AutoModelForSequenceClassification.from_pretrained("EleutherAI/gpt-j-6B")

Some weights of the model checkpoint at EleutherAI/gpt-j-6B were not used when initializing GPTJForSequenceClassification: ['lm_head.weight', 'lm_head.bias']
- This IS expected if you are initializing GPTJForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing GPTJForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of GPTJForSequenceClassification were not initialized from the model checkpoint at EleutherAI/gpt-j-6B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
model

GPTJForSequenceClassification(
  (transformer): GPTJModel(
    (wte): Embedding(50400, 4096)
    (drop): Dropout(p=0.0, inplace=False)
    (h): ModuleList(
      (0): GPTJBlock(
        (ln_1): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
        (attn): GPTJAttention(
          (attn_dropout): Dropout(p=0.0, inplace=False)
          (resid_dropout): Dropout(p=0.0, inplace=False)
          (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (out_proj): Linear(in_features=4096, out_features=4096, bias=False)
        )
        (mlp): GPTJMLP(
          (fc_in): Linear(in_features=4096, out_features=16384, bias=True)
          (fc_out): Linear(in_features=16384, out_features=4096, bias=True)
          (dropout): Dropout(p=0.0, inplace=False)
        )
      )
      (1): GPTJBlock(
        (ln_1): LayerNorm

In [4]:
model.transformer.h[27]

GPTJBlock(
  (ln_1): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
  (attn): GPTJAttention(
    (attn_dropout): Dropout(p=0.0, inplace=False)
    (resid_dropout): Dropout(p=0.0, inplace=False)
    (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
    (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
    (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
    (out_proj): Linear(in_features=4096, out_features=4096, bias=False)
  )
  (mlp): GPTJMLP(
    (fc_in): Linear(in_features=4096, out_features=16384, bias=True)
    (fc_out): Linear(in_features=16384, out_features=4096, bias=True)
    (dropout): Dropout(p=0.0, inplace=False)
  )
)

In [5]:
token_ids = torch.randint(0, 50400, size=(1, 1024), dtype=torch.long).cpu()
model_half = model.cpu()
model_half.eval()
model_half.cpu()
output = model_half(token_ids)

In [6]:
output.logits

tensor([[0.5029, 0.1519]], grad_fn=<IndexBackward0>)

In [38]:
sum([[1,2], [3, 4]], [])

[1, 2, 3, 4]

In [6]:
class GiveMeFirst(nn.Module):
    def forward(self, x):
        return x[0]

class GiveMeLastSeq(nn.Module):
    def forward(self, x):
        return x[:, -1, :]

In [7]:
def add_give_me_first_layers(layers):
    return sum([[layer, GiveMeFirst()] for layer in layers], [])

In [8]:
model1 = nn.Sequential(model.transformer.wte,
                       model.transformer.drop,
                       *add_give_me_first_layers(model.transformer.h[:14]))

In [9]:
model2 = nn.Sequential(*add_give_me_first_layers(model.transformer.h[14:]),
                       GiveMeLastSeq(),
                       model.transformer.ln_f,
                       model.score)

In [19]:
with torch.no_grad():
    model1.cpu()
    model2.cpu()
    hidden = model1(token_ids)
    output = model2(hidden)
    output.shape

In [36]:
first = GiveMeFirst()
first(model.transformer.h[0](model.transformer.wte(token_ids)))

tensor([[[-0.0968, -0.9653, -0.1791,  ..., -0.2034, -0.4834, -0.0325],
         [ 1.3574,  0.1792,  0.8643,  ...,  0.9263, -0.7329, -0.4111],
         [ 0.4150, -0.4248, -0.1935,  ...,  1.4170, -0.8735, -0.2559],
         ...,
         [-0.1338, -0.1221, -0.0857,  ..., -0.0287,  0.1124,  0.0516],
         [-0.3884, -0.5869,  0.2297,  ..., -1.0078,  0.7026,  0.1633],
         [ 0.2778,  0.2360, -1.0928,  ...,  1.2744, -0.3684, -0.3665]]],
       device='cuda:0', dtype=torch.float16, grad_fn=<AddBackward0>)

In [20]:
output

tensor([[0.5029, 0.1519]])

In [16]:
DEVICES = ["cuda:0", "cuda:1"]
model1.to(DEVICES[0])
torch.save(model1, "model/gptj-0.pt")

In [1]:
model2.to(DEVICES[1])
torch.save(model2, "model/gptj-1.pt")

NameError: name 'model2' is not defined

In [38]:
from torch.utils.data import DataLoader
import torchtext

SEQ_LEN=128
MICROBATCH_SIZE=1

tokenizer = transformers.AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B")
tokenizer.pad_token = tokenizer.eos_token

# fetch targets and tokens out of each batch of dataloader
def wrap_loader(dataloader, device, max_seq_len):
    for targets, inputs in dataloader:
        targets_tensor = torch.tensor(
                [rating == 'pos' for rating in targets],
                dtype=torch.long,
                device=device,
        )
        tokens = tokenizer(list(inputs), padding='longest', max_length=max_seq_len, truncation=True).input_ids
        tokens_tensor = torch.tensor(tokens, dtype=torch.long, device=device)
        yield targets_tensor, tokens_tensor

def load_data(device, batch_size, max_seq_len=SEQ_LEN):
    data_train, data_test = torchtext.datasets.IMDB(root='.data', split=('train', 'test'))
    # TODO change shuffle back to True
    train_dataloader = DataLoader(list(data_train), batch_size=batch_size, shuffle=False)
    test_dataloader = DataLoader(data_test, batch_size=batch_size)
    wrap = lambda data: wrap_loader(data, device=device, max_seq_len=max_seq_len)
    return wrap(train_dataloader), wrap(test_dataloader)

training_data, test_data = load_data(
                device="cpu",
                batch_size=MICROBATCH_SIZE
)

In [39]:

loss_fn = nn.CrossEntropyLoss()
model.cpu()
model.eval()

targets, inputs = next(training_data)
print("DEBUGGING: input[0,:5]", inputs[0,:5])
print(f"{inputs.shape=}")
output = model(inputs)
loss = loss_fn(output.logits, targets)
print(f"{loss=} {output.logits=}")

DEBUGGING: input[0,:5] tensor([   40, 26399,   314,  3001,   327])
inputs.shape=torch.Size([1, 128])
loss=tensor(3.5530, grad_fn=<NllLossBackward0>) output.logits=tensor([[-1.7711,  1.7528]], grad_fn=<IndexBackward0>)
