In [51]:
import torch
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
from torch.optim import AdamW

from transformers import GPT2LMHeadModel, GPT2Tokenizer
from datasets import load_dataset

from tqdm import tqdm

In [52]:
device = 'cuda'

# Model setup

In [None]:
vocab_size = 50257

class TweakActivations(nn.Module):
    def __init__(self):
        super().__init__()

        self.k = nn.Parameter(torch.randn(vocab_size))
        self.p = nn.Parameter(torch.randn(vocab_size)) 

    def forward(self, activations):
        # .shape = (batch, seq_len, vocab_size)
        # a = a * (1 + sin(idx*k)/p)

        indices = torch.arange(1, vocab_size+1, 1, device=device).float()
        x = 1 + torch.sin(indices*self.k) / self.p

        last_token_modified = activations[:, -1, :] * x  # (batch, vocab_size)

        modified_actv = torch.cat(
            [activations[:, :-1, :], last_token_modified.unsqueeze(1)],
            dim=1
        )
        return modified_actv

class GPT2Modified(GPT2LMHeadModel):
    def __init__(self):
        super().__init__(GPT2LMHeadModel.from_pretrained("gpt2").config)

        base = GPT2LMHeadModel.from_pretrained("gpt2").to(device)

        self.tweak = TweakActivations().to(device)

        self.transformer = base.transformer
        self.lm_head = nn.Sequential(
            base.lm_head,
            self.tweak
        )

        self.transformer.requires_grad_(False)
        self.lm_head.requires_grad_(False)
        self.tweak.requires_grad_(True)

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

model = GPT2Modified()
model.config.loss_type = "ForCausalLMLoss"

In [54]:
# sanity check 
def infer(model, prompt):
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
    with torch.no_grad():
        output_ids = model.generate(input_ids, max_new_tokens=8)
    return tokenizer.decode(output_ids[0], skip_special_tokens=True)

prompt = "my favorite italian food is"
print("\n")
print(infer(model, prompt))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.




my favorite italian food is Collect Collect Collect Collect Collect Collect Collect Collect


In [55]:
model.tweak.k, model.tweak.p

(Parameter containing:
 tensor([-0.7382, -1.8602, -0.1141,  ..., -0.2604, -0.5621,  1.0173],
        device='cuda:0', requires_grad=True),
 Parameter containing:
 tensor([-0.8936, -0.3015,  0.2686,  ..., -0.2973,  0.3002, -0.5685],
        device='cuda:0', requires_grad=True))

# Find parameters

In [56]:
ds = load_dataset("Hello-SimpleAI/HC3", "open_qa")

df_train = ds["train"].to_pandas()[["question", "human_answers"]]
df_train["human_answers"] = df_train["human_answers"].map(lambda x: x[0])

df_train

Unnamed: 0,question,human_answers
0,what composer used sound mass,"Composers and works include Barbara Kolb , Pau..."
1,where did the persian war take place,The Greco-Persian Wars (also often called the ...
2,what are add ons,"Plug-in (computing) , a piece of software whic..."
3,how does a dredge work?,Dredging is an excavation activity or operatio...
4,what classes are considered humanities,The humanities are academic disciplines that s...
...,...,...
1182,when did secretariat win,"Secretariat (March 30, 1970 – October 4, 1989)..."
1183,what is a full job time??,Full-time employment is employment in which a ...
1184,what are the three primary colors in the subtr...,"The overlapping subtractive yellow, cyan and r..."
1185,what are layers of the ionosphere,The ionosphere is a region of the upper atmosp...


In [57]:
class HC3Dataset(Dataset):
    def __init__(self, df, tokenizer):
        self.df = df
        self.max_length = 128
        self.tokenizer = tokenizer
        self.tokenizer.pad_token = self.tokenizer.eos_token

    def tokenize(self, series):
        return self.tokenizer(
            series,
            padding="max_length",
            truncation=True, 
            max_length=self.max_length,
            return_tensors="pt",
            return_attention_mask=True,
            return_token_type_ids=False, 
        )

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        prompt = row["question"] + " " + row["human_answers"]
        encoding = self.tokenizer(
            prompt,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )
        input_ids = encoding["input_ids"].squeeze()
        attention_mask = encoding["attention_mask"].squeeze()

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": input_ids.clone() # predict same stuff
        }

    def __len__(self):
        # 1187 original length
        return len(self.df)
    
train_dataset = HC3Dataset(df_train, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)

train_dataset[0]["input_ids"].shape

torch.Size([128])

In [61]:
optimizer = AdamW(model.parameters(), lr=5e-6)

model.train()
for epoch in range(5):
    len_dataloader = len(train_loader)
    average_loss = 0

    tk0 = tqdm(enumerate(train_loader), total=len_dataloader)
    for batch_number, batch in tk0:
        input_ids = batch["input_ids"].cuda()
        attention_mask = batch["attention_mask"].cuda()
        labels = batch["labels"].cuda()

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        loss = outputs.loss

        # 2. backward pass
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        # 3. stats
        average_loss += loss.cpu().item()
        tk0.set_postfix(
            loss=average_loss / (batch_number + 1), epoch=epoch
        )

100%|██████████| 297/297 [00:35<00:00,  8.30it/s, epoch=0, loss=1.98]
100%|██████████| 297/297 [00:35<00:00,  8.33it/s, epoch=1, loss=1.72]
100%|██████████| 297/297 [00:35<00:00,  8.34it/s, epoch=2, loss=1.61]
100%|██████████| 297/297 [00:32<00:00,  9.00it/s, epoch=3, loss=1.54]
100%|██████████| 297/297 [00:29<00:00, 10.02it/s, epoch=4, loss=1.5] 


In [62]:
model.tweak.k, model.tweak.p

(Parameter containing:
 tensor([-0.7380, -1.8599, -0.1141,  ..., -0.2603, -0.5619,  1.0170],
        device='cuda:0', requires_grad=True),
 Parameter containing:
 tensor([-0.8935, -0.3014,  0.2685,  ..., -0.2972,  0.3001, -0.5683],
        device='cuda:0', requires_grad=True))

In [60]:
print(infer(model, prompt))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


my favorite italian food is pubs pubs pubs pubs pubs pubs pubs pubs
