In [1]:
from bisect import bisect_left
from itertools import accumulate
from torch.utils.data import DataLoader, Dataset, random_split
from tqdm import tqdm

In [12]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import gc
import json
import random
import torch
import wandb

In [3]:
class RewardModelDataset(Dataset):
    
    def __init__(self, json_file,tokenizer):
        self.data = [sample for sample in self.load_data_from_file(json_file) if len(sample["label"]["steps"]) > 0]
        
        self.total_length = sum(len(sample["label"]["steps"]) for sample in self.data)
        
        self.slots = list(accumulate([len(sample["label"]["steps"]) for sample in self.data]))
        self.ctx_target_pairs = []
        self.tokenizer = tokenizer
        
        def find_pos(numbers, x):
            if x < numbers[0]:
                return 0  # Insert at the beginning
            elif x > numbers[-1]:
                return len(numbers)  # Insert at the end
            else:
                return bisect_left(numbers, x)
            
        def tokenize_and_detokenize(sentence):
            tokens = tokenizer.tokenize(sentence)
            selected_tokens = tokens[:random.randint(0, len(tokens))]
            detokenized_prefix = tokenizer.convert_tokens_to_string(selected_tokens)
            return detokenized_prefix

        
        for idx in tqdm(range(self.total_length)):
            slot_idx = find_pos(self.slots, idx)
            sample_idx = self.slots[slot_idx] - idx 
            sample = self.data[sample_idx]

            question = sample["question"]["problem"]
            steps = sample["label"]["steps"][:sample_idx + 1]
            context = question+"[ANS]"

            targets = []

            for step in steps[:-1]:
                completion = random.choice(step["completions"])
                context += f"  [SEP]{completion['text']} <[RATING]> {completion['rating']}"

            final_ctx = random.choice(steps[-1]["completions"])
            
            # randomly split the context as a prefix
            rnd_ctx = tokenize_and_detokenize(final_ctx['text'])
            context += f"  [SEP]{rnd_ctx} <[RATING]>"
            target = ({-1: 0.0, 0: 0.5, 1: 1.0 }.get(final_ctx['rating'], 0.0)*idx)/self.total_length # trick to convert feedback to reward
            self.ctx_target_pairs.append({"context": context, "target": target})


    def load_data_from_file(self, json_file):
        with open(json_file, 'r') as file:
            data = [json.loads(line) for line in file]
        return data

    def __len__(self):
        return self.total_length

    def __getitem__(self, idx):
        return self.ctx_target_pairs[idx]

In [9]:
model_name = "distilgpt2"
out_size = 768

In [10]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [18]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

In [14]:
model = AutoModelForCausalLM.from_pretrained(model_name)
model.lm_head = torch.nn.Linear(out_size, 1)
model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-5): 6 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=1, bias=True)
)

In [None]:
train_path = "prmdata/train.jsonl"
val_path = "prmdata/test.jsonl"

train_dataset = RewardModelDataset(train_path,tokenizer)
validation_dataset = RewardModelDataset(val_path,tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True) # change to 64
validation_loader = DataLoader(validation_dataset, batch_size=16, shuffle=False)

 16%|█████████████████████████▌                                                                                                                                         | 109120/696998 [00:08<00:45, 12935.22it/s]

In [16]:
wandb.init(project='qstar_rm', name='continuous-loss-plotting')

optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
criterion = torch.nn.CrossEntropyLoss()

train_losses = []

[34m[1mwandb[0m: Currently logged in as: [33mram77[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [17]:
model.train()

epochs = 100

for epoch in range(epochs):
    total_train_loss = 0

    for batch in tqdm(train_loader, desc=f"Epoch {epoch + 1}/{epochs} - Training"):
        context = batch["context"]
        target = batch["target"].to(device)

        inputs = tokenizer(context, return_tensors="pt", padding=True, truncation=True)

        inputs = {key: value.to(device) for key, value in inputs.items()}
        optimizer.zero_grad()

        outputs = model(**inputs)
        loss = criterion(outputs.logits, target)

        loss.backward()
        
        optimizer.step()

        total_train_loss += loss.item()

        wandb.log({'train_batch_loss': loss.item()})
        
        del loss, context, target, outputs, inputs
       
        torch.cuda.empty_cache()

    average_train_loss = total_train_loss / len(train_loader)
    train_losses.append(average_train_loss)
    wandb.log({'train_loss': average_train_loss})
    print(f"Epoch {epoch + 1}/{epochs}, Training Loss: {average_train_loss}")

model.save_pretrained("qstar_rm")
tokenizer.save_pretrained("qstar_rm")

wandb.finish()

Epoch 1/100 - Training:   0%|                                                                                                                                                            | 0/43563 [00:00<?, ?it/s]Using pad_token, but it is not set yet.
Epoch 1/100 - Training:   0%|                                                                                                                                                            | 0/43563 [00:00<?, ?it/s]


ValueError: Asking to pad but the tokenizer does not have a padding token. Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` or add a new pad token via `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`.