In [1]:
from bisect import bisect_left
from itertools import accumulate
from torch.utils.data import DataLoader, Dataset, random_split
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoModelForCausalLM

In [2]:
import json
import random
import torch
import wandb
import gc

In [3]:
class RewardModelDataset(Dataset):
    
    def __init__(self, json_file):
        self.data = [sample for sample in self.load_data_from_file(json_file) if len(sample["label"]["steps"]) > 0]
        
        self.total_length = sum(len(sample["label"]["steps"]) for sample in self.data)
        
        self.slots = list(accumulate([len(sample["label"]["steps"]) for sample in self.data]))
        self.ctx_target_pairs = []
        
        def find_pos(numbers, x):
            if x < numbers[0]:
                return 0  # Insert at the beginning
            elif x > numbers[-1]:
                return len(numbers)  # Insert at the end
            else:
                return bisect_left(numbers, x)
            
        def numerical_to_one_hot(target):
            # Convert numerical target to one-hot encoding
            if target == -1:
                return torch.tensor([1, 0, 0], dtype=torch.float32)
            elif target == 0:
                return torch.tensor([0, 1, 0], dtype=torch.float32)
            elif target == 1:
                return torch.tensor([0, 0, 1], dtype=torch.float32)
            else:
                return torch.tensor([0, 1, 0], dtype=torch.float32)
     
        
        for idx in tqdm(range(self.total_length)):
            slot_idx = find_pos(self.slots, idx)
            sample_idx = self.slots[slot_idx] - idx 
            sample = self.data[sample_idx]

            question = sample["question"]["problem"]
            steps = sample["label"]["steps"][:sample_idx + 1]
            context = question+"[ANS]"

            targets = []

            for step in steps[:-1]:
                completion = random.choice(step["completions"])
                context += f"  [SEP]{completion['text']} <[RATING]> {completion['rating']}"

            #print(steps[-1])
            final_ctx = random.choice(steps[-1]["completions"])

            context += f"  [SEP]{final_ctx['text']} <[RATING]>"
            target = numerical_to_one_hot(final_ctx['rating'])
            self.ctx_target_pairs.append({"context": context, "target": target})



        
    def load_data_from_file(self, json_file):
        with open(json_file, 'r') as file:
            data = [json.loads(line) for line in file]
        return data

    def __len__(self):
        return self.total_length

    def __getitem__(self, idx):
        return self.ctx_target_pairs[idx]

In [4]:
train_path = "prmdata/train.jsonl"
test_path = "prmdata/test.jsonl"


reward_model_dataset = RewardModelDataset(train_path)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 696998/696998 [00:05<00:00, 129574.94it/s]


In [5]:
validation_size = int(0.1 * len(reward_model_dataset))
train_size = len(reward_model_dataset) - validation_size

# Split the dataset into train and validation sets
train_dataset, validation_dataset = random_split(reward_model_dataset, [train_size, validation_size])

In [6]:
# Create DataLoader instances for train and validation sets
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
validation_loader = DataLoader(validation_dataset, batch_size=4, shuffle=False)

In [7]:
test_set = RewardModelDataset("prmdata/test.jsonl")
test_dataloader = torch.utils.data.DataLoader(test_set, batch_size=4, shuffle=True)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 19456/19456 [00:00<00:00, 203146.05it/s]


In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [9]:
num_classes = 3

In [10]:
tokenizer = AutoTokenizer.from_pretrained("ChaiML/reward_models_100_170000000_cp_498032")
model = AutoModelForSequenceClassification.from_pretrained("ChaiML/reward_models_100_170000000_cp_498032")
model.score = torch.nn.Linear(768, num_classes)
model.to(device)

GPT2ForSequenceClassification(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (score): Linear(in_features=768, out_features=3, bias=True)
)

In [11]:
# Assuming tokenizer is an instance of your tokenizer class
tokenizer.pad_token = tokenizer.eos_token

In [12]:
wandb.init(project='prm', name='continuous-loss-plotting')

# Define optimizer and loss function
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
criterion = torch.nn.CrossEntropyLoss()

# Lists to store the training losses
train_losses = []

validation_losses = []

[34m[1mwandb[0m: Currently logged in as: [33mram77[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [13]:
# Training loop
model.train()  # Set the model to training mode

epochs = 10
for epoch in range(epochs):
    # Training
    total_train_loss = 0

    for batch in tqdm(train_loader, desc=f"Epoch {epoch + 1}/{epochs} - Training"):
        context = batch["context"]
        target = batch["target"].to(device)

        # Tokenize the context
        inputs = tokenizer(context, return_tensors="pt", padding=True, truncation=True)

        # Move tensors to GPU
        inputs = {key: value.to(device) for key, value in inputs.items()}
        optimizer.zero_grad()

        # Forward pass
        outputs = model(**inputs)
        loss = criterion(outputs.logits, target)

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        total_train_loss += loss.item()

        # Log the loss to wandb
        wandb.log({'train_batch_loss': loss.item()})
        
        del loss, context, target, outputs, inputs
        # Clear GPU memory
       
        torch.cuda.empty_cache()

    average_train_loss = total_train_loss / len(train_loader)
    train_losses.append(average_train_loss)
    wandb.log({'train_loss': average_train_loss})
    print(f"Epoch {epoch + 1}/{epochs}, Training Loss: {average_train_loss}")

# Save the fine-tuned model
model.save_pretrained("prm")
tokenizer.save_pretrained("prm")

# Close wandb run
wandb.finish()

Epoch 1/10 - Training:   0%|                                                                                                                                                | 28/156825 [00:06<10:08:03,  4.30it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 48.00 MiB (GPU 0; 11.76 GiB total capacity; 10.86 GiB already allocated; 69.12 MiB free; 11.29 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF