In [95]:
from tqdm import tqdm
from transformers import GPTNeoXForCausalLM, AutoTokenizer
import torch
from torch.utils.data import DataLoader, Dataset
import wandb
import json
import random
from torch.utils.data import random_split, DataLoader

In [73]:
class RewardModelDataset(Dataset):
    def __init__(self, json_file):
        self.data = self.load_data_from_file(json_file)

    def load_data_from_file(self, json_file):
        with open(json_file, 'r') as file:
            data = [json.loads(line) for line in file]
        return data

    def __len__(self):
        return sum(len(sample["label"]["steps"]) for sample in self.data)

    def get_random_completion(self, step):
        completions = step["completions"]
        return random.choice(completions)

    def __getitem__(self, idx):
        # Find the sample and completion index for the given global index
        sample_idx = 0
        while idx >= len(self.data[sample_idx]["label"]["steps"]):
            idx -= len(self.data[sample_idx]["label"]["steps"])
            sample_idx += 1

        sample = self.data[sample_idx]
        question = sample["question"]["problem"]
        steps = sample["label"]["steps"][:idx + 1]  # Include completions up to the current step

        # Create context by concatenating question, completions, and ratings of all previous steps
        context = question
        targets = []

        for i, step in enumerate(steps):
            completion = self.get_random_completion(step)
            context += f" <[STEP]> {completion['text']} <[RATING]> {completion['rating']}"
            targets.append(step["completions"][0]["rating"])

        # Predict only the last rating for the current step
        target = targets[-1]

        return {"context": context, "target": target}

In [None]:
train_path = "prmdata/train.jsonl"
test_path = "prmdata/test.jsonl"


reward_model_dataset = RewardModelDataset(train_path)
validation_size = int(0.1 * len(reward_model_dataset))
train_size = len(reward_model_dataset) - validation_size

# Split the dataset into train and validation sets
train_dataset, validation_dataset = random_split(reward_model_dataset, [train_size, validation_size])

# Create DataLoader instances for train and validation sets
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
validation_loader = DataLoader(validation_dataset, batch_size=32, shuffle=False)


test_set = RewardModelDataset("prmdata/test.jsonl")
test_dataloader = torch.utils.data.DataLoader(test_set, batch_size=64, shuffle=True)

In [90]:
print(train_set[42]["context"])

Let $\alpha$ and $\beta$ be the roots of $x^2 + px + 1 = 0,$ and let $\gamma$ and $\delta$ are the roots of $x^2 + qx + 1 = 0.$  Express
\[(\alpha - \gamma)(\beta - \gamma)(\alpha + \delta)(\beta + \delta)\]in terms of $p$ and $q.$ <[STEP]> I notice that both equations have the same constant term, $1,$ which means that the product of the roots in each equation is also $1.$ <[RATING]> 1 <[STEP]> That is, $\alpha \beta = 1$ and $\gamma \delta = 1.$ <[RATING]> 1 <[STEP]> This suggests that I can use the difference of squares identity to simplify the expression I want to find. <[RATING]> 0 <[STEP]> Specifically, I can rewrite $(\alpha - \gamma)(\beta - \gamma)$ as $(\alpha \beta - \gamma (\alpha + \beta) + \gamma^2),$ and $(\alpha + \delta)(\beta + \delta)$ as $(\alpha \beta + \delta (\alpha + \beta) + \delta^2).$ <[RATING]> 0 <[STEP]> Then, multiplying these two expressions, I get $(\alpha \beta - \gamma (\alpha + \beta) + \gamma^2)(\alpha \beta + \delta (\alpha + \beta) + \delta^2)$ <[RA

In [98]:
train_set[42]["target"]

-1

In [1]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
# tokenizer = AutoTokenizer.from_pretrained("EleutherAI/pythia-70m-deduped", revision="step143000")
# model = GPTNeoXForCausalLM.from_pretrained("EleutherAI/pythia-70m-deduped", revision="step143000")
# Load model directly
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("ChaiML/gpt2_medium_retry_and_continue_12m_reward_model")
model = AutoModelForSequenceClassification.from_pretrained("ChaiML/gpt2_medium_retry_and_continue_12m_reward_model")

model.to(device)

tokenizer_config.json:   0%|          | 0.00/236 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.44G [00:00<?, ?B/s]

GPT2ForSequenceClassification(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 1024)
    (wpe): Embedding(1024, 1024)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-23): 24 x GPT2Block(
        (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
  )
  (score): Linear(in_features=1024, out_features=2, bias=False)
)

In [None]:
wandb.init(project='fine_tuned_prm', name='continuous-loss-plotting')

# Set the model to training mode
model.train()

# Define optimizer and loss function
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
criterion = torch.nn.CrossEntropyLoss()

# Lists to store the training losses
losses = []

# Training loop
epochs = 5
for epoch in range(epochs):
    total_loss = 0

    for batch in tqdm(train_loader, desc=f"Epoch {epoch + 1}/{epochs}"):
        sample = batch[0]  # Assuming each batch contains one sample

        context = sample["context"]
        target = sample["target"]

        # Tokenize the context
        inputs = tokenizer(context, return_tensors="pt", padding=True, truncation=True)

        # Move tensors to GPU
        inputs = {key: value.to(device) for key, value in inputs.items()}
        target = target.to(device)

        # Forward pass
        outputs = model(**inputs)

        # Compute loss
        loss = criterion(outputs.logits, target)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        # Append the loss to the list
        losses.append(loss.item())

        # Log the loss to wandb
        wandb.log({'loss': loss.item()})

    average_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1}/{epochs}, Average Loss: {average_loss}")

# Save the fine-tuned model
model.save_pretrained("fine_tuned_prm")
tokenizer.save_pretrained("fine_tuned_prm")

# Close wandb run
wandb.finish()


VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))