<a href="https://colab.research.google.com/github/rishabhranawat/prm-explorations/blob/main/Evaluate_Your_Reward_Model_for_PRM_Token_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%%capture
!pip install -r /content/drive/MyDrive/colab/requirements.txt

In [None]:
import os
import time

import numpy as np
from sklearn.metrics import accuracy_score, f1_score

import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import TensorDataset, DataLoader

from datasets import load_dataset

from transformers import AutoModelForTokenClassification, AutoTokenizer, pipeline
from peft import LoraConfig, get_peft_model, TaskType
from trl import PRMConfig, PRMTrainer
import wandb

os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True' # Set early, before importing pytorch.

In [None]:
#@title Specify Model and Evaluation Dataset

model_name = "/content/drive/MyDrive/qwen-toy-rm-peft/v1/checkpoint-1000/" #@param {type:"string"}
dataset_name = "trl-lib/prm800k" #@param {type:"string"}
test_split = "test[:10%]" #@param {type:"string"}

In [None]:
tuned_model = AutoModelForTokenClassification.from_pretrained(model_name,
                                                              num_labels=2)
tokenizer = AutoTokenizer.from_pretrained(model_name, truncation=True,
                                          padding_side='left')
tokenizer.pad_token = tokenizer.eos_token

config.json:   0%|          | 0.00/661 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/988M [00:00<?, ?B/s]

Some weights of Qwen2ForTokenClassification were not initialized from the model checkpoint at Qwen/Qwen2-0.5B and are newly initialized: ['score.bias', 'score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
splits = ["train", test_split]
train_dataset, test_dataset = load_dataset(dataset_name, split=splits)

In [None]:
def flatten_steps_classification(examples, separator="\n"):
    all_steps = []
    all_labels = []

    for example in examples:
      for idx in range(1, len(example["completions"]) + 1):
        steps = example["completions"][0:idx]
        text = separator.join((example["prompt"], *steps)) + separator
        all_steps.append(text)
        all_labels.append(example["labels"][idx - 1])

    return {"steps": all_steps, "labels": all_labels}

flattened_test_dataset = flatten_steps_classification(test_dataset)

In [None]:
def collate_fn(batch):
    # Assuming each item in batch is already tokenized to input_ids
    input_ids = [torch.tensor(tokenizer(text).input_ids) for text, label in batch]
    labels = torch.tensor([label for _, label in batch])

    # Pad sequences in this batch to the max length in this batch
    padded_inputs = pad_sequence(input_ids,
                                 batch_first=True,
                                 padding_value=tokenizer.pad_token_id,
                                 padding_side="left")

    return padded_inputs, labels

# Create dataset with raw texts
steps = flattened_test_dataset["steps"]
labels = flattened_test_dataset["labels"]

# Use the collate_fn in DataLoader
test_dataloader = DataLoader(
    list(zip(steps, labels)),
    batch_size=64,
    shuffle=False,
    collate_fn=collate_fn
)

In [None]:
def batched_evaluate(model, tokenizer, test_dataloader):
  all_predictions = []
  all_labels = []
  time_per_batch = []
  model = model.to("cuda")

  for i, (input_batch, label_batch) in enumerate(test_dataloader):
    start_time = time.time()
    if i % 10 == 0:
      print(f"Batch {i}, Average Time per 10 batches {np.mean(time_per_batch)}")
      acc = accuracy_score(all_labels, all_predictions)
      f1 = f1_score(all_labels, all_predictions)
      print(f"Stats as of Batch {i}, Accuracy: {acc}, F1: {f1}")

      time_per_batch = []

    input_batch = input_batch.to("cuda", non_blocking=True)
    predictions = model(input_batch)

    # Process entire batch at once using tensor operations
    batch_probs = torch.softmax(predictions.logits, dim=-1)
    batch_preds = batch_probs[:, -1].argmax(dim=-1).bool()

    # Extend lists with batch predictions
    all_predictions.extend(batch_preds.cpu().tolist())
    all_labels.extend(label_batch.bool().tolist())
    time_per_batch.append(time.time() - start_time)

batched_evaluate(tuned_model, tokenizer, test_dataloader)

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Batch 0, Average Time per 10 batches nan
Accuracy: 0.671875, F1: 0.7961165048543689
Accuracy: 0.5546875, F1: 0.7015706806282722
Accuracy: 0.5572916666666666, F1: 0.7038327526132404
Accuracy: 0.64453125, F1: 0.7775061124694377
Accuracy: 0.684375, F1: 0.8068833652007649
Accuracy: 0.6901041666666666, F1: 0.8120063191153238
Accuracy: 0.6986607142857143, F1: 0.8183041722745625
Accuracy: 0.69921875, F1: 0.8188235294117647
Accuracy: 0.7170138888888888, F1: 0.8317853457172343
Accuracy: 0.721875, F1: 0.8351851851851851
Batch 10, Average Time per 10 batches 7.7063373804092405
Accuracy: 0.7230113636363636, F1: 0.8359966358284272
Accuracy: 0.73046875, F1: 0.8413793103448276
Accuracy: 0.7223557692307693, F1: 0.8355871886120997
Accuracy: 0.7198660714285714, F1: 0.8341044282881692
Accuracy: 0.71875, F1: 0.8333333333333334
Accuracy: 0.7158203125, F1: 0.831304347826087
Accuracy: 0.7196691176470589, F1: 0.833968426782798
Accuracy: 0.7170138888888888, F1: 0.8321318228630278
Accuracy: 0.7162828947368421, 