In [1]:
!pip install tf-keras 
!pip install flash-attn 
!pip install wandb 
!pip install 'accelerate>=0.26.0'
!pip install transformers 
!pip install datasets 

import random
import copy
import re
import os
import numpy as np
import wandb

import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence

from transformers import AutoModelForCausalLM, AutoTokenizer

def set_random_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_random_seed(42)

os.environ["WANDB_API_KEY"] = "d687aedb7a16ede6abad66b2adade95edf09f216"
os.environ["WANDB_PROJECT"] = "test"

Collecting tf-keras
  Downloading tf_keras-2.19.0-py3-none-any.whl (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m38.2 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25h  Downloading tf_keras-2.18.0-py3-none-any.whl (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m83.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tensorflow<2.19,>=2.18
  Downloading tensorflow-2.18.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (615.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m615.3/615.3 MB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting ml-dtypes<0.5.0,>=0.4.0
  Downloading ml_dtypes-0.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m95.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting h5py>=3.11.0
  Downloading h5py-3.13.0-cp310-cp310-manylinu

In [2]:
from datasets import load_dataset, concatenate_datasets

In [3]:
def extract_boxed_answer(text):
    match = re.search(
        r'\\boxed{\s*(-?\d+(?:\.\d+)?|-\s*\\frac{\s*(\d+)\s*}{\s*(\d+)\s*}|\\frac{\s*(\d+)\s*}{\s*(\d+)\s*})\s*}', 
        text
    )
    if match:
        if match.group(2) and match.group(3):  # Negative fraction case (-\frac{a}{b})
            return f"-{match.group(2)}/{match.group(3)}"
        elif match.group(4) and match.group(5):  # Positive fraction case (\frac{a}{b})
            return f"{match.group(4)}/{match.group(5)}"
        else:  # Whole number or decimal case
            return match.group(1).replace(" ", "")  # Remove spaces for consistency

    return None

import re
from fractions import Fraction

def extract_last_number(text):
    text = text.replace('$', '').replace('%', '')
    pattern = r'(-?\d+\.\d+|-?\d+/\d+|-?\d+)'
    matches = re.findall(pattern, text)
    if matches:
        last_match = matches[-1]
        if '/' in last_match:
            return float(Fraction(last_match))
        else:
            return float(last_match)
    return None


In [4]:
SYSTEM_PROMPT = r"""<｜begin_of_sentence｜>  
Below is a Math problem that is on the difficulty level of national olympiads. 
You are a international gold medalist in Math, so you should be able to solve this problem. 
You should expect the difficulty of the problems to be roughly at the level of a national Olympiad, 
although some problems are slightly easier and some are slightly harder. 
The problems are all in LaTeX format. 
Answers may require basic computations, e.g., square roots, absolute values. 
Provide the final answer in the end as a numerical value.
This is mandatory and you will be punished for not doing so.
Now here is your question: 
### Problem <｜User｜> {} -------------- 
Provide the answer here: 
### Answer <｜Assistant｜> <think> {}  answer: {} </think> """

# r"""<｜begin_of_sentence｜>  Below is a Math problem that is on the difficulty level of national olympiads. You are a international gold medalist in Math, so you should be able to solve this problem. You should expect the difficulty of the problems to be roughly at the level of a national Olympiad, although some problems are slightly easier and some are slightly harder. The problems are all in LaTeX format. Answers may require basic computations, e.g., square roots, absolute values. Provide the final answer inside $\boxed{{}}. Example: $\boxed{{9}}, $\boxed{\frac{13}{6}}, $\boxed{-7}. This is mandatory and you will be punished for not doing so. Now here is your question: ### Problem <｜User｜> {} -------------- Provide the answer here: ### Answer <｜Assistant｜> <think> {} </think> """

In [5]:
EOS_TOKEN = "<｜end_of_sentence｜>"
def formatting_prompts_func(examples, split):
    problems = examples["problem"]
    solutions = examples["solution"]
    texts = []
    count = 0
    if split == "train":
        for problem, solution in zip(problems, solutions):
            if solution.count("$\\boxed{") == 1:
                answer = extract_boxed_answer(solution)
                if answer is not None:
                    text = SYSTEM_PROMPT.format(problem, solution, answer) + EOS_TOKEN
                    texts.append({"text" : text})
                    count = count + 1
    elif split == "test":
        for problem, solution in zip(problems, solutions):
            if solution.count("$\\boxed{") == 1:
                answer = extract_boxed_answer(solution)
                if answer is not None:
                    text = SYSTEM_PROMPT.format(problem, "", "") + EOS_TOKEN
                    texts.append({"text" : text, "answer": answer})
                    count = count + 1
    print("count", count)
    return texts

from datasets import load_dataset
train_dataset = load_dataset("AI-MO/NuminaMath-CoT", split = "train")
test_dataset = load_dataset("AI-MO/NuminaMath-CoT", split = "test")
combined_dataset = concatenate_datasets([train_dataset, test_dataset])

shuffled_dataset = combined_dataset.shuffle(seed=42)
final_dataset = shuffled_dataset.train_test_split(test_size=0.0001)
# print(type(train_dataset))

train_dataset = formatting_prompts_func(final_dataset["train"], "train")
test_dataset = formatting_prompts_func(final_dataset["test"], "test")
# test_dataset = load_dataset("AI-MO/NuminaMath-CoT", split = "test")
# test_dataset = formatting_prompts_func(test_dataset, "test")

# all_data = train_dataset + test_dataset
# random.shuffle(all_data)
# size_of_eval_data = 22334
# train_dataset = all_data[:size_of_eval_data]
# test_dataset = all_data[size_of_eval_data:]

README.md:   0%|          | 0.00/2.68k [00:00<?, ?B/s]

train-00000-of-00005.parquet:   0%|          | 0.00/247M [00:00<?, ?B/s]

train-00001-of-00005.parquet:   0%|          | 0.00/247M [00:00<?, ?B/s]

train-00002-of-00005.parquet:   0%|          | 0.00/247M [00:00<?, ?B/s]

train-00003-of-00005.parquet:   0%|          | 0.00/247M [00:00<?, ?B/s]

train-00004-of-00005.parquet:   0%|          | 0.00/247M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/166k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/859494 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/100 [00:00<?, ? examples/s]

count 223313
count 29


In [6]:
def evaluate_model(model, tokenizer, eval_examples, device):
   model.eval()
   correct = 0
   total = len(eval_examples)
   print("\n" + "="*50)
   print("EVALUATION ON", total, "EXAMPLES")
   print("="*50)

   for example in eval_examples:
       full_prompt = example["text"]
       expected = example["answer"]

       inputs = tokenizer.encode(full_prompt, return_tensors="pt").to(device)
       with torch.no_grad():
           outputs = model.generate(
               inputs,
               max_new_tokens=32768,
               temperature=0.7,
               pad_token_id=tokenizer.pad_token_id,
               eos_token_id=tokenizer.eos_token_id,
               forced_eos_token_id=tokenizer.eos_token_id,
               early_stopping=False,
           )
       response = tokenizer.decode(outputs[0], skip_special_tokens=True)

       try:
           predicted = extract_boxed_answer(response)
           if predicted == expected:  
               is_correct = True
           else:
               pred_num = extract_last_number(response)
               is_correct = (pred_num is not None and expected is not None and
                           pred_num == expected)

           if is_correct:
               correct += 1
           print("\nResponse")
           print(response)
           print("\nExpected Answer:")
           print(expected)
           print("\nExtracted Answer:")
           print(predicted)
           print("\nCorrect:", "✓" if is_correct else "✗")
           print("-"*50)

       except Exception as e:
           print("\nFailed to parse model output for prompt:")
           print(full_prompt)
           print("Error:", e)
           print("-"*50)

   accuracy = (correct / total) * 100
   print(f"\nAccuracy: {accuracy:.2f}% ({correct}/{total})")
   print("="*50)

   model.train()
   return accuracy

In [7]:
def correctness_reward(prompts, completions, answer, **kwargs):
   responses = [completion[0]['content'] for completion in completions]
   extracted = [extract_boxed_answer(r) for r in responses]
   rewards = []
   for r, a in zip(extracted, answer):
       if r == a:  
           rewards.append(2.0)
       else:
           r_num = extract_last_number(str(r))
           a_num = extract_last_number(str(a))
           if r_num is not None and a_num is not None and r_num == a_num:
               rewards.append(1.5)
           else:
               rewards.append(0.0)
   completion_lengths = [len(response.split()) for response in responses]
   return rewards


def format_reward(completions, **kwargs):
   responses = [completion[0]['content'] for completion in completions]
   rewards = []
   format_scores = []
   for response in responses:
       score = 0.0
       if "<think>" in response: score += 0.2
       if "</think>" in response: score += 0.2
       if "\boxed" in response: score += 0.2
       rewards.append(score)
       format_scores.append(score)
   return rewards


def combined_reward(prompts, completions, answer):
   correctness_scores = correctness_reward(prompts=prompts, completions=completions, answer=answer)
   format_scores = format_reward(completions=completions)

   combined_rewards = []
   for c_score, f_score in zip(correctness_scores, format_scores):
       # Correctness score range: 0.0 to 2.0
       # Format score range: 0.0 to 0.6
       # Total range: 0.0 to 2.6
       combined_rewards.append(c_score + f_score)

   return combined_rewards

In [8]:
def selective_log_softmax(logits, input_ids):
    log_probs = nn.functional.log_softmax(logits, dim=-1)
    return log_probs.gather(dim=-1, index=input_ids.unsqueeze(-1)).squeeze(-1)

def compute_log_probs(model, input_ids, attention_mask, logits_to_keep):
    logits = model(input_ids=input_ids, attention_mask=attention_mask).logits[:, :-1, :]
    input_ids = input_ids[:, -logits_to_keep:]
    logits = logits[:, -logits_to_keep:, :]
    return selective_log_softmax(logits, input_ids)

def create_completion_mask(completion_ids, eos_token_id):
    is_eos = completion_ids == eos_token_id
    eos_idx = torch.full((is_eos.size(0),), is_eos.size(1), dtype=torch.long, device=completion_ids.device)
    mask_exists = is_eos.any(dim=1)
    eos_idx[mask_exists] = is_eos.int().argmax(dim=1)[mask_exists]
    sequence_indices = torch.arange(is_eos.size(1), device=completion_ids.device).expand(is_eos.size(0), -1)
    return (sequence_indices <= eos_idx.unsqueeze(1)).int()

def generate_completions(model, tokenizer, prompts, num_generations=4, max_completion_length=32768):
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    inputs = tokenizer(prompts, return_tensors="pt", padding=True, padding_side="left")
    prompt_ids = inputs["input_ids"].to(device)
    prompt_mask = inputs["attention_mask"].to(device)
    print(f"Input batch size: {prompt_ids.size(0)}, Device before model: {prompt_ids.device}")
    prompt_length = prompt_ids.size(1)
    prompt_ids = prompt_ids.repeat_interleave(num_generations, dim=0)
    prompt_mask = prompt_mask.repeat_interleave(num_generations, dim=0)
    outputs = model.generate(
        prompt_ids,
        attention_mask=prompt_mask,
        max_new_tokens=max_completion_length,
        do_sample=True,
        temperature=1.0,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id,
        early_stopping=False
    )
    print(f"Output batch size: {outputs.size(0)}, Device after model: {outputs.device}")
    completion_ids = outputs[:, prompt_length:]
    completion_mask = create_completion_mask(completion_ids, tokenizer.eos_token_id)
    return prompt_ids, prompt_mask, completion_ids, completion_mask

def generate_rollout_data(model, ref_model, tokenizer, batch_samples, num_generations, max_completion_length):
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    prompts = [sample["text"] if isinstance(sample, dict) else sample[0] for sample in batch_samples]
    answers = [extract_boxed_answer(sample["text"]) if isinstance(sample, dict) else sample[1] for sample in batch_samples]
    with torch.no_grad():
        prompt_ids, prompt_mask, completion_ids, completion_mask = generate_completions(
            model, tokenizer, prompts, num_generations, max_completion_length
        )
        input_ids = torch.cat([prompt_ids, completion_ids], dim=1)
        attention_mask = torch.cat([prompt_mask, completion_mask], dim=1)
        logits_to_keep = completion_ids.size(1)
        old_log_probs = compute_log_probs(model, input_ids, attention_mask, logits_to_keep)
        ref_log_probs = compute_log_probs(ref_model, input_ids, attention_mask, logits_to_keep)
    formatted_completions = [[{'content': tokenizer.decode(ids, skip_special_tokens=True)}] for ids in completion_ids]
    repeated_prompts = [p for p in prompts for _ in range(num_generations)]
    repeated_answers = [a for a in answers for _ in range(num_generations)]
    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "completion_mask": completion_mask,
        "old_log_probs": old_log_probs,
        "ref_log_probs": ref_log_probs,
        "formatted_completions": formatted_completions,
        "repeated_prompts": repeated_prompts,
        "repeated_answers": repeated_answers,
        "logits_to_keep": logits_to_keep,
        "batch_size": len(prompts),
        "num_generations": num_generations
    }

def grpo_loss(model, ref_model, rollout_data, tokenizer, reward_function, beta=0.01, epsilon=0.2):
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    input_ids = rollout_data["input_ids"]
    attention_mask = rollout_data["attention_mask"]
    completion_mask = rollout_data["completion_mask"]
    logits_to_keep = rollout_data["logits_to_keep"]
    old_log_probs = rollout_data["old_log_probs"]
    ref_log_probs = rollout_data["ref_log_probs"]
    token_log_probs = compute_log_probs(model, input_ids, attention_mask, logits_to_keep)
    ratio = torch.exp(token_log_probs - old_log_probs)
    rewards = torch.tensor(
        reward_function(prompts=rollout_data["repeated_prompts"], completions=rollout_data["formatted_completions"], answer=rollout_data["repeated_answers"]),
        dtype=torch.float32,
        device=device
    )
    #print(f"Rewards: {rewards}")  # Debug rewards
    batch_size = rollout_data["batch_size"]
    num_generations = rollout_data["num_generations"]
    rewards = rewards.view(batch_size, num_generations)
    avg_reward = rewards.mean().item()
    print("Average Reward:", avg_reward)
    mean_rewards = rewards.mean(dim=1).repeat_interleave(num_generations)
    std_rewards = rewards.std(dim=1).repeat_interleave(num_generations)
    advantages = ((rewards.view(-1) - mean_rewards) / (std_rewards + 1e-4)).unsqueeze(1)
    surr1 = ratio * advantages
    surr2 = torch.clamp(ratio, 1 - epsilon, 1 + epsilon) * advantages
    surrogate_loss = torch.min(surr1, surr2)
    kl = torch.exp(ref_log_probs - token_log_probs) - (ref_log_probs - token_log_probs) - 1
    per_token_loss = surrogate_loss - beta * kl
    loss = -((per_token_loss * completion_mask).sum(dim=1) / completion_mask.sum(dim=1)).mean()
    return loss, avg_reward

def train_with_grpo(model, tokenizer, train_data, num_iterations=1, num_steps=500, batch_size=4,
                              num_generations=4, max_completion_length=128, beta=0.1,
                              learning_rate=5e-6, mu=3, epsilon=0.2, reward_function=None, device_ids=None):
    assert device_ids is not None and len(device_ids) > 1, "This code needs at least 2 GPU cores to run!"

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    model = nn.DataParallel(model, device_ids=device_ids)
    print(f"Model wrapped with DataParallel across GPUs: {device_ids}")

    for iteration in range(num_iterations):
        print(f"\nIteration {iteration+1}/{num_iterations}")

        ref_model = copy.deepcopy(model.module)
        ref_model.eval()
        for param in ref_model.parameters():
            param.requires_grad = False
        print("Reference model created.")

        optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
        model.train()

        for step in range(num_steps):
            batch_samples = random.sample(train_data, batch_size)
            with torch.no_grad():
                rollout_data = generate_rollout_data(
                    model.module,
                    ref_model,
                    tokenizer,
                    batch_samples,
                    num_generations,
                    max_completion_length
                )
            for grpo_iter in range(mu):
                loss, avg_reward = grpo_loss(
                    model.module,
                    ref_model,
                    rollout_data,
                    tokenizer,
                    reward_function,
                    beta=beta,
                    epsilon=epsilon
                )
                optimizer.zero_grad()
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=0.1)
                optimizer.step()
                wandb.log({
                    "loss": loss.item(),
                    "average_reward": avg_reward,
                    "iteration": iteration + 1,
                    "step": step + 1,
                    "grpo_iter": grpo_iter + 1
                })
                print(f"Iteration {iteration+1}/{num_iterations}, Step {step+1}/{num_steps}, "
                      f"GRPO iter {grpo_iter+1}/{mu}, loss: {loss.item():.4f}")
                for i in range(torch.cuda.device_count()):
                   print(f"GPU {i} Usage: {torch.cuda.memory_allocated(i) / 1024**2:.2f} MiB, "
                         f"Utilization: {torch.cuda.utilization(i)}%")
    return model.module

In [9]:
def optimize_model_memory(model):
    model.train()
    model.config.use_cache = False

    if hasattr(model, "enable_input_require_grads"):
        model.enable_input_require_grads()
    else:
        def make_inputs_require_grad(module, input, output):
            output.requires_grad_(True)
        model.get_input_embeddings().register_forward_hook(make_inputs_require_grad)

    model.gradient_checkpointing_enable()

    return model

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Using primary device: {device}")

model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
output_dir = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B-GRPO"

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
    device_map="auto"
)
print("Model downloaded")

tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left")
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.eos_token_id
model.config.eos_token_id = tokenizer.eos_token_id

num_gpus = torch.cuda.device_count()
print(f"Detected {num_gpus} GPUs")
device_ids = list(range(num_gpus)) if num_gpus > 1 else None

# all_data = train_dataset + test_dataset
# random.shuffle(all_data)
# size_of_eval_data = 3
# eval_data = all_data[:size_of_eval_data]
# train_data = all_data[size_of_eval_data:]

eval_data = test_dataset
train_data = train_dataset

print(eval_data[0])

print("\nInitial model evaluation before finetuning:")
# pre_grpo_accuracy = evaluate_model(model, tokenizer, eval_data, device)
# print(f"Pre-GRPO Accuracy: {pre_grpo_accuracy:.2f}%")

model = optimize_model_memory(model)

print("\nStarting RL fine-tuning using GRPO...")
training_config = {
    'num_iterations': 1,
    'num_steps': 15,
    'batch_size': 7,
    'num_generations': 6,
    'max_completion_length': 200, 
    'beta': 0.04,
    'learning_rate': 5e-6,
    'mu': 1,
    'epsilon': 0.1
}

wandb.init(project=os.environ["WANDB_PROJECT"], reinit=True)
print("Weights & Biases initialized.")

model = train_with_grpo(
    model=model,
    tokenizer=tokenizer,
    train_data=train_data,
    reward_function=combined_reward,
    device_ids=device_ids,
    **training_config
)

wandb.finish()
print("Training completed and wandb run finished.")

print("\nFinal model evaluation after GRPO RL fine-tunin.g:")
post_grpo_accuracy = evaluate_model(model, tokenizer, eval_data, device)
print(f"Post-GRPO Accuracy: {post_grpo_accuracy:.2f}%")



Using primary device: cuda:0


config.json:   0%|          | 0.00/679 [00:00<?, ?B/s]

2025-03-06 09:52:01.076841: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1741254721.087290     546 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1741254721.090454     546 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-06 09:52:01.104088: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


model.safetensors:   0%|          | 0.00/3.55G [00:00<?, ?B/s]

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

Model downloaded


tokenizer_config.json:   0%|          | 0.00/3.07k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Detected 8 GPUs
{'text': '<｜begin_of_sentence｜>  \nBelow is a Math problem that is on the difficulty level of national olympiads. \nYou are a international gold medalist in Math, so you should be able to solve this problem. \nYou should expect the difficulty of the problems to be roughly at the level of a national Olympiad, \nalthough some problems are slightly easier and some are slightly harder. \nThe problems are all in LaTeX format. \nAnswers may require basic computations, e.g., square roots, absolute values. \nProvide the final answer in the end as a numerical value.\nThis is mandatory and you will be punished for not doing so.\nNow here is your question: \n### Problem <｜User｜> Pedro goes to the market and buys a total of 32 plums and peaches for 52 dollars. A plum costs 2 dollars and a peach costs 1 dollar. How many plums did Pedro buy? -------------- \nProvide the answer here: \n### Answer <｜Assistant｜> <think>   answer:  </think> <｜end_of_sentence｜>', 'answer': '20'}

Initial 

[34m[1mwandb[0m: Currently logged in as: [33msindhusatish155[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Weights & Biases initialized.
Model wrapped with DataParallel across GPUs: [0, 1, 2, 3, 4, 5, 6, 7]

Iteration 1/1


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Reference model created.
Input batch size: 7, Device before model: cuda:0




Output batch size: 42, Device after model: cuda:0
Average Reward: 0.8666667342185974


OutOfMemoryError: CUDA out of memory. Tried to allocate 13.81 GiB. GPU 0 has a total capacity of 21.95 GiB of which 7.90 GiB is free. Including non-PyTorch memory, this process has 0 bytes memory in use. Of the allocated memory 13.81 GiB is allocated by PyTorch, and 16.04 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [10]:
print("\nSaving GRPO fine-tuned model...")
model.save_pretrained("grpo_finetuned_model")
tokenizer.save_pretrained("grpo_finetuned_model")


Saving GRPO fine-tuned model...


('grpo_finetuned_model/tokenizer_config.json',
 'grpo_finetuned_model/special_tokens_map.json',
 'grpo_finetuned_model/tokenizer.json')