In [None]:
!pip install datasets peft trl transformers bitsandbytes accelerate torch tqdm pandas numpy Levenshtein


In [2]:
from datasets import load_dataset,Dataset
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, PeftModel
from trl import GRPOConfig, GRPOTrainer

import datetime

from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    GenerationConfig,
    PrinterCallback,
)
from tqdm import tqdm
import torch
import time
import transformers
import pandas as pd
import numpy as np

from Levenshtein import ratio as levenshtein_ratio
transformers.set_seed(42)


In [24]:
class CFG:
    MAX_TRAIN = 100
    MAX_TOKENS = 2048
    NUM_GENERATIONS = 4
    USE_PEFT = True
    BATCH_SIZE=4
    MAX_STEPS = 40

    BETA = 0.04
    LR = 1.e-5

    model_name = 'deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B'
    splitter = '<｜Assistant｜>'

    step_count=10
    DEBUG = False



In [4]:
import re

def extract_boxed_text(text):
    pattern = r'oxed{(.*?)}'
    matches = re.findall(pattern, text)
    if not matches:
        return ""
    for match in matches[::-1]:
        if match != "":
            return match
    return ""

In [6]:
df = pd.read_parquet('/content/math_problems.parquet')
df = df.reset_index().rename({'index': 'id'}, axis=1)
df['answer'] = df['solution'].map(extract_boxed_text)

def is_valid_answer(s):
    try:
        if float(s) == int(s):
            i = int(s)
            return 0<=i<1000
        else:
            return False
    except ValueError:
        return False

mask = df['answer'].map(is_valid_answer)
df = df[mask]

In [7]:
df = df.iloc[:CFG.MAX_TRAIN]

In [8]:
dataset = Dataset.from_pandas(df)

In [9]:
dataset = dataset.train_test_split(test_size=0.1)
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'problem', 'solution', 'answer', '__index_level_0__'],
        num_rows: 90
    })
    test: Dataset({
        features: ['id', 'problem', 'solution', 'answer', '__index_level_0__'],
        num_rows: 10
    })
})

In [10]:
def create_prompt(sample):
    question = sample['problem']
    chat = [{"role": "system", "content": "A conversation between User and Assistant. The user asks a question, and the Assistant solves it.  The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think> </think> and <answer> </answer> tags, respectively, i.e., <think> reasoning process here </think> <answer> answer here </answer>"},
            {"role": "user", "content": question + ' Return final answer within \\boxed{}, after taking modulo 1000.'},]
    sample['prompt'] = tokenizer.apply_chat_template(
            conversation=chat,
            tokenize=False,
            add_generation_prompt=True
        )
    return sample

In [11]:
dataset['train'][0]

{'id': 8,
 'problem': 'Given a set of data $x_1, x_2, x_3, x_4, x_5$ with a mean of 8 and variance of 2, find the mean and variance of a new set of data: $4x_1+1, 4x_2+1, 4x_3+1, 4x_4+1, 4x_5+1$.',
 'solution': "Firstly, let's focus on the mean of the new set. Since the original set has a mean of 8, when we apply the transformation $y_i = 4x_i + 1$ to each element, the mean will be scaled by a factor of 4 and then increased by 1. Therefore, the mean of the new set is given by:\n\\[\n\\text{New mean} = 4 \\cdot \\text{Mean of original set} + 1 = 4 \\cdot 8 + 1 = 32 + 1 = \\boxed{33}.\n\\]\n\nNext, we consider the change in variance due to this transformation. Adding 1, a constant, does not affect the variance. However, multiplying by 4 scales the variance by the square of that factor. Therefore, the variance of the new set is given by:\n\\[\n\\text{New variance} = 4^2 \\cdot \\text{Variance of original set} = 16 \\cdot 2 = \\boxed{32}.\n\\]\n\nHence, the correct answer for the mean and 

In [12]:
## We would also want a reward function based on accuracy
# split after </think>, then get the answer within bbox

## We can also do a reward based on Similarity of

import re

def format_reward_func(completions, **kwargs):
    """Reward function that checks if the completion has a specific format."""
    pattern = r"^<think>.*?</think>.*?oxed{(.*?)}.*?$"
    matches = [re.match(pattern, content, re.DOTALL) for content in completions]
    return [1.0 if match else 0.0 for match in matches]


def extract_boxed_text(text):
    pattern = r'oxed{(.*?)}'
    matches = re.findall(pattern, text)
    if not matches:
        return ""
    for match in matches[::-1]:
        if match != "":
            return match
    return ""

def accuracy_reward_func(completions, answer, **kwargs):
    # Regular expression to capture content inside \boxed{}
    contents = [extract_boxed_text(completion) for completion in completions]
    # Reward 1 if the content is the same as the ground truth, 0 otherwise
    return [1.0 if c == str(gt) else 0.0 for c, gt in zip(contents, answer)]

In [13]:
def levenshtein_reward_func(completions, solution, **kwargs):
    res = []
    for completion, sol in zip(completions, solution):
        if '</think>' in completion:
            t = completion.split('</think>')[-1]
            res.append(levenshtein_ratio(t, sol))
        else:
            res.append(0.0)
    return res

In [14]:
device_map = 'auto'
if CFG.USE_PEFT:
    compute_dtype = getattr(torch, "float16")
    bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type='nf4',
            bnb_4bit_compute_dtype=compute_dtype,
            bnb_4bit_use_double_quant=False,
        )
    original_model = AutoModelForCausalLM.from_pretrained(CFG.model_name,
                                                          device_map=device_map,
                                                          quantization_config=bnb_config,
                                                          trust_remote_code=True)
else:
    original_model = AutoModelForCausalLM.from_pretrained(CFG.model_name,
                                                          device_map=device_map,
                                                          trust_remote_code=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/679 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.55G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

In [15]:
tokenizer = AutoTokenizer.from_pretrained(CFG.model_name,trust_remote_code=True,padding_side="left")

tokenizer_config.json:   0%|          | 0.00/3.07k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

In [16]:
dataset = dataset.map(create_prompt)#, batched=True)

Map:   0%|          | 0/90 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

In [17]:
def gen(model, text, max_tokens):
    model_input = tokenizer(text, return_tensors='pt').to(model.device)
    model.eval()
    with torch.no_grad():
        tok = model.generate(**model_input, max_new_tokens=max_tokens, pad_token_id=tokenizer.pad_token_type_id)
        outputs = []
        for i in range(len(tok)):
            res = tokenizer.decode(tok[i], skip_special_tokens=True)
            output = res.split(CFG.splitter)[-1]
            outputs.append(output)
        return outputs[0] if len(outputs) == 1 else outputs

In [18]:
def evaluate_rewards(model, dataset, reward_functions: dict[str, callable], max_tokens: int, num_generations: int):
    completions = []
    other_info = []
    for example in tqdm(dataset):
        txt = example['prompt']
        kw = {k: v for k, v in example.items() if k not in {'prompt', 'completion'}}
        for _ in range(num_generations):
            other_info.append(kw)

        completion = gen(model, [txt]*num_generations, max_tokens)
        if isinstance(completion, str):
            completions.append(completion)
        else:
            completions += completion

    kwargs = {k: [d[k] for d in other_info] for k in other_info[0].keys()}
    res = {}
    for nm, reward_func in reward_functions.items():
        v = reward_func(completions=completions, **kwargs)
        print(nm, np.mean(v))
        res[nm] = np.mean(v)
    return res

In [19]:


reward_functions = {'formatting': format_reward_func, 'accuracy': accuracy_reward_func, 'solution_quality': levenshtein_reward_func}



In [20]:
if not CFG.DEBUG:
    original_rewards = evaluate_rewards(model=original_model, dataset=dataset['test'], reward_functions=reward_functions, max_tokens=CFG.MAX_TOKENS, num_generations=CFG.NUM_GENERATIONS)

100%|██████████| 10/10 [14:12<00:00, 85.27s/it]

formatting 0.65
accuracy 0.55
solution_quality 0.282795764042829





In [21]:
dtstr = datetime.datetime.now().strftime('%Y%m%d%H%M%S')
output_directory=f"./DEEPSEEK-GRPO-{dtstr}"


training_args = GRPOConfig(
    output_dir=output_directory,

    learning_rate=CFG.LR,

    per_device_train_batch_size=CFG.BATCH_SIZE,

    gradient_accumulation_steps=1,
    max_steps=CFG.MAX_STEPS,

    max_completion_length=CFG.MAX_TOKENS,  #8192
    num_generations=CFG.NUM_GENERATIONS,
    beta=CFG.BETA,

    logging_steps=CFG.step_count,
    logging_dir="./logs",
    save_strategy="steps",
    save_steps=CFG.step_count,
#     eval_strategy="steps",
#     eval_steps=CFG.step_count,
#     do_eval=True,
    # gradient_checkpointing=True,  # Will crash the whole thing
    report_to="none",
    overwrite_output_dir = 'True',
)

# Will typically use the AdamW optimizer

In [25]:
if CFG.USE_PEFT:
    peft_config = LoraConfig(
        r=16, #Rank
        lora_alpha=32,
        target_modules=[
            'q_proj',
            'k_proj',
            'v_proj',
            'dense'
        ],
        bias="none",
        lora_dropout=0.05,  # Conventional
        task_type="CAUSAL_LM",
    )
    trainer = GRPOTrainer(
        model=original_model,
        reward_funcs=list(reward_functions.values()),
        args=training_args,
        train_dataset=dataset['train'],
        peft_config=peft_config,
        callbacks=[PrinterCallback()]
    )
else:
    trainer = GRPOTrainer(
        model=original_model,
        reward_funcs=list(reward_functions.values()),
        args=training_args,
        train_dataset=dataset['train'],
        callbacks=[PrinterCallback()]
    )

In [26]:
trainer.train()

Step,Training Loss
10,-0.0
20,-0.0
30,-0.0
40,-0.0
50,-0.0
60,-0.0
70,-0.0


{'loss': -0.0, 'grad_norm': 0.05637193098664284, 'learning_rate': 8.750000000000001e-06, 'rewards/format_reward_func': 0.0, 'rewards/accuracy_reward_func': 0.25, 'rewards/levenshtein_reward_func': 0.17943935766816138, 'reward': 0.4294393636286259, 'reward_std': 0.2010581212118268, 'completion_length': 1633.725, 'kl': -4.929304122924805e-06, 'epoch': 0.1111111111111111}
{'loss': -0.0, 'grad_norm': 0.0, 'learning_rate': 7.500000000000001e-06, 'rewards/format_reward_func': 0.0, 'rewards/accuracy_reward_func': 0.375, 'rewards/levenshtein_reward_func': 0.19461265504360198, 'reward': 0.5696126520633698, 'reward_std': 0.08434975519776344, 'completion_length': 1414.825, 'kl': -6.55055046081543e-06, 'epoch': 0.2222222222222222}
{'loss': -0.0, 'grad_norm': 0.05015848949551582, 'learning_rate': 6.25e-06, 'rewards/format_reward_func': 0.0, 'rewards/accuracy_reward_func': 0.525, 'rewards/levenshtein_reward_func': 0.32836783602833747, 'reward': 0.8533678218722344, 'reward_std': 0.2889576876536012, '

Step,Training Loss
10,-0.0
20,-0.0
30,-0.0
40,-0.0
50,-0.0
60,-0.0
70,-0.0
80,-0.0


{'loss': -0.0, 'grad_norm': 0.049999602138996124, 'learning_rate': 0.0, 'rewards/format_reward_func': 0.0, 'rewards/accuracy_reward_func': 0.475, 'rewards/levenshtein_reward_func': 0.3049303561449051, 'reward': 0.7799303531646729, 'reward_std': 0.27239095997065305, 'completion_length': 1525.425, 'kl': -6.252527236938477e-06, 'epoch': 0.8888888888888888}
{'train_runtime': 10937.0447, 'train_samples_per_second': 0.029, 'train_steps_per_second': 0.007, 'train_loss': -2.771159415715374e-07, 'epoch': 0.8888888888888888}


TrainOutput(global_step=80, training_loss=-2.771159415715374e-07, metrics={'train_runtime': 10937.0447, 'train_samples_per_second': 0.029, 'train_steps_per_second': 0.007, 'total_flos': 0.0, 'train_loss': -2.771159415715374e-07})

In [27]:
if CFG.USE_PEFT:
    print('Loading trained model')
    CHKPT = CFG.MAX_STEPS
    adapter_model_name = f'{output_directory}/checkpoint-{CHKPT}/'
    new_model = PeftModel.from_pretrained(original_model, adapter_model_name)
else:
    new_model = original_model

Loading trained model


In [28]:
rewards = evaluate_rewards(model=new_model, dataset=dataset['test'], reward_functions=reward_functions, max_tokens=CFG.MAX_TOKENS, num_generations=CFG.NUM_GENERATIONS)
rewards

100%|██████████| 10/10 [19:46<00:00, 118.64s/it]

formatting 0.675
accuracy 0.5
solution_quality 0.2654145397995742





{'formatting': 0.675, 'accuracy': 0.5, 'solution_quality': 0.2654145397995742}