# **Install Dependencies**

In [None]:
!pip install unsloth vllm
!pip install --upgrade pillow
!pip install datasets



# **Import Required Libraries**

In [None]:
import os
import re
import torch
from unsloth import is_bf16_supported
from datasets import load_dataset
from typing import Optional, Dict, List
from unsloth import FastLanguageModel, PatchFastRL
from vllm import SamplingParams
from trl import GRPOConfig
PatchFastRL("GRPO", FastLanguageModel)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


# **Model Configuration & Parameters**

In [None]:
max_seq_lenghth = 512 # Maximum sequence length the model can handle.
lora_rank = 8  # The rank of the LoRA adapters, which determines how much memory and compute are used for fine-tuning
target_modules = ["q_proj", "k_proj", "v_proj", "o_proj"]
gpu_memory_utilization = 0.6

# **Model Loading & Preparation**

*Load Pre-trained Model*

In [None]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "meta-llama/meta-Llama-3.1-8B-Instruct",
    max_seq_length = max_seq_lenghth,
    load_in_4bit = True,
    fast_inference = True,
    max_lora_rank = lora_rank,
    gpu_memory_utilization = 0.6,
)

INFO 03-10 17:10:22 __init__.py:207] Automatically detected platform cuda.
==((====))==  Unsloth 2025.3.9: Fast Llama patching. Transformers: 4.48.3. vLLM: 0.7.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: vLLM loading unsloth/meta-llama-3.1-8b-instruct-unsloth-bnb-4bit with actual GPU utilization = 59.43%
Unsloth: Your GPU has CUDA compute capability 7.5 with VRAM = 14.74 GB.
Unsloth: Using conservativeness = 1.0. Chunked prefill tokens = 512. Num Sequences = 160.
Unsloth: vLLM's KV Cache can use up to 2.7 GB. Also swap space = 2 GB.
INFO 03-10 17:10:52 config.py:549] This model supports multiple tasks: {'classify', 'generate', 'score', 'embed'

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 03-10 17:11:17 model_runner.py:1115] Loading model weights took 5.5976 GB
INFO 03-10 17:11:17 punica_selector.py:18] Using PunicaWrapperGPU.
INFO 03-10 17:11:22 worker.py:267] Memory profiling takes 4.13 seconds
INFO 03-10 17:11:22 worker.py:267] the current vLLM instance can use total_gpu_memory (14.74GiB) x gpu_memory_utilization (0.59) = 8.76GiB
INFO 03-10 17:11:22 worker.py:267] model weights take 5.60GiB; non_torch_memory takes 0.03GiB; PyTorch activation peak memory takes 0.74GiB; the rest of the memory reserved for KV Cache is 2.40GiB.
INFO 03-10 17:11:22 executor_base.py:111] # cuda blocks: 1226, # CPU blocks: 1024
INFO 03-10 17:11:22 executor_base.py:116] Maximum concurrency for 512 tokens per request: 38.31x
INFO 03-10 17:11:24 model_runner.py:1434] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error occu

Capturing CUDA graph shapes: 100%|██████████| 23/23 [00:41<00:00,  1.81s/it]

INFO 03-10 17:12:06 model_runner.py:1562] Graph capturing finished in 42 secs, took 0.59 GiB
INFO 03-10 17:12:06 llm_engine.py:436] init engine (profile, create kv cache, warmup model) took 48.54 seconds





tokenizer_config.json:   0%|          | 0.00/55.5k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

*Apply LoRA Adapters*

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = lora_rank,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",],
    lora_alpha = lora_rank,
    use_gradient_checkpointing = "unsloth")



Not an error, but Unsloth cannot patch MLP layers with our manual autograd engine since either LoRA adapters
are not enabled or a bias term (like in Qwen) is used.
Unsloth 2025.3.9 patched 32 layers with 32 QKV layers, 32 O layers and 0 MLP layers.


*Structuring system prompt*

In [None]:
# Define a structured system prompt
SYSTEM_PROMPT = """
You are a math assistant. Answer the given question based on your calculations.

Format:
Question: <Problem Statement>
Answer: <Final Answer>
"""

# **Dataset preparation**

In [None]:
def extract_hash_answer(text: str) -> Optional[str]:
    """
    Extracts the answer from text using the '####' delimiter.

    Args:
        text (str): The raw answer text from the dataset.

    Returns:
        Optional[str]: The extracted answer or None if not found.
    """
    match = re.search(r"####\s*(.*)", text)
    return match.group(1).strip() if match else None

def create_dataset(split: str = "train", dataset_name: str = "openai/gsm8k") -> List[Dict]:
    """
    Loads and processes the GSM8K dataset into a structured prompt-answer format.

    Args:
        split (str): Dataset split to use ('train' or 'test').
        dataset_name (str): The Hugging Face dataset name (default: 'openai/gsm8k').

    Returns:
        List[Dict]: Processed dataset with system prompts, user queries, and extracted answers.
    """
    # Load dataset with error handling
    try:
        dataset = load_dataset(dataset_name, 'main')[split]
    except Exception as e:
        raise RuntimeError(f"Error loading dataset {dataset_name}: {e}")

    # Process dataset into structured format
    processed_data = dataset.map(lambda x: {
        'prompt': [
            {'role': "system", "content": SYSTEM_PROMPT},  # System instruction
            {'role': "user", "content": x['question']}  # User question
        ],
        'answer': extract_hash_answer(x['answer'])  # Extracted answer
    })

    # Debugging: print a sample
    print(f"Sample Processed Data (First Entry):\n{processed_data[0]}")

    return processed_data

# Create dataset (default to 'train' split)
dataset = create_dataset()


README.md:   0%|          | 0.00/7.94k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/2.31M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/419k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/7473 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1319 [00:00<?, ? examples/s]

Map:   0%|          | 0/7473 [00:00<?, ? examples/s]

Sample Processed Data (First Entry):
{'question': 'Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?', 'answer': '72', 'prompt': [{'content': '\nYou are a math assistant. Answer the given question based on your calculations.\n\nFormat:\nQuestion: <Problem Statement>\nAnswer: <Final Answer>\n', 'role': 'system'}, {'content': 'Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?', 'role': 'user'}]}


# **Reward Functions**

In [None]:
def extract_xml_answer(text: str) -> Optional[str]:
    """Extracts the answer from XML-like formatted text."""
    if not text.strip():  # Check if text is empty or only contains spaces
        return None
    return text.strip()  # Remove extra spaces and return cleaned text


def correctness_reward_func(prompts, completions, answer, **kwargs) -> list[float]:
    responses = [completion[0]['content'] for completion in completions]
    q = prompts[0][-1]['content']
    extracted_responses = [extract_xml_answer(r) for r in responses]
    print('-'*20, f"Question:\n{q}", f"\nAnswer:\n{answer[0]}", f"\nResponse:\n{responses[0]}", f"\nExtracted:\n{extracted_responses[0]}")
    return [2.0 if r == a else 0.0 for r, a in zip(extracted_responses, answer)]

def int_reward_func(completions, **kwargs) -> list[float]:
    responses = [completion[0]['content'] for completion in completions]
    extracted_responses = [extract_xml_answer(r) for r in responses]
    return [0.5 if r.isdigit() else 0.0 for r in extracted_responses]

def strict_format_reward_func(completions, **kwargs) -> list[float]:
    """Reward function that checks if the completion has a specific format."""
    pattern = r"^\n.*?\n\n\n.*?\n\n$"
    responses = [completion[0]["content"] for completion in completions]
    matches = [re.match(pattern, r) for r in responses]
    return [0.5 if match else 0.0 for match in matches]

def soft_format_reward_func(completions, **kwargs) -> list[float]:
    """Reward function that checks if the completion has a specific format."""
    pattern = r".*?\s*.*?"
    responses = [completion[0]["content"] for completion in completions]
    matches = [re.match(pattern, r) for r in responses]
    return [0.5 if match else 0.0 for match in matches]

def count_xml(text: str) -> float:
    """Counts occurrences of newlines and assigns a score based on structure."""
    count = 0.0
    count += 0.125 if text.count("\n") == 1 else 0.0
    count += 0.125 if text.count("\n\n") == 1 else 0.0
    if text.count("\n\n") == 1:
        count -= len(text.split("\n\n")[-1]) * 0.001
    if text.count("\n") == 1:
        count -= (len(text.split("\n")[-1]) - 1) * 0.001
    return count

def xmlcount_reward_func(completions, **kwargs) -> list[float]:
    contents = [completion[0]["content"] for completion in completions]
    return [count_xml(c) for c in contents]


# **Training setup**

In [None]:
# -------------------------------
# Precision Handling
# -------------------------------
BF16_SUPPORTED = is_bf16_supported()
FP16 = not BF16_SUPPORTED

# -------------------------------
# Load Configurations from Environment Variables
# -------------------------------
# General settings
USE_VLLM = os.getenv("USE_VLLM", "True") == "True"
REPORT_TO = os.getenv("REPORT_TO", "none")
OUTPUT_DIR = os.getenv("OUTPUT_DIR", "outputs")

# Optimization parameters
LEARNING_RATE = float(os.getenv("LEARNING_RATE", 5e-6))
ADAM_BETA1 = float(os.getenv("ADAM_BETA1", 0.9))
ADAM_BETA2 = float(os.getenv("ADAM_BETA2", 0.99))
WEIGHT_DECAY = float(os.getenv("WEIGHT_DECAY", 0.1))
MAX_GRAD_NORM = float(os.getenv("MAX_GRAD_NORM", 0.1))
OPTIMIZER = os.getenv("OPTIMIZER", "paged_adamw_8bit")

# Scheduler settings
WARMUP_RATIO = float(os.getenv("WARMUP_RATIO", 0.1))
LR_SCHEDULER_TYPE = os.getenv("LR_SCHEDULER_TYPE", "cosine")

# Training parameters
PER_DEVICE_TRAIN_BATCH_SIZE = int(os.getenv("PER_DEVICE_TRAIN_BATCH_SIZE", 1))
GRADIENT_ACCUMULATION_STEPS = int(os.getenv("GRADIENT_ACCUMULATION_STEPS", 1))
MAX_STEPS = int(os.getenv("MAX_STEPS", 250))
SAVE_STEPS = int(os.getenv("SAVE_STEPS", 250))

# Logging settings
LOGGING_STEPS = int(os.getenv("LOGGING_STEPS", 1))

# Model generation settings
NUM_GENERATIONS = int(os.getenv("NUM_GENERATIONS", 6))
MAX_PROMPT_LENGTH = int(os.getenv("MAX_PROMPT_LENGTH", 256))
MAX_COMPLETION_LENGTH = int(os.getenv("MAX_COMPLETION_LENGTH", 200))

# -------------------------------
# Training Configuration
# -------------------------------
training_args = GRPOConfig(
    use_vllm=USE_VLLM,
    learning_rate=LEARNING_RATE,
    adam_beta1=ADAM_BETA1,
    adam_beta2=ADAM_BETA2,
    weight_decay=WEIGHT_DECAY,
    warmup_ratio=WARMUP_RATIO,
    lr_scheduler_type=LR_SCHEDULER_TYPE,
    optim=OPTIMIZER,

    # Precision settings
    bf16=BF16_SUPPORTED,
    fp16=FP16,

    # Training settings
    per_device_train_batch_size=PER_DEVICE_TRAIN_BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    max_steps=MAX_STEPS,
    save_steps=SAVE_STEPS,
    max_grad_norm=MAX_GRAD_NORM,

    # Logging & Output
    logging_steps=LOGGING_STEPS,
    report_to=REPORT_TO,
    output_dir=OUTPUT_DIR,

    # Generation settings
    num_generations=NUM_GENERATIONS,
    max_prompt_length=MAX_PROMPT_LENGTH,
    max_completion_length=MAX_COMPLETION_LENGTH,
)

# -------------------------------
# Print Configuration for Verification
# -------------------------------
print("Training Configuration Loaded Successfully!")


Unsloth: We now expect `per_device_train_batch_size` to be a multiple of `num_generations`.
We will change the batch size of 1 to the `num_generations` of 6
Training Configuration Loaded Successfully!


# **Model Training**

In [10]:
from trl import GRPOTrainer
trainer = GRPOTrainer(
    model = model,
    processing_class = tokenizer,
    reward_funcs= [
                   correctness_reward_func,
                   int_reward_func,
                   strict_format_reward_func,
                   soft_format_reward_func,
                   xmlcount_reward_func,
    ],
    args = training_args,
    train_dataset = dataset,
)
trainer.train() # The GRPOTrainer class has a train() method to start the training process.

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 7,473 | Num Epochs = 1 | Total steps = 250
O^O/ \_/ \    Batch size per device = 6 | Gradient accumulation steps = 1
\        /    Data Parallel GPUs = 1 | Total batch size (6 x 1 x 1) = 6
 "-____-"     Trainable parameters = 6,815,744/4,635,496,448 (0.15% trained)


-------------------- Question:
A concert ticket costs $40. Mr. Benson bought 12 tickets and received a 5% discount for every ticket bought that exceeds 10. How much did Mr. Benson pay in all? 
Answer:
476 
Response:
To find the total amount Mr. Benson paid, first, we need to find the cost of the tickets after the discount. 

The first 10 tickets cost $40 each, so the cost of the first 10 tickets is:
10 * $40 = $400

Since Mr. Benson bought 12 tickets in total, he bought 12 - 10 = 2 tickets with a discount.

For 2 tickets that exceed 10 with a 5% discount, the cost per ticket would be 95% of $40.
95% of $40 = (95/100) * $40 = $38

So, the cost of the first 10 tickets and the 2 tickets with a discount is:
$400 + 2 * $38 = $400 + $76 = $476

The total amount Mr. Benson paid is $476. 
Extracted:
To find the total amount Mr. Benson paid, first, we need to find the cost of the tickets after the discount. 

The first 10 tickets cost $40 each, so the cost of the first 10 tickets is:
10 * $40 =

Step,Training Loss,reward,reward_std,completion_length,kl,rewards / correctness_reward_func,rewards / int_reward_func,rewards / strict_format_reward_func,rewards / soft_format_reward_func,rewards / xmlcount_reward_func
1,0.0,0.450333,0.121658,179.666672,0.0,0.0,0.0,0.0,0.5,-0.049667
2,0.0,0.5,0.0,192.0,0.0,0.0,0.0,0.0,0.5,0.0
3,0.0,0.5,0.0,181.333344,6e-06,0.0,0.0,0.0,0.5,0.0
4,0.0,0.5,0.0,178.0,5e-06,0.0,0.0,0.0,0.5,0.0
5,0.0,0.5,0.0,128.666672,5e-06,0.0,0.0,0.0,0.5,0.0
6,0.0,0.5,0.0,186.333344,4e-06,0.0,0.0,0.0,0.5,0.0
7,0.0,0.5,0.0,182.333344,5e-06,0.0,0.0,0.0,0.5,0.0
8,0.0,0.5,0.0,136.166672,9e-06,0.0,0.0,0.0,0.5,0.0
9,0.0,0.5,0.0,176.666672,6e-06,0.0,0.0,0.0,0.5,0.0
10,0.0,0.5,0.0,200.0,5e-06,0.0,0.0,0.0,0.5,0.0


-------------------- Question:
Jane is trying to decide whether to buy a house or a trailer. A house costs $480,000 and a trailer costs $120,000. Each loan will be paid in monthly installments over 20 years. How much more is the monthly payment on the house compared to the trailer? 
Answer:
1500 
Response:
To find out how much more is the monthly payment on the house compared to the trailer, we need to calculate the monthly payment for each option.

The formula to calculate the monthly payment is:

Monthly Payment = Total Amount / (1 + (Interest Rate / 12) * (1 + (Interest Rate / 12)^ (Number of Payments))

However, the interest rate is not provided. Let's assume a standard 20-year mortgage with a fixed interest rate. Assuming an interest rate of 4% for simplicity.

For the house:
Monthly Payment (House) = $480,000 / (1 + (0.04 / 12) * (1 + (0.04 / 12)^ (12*20))
Monthly Payment (House) ≈ $3,500.69

For the trailer:
Monthly Payment (Trailer) = $120,000 / (1 + (0.04 / 12) * (1 + (0.04 / 

Step,Training Loss,reward,reward_std,completion_length,kl,rewards / correctness_reward_func,rewards / int_reward_func,rewards / strict_format_reward_func,rewards / soft_format_reward_func,rewards / xmlcount_reward_func
1,0.0,0.450333,0.121658,179.666672,0.0,0.0,0.0,0.0,0.5,-0.049667
2,0.0,0.5,0.0,192.0,0.0,0.0,0.0,0.0,0.5,0.0
3,0.0,0.5,0.0,181.333344,6e-06,0.0,0.0,0.0,0.5,0.0
4,0.0,0.5,0.0,178.0,5e-06,0.0,0.0,0.0,0.5,0.0
5,0.0,0.5,0.0,128.666672,5e-06,0.0,0.0,0.0,0.5,0.0
6,0.0,0.5,0.0,186.333344,4e-06,0.0,0.0,0.0,0.5,0.0
7,0.0,0.5,0.0,182.333344,5e-06,0.0,0.0,0.0,0.5,0.0
8,0.0,0.5,0.0,136.166672,9e-06,0.0,0.0,0.0,0.5,0.0
9,0.0,0.5,0.0,176.666672,6e-06,0.0,0.0,0.0,0.5,0.0
10,0.0,0.5,0.0,200.0,5e-06,0.0,0.0,0.0,0.5,0.0


-------------------- Question:
Ben has $2000 for his business operations costs. He orders goods from his supplier and writes them a cheque for $600. His debtor pays him $800 from the purchases they had made on credit. Mr. Ben then decides to do equipment maintenance and spends $1200 on the whole operation. How much money is Mr. Ben remaining with? 
Answer:
1000 
Response:
Payment made by Ben to the supplier = $600.
Amount received from debtor = $800.
Since Ben had $2000 initially, he deducts $600 to get $1400. Then he adds $800 to get $2200. After that, he subtracts $1200 to find the remaining amount.

Remaining amount = $2200 - $1200 
Remaining amount = $1000.

The final answer is $1000. 
Extracted:
Payment made by Ben to the supplier = $600.
Amount received from debtor = $800.
Since Ben had $2000 initially, he deducts $600 to get $1400. Then he adds $800 to get $2200. After that, he subtracts $1200 to find the remaining amount.

Remaining amount = $2200 - $1200 
Remaining amount = $1

TrainOutput(global_step=250, training_loss=5.733436279626858e-07, metrics={'train_runtime': 10867.8522, 'train_samples_per_second': 0.138, 'train_steps_per_second': 0.023, 'total_flos': 0.0, 'train_loss': 5.733436279626858e-07})

# *Comparing Regular model and Model with LoRA weights*

In [15]:
model.save_lora("grpo_saved_lora")

In [14]:
# Regular model
query = "which is bigger 9.11 or 9.9?"

text = tokenizer.apply_chat_template([
    {"role" : "user", "content" : query},
], tokenize = False, add_generation_prompt = True)


output = model.fast_generate([text],
                             sampling_params = SamplingParams(
                                 temperature = 0.8,
                                 top_p = 0.95,
                                 max_tokens = 1024),
                             lora_request = None
                             )[0].outputs[0].text

print(output)

Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  1.24it/s, est. speed input: 60.71 toks/s, output: 13.63 toks/s]

9.11 is bigger than 9.9





In [16]:
# Model with LoRA weights

query = "which is bigger 9.11 or 9.9?"

text = tokenizer.apply_chat_template([
    {"role" : "system", "content" : SYSTEM_PROMPT},
    {"role" : "user", "content" : query},
], tokenize = False, add_generation_prompt = True)


output = model.fast_generate(text,
                             sampling_params = SamplingParams(
                                 temperature = 0.8,
                                 top_p = 0.95,
                                 max_tokens = 1024),
                             lora_request = model.load_lora("grpo_saved_lora"),
                             )[0].outputs[0].text

print(output)

Processed prompts: 100%|██████████| 1/1 [00:01<00:00,  1.61s/it, est. speed input: 48.52 toks/s, output: 15.55 toks/s]

To determine which is bigger, we can compare the two numbers.

9.11 is greater than 9.9.





In [18]:
query = "Look at this series: 12, 11, 13, 12, 14, 13, … What number should come next?"

text = tokenizer.apply_chat_template([
    {"role" : "user", "content" : query},
], tokenize = False, add_generation_prompt = True)


output = model.fast_generate([text],
                             sampling_params = SamplingParams(
                                 temperature = 0.8,
                                 top_p = 0.95,
                                 max_tokens = 1024),
                             lora_request = None
                             )[0].outputs[0].text

print(output)

Processed prompts: 100%|██████████| 1/1 [00:07<00:00,  7.38s/it, est. speed input: 8.95 toks/s, output: 18.99 toks/s]

The series is alternating between two patterns: one increasing by 1 and another increasing by 2.

When we look at the series:

- 12 is followed by 11 (decrease of 1)
- 11 is followed by 13 (increase of 2)
- 13 is followed by 12 (decrease of 1)
- 12 is followed by 14 (increase of 2)
- 14 is followed by 13 (decrease of 1)

Following this pattern, the next number should be an increase of 2 from 13:

13 + 2 = 15

So, the next number in the series should be 15.





In [11]:
query = "Look at this series: 12, 11, 13, 12, 14, 13, … What number should come next?"

text = tokenizer.apply_chat_template([
    {"role" : "system", "content" : SYSTEM_PROMPT},
    {"role" : "user", "content" : query},
], tokenize = False, add_generation_prompt = True)


output = model.fast_generate(text,
                             sampling_params = SamplingParams(
                                 temperature = 0.8,
                                 top_p = 0.95,
                                 max_tokens = 1024),
                             lora_request = model.load_lora("grpo_saved_lora"),
                             )[0].outputs[0].text

print(output)

# 15

Processed prompts: 100%|██████████| 1/1 [00:09<00:00,  9.14s/it, est. speed input: 10.40 toks/s, output: 18.06 toks/s]

To find the pattern in the series, let's analyze the given numbers:

12, 11, 13, 12, 14, 13

The pattern alternates between two sequences: 
- One sequence is increasing by 1: 12, 13, 14...
- The other sequence is decreasing by 1: 12, 11, 10...

Since the given sequence starts with 12 (from the increasing sequence) and then decreases to 11 (the start of the decreasing sequence), then it increases to 13 (from the increasing sequence) and so on.

Now that we understand the pattern, the next number in the series would be the next number in the increasing sequence, which is: 15.

Therefore, the next number in the series is: 15.





In [17]:
query = "A bag contains 5 red, 4 blue, and 3 green balls. You randomly pick two balls. What is the probability that both balls are red?"

text = tokenizer.apply_chat_template([
    {"role" : "system", "content" : SYSTEM_PROMPT},
    {"role" : "user", "content" : query},
], tokenize = False, add_generation_prompt = True)


output = model.fast_generate(text,
                             sampling_params = SamplingParams(
                                 temperature = 0.8,
                                 top_p = 0.95,
                                 max_tokens = 1024),
                             lora_request = model.load_lora("grpo_saved_lora"),
                             )[0].outputs[0].text

print(output)

Processed prompts: 100%|██████████| 1/1 [00:14<00:00, 14.77s/it, est. speed input: 6.64 toks/s, output: 17.34 toks/s]

To find the probability that both balls are red, we need to first find the total number of ways to pick 2 balls out of 12, and then find the number of ways to pick 2 red balls out of 5.

Total number of ways to pick 2 balls out of 12:
This can be calculated using the combination formula: nCr = n! / (r!(n-r)!), where n is the total number of balls and r is the number of balls picked.
So, the total number of ways to pick 2 balls out of 12 is 12C2 = 12! / (2!(12-2)!) = 66.

Total number of ways to pick 2 red balls out of 5:
This can also be calculated using the combination formula: 5C2 = 5! / (2!(5-2)!) = 10.

Probability that both balls are red:
This can be calculated as the number of ways to pick 2 red balls out of 5 divided by the total number of ways to pick 2 balls out of 12:
Probability = 10/66 = 5/33.

So, the probability that both balls are red is 5/33.





In [23]:
query = "A father is twice as old as his son. 20 years ago, the father's age was 12 times the son's age. How old is the son now?"

text = tokenizer.apply_chat_template([
    {"role" : "system", "content" : SYSTEM_PROMPT},
    {"role" : "user", "content" : query},
], tokenize = False, add_generation_prompt = True)


output = model.fast_generate(text,
                             sampling_params = SamplingParams(
                                 temperature = 0.8,
                                 top_p = 0.95,
                                 max_tokens = 1024),
                             lora_request = model.load_lora("grpo_saved_lora"),
                             )[0].outputs[0].text

print(output)

Processed prompts: 100%|██████████| 1/1 [00:09<00:00,  9.18s/it, est. speed input: 10.90 toks/s, output: 18.64 toks/s]

Let's break this down step by step.

Let the son's current age be x.

Since the father is twice as old as his son, the father's current age is 2x.

Twenty years ago, the son's age was x - 20 and the father's age was 2x - 20.

According to the problem, 20 years ago, the father's age was 12 times the son's age. So, we can write an equation:

2x - 20 = 12(x - 20)

Simplifying the equation:

2x - 20 = 12x - 240
2x - 12x = -240 + 20
-10x = -220
x = 22

Since x is the son's current age, the son is 22 years old.





In [24]:
query = "A 5-digit number is written on a piece of paper, but one digit is smudged and unreadable: 98_23. If this number is divisible by 9, what is the missing digit?"

text = tokenizer.apply_chat_template([
    {"role" : "system", "content" : SYSTEM_PROMPT},
    {"role" : "user", "content" : query},
], tokenize = False, add_generation_prompt = True)


output = model.fast_generate(text,
                             sampling_params = SamplingParams(
                                 temperature = 0.8,
                                 top_p = 0.95,
                                 max_tokens = 1024),
                             lora_request = model.load_lora("grpo_saved_lora"),
                             )[0].outputs[0].text

print(output)

Processed prompts: 100%|██████████| 1/1 [00:10<00:00, 10.13s/it, est. speed input: 10.76 toks/s, output: 18.46 toks/s]

To find the missing digit, we need to find a number that when added to 98_23 will result in a number that is divisible by 9.

The sum of the digits of a number that is divisible by 9 must be divisible by 9. 

First, let's add the known digits: 9 + 8 + 2 + 3 = 22.

Since the number is divisible by 9, the sum of its digits must be divisible by 9. Therefore, the missing digit must be a value such that 22 + x is divisible by 9.

Let's find the smallest multiple of 9 that is greater than 22: 27 (which is 9 * 3).

Now, let's find the difference between 27 and 22: 27 - 22 = 5.

So, the missing digit is 5.

The final answer is 5.





In [31]:
query = """
A two-digit number is four times the sum of its digits. When the digits are reversed, the new number is 9 more than the original. What is the original number?

"""

text = tokenizer.apply_chat_template([
    {"role" : "system", "content" : SYSTEM_PROMPT},
    {"role" : "user", "content" : query},
], tokenize = False, add_generation_prompt = True)


output = model.fast_generate(text,
                             sampling_params = SamplingParams(
                                 temperature = 0.8,
                                 top_p = 0.95,
                                 max_tokens = 1024),
                             lora_request = model.load_lora("grpo_saved_lora"),
                             )[0].outputs[0].text

print(output)



Processed prompts: 100%|██████████| 1/1 [00:19<00:00, 19.30s/it, est. speed input: 5.23 toks/s, output: 19.02 toks/s]

Let's break down the problem. 

Let the original number be 10a + b, where a is the tens digit and b is the ones digit.

Since the number is four times the sum of its digits, we can write an equation:

10a + b = 4(a + b)

Expanding the equation, we get:
10a + b = 4a + 4b

Subtracting 4a from both sides:
6a + b = 4b

Subtracting b from both sides:
6a = 3b

Dividing both sides by 3:
2a = b

Now we have a relation between a and b: b = 2a.

Now, let's consider the second condition. The new number, when the digits are reversed, is 9 more than the original. So, the new number is 10b + a.

The equation is:
10b + a = (10a + b) + 9

Expanding the equation:
10b + a = 10a + b + 9

Subtracting 10a and b from both sides:
9b - 9a = 9

Dividing both sides by 9:
b - a = 1

We already know that b = 2a. Substituting this into the equation above:
2a - a = 1

Solving for a:
a = 1

Since b = 2a, we get:
b = 2(1) = 2

Now that we have a = 1 and b = 2, we can find the original number:
Original number = 10a + 


