In [None]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth vllm
else:
    # [NOTE] Do the below ONLY in Colab! Use [[pip install unsloth vllm]]
    !pip install --no-deps unsloth vllm==0.8.5.post1

In [None]:
#@title Colab Extra Install { display-mode: "form" }
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth vllm
else:
    !pip install --no-deps unsloth vllm==0.8.5.post1
    # [NOTE] Do the below ONLY in Colab! Use [[pip install unsloth vllm]]
    # Skip restarting message in Colab
    import sys, re, requests; modules = list(sys.modules.keys())
    for x in modules: sys.modules.pop(x) if "PIL" in x or "google" in x else None
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft "trl==0.15.2" triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets>=3.4.1" huggingface_hub hf_transfer

    # vLLM requirements - vLLM breaks Colab due to reinstalling numpy
    f = requests.get("https://raw.githubusercontent.com/vllm-project/vllm/refs/heads/main/requirements/common.txt").content
    with open("vllm_requirements.txt", "wb") as file:
        file.write(re.sub(rb"(transformers|numpy|xformers)[^\n]{1,}\n", b"", f))
    !pip install -r vllm_requirements.txt

## Proofs

| Pramāṇa (Proof Type)               | Description                                 | Formal Representation                                                                                                                                                  |
| ---------------------------------- | ------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| **Pratyakṣa (Perception)**         | Knowledge through direct sensory contact    | $\text{See}(x) \Rightarrow \text{Know}(x)$<br>e.g., $\text{Visual}(Smoke) \Rightarrow \text{Belief}(Smoke)$                                                            |
| **Anumāna (Inference)**            | Infer unknown from known via a logical rule | **Syllogism**:<br>$\text{Smoke}(x) \land \text{Smoke}(x) \Rightarrow \text{Fire}(x) \Rightarrow \text{Fire}(x)$<br>Formal: $P(x) \rightarrow Q(x),\; P(x) \vdash Q(x)$ |
| **Upamāna (Comparison)**           | Knowledge by analogy                        | $A \sim B \Rightarrow \text{Properties}(B) \approx \text{Properties}(A)$<br>E.g., $\text{Gavaya} \sim \text{Cow} \Rightarrow \text{Tameable}(Gavaya)$                  |
| **Arthāpatti (Postulation)**       | Best explanation for conflicting data       | **Abductive Rule**:<br>$E: \text{Fat}(Devadatta) \land \neg \text{Eats}_\text{day} \Rightarrow \text{Hypothesis: Eats}_\text{night}$                                   |
| **Anupalabdhi (Non-Apprehension)** | Inference from absence                      | $\neg \text{See}(x) \land \text{ShouldBeVisible}(x) \Rightarrow \neg \text{Exists}(x)$<br>E.g., $\neg \text{Perceive}(Pot) \Rightarrow \neg \text{Pot}$                |
| **Śabda (Testimony)**              | Valid knowledge from reliable source        | $\text{Authority}(A) \land \text{Statement}(A, p) \Rightarrow \text{Belief}(p)$<br>e.g., $\text{Vedas}(p) \Rightarrow \text{True}(p)$ if $A$ is reliable               |


## Reasoning

| Reasoning Type     | Description                                 | Formal Representation                                                                                                                                                            |
| ------------------ | ------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| **Deductive**      | General → Specific (truth-preserving)       | If $P \Rightarrow Q$, and $P$ is true, then $Q$ is true.<br>$P, P \rightarrow Q \vdash Q$                                                                                        |
| **Inductive**      | Specific → General (probabilistic)          | Observe: $Q(a_1), Q(a_2), ..., Q(a_n) \Rightarrow \forall x, Q(x)$ (tentative)<br>Not truth-preserving, but supportive                                                           |
| **Abductive**      | Best explanation (reverse inference)        | $H \Rightarrow E,\; E$ is observed ⟹ infer $H$ as best explanation<br>e.g., $H \vdash E, \text{observe } E \Rightarrow \text{prefer } H$                                         |
| **Causal**         | Understanding cause-effect structure        | Use **Structural Equation Models (SEMs)** or **Causal Graphs (DAGs)**<br>$X \rightarrow Y \Rightarrow \text{do}(X=x) \Rightarrow Y$                                              |
| **Counterfactual** | Reasoning about what would happen otherwise | $Y_{x'}$: Value of $Y$ if $X$ were $x'$<br>$X = x, Y = y \Rightarrow \text{what if } X = x'? \Rightarrow Y_{x'}$<br>Formally: $Y_{x'} \neq Y_x \Rightarrow \text{causal effect}$ |


# Pydantic Models

In [None]:
from pydantic import BaseModel
from typing import List, Optional

In [None]:
# ─── INDIVIDUAL PROOF MODELS ────────────────────────────────────────────────────

class Perception(BaseModel):
    object_perceived: str
    sense_modality: str  # e.g., "vision", "touch"
    is_direct: bool
    confidence: float  # 0.0 to 1.0


class Inference(BaseModel):
    major_premise: str
    minor_premise: str
    conclusion: str
    valid: bool

class Comparison(BaseModel):
    known_object: str
    compared_object: str
    shared_attributes: List[str]
    inferred_properties: List[str]

class Postulation(BaseModel):
    observed_fact: str
    contradictory_fact: Optional[str]
    inferred_hypothesis: str
    is_best_explanation: bool


class NonApprehension(BaseModel):
    expected_object: str
    observation_context: str
    result: str  # e.g., "absent", "not found"


class Testimony(BaseModel):
    authority: str
    statement: str
    is_trusted_source: bool
    domain_of_claim: Optional[str]

In [None]:


# ─── INDIVIDUAL REASONING MODELS ────────────────────────────────────────────────

class DeductiveReasoning(BaseModel):
    premises: List[str]
    conclusion: str
    is_valid: bool


class InductiveReasoning(BaseModel):
    observations: List[str]
    generalized_conclusion: str
    probability_confidence: float  # 0.0 to 1.0


class AbductiveReasoning(BaseModel):
    evidence: str
    possible_explanations: List[str]
    best_explanation: str
    plausibility_score: float  # 0.0 to 1.0


class CausalReasoning(BaseModel):
    cause: str
    effect: str
    evidence_type: str  # e.g., "empirical", "statistical"
    confidence: float  # 0.0 to 1.0


class CounterfactualReasoning(BaseModel):
    actual_event: str
    counterfactual_scenario: str
    inferred_outcome: str
    differs_from_actual: bool





In [None]:
# ─── WRAPPER MODEL FOR ALL REASONING TYPES ──────────────────────────────────────

class Reasoning(BaseModel):
    deductive: DeductiveReasoning
    inductive: InductiveReasoning
    abductive: AbductiveReasoning
    causal: CausalReasoning
    counterfactual: CounterfactualReasoning

In [None]:
# ─── WRAPPER MODEL FOR ALL PROOFS ────────────────────────────────────────────────

class Proofs(BaseModel):
    perception: Perception
    inference: Inference
    comparison: Comparison
    postulation: Postulation
    non_apprehension: NonApprehension
    testimony: Testimony

In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 1024 # Can increase for longer reasoning traces
lora_rank = 32 # Larger rank = smarter, but slower

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "meta-llama/meta-Llama-3.1-8B-Instruct",
    max_seq_length = max_seq_length,
    load_in_4bit = True, # False for LoRA 16bit
    fast_inference = True, # Enable vLLM fast inference
    max_lora_rank = lora_rank,
    gpu_memory_utilization = 0.6, # Reduce if out of memory
)

model = FastLanguageModel.get_peft_model(
    model,
    r = lora_rank, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = [
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ], # Remove QKVO if out of memory
    lora_alpha = lora_rank,
    use_gradient_checkpointing = "unsloth", # Enable long context finetuning
    random_state = 3407,
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 05-27 08:38:23 [importing.py:53] Triton module has been replaced with a placeholder.
INFO 05-27 08:38:23 [__init__.py:239] Automatically detected platform cuda.
==((====))==  Unsloth 2025.5.7: Fast Llama patching. Transformers: 4.51.3. vLLM: 0.8.5.post1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: vLLM loading unsloth/meta-llama-3.1-8b-instruct-unsloth-bnb-4bit with actual GPU utilization = 59.43%
Unsloth: Your GPU has CUDA compute capability 7.5 with VRAM = 14.74 GB.
Unsloth: Using conservativeness = 1.0.

tokenizer_config.json:   0%|          | 0.00/55.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

INFO 05-27 08:38:54 [cuda.py:240] Cannot use FlashAttention-2 backend for Volta and Turing GPUs.
INFO 05-27 08:38:54 [cuda.py:289] Using XFormers backend.
INFO 05-27 08:38:54 [parallel_state.py:1004] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, TP rank 0
INFO 05-27 08:38:54 [model_runner.py:1108] Starting to load model unsloth/meta-llama-3.1-8b-instruct-unsloth-bnb-4bit...
INFO 05-27 08:38:55 [loader.py:1187] Loading weights with BitsAndBytes quantization. May take a while ...
INFO 05-27 08:38:55 [weight_utils.py:265] Using model weights format ['*.safetensors']


model.safetensors:   0%|          | 0.00/5.96G [00:00<?, ?B/s]

INFO 05-27 08:39:56 [weight_utils.py:281] Time spent downloading weights for unsloth/meta-llama-3.1-8b-instruct-unsloth-bnb-4bit: 60.765634 seconds
INFO 05-27 08:39:56 [weight_utils.py:315] No model.safetensors.index.json found in remote.


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 05-27 08:40:35 [punica_selector.py:18] Using PunicaWrapperGPU.
INFO 05-27 08:40:36 [model_runner.py:1140] Model loading took 5.7737 GiB and 100.755685 seconds
INFO 05-27 08:40:46 [worker.py:287] Memory profiling takes 10.17 seconds
INFO 05-27 08:40:46 [worker.py:287] the current vLLM instance can use total_gpu_memory (14.74GiB) x gpu_memory_utilization (0.59) = 8.76GiB
INFO 05-27 08:40:46 [worker.py:287] model weights take 5.77GiB; non_torch_memory takes 0.03GiB; PyTorch activation peak memory takes 0.74GiB; the rest of the memory reserved for KV Cache is 2.22GiB.
INFO 05-27 08:40:47 [executor_base.py:112] # cuda blocks: 1134, # CPU blocks: 0
INFO 05-27 08:40:47 [executor_base.py:117] Maximum concurrency for 1024 tokens per request: 17.72x
INFO 05-27 08:40:47 [model_runner.py:1450] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If o

Capturing CUDA graph shapes:   0%|          | 0/23 [00:00<?, ?it/s]

INFO 05-27 08:41:38 [model_runner.py:1592] Graph capturing finished in 51 secs, took 0.53 GiB
INFO 05-27 08:41:38 [llm_engine.py:437] init engine (profile, create kv cache, warmup model) took 62.39 seconds
Unsloth: Just some info: will skip parsing ['q_norm', 'post_feedforward_layernorm', 'k_norm', 'pre_feedforward_layernorm']
Unsloth: Just some info: will skip parsing ['q_norm', 'post_feedforward_layernorm', 'k_norm', 'pre_feedforward_layernorm']


tokenizer_config.json:   0%|          | 0.00/55.5k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

Unsloth 2025.5.7 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [None]:

def extract_xml_answer(text: str) -> str:
    answer = text.split("<answer>")[-1]
    answer = answer.split("</answer>")[0]
    return answer.strip()

In [None]:
def extract_hash_answer(text: str) -> str | None:
    if "####" not in text:
        return None
    return text.split("####")[1].strip()

In [None]:
from datasets import load_dataset
dataset = load_dataset("openai/gsm8k", "main", split = "train")
dataset

README.md:   0%|          | 0.00/7.94k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/2.31M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/419k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/7473 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1319 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'answer'],
    num_rows: 7473
})

In [None]:
# reasoning_start = "<start_working_out>"
# reasoning_end   = "<end_working_out>"
# solution_start = "<SOLUTION>"
# solution_end = "</SOLUTION>"

# system_prompt = \
# f"""You are given a problem.
# Think about the problem and provide your working out.
# Place it between {reasoning_start} and {reasoning_end}.
# Then, provide your solution between {solution_start}{solution_end}"""
# system_prompt

In [None]:
reasoning_start = "<start_working_out>"
reasoning_end = "<end_working_out>"
solution_start = "<SOLUTION>"
solution_end = "</SOLUTION>"

system_prompt = f"""
You are an advanced AI model. Solve the given problem by first identifying a suitable proof method, and then applying the correct form of reasoning based on it.

Use the following structure:
1. Enclose your reasoning process between {reasoning_start} and {reasoning_end}.
2. Specify the proof used (e.g. <PERCEPTION>) and then the reasoning it enables (e.g. <DEDUCTIVE_REASONING>).
3. Provide your final answer between {solution_start} and {solution_end}.

###  Available Proof Methods (Epistemic Sources):

- <PERCEPTION>: Direct sense experience or observation.
- <INFERENCE>: Deriving knowledge based on observed patterns, cause-effect, or logic.
- <COMPARISON>: Knowledge through analogy or similarity.
- <POSTULATION>: Assuming the existence of something to explain a known effect.
- <NON_APPREHENSION>: Understanding through absence (e.g., lack of evidence or perception).
- <TESTIMONY>: Accepting knowledge from reliable authority or communication.

###  Reasoning Types Based on Proofs:

- <DEDUCTIVE_REASONING>: From general truths (e.g., from inference or testimony) to specific conclusions.
- <INDUCTIVE_REASONING>: From repeated observations (e.g., via perception) to general principles.
- <ABDUCTIVE_REASONING>: From effect to most plausible cause (e.g., postulation).
- <CAUSAL_REASONING>: Understanding causes from effects (usually based on inference).
- <COUNTERFACTUAL_REASONING>: Hypothetical reasoning about what could have happened differently (requires inference or postulation).

###  Example Format:

{reasoning_start}
<INFERENCE>
<DEDUCTIVE_REASONING>
[Logical explanation and steps go here.]
</DEDUCTIVE_REASONING>
</INFERENCE>
{reasoning_end}

{solution_start}
[Your final answer here.]
{solution_end}
"""

In [None]:
dataset = dataset.map(lambda x: {
    "prompt" : [
        {"role": "system", "content": system_prompt},
        {"role": "user",   "content": x["question"]},
    ],
    "answer": extract_hash_answer(x["answer"]),
})
dataset[0]

Map:   0%|          | 0/7473 [00:00<?, ? examples/s]

{'question': 'Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?',
 'answer': '72',
 'prompt': [{'content': '\nYou are an advanced AI model. Solve the given problem by first identifying a suitable proof method, and then applying the correct form of reasoning based on it.\n\nUse the following structure:\n1. Enclose your reasoning process between <start_working_out> and <end_working_out>.\n2. Specify the proof used (e.g. <PERCEPTION>) and then the reasoning it enables (e.g. <DEDUCTIVE_REASONING>).\n3. Provide your final answer between <SOLUTION> and </SOLUTION>.\n\n###  Available Proof Methods (Epistemic Sources):\n\n- <PERCEPTION>: Direct sense experience or observation.\n- <INFERENCE>: Deriving knowledge based on observed patterns, cause-effect, or logic.\n- <COMPARISON>: Knowledge through analogy or similarity.\n- <POSTULATION>: Assuming the existence of something to explain a know

In [None]:
import re

match_format = re.compile(
    rf"^[\s]{{0,}}"\
    rf"{reasoning_start}.+?{reasoning_end}.*?"\
    rf"{solution_start}(.+?){solution_end}"\
    rf"[\s]{{0,}}$",
    flags = re.MULTILINE | re.DOTALL
)

In [None]:
match_format.search(
    "<start_working_out>Let me think!<end_working_out>"\
    "<SOLUTION>2</SOLUTION>",
)

<re.Match object; span=(0, 71), match='<start_working_out>Let me think!<end_working_out>>

In [None]:
def match_format_exactly(completions, **kwargs):
    scores = []
    for completion in completions:
        score = 0
        response = completion  # completions are strings, no need to index
        if match_format.search(response) is not None:
            score += 3.0
        scores.append(score)
    return scores

In [None]:
def match_format_approximately(completions, **kwargs):
    scores = []
    for completion in completions:
        score = 0
        response = completion[0]["content"]
        # Count how many keywords are seen - we penalize if too many!
        # If we see 1, then plus some points!
        score += 0.5 if response.count(reasoning_start) == 1 else -0.5
        score += 0.5 if response.count(reasoning_end)   == 1 else -0.5
        score += 0.5 if response.count(solution_start)  == 1 else -0.5
        score += 0.5 if response.count(solution_end)    == 1 else -0.5
        scores.append(score)
    return scores

In [None]:
def check_answer(prompts, completions, answer, **kwargs):
    question = prompts[0][-1]["content"]
    responses = [completion[0]["content"] for completion in completions]

    extracted_responses = [
        guess.group(1)
        if (guess := match_format.search(r)) is not None else None \
        for r in responses
    ]

    scores = []
    for guess, true_answer in zip(extracted_responses, answer):
        score = 0
        if guess is None:
            scores.append(0)
            continue
        # Correct answer gets 3 points!
        if guess == true_answer:
            score += 3.0
        # Match if spaces are seen
        elif guess.strip() == true_answer.strip():
            score += 1.5
        else:
            # We also reward it if the answer is close via ratios!
            # Ie if the answer is within some range, reward it!
            try:
                ratio = float(guess) / float(true_answer)
                if   ratio >= 0.9 and ratio <= 1.1: score += 0.5
                elif ratio >= 0.8 and ratio <= 1.2: score += 0.25
                else: score -= 1.0 # Penalize wrong answers
            except:
                score -= 0.5 # Penalize
        scores.append(score)
    return scores

In [None]:
match_numbers = re.compile(
    rf"{solution_start}.*?([\d\.]{{1,}})",
    flags = re.MULTILINE | re.DOTALL
)
match_numbers.findall("<SOLUTION>  0.34  </SOLUTION>")

['0.34']

In [None]:
def check_numbers(prompts, completions, answer, **kwargs):
    question = prompts[0][-1]["content"]
    responses = [completion[0]["content"] for completion in completions]

    extracted_responses = [
        guess.group(1)
        if (guess := match_numbers.search(r)) is not None else None \
        for r in responses
    ]

    scores = []
    print('*'*20, f"Question:\n{question}", f"\nAnswer:\n{answer[0]}", f"\nResponse:\n{responses[0]}", f"\nExtracted:\n{extracted_responses[0]}")
    for guess, true_answer in zip(extracted_responses, answer):
        if guess is None:
            scores.append(0)
            continue
        # Convert to numbers
        try:
            true_answer = float(true_answer.strip())
            guess       = float(guess.strip())
            scores.append(1.5 if guess == true_answer else 0.0)
        except:
            scores.append(0)
            continue
    return scores

Unsloth: We now expect `per_device_train_batch_size` to be a multiple of `num_generations`.
We will change the batch size of 1 to the `num_generations` of 4


# Logic and Proofs rewards

In [None]:
import re

def check_logical_ordering(completions, **kwargs):
    reasoning_block_re = re.compile(
        r"<start_working_out>\s*(.*?)\s*</end_working_out>", re.DOTALL | re.IGNORECASE
    )
    proof_tag_re = re.compile(
        r"<(PERCEPTION|INFERENCE|COMPARISON|POSTULATION|NON_APPREHENSION|TESTIMONY)>", re.IGNORECASE
    )
    reasoning_tag_re = re.compile(
        r"<(DEDUCTIVE_REASONING|INDUCTIVE_REASONING|ABDUCTIVE_REASONING|CAUSAL_REASONING|COUNTERFACTUAL_REASONING)>", re.IGNORECASE
    )

    LOGICAL_MAPPING = {
        "PERCEPTION": {"INDUCTIVE_REASONING"},
        "INFERENCE": {"DEDUCTIVE_REASONING", "CAUSAL_REASONING", "COUNTERFACTUAL_REASONING"},
        "COMPARISON": {"INDUCTIVE_REASONING"},
        "POSTULATION": {"ABDUCTIVE_REASONING", "COUNTERFACTUAL_REASONING"},
        "NON_APPREHENSION": {"COUNTERFACTUAL_REASONING"},
        "TESTIMONY": {"DEDUCTIVE_REASONING"},
    }

    scores = []
    for completion in completions:
        score = 0
        response = completion[0]["content"]

        reasoning_match = reasoning_block_re.search(response)
        if not reasoning_match:
            scores.append(score)
            continue

        reasoning_block = reasoning_match.group(1)

        proof_match = proof_tag_re.search(reasoning_block)
        reasoning_tag_match = reasoning_tag_re.search(reasoning_block)

        if not proof_match or not reasoning_tag_match:
            scores.append(score)
            continue

        proof = proof_match.group(1).upper()
        reasoning = reasoning_tag_match.group(1).upper()

        valid_reasonings = LOGICAL_MAPPING.get(proof, set())
        if reasoning in valid_reasonings:
            score += 1
        else:
            score -= 0.9

        scores.append(score)

    return scores


In [None]:
import re

def check_logical_ordering(completions, **kwargs):
    # Define valid proof -> reasoning mappings
    LOGICAL_MAPPING = {
        "PERCEPTION": {"INDUCTIVE_REASONING"},
        "INFERENCE": {"DEDUCTIVE_REASONING", "CAUSAL_REASONING", "COUNTERFACTUAL_REASONING"},
        "COMPARISON": {"INDUCTIVE_REASONING"},
        "POSTULATION": {"ABDUCTIVE_REASONING", "COUNTERFACTUAL_REASONING"},
        "NON_APPREHENSION": {"COUNTERFACTUAL_REASONING"},
        "TESTIMONY": {"DEDUCTIVE_REASONING"},
    }

    scores = []
    for completion in completions:
        score = 0
        text = completion[0]["content"]

        # Extract block inside <start_working_out>...</end_working_out>
        reasoning_block = re.search(r"<start_working_out>(.*?)<end_working_out>", text, re.DOTALL | re.IGNORECASE)
        if not reasoning_block:
            scores.append(score)
            continue

        content = reasoning_block.group(1)

        # Find proof and reasoning tags
        proof_match = re.search(r"<(PERCEPTION|INFERENCE|COMPARISON|POSTULATION|NON_APPREHENSION|TESTIMONY)>", content, re.IGNORECASE)
        reasoning_match = re.search(r"<(DEDUCTIVE_REASONING|INDUCTIVE_REASONING|ABDUCTIVE_REASONING|CAUSAL_REASONING|COUNTERFACTUAL_REASONING)>", content, re.IGNORECASE)

        if not proof_match or not reasoning_match:
            scores.append(score)
            continue

        proof = proof_match.group(1).upper()
        reasoning = reasoning_match.group(1).upper()

        if reasoning in LOGICAL_MAPPING.get(proof, set()):
            score = 0.5  # Correct logical ordering
        else:
            score = -0.6  # Incorrect logical pairing

        scores.append(score)
    return scores

In [None]:
trainer = GRPOTrainer(
    model = model,
    processing_class = tokenizer,
    reward_funcs = [
        match_format_exactly,
        match_format_approximately,
        check_answer,
        check_numbers,
        check_logical_ordering
    ],
    args = training_args,
    train_dataset = dataset,
)
trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 7,473 | Num Epochs = 1 | Total steps = 50
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 1
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 1 x 1) = 4
 "-____-"     Trainable parameters = 83,886,080/8,000,000,000 (1.05% trained)


******************** Question:
A concert ticket costs $40. Mr. Benson bought 12 tickets and received a 5% discount for every ticket bought that exceeds 10. How much did Mr. Benson pay in all? 
Answer:
476 
Response:
<start_working_out>
<INFERENCE>
<DEDUCTIVE_REASONING>

Let's break down this problem step by step:

1. First, we know that a concert ticket costs $40 and Mr. Benson bought 12 tickets.
2. For every ticket bought that exceeds 10, Mr. Benson received a 5% discount. This means for 2 tickets (12 - 10 = 2), he got a 5% discount.
3. The discount price for one ticket is (100 - 5)% = 95% of the original price, which is $40 * 0.95 = $38.
4. The discount for two tickets is $38 * 2 = $76.
5. The remaining 10 tickets don't have any discount, so the total price for those is 10 * 40 = $400.
6. To find the total price paid by Mr. Benson, we need to add the price of the 10 tickets with no discount and the price of the 2 tickets with the 5% discount. So the total price is $400 + $76.

Calcul

Step,Training Loss,reward,reward_std,completion_length,kl,rewards / match_format_exactly,rewards / match_format_approximately,rewards / check_answer,rewards / check_numbers,rewards / check_logical_ordering
1,-0.0,5.575,2.05,219.25,0.0,2.25,1.75,-0.375,1.5,0.45
2,-0.0,2.175,2.997082,355.25,0.0,0.75,0.5,-0.125,0.75,0.3
3,0.0,5.575,2.05,246.0,7e-06,2.25,1.75,-0.375,1.5,0.45
4,0.0,6.575,2.875616,230.0,4e-06,2.25,1.75,0.625,1.5,0.45
5,0.0,1.375,0.75,188.25,8e-06,0.0,1.0,0.0,0.375,0.0
6,0.0,5.05,3.055596,209.0,2.5e-05,1.5,1.5,0.25,1.5,0.3
7,0.0,1.675,0.618466,273.25,4.6e-05,0.0,1.0,0.0,0.375,0.3
8,0.0,4.3,3.897008,164.0,0.000207,1.5,1.5,0.25,0.75,0.3
9,0.0,3.425,2.028752,265.75,0.0002,1.5,1.5,-0.25,0.375,0.3
10,0.0,4.45,1.786057,334.25,0.000264,2.25,1.75,-0.375,0.75,0.075


******************** Question:
Jane is trying to decide whether to buy a house or a trailer. A house costs $480,000 and a trailer costs $120,000. Each loan will be paid in monthly installments over 20 years. How much more is the monthly payment on the house compared to the trailer? 
Answer:
1500 
Response:
(start_working_out)
<INFERENCE>
<DEDUCTIVE_REASONING>
To determine the difference in monthly payments, we first need to calculate the monthly payment for each option. 

The formula for monthly payment calculation is given by M = P [ i(1 + i)^n ] / [ (1 + i)^n - 1], where:
- M = monthly payment,
- P = principal amount (initial price of the house or trailer),
- i = monthly interest rate as a decimal,
- n = number of payments (20 years * 12 months/year).

Since the interest rates for both loans are not given, let's assume the interest rates are equal and let's use 4% as an example interest rate (0.04 as a decimal).

For a house, P = $480,000 and i = 0.04. The number of payments is 20*12

TrainOutput(global_step=50, training_loss=0.000133209696755614, metrics={'train_runtime': 3434.4214, 'train_samples_per_second': 0.058, 'train_steps_per_second': 0.015, 'total_flos': 0.0, 'train_loss': 0.000133209696755614})