In [1]:
from unsloth import FastLanguageModel, PatchFastRL
import re
from vllm import SamplingParams

PatchFastRL("GRPO", FastLanguageModel)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


  from .autonotebook import tqdm as notebook_tqdm


🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 02-09 06:59:09 __init__.py:190] Automatically detected platform cuda.


2025-02-09 06:59:10,668	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


In [2]:
max_seq_length = 32000 # Can increase for longer reasoning traces
lora_rank = 64 # Larger rank = smarter, but slower

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/DeepSeek-R1-Distill-Qwen-1.5B",
    max_seq_length = max_seq_length,
    load_in_4bit = False, # False for LoRA 16bit
    fast_inference = True, # Enable vLLM fast inference
    max_lora_rank = lora_rank,
    gpu_memory_utilization = 0.95, # Reduce if out of memory
)

model = FastLanguageModel.get_peft_model(
    model,
    r = lora_rank, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = [
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ], # Remove QKVO if out of memory
    lora_alpha = lora_rank,
    use_gradient_checkpointing = "unsloth", # Enable long context finetuning
    random_state = 3407,
)

==((====))==  Unsloth 2025.2.4: Fast Qwen2 patching. Transformers: 4.48.2.
   \\   /|    GPU: NVIDIA A10G. Max memory: 22.184 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 8.6. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: vLLM loading unsloth/DeepSeek-R1-Distill-Qwen-1.5B with actual GPU utilization = 49.44%
Unsloth: Your GPU has CUDA compute capability 8.6 with VRAM = 22.18 GB.
Unsloth: Using conservativeness = 1.0. Chunked prefill tokens = 32000. Num Sequences = 192.
Unsloth: vLLM's KV Cache can use up to 7.52 GB. Also swap space = 6 GB.
INFO 02-09 06:59:28 config.py:542] This model supports multiple tasks: {'reward', 'generate', 'score', 'embed', 'classify'}. Defaulting to 'generate'.
INFO 02-09 06:59:28 llm_engine.py:234] Initializing a V0 LLM



INFO 02-09 06:59:30 weight_utils.py:252] Using model weights format ['*.safetensors']
INFO 02-09 06:59:30 weight_utils.py:297] No model.safetensors.index.json found in remote.


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  2.25it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  2.24it/s]



INFO 02-09 06:59:30 model_runner.py:1115] Loading model weights took 3.3470 GB
INFO 02-09 06:59:31 punica_selector.py:18] Using PunicaWrapperGPU.
INFO 02-09 06:59:34 worker.py:267] Memory profiling takes 3.13 seconds
INFO 02-09 06:59:34 worker.py:267] the current vLLM instance can use total_gpu_memory (22.18GiB) x gpu_memory_utilization (0.49) = 10.97GiB
INFO 02-09 06:59:34 worker.py:267] model weights take 3.35GiB; non_torch_memory takes 0.06GiB; PyTorch activation peak memory takes 2.02GiB; the rest of the memory reserved for KV Cache is 5.54GiB.
INFO 02-09 06:59:34 executor_base.py:110] # CUDA blocks: 12967, # CPU blocks: 14043
INFO 02-09 06:59:34 executor_base.py:115] Maximum concurrency for 32000 tokens per request: 6.48x
INFO 02-09 06:59:38 model_runner.py:1434] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error 

Capturing CUDA graph shapes: 100%|██████████| 27/27 [00:15<00:00,  1.75it/s]

INFO 02-09 06:59:54 model_runner.py:1562] Graph capturing finished in 15 secs, took 0.27 GiB
INFO 02-09 06:59:54 llm_engine.py:431] init engine (profile, create kv cache, warmup model) took 23.15 seconds



Unsloth 2025.2.4 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


In [5]:
def extract_boxed_answer(text: str) -> str:
    """
    Extract content from within \boxed{} notation
    Args:
        text: Text containing LaTeX boxed content
    Returns:
        Content within boxed notation or empty string if not found
    """
    try:
        answer = text.split("</think>")[-1]
        # Look for content between \boxed{...}
        boxed = re.search(r"\\boxed{(.*?)}", answer)
        return boxed.group(1) if boxed else ""
    except:
        return ""

In [17]:

sampling_params = SamplingParams(
    temperature = 0.6,
    top_p = 0.95,
    max_tokens = 32000,
)

# inference without lora and with lora trained

In [None]:
question = """Ahmed and Emily are having a contest to see who can get the best grade in the class. There have been 9 assignments and Ahmed has a 91 in the class. Emily has a 92. The final assignment is worth the same amount as all the other assignments. Emily got a 90 on the final assignment. What is the minimum grade Ahmed needs to get to beat Emily if all grades are whole numbers?
Please reason step by step, and put your final answer within \boxed{}"""

text = tokenizer.apply_chat_template([
    {"role" : "user", "content" : question},
], tokenize = False, add_generation_prompt = True)

output = model.fast_generate(
    [text],
    sampling_params = sampling_params,
    lora_request = None,
)[0].outputs[0].text

print(output)

In [None]:

text = tokenizer.apply_chat_template([
    {"role" : "user", "content" : question},
], tokenize = False, add_generation_prompt = True)


lora_path = "" # path to the LoRA checkpoint
output = model.fast_generate(
    text,
    sampling_params = sampling_params,
    lora_request = model.load_lora(lora_path),
)[0].outputs[0].text

print(output)