In [None]:
!pip install unsloth vllm trl

In [None]:
# Step 1: Patch GRPO for reinforcement learning
from unsloth import FastLanguageModel, PatchFastRL, is_bfloat16_supported
from trl import GRPOConfig, GRPOTrainer
PatchFastRL("GRPO", FastLanguageModel)

import logging
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

In [None]:
import torch
# Step 2: Define model and training parameters
MODEL_NAME = "Qwen/Qwen2.5-3B-Instruct"
MAX_SEQ_LENGTH = 1024
LORA_RANK = 64 # Efficient tuning
GPU_MEMORY_UTIL = 0.6  # Optimize GPU usage

In [None]:
# Step 3: Load model with LoRA fine-tuning
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=MODEL_NAME,
    max_seq_length=MAX_SEQ_LENGTH,
    load_in_4bit=True,
    fast_inference=True,
    max_lora_rank=LORA_RANK,
    gpu_memory_utilization=GPU_MEMORY_UTIL,
)

# Apply LoRA optimization
model = FastLanguageModel.get_peft_model(
    model,
    r=LORA_RANK,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",],  # Optimized layer selection
    lora_alpha=LORA_RANK,
    use_gradient_checkpointing="unsloth",
    random_state=42,

)

In [None]:
import re
from datasets import load_dataset, Dataset

# Load and prep dataset
SYSTEM_PROMPT = """
Respond in the following format:
<reasoning>
...
</reasoning>
<answer>
...
</answer>
"""

XML_COT_FORMAT = """\
<reasoning>
{reasoning}
</reasoning>
<answer>
{answer}
</answer>
"""

def extract_xml_answer(text: str) -> str:
    answer = text.split("<answer>")[-1]
    answer = answer.split("</answer>")[0]
    return answer.strip()

def extract_hash_answer(text: str) -> str | None:
    if "####" not in text:
        return None
    return text.split("####")[1].strip()

# uncomment middle messages for 1-shot prompting
def get_gsm8k_questions(split = "train") -> Dataset:
    data = load_dataset('openai/gsm8k', 'main')[split] # type: ignore
    data = data.map(lambda x: { # type: ignore
        'prompt': [
            {'role': 'system', 'content': SYSTEM_PROMPT},
            {'role': 'user', 'content': x['question']}
        ],
        'answer': extract_hash_answer(x['answer'])
    }) # type: ignore
    return data # type: ignore

dataset = get_gsm8k_questions()

In [None]:
from sentence_transformers import SentenceTransformer, util
# Step 5: Define optimized reward functions
embed_model = SentenceTransformer("all-MiniLM-L6-v2")
#
def reward_correctness(completions, answer, **kwargs):
    """Reward based on semantic similarity instead of exact matching."""
    completion_texts = [completion[0]['content'].strip() for completion in completions]
    answer_texts = [a.strip() for a in answer]

    embeddings_comp = embed_model.encode(completion_texts, convert_to_tensor=True)
    embeddings_ans = embed_model.encode(answer_texts, convert_to_tensor=True)
    similarities = util.pytorch_cos_sim(embeddings_comp, embeddings_ans).diagonal()

    return [float(sim) for sim in similarities]

#
def soft_format_reward_func(completions, **kwargs) -> list[float]:
    """Reward function that checks if the completion has a specific format."""
    pattern = r".*?\s*.*?"
    responses = [completion[0]["content"] for completion in completions]
    matches = [re.match(pattern, r) for r in responses]
    return [0.5 if match else 0.0 for match in matches]
#
def strict_format_reward_func(completions, **kwargs):
    """Ensure response follows strict XML-based reasoning/answer format."""
    pattern = r"^<reasoning>.*?</reasoning>\s*<answer>.*?</answer>$"
    responses = [completion[0]["content"] for completion in completions]
    return [0.5 if re.match(pattern, r, re.DOTALL) else 0.0 for r in responses]
#
def int_reward_func(completions, **kwargs):
    """Ensure numerical answers when required."""
    responses = [completion[0]['content'] for completion in completions]
    return [0.5 if any(char.isdigit() for char in r) else 0.0 for r in responses]
#
def count_xml(text) -> float:
    count = 0.0
    if text.count("\n") == 1:
        count += 0.125
    if text.count("\n\n") == 1:
        count += 0.125
    if text.count("\n\n") == 1:
        count += 0.125
        count -= len(text.split("\n\n")[-1])*0.001
    if text.count("\n") == 1:
        count += 0.125
        count -= (len(text.split("\n")[-1]) - 1)*0.001
    return count
#
def xmlcount_reward_func(completions, **kwargs) -> list[float]:
    contents = [completion[0]["content"] for completion in completions]
    return [count_xml(c) for c in contents]

# extra
def reward_stepwise_accuracy(completions, **kwargs):
    """Reward answers that follow correct reasoning steps."""
    return [0.7 if "<reasoning>" in completion[0]['content'] else 0.0 for completion in completions]

def reward_length_penalty(completions, **kwargs):
    """Penalize responses that are too long or too short."""
    return [-0.2 if len(completion[0]['content'].split()) > 150 else 0.2 for completion in completions]

In [None]:
# Step 6: Configure training parameters
from trl import GRPOConfig, GRPOTrainer
training_args = GRPOConfig(
    use_vllm = True, # use vLLM for fast inference!
    learning_rate = 5e-6,
    adam_beta1 = 0.9,
    adam_beta2 = 0.99,
    weight_decay = 0.1,
    warmup_ratio = 0.1,
    lr_scheduler_type = "cosine",
    optim = "adamw_8bit",
    logging_steps = 1,
    bf16 = is_bfloat16_supported(),
    fp16 = not is_bfloat16_supported(),
    per_device_train_batch_size = 1,
    gradient_accumulation_steps = 1, # Increase to 4 for smoother training
    num_generations = 8, # Decrease if out of memory
    max_prompt_length = 256,
    max_completion_length = 200,
    # num_train_epochs = 1, # Set to 1 for a full training run
    max_steps = 150,
    save_steps = 150,
    max_grad_norm = 0.1,
    report_to = "none", # Can use Weights & Biases
    output_dir = "outputs",
)

In [None]:
# Step 7: Initialize and train model
logger.info("Initializing trainer...")

trainer = GRPOTrainer(
    model = model,
    processing_class = tokenizer,
    reward_funcs = [
        strict_format_reward_func,
        int_reward_func,
        reward_correctness, # Assuming this is what you intended
        reward_stepwise_accuracy, # Adding this reward function for more robust training
        reward_length_penalty, # Adding this to penalize overly long responses
        xmlcount_reward_func, # Adding this to penalize overly long responses
        soft_format_reward_func, # Adding this to penalize overly long response
    ],
    args = training_args,
    train_dataset = dataset,
)
trainer.train()





In [None]:
model.save_lora("grpo_saved_lora")

In [None]:
# model.push_to_hub_merged("rushigulum/GRPO4", tokenizer, save_method = "merged_16bit")

In [None]:
!pip install huggingface_hub

In [None]:
from huggingface_hub import notebook_login
notebook_login()


In [None]:
import os
os.listdir("/content")


In [None]:
from huggingface_hub import HfApi

api = HfApi()
api.create_repo("rushigulum/grpo", private=False)

api.upload_folder(
    folder_path="/content/grpo_saved_lora",  # Change this to your model's path
    repo_id="rushigulum/grpo",
)


In [None]:
from huggingface_hub import HfApi

repo_name = "rushigulum"
# Ensure model_path points to a directory containing the model files
model_path = "grpo_saved_lora"  # Changed from 'optimized_outputs/best_model'

# Upload model
api = HfApi()
api.create_repo(repo_name, exist_ok=True)
api.upload_folder(folder_path=model_path, repo_id=repo_name, repo_type="model")

In [None]:
model.push_to_hub_merged("rushigulum/GRPO4", tokenizer, save_method = "lora")

In [None]:
!pip install transformers torch

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# Replace with your Hugging Face repo ID
MODEL_REPO = "rushigulum/q3b-grpo"

# Load the model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_REPO)
model = AutoModelForCausalLM.from_pretrained(MODEL_REPO, torch_dtype=torch.float16, device_map="auto")


In [None]:
def generate_response(prompt):
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(model.device)
    output_ids = model.generate(input_ids, max_length=200, temperature=0.7)
    return tokenizer.decode(output_ids[0], skip_special_tokens=True)

# Example Test
prompt = "What is the capital of France?"
response = generate_response(prompt)
print("Model Response:", response)
