In [1]:
BASE_ID = "unsloth/Llama-3.2-3B-Instruct"

In [2]:
from unsloth import FastLanguageModel
import torch

max_seq_length = 2048 # Can increase for longer RL output
lora_rank = 128        # Larger rank = smarter, but slower
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = BASE_ID,
    load_in_4bit = False,
    max_seq_length = max_seq_length,
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


  from .autonotebook import tqdm as notebook_tqdm


🦥 Unsloth Zoo will now patch everything to make training faster!
Unsloth: AMD currently is not stable with 4bit bitsandbytes. Disabling for now.
==((====))==  Unsloth 2025.10.9: Fast Llama patching. Transformers: 4.56.2.
   \\   /|    AMD Radeon Graphics. Num GPUs = 1. Max memory: 191.688 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+rocm6.4. ROCm Toolkit: 6.4.43482-0f2d60242. Triton: 3.4.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards: 100% 2/2 [00:02<00:00,  1.21s/it]


In [3]:
model_policy = FastLanguageModel.get_peft_model(
    model,
    r = lora_rank, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = [
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ], # Remove QKVO if out of memory
    lora_alpha = lora_rank,
    use_gradient_checkpointing = "unsloth", # Enable long context finetuning
    random_state = 3407,
)
model_policy.print_trainable_parameters()

Unsloth 2025.10.9 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


trainable params: 194,510,848 || all params: 3,407,260,672 || trainable%: 5.7087


In [4]:
import os, sys, subprocess

working_directory = "/shared-docker/OpenEnv"  # repo root containing src/
port = "8050"
keepalive = "1000"

cmd = [
    sys.executable, "-m", "uvicorn",
    "envs.cartpole_env.server.app:app",  # <— our new app path
    "--host", "0.0.0.0",
    "--port", port,
    "--timeout-keep-alive", keepalive,
]

env = {
    **os.environ,
    "PYTHONPATH": f"{working_directory}/src",

    # CartPole knobs (analogous to OPENSPIEL_*):
    "CARTPOLE_ENV_ID": "CartPole-v1",
    "CARTPOLE_SEED": "123",
    "CARTPOLE_MAX_EPISODE_STEPS": "1000",
    "CARTPOLE_RENDER_MODE": "none",  # or "rgb_array"
}

#proc = subprocess.Popen(cmd, env=env, cwd=working_directory,
#                        stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)

#print("CartPole server starting at http://localhost:8020 … PID:", proc.pid)


In [5]:
import httpx, time
time.sleep(1)
r = httpx.post("http://localhost:8070/reset", json={}); print(r.json())

INFO:httpx: HTTP Request: POST http://localhost:8070/reset "HTTP/1.1 200 OK"


{'observation': {'state': [-5.636215064441785e-05, 1.4085109233856201, -0.0057279132306575775, -0.10707679390907288, 7.21305186743848e-05, 0.0012974588898941875, 0.0, 0.0], 'legal_actions': 'continuous actions: main_engine [-1,1], lateral_engine [-1,1]', 'episode_length': 0, 'total_reward': 0.0}, 'reward': 0.0, 'done': False}


In [6]:
import requests
import time
time.sleep(5) # Wait 5 seconds for OpenEnv to start!
from envs.lunarlander_environment import LunarLanderEnv, LunarLanderAction
import httpx

# Same shape as: OpenSpielEnv(base_url=..., request_timeout_s=...)
base_url = "http://localhost:8070"
request_timeout_s = 1000  # seconds

openenv_process = openenv_process = LunarLanderEnv(
        base_url=base_url,
        request_timeout_s=request_timeout_s
    )
# quick smoke test
# info = openenv_process.info()

# print("CartPole info:", info)

state = openenv_process.reset()
print("reset:", state)

# step with discrete action 0/1
state = openenv_process.step(
                LunarLanderAction(main_engine=0.0, lateral_engine=0.0))
print("step:", state)

reset: StepResult(observation=LunarLanderObservation(done=False, reward=0.0, metadata={}, state=[-0.005594158079475164, 1.4150160551071167, -0.5666389465332031, 0.18202506005764008, 0.006488978862762451, 0.12835219502449036, 0.0, 0.0], legal_actions='continuous actions: main_engine [-1,1], lateral_engine [-1,1]', episode_length=0, total_reward=0.0), reward=0.0, done=False)
step: StepResult(observation=LunarLanderObservation(done=False, reward=-0.1806059112571461, metadata={}, state=[-0.011188698001205921, 1.4185343980789185, -0.5658724904060364, 0.15633147954940796, 0.012831433676183224, 0.12686122953891754, 0.0, 0.0], legal_actions='continuous actions: main_engine [-1,1], lateral_engine [-1,1]', episode_length=1, total_reward=-0.1806059112571461), reward=-0.1806059112571461, done=False)


In [7]:
def strategy_simple(state):
    # state = [x, dx, angle, dangle]
    return 0  # left or 1 right


def build_user_prompt():
    return (
        """You are an expert LunarLander (continuous control) pilot and a precise Python code generator.

Context / How this will be used
- Your function will be called every environment step to control the entire episode of LunarLanderContinuous-v2 (OpenAI Gym/Gymnasium, Box2D).
- The simulator runs at ~50 Hz. The episode ends when the lander is safely on the pad, crashes, flies off-screen, or hits the environment step cap.
- The reward is shaped for soft, centered landings: proximity to the pad and zeroing velocity/tilt are good; fuel use and large forces are penalized; leg contacts yield bonuses; crashing yields large negatives.

Your objective: achieve a safe, fuel-efficient landing near the center pad by minimizing horizontal/vertical speeds and tilt while avoiding hard thrusting.

What you must write
- A single Python function with this exact signature (no extras):
    def lunarlander_strategy(state):
- Input `state` is a list of 8 values: [x, y, vx, vy, theta, vtheta, left_contact, right_contact]
  - x, y       = position relative to pad center (0,0)
  - vx, vy     = horizontal and vertical velocities
  - theta      = lander angle (radians; 0 is upright; positive leans right)
  - vtheta     = angular velocity
  - left_contact, right_contact = leg contact flags (0.0 or 1.0)
- Output: return a length-2 sequence of floats **[main, lateral]**, each in **[-1.0, 1.0]**
  - `main`   = main engine command (upward thrust). Negative values are treated as “off”; positive values increase thrust.
  - `lateral`= side thruster command (negative pushes left, positive pushes right).

Design guidance for long-horizon control
- Prioritize **vertical stabilization** (reduce `vy`) and **upright attitude** (`theta≈0`, `vtheta≈0`), then **center horizontally** (`x≈0`, `vx≈0`).
- Use **smooth, deterministic** control laws (e.g., weighted linear feedback with deadbands) and **clamp** outputs to [-1, 1].
- Add small **dead-zones/hysteresis** around zero for `vx`, `vy`, and `theta` to avoid jitter and fuel waste.
- Useful heuristics:
  - Main thrust should counteract downward speed: increase with positive descent rate (`vy < 0`) and with tilt error magnitude.
  - Lateral thruster should reduce horizontal error and velocity and help re-center the pad under the craft.
  - Reduce thrust when either leg has contact and vertical speed is small to prevent bouncing.
  - Cap commands gently to avoid saturating engines; prefer incremental adjustments as you approach touchdown.

Hard constraints
- Do not import, print, read/write files, use globals, randomness, or any I/O.
- Keep the code short and stateless; identical inputs must produce identical outputs.
- The output must be exactly one fenced code block in Python, with nothing before or after.
  - The first line inside the block must be: def lunarlander_strategy(state):
  - The last line of your entire response must be the closing backticks to clearly end the program. No trailing commentary.

Output format reminder (dummy example — do NOT copy this logic):

```
def lunarlander_strategy(state):
    x, y, vx, vy, th, vth, lc, rc = state
    m = x
    lat = y
    return [m, lat] # example 
        ```
        All helper functions should be inside def lunarlander_strategy. Only output the short function `strategy`.
        """.strip())

print(build_user_prompt())

You are an expert LunarLander (continuous control) pilot and a precise Python code generator.

Context / How this will be used
- Your function will be called every environment step to control the entire episode of LunarLanderContinuous-v2 (OpenAI Gym/Gymnasium, Box2D).
- The simulator runs at ~50 Hz. The episode ends when the lander is safely on the pad, crashes, flies off-screen, or hits the environment step cap.
- The reward is shaped for soft, centered landings: proximity to the pad and zeroing velocity/tilt are good; fuel use and large forces are penalized; leg contacts yield bonuses; crashing yields large negatives.

Your objective: achieve a safe, fuel-efficient landing near the center pad by minimizing horizontal/vertical speeds and tilt while avoiding hard thrusting.

What you must write
- A single Python function with this exact signature (no extras):
    def lunarlander_strategy(state):
- Input `state` is a list of 8 values: [x, y, vx, vy, theta, vtheta, left_contact, right_c

In [8]:
def extract_function(text):
    if text.count("```") >= 2:
        first = text.find("```") + 3
        second = text.find("```", first)
        fx = text[first : second].strip()
        fx = fx[fx.find("def"):]
        if fx.startswith("def lunarlander_strategy(state):"): return fx
    return None
print(extract_function(build_user_prompt()))

def lunarlander_strategy(state):
    x, y, vx, vy, th, vth, lc, rc = state
    m = x
    lat = y
    return [m, lat] # example


In [9]:
# Minimal safe executor (reuse your nb's create_locked_down_function if available)
from unsloth import create_locked_down_function

from unsloth import check_python_modules

def _safe_compile(func_src: str):
    # Use the notebook's 'create_locked_down_function' if present
    return create_locked_down_function(func_src)



In [10]:
import numpy as np
from random import randint
global _PRINT_COUNTER
_PRINT_COUNTER = 0


def shaped_reward(s, s_next, r_env, a, noop_streak, gamma=0.99):
    # unpack state as needed
    x,y,vx,vy,theta,omega,left_leg,right_leg = s
    x2,y2,vx2,vy2,theta2,omega2,ll2,rl2 = s_next

    Phi = lambda x_,y_,vx_,vy_,th_,om_,ll_,rl_: (
        -(0.8*abs(x_) + 0.8*abs(y_-0.0) + 0.4*abs(vx_) + 0.6*abs(vy_) +
          0.4*abs(th_) + 0.2*abs(om_)) + 0.1*(ll_+rl_)
    )

    r_shape = gamma*Phi(x2,y2,vx2,vy2,theta2,omega2,ll2,rl2) - \
              Phi(x,y,vx,vy,theta,omega,left_leg,right_leg)

    # time cost
    r = r_env + r_shape - 0.01

    # determine if "safe" (don’t penalize idling here)
    safe = (y < 0.7 and abs(vx) < 0.1 and abs(vy) < 0.1 and abs(theta) < 0.1)

    # dead-zone idle penalty (continuous)
    if not safe and (a < 0.01):  # or np.linalg.norm(a) < eps
        if noop_streak >= 3:
            r -= 0.02 * (noop_streak - 2)

    # small control cost
    r -= 0.001 * (a**2)

    return r
    
def execute_strategy(strategy_fn, initial_state, max_steps):
    """Run strategy on CartPole server until done or step limit.
       Returns (steps_survived, done_flag)."""
    steps = 0
    done = False
    state = initial_state
    reward = 0
    last_state = None
    noop_streak = 0
    while not done and steps < max_steps:
        # print("try strategy")
        main,lateral = strategy_fn(state.observation.state)
        # print(a)
        res = openenv_process.step(LunarLanderAction(
                main_engine=main, lateral_engine=lateral))
        # print(res)
        last_state = state
        state = res
        done = bool(res.done)
        reward +=  shaped_reward(last_state.observation.state ,state.observation.state, res.reward, abs(main)+ abs(lateral),  noop_streak)
        if((abs(main)+ abs(lateral)) < 0.01):
            noop_streak += 1
        steps += 1
    return reward, steps, done


def function_works(completions, **kwargs):
    scores = []
    for completion in completions:
        score = 0
        response = completion[0]["content"]
        function = extract_function(response)
        if function is not None:
            ok, info = check_python_modules(function)
        if function is None or "error" in info:
            score = -2.0
        else:
            try:
                new_strategy = create_locked_down_function(function)
                score = randint(7, 20)
            except:
                score = -0.5
        scores.append(score)
    return scores
    
def strategy_succeeds(completions, **kwargs):
    """completions: list of candidate generations (like the nb).
       Returns list[float] rewards (higher is better)."""
    global _PRINT_COUNTER
    scores = []

    # Reset the env once per candidate to a fresh start
    for completion in completions:
        try:
            # The notebook packs text like completion[0]["content"]
            response = completion[0]["content"]
        except Exception:
            scores.append(0.0); continue

        # Optional: print every 5th for debugging
        if _PRINT_COUNTER % 5 == 0:
            try:
                print(response.splitlines()[0][:120])
            except:
                print("...candidate omitted...")
        _PRINT_COUNTER += 1
        #print(response)
        # Parse the function from the text (reuse nb's helpers if present)
        try:
            func_src = extract_function(response)
            print(func_src)
            strategy_fn = _safe_compile(func_src)
        except Exception as e:
            print("Compile error:", e)
            scores.append(0.0); continue
        print(func_src)
        print(strategy_fn)
        # Rollout on CartPole
        try:
            # Fresh episode
            res0 = openenv_process.reset()
            current_state = res0
            reward, steps, finished = execute_strategy(strategy_fn, current_state, max_steps=10000)

            # Reward = steps survived (CartPole native) with mild shaping:
            # huge bonus if finished by reaching max steps (i.e., perfect 500).
            

            # print a brief trace for failing candidates occasionally
            if steps > 100 and (_PRINT_COUNTER % 7 == 0):
                print("Long episode:", steps)

            scores.append(reward)
        except TimeoutError:
            # keep the semantics from the nb
            scores.append(-1.0)
        except Exception as e:
            # print(f"Exception = {str(e)}")
            scores.append(-3.0)

    return scores

In [11]:
toy = """
```
def lunarlander_strategy(state):
    # state: [x, v]
    print(state)
    x, y, z, m , v1,v2, v3, v4 = state
    score = x + 0.1 * v1
    return -0.5,0.0
    ```
"""




    
fn1 = extract_function(toy)

fn = _safe_compile(fn1)

s0 = openenv_process.reset();
# print(s0.observation.values[0],s0.observation.values[1],s0.observation.values[2],s0.observation.values[3])
print(fn(s0.observation.state))
done = False

reward, steps, done = execute_strategy(fn, s0, 100)
print("Toy strategy survived steps reward:", steps, reward)

[0.00595931988209486, 1.401628851890564, 0.6035951972007751, -0.4129604399204254, -0.006898547522723675, -0.1367233395576477, 0.0, 0.0]
(-0.5, 0.0)
[0.00595931988209486, 1.401628851890564, 0.6035951972007751, -0.4129604399204254, -0.006898547522723675, -0.1367233395576477, 0.0, 0.0]
[0.011918830685317516, 1.3917615413665771, 0.602779746055603, -0.4385990500450134, -0.013654747977852821, -0.13513651490211487, 0.0, 0.0]
[0.017878437414765358, 1.3812940120697021, 0.602800726890564, -0.4652988612651825, -0.02040674351155758, -0.13505271077156067, 0.0, 0.0]
[0.023838330060243607, 1.3702269792556763, 0.6028205752372742, -0.4919743239879608, -0.027157841250300407, -0.13503488898277283, 0.0, 0.0]
[0.029798507690429688, 1.358560562133789, 0.6028403043746948, -0.5186463594436646, -0.033907875418663025, -0.13501325249671936, 0.0, 0.0]
[0.03575878217816353, 1.346294641494751, 0.6028602123260498, -0.5453177094459534, -0.04065680503845215, -0.13499125838279724, 0.0, 0.0]
[0.04171943664550781, 1.3334

In [12]:
from datasets import Dataset
dataset = Dataset.from_list([{"prompt" : [{"role": "user", "content": build_user_prompt().strip()}], "answer" : 0, "reasoning_effort": "low"}]*1000)
maximum_length = len(tokenizer.apply_chat_template([{"role": "user", "content": build_user_prompt().strip()}], add_generation_prompt = True))
print(maximum_length)

820


In [13]:
dataset[0]

{'prompt': [{'content': 'You are an expert LunarLander (continuous control) pilot and a precise Python code generator.\n\nContext / How this will be used\n- Your function will be called every environment step to control the entire episode of LunarLanderContinuous-v2 (OpenAI Gym/Gymnasium, Box2D).\n- The simulator runs at ~50 Hz. The episode ends when the lander is safely on the pad, crashes, flies off-screen, or hits the environment step cap.\n- The reward is shaped for soft, centered landings: proximity to the pad and zeroing velocity/tilt are good; fuel use and large forces are penalized; leg contacts yield bonuses; crashing yields large negatives.\n\nYour objective: achieve a safe, fuel-efficient landing near the center pad by minimizing horizontal/vertical speeds and tilt while avoiding hard thrusting.\n\nWhat you must write\n- A single Python function with this exact signature (no extras):\n    def lunarlander_strategy(state):\n- Input `state` is a list of 8 values: [x, y, vx, vy,

In [14]:
max_prompt_length = maximum_length + 1 # + 1 just in case!
max_completion_length = max_seq_length - max_prompt_length

from trl import GRPOConfig, GRPOTrainer
training_args = GRPOConfig(
    temperature = 1.0,
    learning_rate = 5e-6,
    weight_decay = 0.01,
    warmup_ratio = 0.1,
    lr_scheduler_type = "linear",
    optim = "adamw_8bit",
    logging_steps = 1,
    per_device_train_batch_size = 1,
    gradient_accumulation_steps = 1, # Increase to 4 for smoother training
    num_generations = 8, # Decrease if out of memory
    max_prompt_length = max_prompt_length,
    max_completion_length = max_completion_length,
    # num_train_epochs = 1, # Set to 1 for a full training run
    max_steps = 35,
    save_steps = 100,
    report_to = "trackio", # Can use Weights & Biases, TrackIO
    output_dir = "outputs",

    # For optional training + evaluation
    # fp16_full_eval = True,
    # per_device_eval_batch_size = 4,
    # eval_accumulation_steps = 1,
    # eval_strategy = "steps",
    # eval_steps = 1,
)

Unsloth: We now expect `per_device_train_batch_size` to be a multiple of `num_generations`.
We will change the batch size of 1 to the `num_generations` of 8


In [15]:
# For optional training + evaluation
# new_dataset = dataset.train_test_split(test_size = 0.01)

trainer = GRPOTrainer(
    model = model_policy,
    processing_class = tokenizer,
    reward_funcs = [
        function_works,
        strategy_succeeds,
    ],
    args = training_args,
    train_dataset = dataset,

    # For optional training + evaluation
    # train_dataset = new_dataset["train"],
    # eval_dataset = new_dataset["test"],
)

INFO:httpx: HTTP Request: GET https://api.gradio.app/pkg-version "HTTP/1.1 200 OK"


In [16]:
trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 1,000 | Num Epochs = 1 | Total steps = 35
O^O/ \_/ \    Batch size per device = 8 | Gradient accumulation steps = 1
\        /    Data Parallel GPUs = 1 | Total batch size (8 x 1 x 1) = 8
 "-____-"     Trainable parameters = 194,510,848 of 3,407,260,672 (5.71% trained)


* Trackio project initialized: huggingface
* Trackio metrics logged to: /root/.cache/huggingface/trackio


* Created new run: luminous-heron-56


`generation_config` default values have been modified to match model-specific defaults: {'max_length': 131072, 'temperature': 0.6, 'top_p': 0.9}. If this is not desired, please set these values explicitly.


```python
def lunarlander_strategy(state):
    x, y, vx, vy, theta, vtheta, left_contact, right_contact = state
    
    # Dead zones for vx, vy, and theta
    vx_deadzone = 0.1
    vy_deadzone = 0.1
    theta_deadzone = 0.1
    
    # Vertical stabilization
    vth = 0
    if vy < 0:
        vth += 0.5 * vy
    elif vy > 0:
        vth -= 0.5 * vy
    
    # Main engine command
    main = 0
    if abs(theta) > theta_deadzone:
        main += 0.5 * (theta / abs(theta))
    elif vy < 0:
        main += 0.5 * abs(vy)
    
    # Lateral thruster command
    lat = 0
    if vx > 0:
        lat -= 0.5 * vx
    elif vx < 0:
        lat += 0.5 * vx
    
    # Clamp outputs to [-1, 1]
    main = max(-1, min(1, main))
    lat = max(-1, min(1, lat))
    
    return [main, lat]
def lunarlander_strategy(state):
    x, y, vx, vy, theta, vtheta, left_contact, right_contact = state
    
    # Dead zones for vx, vy, and theta
    vx_deadzone = 0.1
    vy_deadzone = 0.1
    theta_deadzone = 0.1
    
   

Step,Training Loss,reward,reward_std,completions / mean_length,completions / min_length,completions / max_length,completions / clipped_ratio,completions / mean_terminated_length,completions / min_terminated_length,completions / max_terminated_length,sampling / sampling_logp_difference / mean,sampling / sampling_logp_difference / max,sampling / importance_sampling_ratio / min,sampling / importance_sampling_ratio / mean,sampling / importance_sampling_ratio / max,kl,rewards / function_works / mean,rewards / function_works / std,rewards / strategy_succeeds / mean,rewards / strategy_succeeds / std
1,0.0,-64.480583,176.721771,1107.625,272.0,1227.0,0.875,272.0,272.0,272.0,0,0,0,0,0,0.000261,-0.875,3.181981,-63.605583,179.903763
2,0.0,-68.687508,188.620758,1103.875,242.0,1227.0,0.875,242.0,242.0,242.0,No Log,No Log,No Log,No Log,No Log,0.000279,-0.125,5.303301,-68.562508,193.924072
3,0.0,-23.704559,61.389763,1112.875,314.0,1227.0,0.875,314.0,314.0,314.0,No Log,No Log,No Log,No Log,No Log,0.000289,-0.625,3.889087,-23.079559,65.278854
4,0.0,-0.625,3.889087,1115.875,338.0,1227.0,0.875,338.0,338.0,338.0,No Log,No Log,No Log,No Log,No Log,0.00052,-0.25,4.949748,-0.375,1.06066
5,0.0,-37.471153,100.327576,1115.0,331.0,1227.0,0.875,331.0,331.0,331.0,No Log,No Log,No Log,No Log,No Log,0.000871,-0.5,4.24264,-36.971153,104.570206
6,0.0,-16.146255,40.011654,1089.75,129.0,1227.0,0.875,129.0,129.0,129.0,No Log,No Log,No Log,No Log,No Log,0.004813,-0.125,5.303301,-16.021255,45.314953
7,0.0,-18.338549,46.212395,1118.0,355.0,1227.0,0.875,355.0,355.0,355.0,No Log,No Log,No Log,No Log,No Log,0.017728,-0.375,4.596194,-17.963549,50.80859
8,0.0001,-39.713135,106.668854,1097.625,192.0,1227.0,0.875,192.0,192.0,192.0,No Log,No Log,No Log,No Log,No Log,0.068833,0.25,6.363961,-39.963135,113.032822
9,0.0002,-13.887166,33.621983,1104.0,243.0,1227.0,0.875,243.0,243.0,243.0,No Log,No Log,No Log,No Log,No Log,0.198403,0.75,7.778175,-14.637166,41.400158
10,0.0007,-19.189554,48.6194,1112.5,311.0,1227.0,0.875,311.0,311.0,311.0,No Log,No Log,No Log,No Log,No Log,0.725491,-0.25,4.949748,-18.939554,53.569149


Unsloth: Will smartly offload gradients to save VRAM!
def lunarlander_strategy(state):
    x, y, vx, vy, theta, vtheta, left_contact, right_contact = state
    
    # Dead zones for velocity and angle
    vx_deadzone = 0.1
    vy_deadzone = 0.1
    theta_deadzone = 0.1
    
    # Calculate control commands
    main = max(-1, min(1, -vy * 10))  # Counteract downward speed
    main += max(-1, min(1, -theta * 10)) * 5  # Counteract tilt error
    
    lateral = max(-1, min(1, -vx * 2))  # Reduce horizontal error and velocity
    
    # Cap commands to prevent saturation
    main = max(-1, min(main, 1))
    lateral = max(-1, min(lateral, 1))
    
    # Reduce thrust when either leg has contact and vertical speed is small
    if left_contact or right_contact:
        main = max(-1, min(main, 1 - abs(vy) * 2))
    
    return [main, lateral]
def lunarlander_strategy(state):
    x, y, vx, vy, theta, vtheta, left_contact, right_contact = state
    
    # Dead zones for velocity and angle
    v

TrainOutput(global_step=35, training_loss=0.16003739131348474, metrics={'train_runtime': 1555.6975, 'train_samples_per_second': 0.18, 'train_steps_per_second': 0.022, 'total_flos': 0.0, 'train_loss': 0.16003739131348474})

In [18]:
os.makedirs("adapters", exist_ok=True)
model_policy.save_pretrained("adapters/lunarlander-lora-grpo_trained")
tokenizer.save_pretrained("adapters/lunarlander-lora-grpo_trained")

('adapters/lunarlander-lora-grpo_trained/tokenizer_config.json',
 'adapters/lunarlander-lora-grpo_trained/special_tokens_map.json',
 'adapters/lunarlander-lora-grpo_trained/chat_template.jinja',
 'adapters/lunarlander-lora-grpo_trained/tokenizer.json')

Unsloth: AMD currently is not stable with 4bit bitsandbytes. Disabling for now.
==((====))==  Unsloth 2025.10.9: Fast Llama patching. Transformers: 4.56.2.
   \\   /|    AMD Radeon Graphics. Num GPUs = 1. Max memory: 191.688 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+rocm6.4. ROCm Toolkit: 6.4.43482-0f2d60242. Triton: 3.4.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards: 100% 2/2 [00:02<00:00,  1.18s/it]


CartPole LoRA (GRPO): 9.2 ± 0.9
