In [18]:
BASE_ID = "unsloth/Llama-3.2-3B-Instruct"

In [19]:
from unsloth import FastLanguageModel
import torch

max_seq_length = 2048 # Can increase for longer RL output
lora_rank = 128        # Larger rank = smarter, but slower
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = BASE_ID,
    load_in_4bit = False,
    max_seq_length = max_seq_length,
)

Unsloth: AMD currently is not stable with 4bit bitsandbytes. Disabling for now.
==((====))==  Unsloth 2025.10.9: Fast Llama patching. Transformers: 4.56.2.
   \\   /|    AMD Radeon Graphics. Num GPUs = 1. Max memory: 191.688 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+rocm6.4. ROCm Toolkit: 6.4.43482-0f2d60242. Triton: 3.4.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards: 100% 2/2 [00:02<00:00,  1.21s/it]


In [20]:
model_policy = FastLanguageModel.get_peft_model(
    model,
    r = lora_rank, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = [
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ], # Remove QKVO if out of memory
    lora_alpha = lora_rank,
    use_gradient_checkpointing = "unsloth", # Enable long context finetuning
    random_state = 3407,
)
model_policy.print_trainable_parameters()

trainable params: 194,510,848 || all params: 3,407,260,672 || trainable%: 5.7087


In [4]:
import os, sys, subprocess

working_directory = "/shared-docker/OpenEnv"  # repo root containing src/
port = "8050"
keepalive = "1000"

cmd = [
    sys.executable, "-m", "uvicorn",
    "envs.cartpole_env.server.app:app",  # <— our new app path
    "--host", "0.0.0.0",
    "--port", port,
    "--timeout-keep-alive", keepalive,
]

env = {
    **os.environ,
    "PYTHONPATH": f"{working_directory}/src",

    # CartPole knobs (analogous to OPENSPIEL_*):
    "CARTPOLE_ENV_ID": "CartPole-v1",
    "CARTPOLE_SEED": "123",
    "CARTPOLE_MAX_EPISODE_STEPS": "1000",
    "CARTPOLE_RENDER_MODE": "none",  # or "rgb_array"
}

#proc = subprocess.Popen(cmd, env=env, cwd=working_directory,
#                        stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)

#print("CartPole server starting at http://localhost:8020 … PID:", proc.pid)


In [5]:
import httpx, time
time.sleep(1)
r = httpx.post("http://localhost:8050/reset", json={}); print(r.json())

INFO:httpx: HTTP Request: POST http://localhost:8050/reset "HTTP/1.1 200 OK"


{'observation': {'state': [-0.45442235469818115, 0.0], 'legal_actions': [-1.0, 1.0], 'episode_length': 0, 'total_reward': 0.0}, 'reward': 0.0, 'done': False}


In [6]:
import requests
import time
time.sleep(5) # Wait 5 seconds for OpenEnv to start!
from envs.mountaincarcontinuous_environment import MountainCarContinuousEnv, MountainCarContinuousAction
import httpx

# Same shape as: OpenSpielEnv(base_url=..., request_timeout_s=...)
base_url = "http://localhost:8050"
request_timeout_s = 1000  # seconds

openenv_process = openenv_process = MountainCarContinuousEnv(
        base_url=base_url,
        request_timeout_s=request_timeout_s
    )
# quick smoke test
# info = openenv_process.info()

# print("CartPole info:", info)

state = openenv_process.reset()
print("reset:", state)

# step with discrete action 0/1
state = openenv_process.step(MountainCarContinuousAction(
                engine_force=0.1))
print("step:", state)

reset: StepResult(observation=MountainCarContinuousObservation(done=False, reward=0.0, metadata={}, state=[-0.45953240990638733, 0.0], legal_actions=[-1.0, 1.0], episode_length=0, total_reward=0.0), reward=0.0, done=False)
step: StepResult(observation=MountainCarContinuousObservation(done=False, reward=-0.0010000000000000002, metadata={}, state=[-0.4598599374294281, -0.000327545014442876], legal_actions=[-1.0, 1.0], episode_length=1, total_reward=-0.0010000000000000002), reward=-0.0010000000000000002, done=False)


In [7]:
def strategy_simple(state):
    # state = [x, dx, angle, dangle]
    return 0  # left or 1 right


def build_user_prompt():
    return (
        """You are an expert MountainCarContinuous player and a precise Python code generator.

Context / How this will be used
- Your function will be called every environment step to control the entire episode of MountainCarContinuous-v0 (OpenAI Gym/Gymnasium style, continuous actions).
- The episode ends when the car reaches the goal (typically x ≥ 0.45–0.5) or when the environment hits its max step limit.
- Reward shaping encourages reaching the goal quickly while using as little engine force as possible (penalizes large |action|).

Your objective: reach the goal position as fast as possible by smartly building and exploiting momentum on the hills, while avoiding wasteful throttle.

What you must write
- A single Python function with this exact signature (no extras):
    def mountaincar_strategy(state):
- Input `state` is a list of 2 floats: [x, v]
  - x = car position (range about [-1.2, 0.6])
  - v = car velocity (range about [-0.07, 0.07])
- Output: return a **float** action in **[-1.0, 1.0]**
  - negative = throttle left, positive = throttle right, zero ≈ coasting.
  - The caller will pass this scalar to the env (wrapping into a length-1 array if needed).

Design guidance for long-horizon control
- MountainCarContinuous requires **momentum pumping**: generally apply force in the direction of current velocity to amplify swings, then **time the reversal** to crest the right hill.
- Prefer a **short, deterministic** control law (e.g., a weighted linear rule with saturating clamp to [-1, 1]).
- Add a **deadband/hysteresis** around v ≈ 0 to avoid rapid action sign flips; coasting (≈0) can be beneficial when switching directions.
- Useful heuristics:
  - If v > +ε, bias action right; if v < -ε, bias action left.
  - When far on the left (x < -0.5), allow larger left pushes to harvest momentum.
  - Near the final ascent on the right (x > -0.2), bias action right and reduce needless oscillation.
  - Optionally damp action magnitude by |v| to avoid over-throttling at high speeds.

Hard constraints
- Do not import, print, read/write files, use globals, randomness, or any I/O.
- Keep the code short and stateless; identical inputs must produce identical outputs.
- The output must be exactly one fenced code block in Python, with nothing before or after.
  - The first line inside the block must be: def mountaincar_strategy(state):
  - The last line of your entire response must be the closing backticks to clearly end the program. No trailing commentary.

Output format reminder (dummy example — do NOT copy this logic):
```
def mountaincar_strategy(state):
    x, v = state
    return 1 * x * v # dummy example
        ```
        All helper functions should be inside def mountaincar_strategy. Only output the short function `strategy`.
        """.strip())

print(build_user_prompt())

You are an expert MountainCarContinuous player and a precise Python code generator.

Context / How this will be used
- Your function will be called every environment step to control the entire episode of MountainCarContinuous-v0 (OpenAI Gym/Gymnasium style, continuous actions).
- The episode ends when the car reaches the goal (typically x ≥ 0.45–0.5) or when the environment hits its max step limit.
- Reward shaping encourages reaching the goal quickly while using as little engine force as possible (penalizes large |action|).

Your objective: reach the goal position as fast as possible by smartly building and exploiting momentum on the hills, while avoiding wasteful throttle.

What you must write
- A single Python function with this exact signature (no extras):
    def mountaincar_strategy(state):
- Input `state` is a list of 2 floats: [x, v]
  - x = car position (range about [-1.2, 0.6])
  - v = car velocity (range about [-0.07, 0.07])
- Output: return a **float** action in **[-1.0, 1.

In [8]:
def extract_function(text):
    if text.count("```") >= 2:
        first = text.find("```") + 3
        second = text.find("```", first)
        fx = text[first : second].strip()
        fx = fx[fx.find("def"):]
        if fx.startswith("def mountaincar_strategy(state):"): return fx
    return None
print(extract_function(build_user_prompt()))

def mountaincar_strategy(state):
    x, v = state
    return 1 * x * v # dummy example


In [9]:
# Minimal safe executor (reuse your nb's create_locked_down_function if available)
from unsloth import create_locked_down_function
from random import randint
from unsloth import check_python_modules

def _safe_compile(func_src: str):
    # Use the notebook's 'create_locked_down_function' if present
    return create_locked_down_function(func_src)



In [10]:
import numpy as np
from random import randint
global _PRINT_COUNTER
_PRINT_COUNTER = 0

def mc_simple_reward(s, a, r_env):
    """
    Simple shaping for MountainCarContinuous using only (state, action, env reward).
      s: (x, v)      # position, velocity
      a: float       # throttle in [-1, 1]
      r_env: float   # environment reward at this step
    Returns: float
    """
    import math
    x, v = float(s[0]), float(s[1])

    # Terrain geometry from x
    height = math.sin(3.0 * x)          # in [-1, 1]  (higher is better)
    slope  = 3.0 * math.cos(3.0 * x)    # proportional to uphill direction

    # ---- small, interpretable weights (tune if needed) ----
    w_height   = 0.5    # prefer higher positions overall
    w_push     = 0.20   # reward pushing in uphill direction (a * slope)
    w_synergy  = 0.06   # action aligned with current velocity (a * v)
    w_slow     = 0.10   # gently slow near crests (gate * v^2)
    time_cost  = 0.01   # discourage dithering
    effort_c   = 0.0005 # tiny control cost
    eps_a      = 0.04   # "idle" action band
    v_idle     = 0.02   # moving but not actuating -> penalize
    idle_pen   = 0.02

    # 1) Positional progress: higher is better (no goal hardcoding)
    progress = w_height * height

    # 2) Push uphill: align throttle with uphill direction from slope
    #    - On left hill (slope < 0) pushing left (a<0) is rewarded to build momentum
    #    - On right hill (slope > 0) pushing right (a>0) is rewarded to finish
    push_uphill = w_push * (a * slope)

    # 3) Synergy: using throttle with your current motion helps build momentum
    synergy = w_synergy * (a * v)

    # 4) Near-crest slowdown: penalize kinetic energy more as height → crest (h≈1)
    crest_gate = 0.5 * (height + 1.0)   # maps [-1,1] -> [0,1]
    slow_near_crest = w_slow * crest_gate * (v * v)

    # 5) Anti-idle: if action ~0 while moving OR terrain is steep, penalize
    steep = abs(slope) > 0.5
    idle = idle_pen if (abs(a) < eps_a and (abs(v) > v_idle or steep)) else 0.0

    shaped = (
        r_env
        + progress
        + push_uphill
        + synergy
        - slow_near_crest
        - time_cost
        - effort_c * (a * a)
        - idle
    )
    return float(shaped)
    
    
def execute_strategy(strategy_fn, initial_state, max_steps):
    """Run strategy on CartPole server until done or step limit.
       Returns (steps_survived, done_flag)."""
    steps = 0
    done = False
    state = initial_state
    reward = 0
    while not done and steps < max_steps:
        # print("try strategy")
        a = strategy_fn(state.observation.state)
        # print(a)
        if a < -1 :
            # clamp invalid actions
            a = -1 
        if a > 1:
            a = 1
        res = openenv_process.step(MountainCarContinuousAction(
                engine_force=a))
        reward += mc_simple_reward(state.observation.state, a, res.reward)
        done = bool(res.done)
        steps += 1
    return reward, steps, done


def function_works(completions, **kwargs):
    scores = []
    for completion in completions:
        score = 0
        response = completion[0]["content"]
        function = extract_function(response)
        if function is not None:
            ok, info = check_python_modules(function)
        if function is None or "error" in info:
            score = -2.0
        else:
            try:
                new_strategy = create_locked_down_function(function)
                score =  randint(7, 20)
            except:
                score = -0.5
        scores.append(score)
    return scores
    
def strategy_succeeds(completions, **kwargs):
    """completions: list of candidate generations (like the nb).
       Returns list[float] rewards (higher is better)."""
    global _PRINT_COUNTER
    scores = []

    # Reset the env once per candidate to a fresh start
    for completion in completions:
        try:
            # The notebook packs text like completion[0]["content"]
            response = completion[0]["content"]
        except Exception:
            scores.append(0.0); continue

        # Optional: print every 5th for debugging
        if _PRINT_COUNTER % 5 == 0:
            try:
                print(response.splitlines()[0][:120])
            except:
                print("...candidate omitted...")
        _PRINT_COUNTER += 1
        #print(response)
        # Parse the function from the text (reuse nb's helpers if present)
        try:
            func_src = extract_function(response)
            print(func_src)
            strategy_fn = _safe_compile(func_src)
        except Exception as e:
            print("Compile error:", e)
            scores.append(0.0); continue
        print(func_src)
        print(strategy_fn)
        # Rollout on CartPole
        try:
            # Fresh episode
            res0 = openenv_process.reset()
            current_state = res0
            reward, steps, finished = execute_strategy(strategy_fn, current_state, max_steps=10000)

            # Reward = steps survived (CartPole native) with mild shaping:
            # huge bonus if finished by reaching max steps (i.e., perfect 500).
            

            # print a brief trace for failing candidates occasionally
            if steps > 100 and (_PRINT_COUNTER % 7 == 0):
                print("Long episode:", steps)

            scores.append(reward - 0.1 * steps)
        except TimeoutError:
            # keep the semantics from the nb
            scores.append(-1.0)
        except Exception as e:
            # print(f"Exception = {str(e)}")
            scores.append(-3.0)

    return scores

In [11]:
toy = """
```
def mountaincar_strategy(state):
    # state: [x, v]
    print(state)
    x, v = state
    score = x + 0.1 * v
    return -0.5
    ```
"""




    
fn1 = extract_function(toy)

fn = _safe_compile(fn1)

s0 = openenv_process.reset();
# print(s0.observation.values[0],s0.observation.values[1],s0.observation.values[2],s0.observation.values[3])
print(fn(s0.observation.state))
done = False

reward, steps, done = execute_strategy(fn, s0, 100)
print("Toy strategy survived steps reward:", steps, reward)

[-0.5655260682106018, 0.0]
-0.5
[-0.5655260682106018, 0.0]
[-0.5655260682106018, 0.0]
[-0.5655260682106018, 0.0]
[-0.5655260682106018, 0.0]
[-0.5655260682106018, 0.0]
[-0.5655260682106018, 0.0]
[-0.5655260682106018, 0.0]
[-0.5655260682106018, 0.0]
[-0.5655260682106018, 0.0]
[-0.5655260682106018, 0.0]
[-0.5655260682106018, 0.0]
[-0.5655260682106018, 0.0]
[-0.5655260682106018, 0.0]
[-0.5655260682106018, 0.0]
[-0.5655260682106018, 0.0]
[-0.5655260682106018, 0.0]
[-0.5655260682106018, 0.0]
[-0.5655260682106018, 0.0]
[-0.5655260682106018, 0.0]
[-0.5655260682106018, 0.0]
[-0.5655260682106018, 0.0]
[-0.5655260682106018, 0.0]
[-0.5655260682106018, 0.0]
[-0.5655260682106018, 0.0]
[-0.5655260682106018, 0.0]
[-0.5655260682106018, 0.0]
[-0.5655260682106018, 0.0]
[-0.5655260682106018, 0.0]
[-0.5655260682106018, 0.0]
[-0.5655260682106018, 0.0]
[-0.5655260682106018, 0.0]
[-0.5655260682106018, 0.0]
[-0.5655260682106018, 0.0]
[-0.5655260682106018, 0.0]
[-0.5655260682106018, 0.0]
[-0.5655260682106018, 0

In [12]:
from datasets import Dataset
dataset = Dataset.from_list([{"prompt" : [{"role": "user", "content": build_user_prompt().strip()}], "answer" : 0, "reasoning_effort": "low"}]*1000)
maximum_length = len(tokenizer.apply_chat_template([{"role": "user", "content": build_user_prompt().strip()}], add_generation_prompt = True))
print(maximum_length)

690


In [13]:
dataset[0]

{'prompt': [{'content': 'You are an expert MountainCarContinuous player and a precise Python code generator.\n\nContext / How this will be used\n- Your function will be called every environment step to control the entire episode of MountainCarContinuous-v0 (OpenAI Gym/Gymnasium style, continuous actions).\n- The episode ends when the car reaches the goal (typically x ≥ 0.45–0.5) or when the environment hits its max step limit.\n- Reward shaping encourages reaching the goal quickly while using as little engine force as possible (penalizes large |action|).\n\nYour objective: reach the goal position as fast as possible by smartly building and exploiting momentum on the hills, while avoiding wasteful throttle.\n\nWhat you must write\n- A single Python function with this exact signature (no extras):\n    def mountaincar_strategy(state):\n- Input `state` is a list of 2 floats: [x, v]\n  - x = car position (range about [-1.2, 0.6])\n  - v = car velocity (range about [-0.07, 0.07])\n- Output: 

In [14]:
max_prompt_length = maximum_length + 1 # + 1 just in case!
max_completion_length = max_seq_length - max_prompt_length

from trl import GRPOConfig, GRPOTrainer
training_args = GRPOConfig(
    temperature = 1.0,
    learning_rate = 5e-6,
    weight_decay = 0.01,
    warmup_ratio = 0.1,
    lr_scheduler_type = "linear",
    optim = "adamw_8bit",
    logging_steps = 1,
    per_device_train_batch_size = 1,
    gradient_accumulation_steps = 1, # Increase to 4 for smoother training
    num_generations = 8, # Decrease if out of memory
    max_prompt_length = max_prompt_length,
    max_completion_length = max_completion_length,
    # num_train_epochs = 1, # Set to 1 for a full training run
    max_steps = 600,
    save_steps = 100,
    report_to = "trackio", # Can use Weights & Biases, TrackIO
    output_dir = "outputs",

    # For optional training + evaluation
    # fp16_full_eval = True,
    # per_device_eval_batch_size = 4,
    # eval_accumulation_steps = 1,
    # eval_strategy = "steps",
    # eval_steps = 1,
)

Unsloth: We now expect `per_device_train_batch_size` to be a multiple of `num_generations`.
We will change the batch size of 1 to the `num_generations` of 8


In [15]:
# For optional training + evaluation
# new_dataset = dataset.train_test_split(test_size = 0.01)

trainer = GRPOTrainer(
    model = model_policy,
    processing_class = tokenizer,
    reward_funcs = [
        function_works,
        strategy_succeeds,
    ],
    args = training_args,
    train_dataset = dataset,

    # For optional training + evaluation
    # train_dataset = new_dataset["train"],
    # eval_dataset = new_dataset["test"],
)

INFO:httpx: HTTP Request: GET https://api.gradio.app/pkg-version "HTTP/1.1 200 OK"


In [None]:
trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 1,000 | Num Epochs = 1 | Total steps = 600
O^O/ \_/ \    Batch size per device = 8 | Gradient accumulation steps = 1
\        /    Data Parallel GPUs = 1 | Total batch size (8 x 1 x 1) = 8
 "-____-"     Trainable parameters = 194,510,848 of 3,407,260,672 (5.71% trained)


* Trackio project initialized: huggingface
* Trackio metrics logged to: /root/.cache/huggingface/trackio


* Created new run: imaginative-robin-53


`generation_config` default values have been modified to match model-specific defaults: {'max_length': 131072, 'temperature': 0.6, 'top_p': 0.9}. If this is not desired, please set these values explicitly.


```python
def mountaincar_strategy(state):
    x, v = state
    if abs(v) < 0.01:
        return 0
    if x > 0.45:
        return 0
    if v > 0:
        return min(1, max(-1, 1 + 0.05 * v))
    if v < -0.05:
        return min(1, max(-1, -1 - 0.05 * v))
    if x < -0.5:
        return min(1, max(-1, 1 - 0.01 * x))
    if x > -0.2:
        return min(1, max(-1, 1 + 0.01 * x))
def mountaincar_strategy(state):
    x, v = state
    if abs(v) < 0.01:
        return 0
    if x > 0.45:
        return 0
    if v > 0:
        return min(1, max(-1, 1 + 0.05 * v))
    if v < -0.05:
        return min(1, max(-1, -1 - 0.05 * v))
    if x < -0.5:
        return min(1, max(-1, 1 - 0.01 * x))
    if x > -0.2:
        return min(1, max(-1, 1 + 0.01 * x))
<function mountaincar_strategy at 0x7b0b241657e0>
None
Compile error: compile() arg 1 must be a string, bytes or AST object
None
Compile error: compile() arg 1 must be a string, bytes or AST object
None
Compile error: compile() arg 1 must be a string

Step,Training Loss,reward,reward_std,completions / mean_length,completions / min_length,completions / max_length,completions / clipped_ratio,completions / mean_terminated_length,completions / min_terminated_length,completions / max_terminated_length,sampling / sampling_logp_difference / mean,sampling / sampling_logp_difference / max,sampling / importance_sampling_ratio / min,sampling / importance_sampling_ratio / mean,sampling / importance_sampling_ratio / max,kl,rewards / function_works / mean,rewards / function_works / std,rewards / strategy_succeeds / mean,rewards / strategy_succeeds / std
1,0.0,-76.83448,211.663864,1207.625,162.0,1357.0,0.875,162.0,162.0,162.0,0,0,0,0,0,0.000218,-0.875,3.181981,-75.95948,214.845871
2,0.0,-75.44323,207.728836,1202.375,120.0,1357.0,0.875,120.0,120.0,120.0,No Log,No Log,No Log,No Log,No Log,0.000297,-0.125,5.303301,-75.31823,213.03212
3,0.0,-75.626846,208.248184,1198.5,89.0,1357.0,0.875,89.0,89.0,89.0,No Log,No Log,No Log,No Log,No Log,0.000292,-0.625,3.889087,-75.001846,212.137268
4,0.0,-76.436897,210.539337,1203.5,129.0,1357.0,0.875,129.0,129.0,129.0,No Log,No Log,No Log,No Log,No Log,0.000281,-0.25,4.949748,-76.186897,215.48909
5,0.0,-75.968102,209.213394,1199.0,93.0,1357.0,0.875,93.0,93.0,93.0,No Log,No Log,No Log,No Log,No Log,0.000244,-0.5,4.24264,-75.468102,213.456055
6,0.0,-76.600204,211.001236,1196.5,73.0,1357.0,0.875,73.0,73.0,73.0,No Log,No Log,No Log,No Log,No Log,0.000314,-0.125,5.303301,-76.475204,216.304535
7,0.0,-78.035172,215.059937,1196.125,70.0,1357.0,0.875,70.0,70.0,70.0,No Log,No Log,No Log,No Log,No Log,0.000279,-0.375,4.596194,-77.660172,219.656143
8,0.0,-76.602638,211.008133,1205.875,148.0,1357.0,0.875,148.0,148.0,148.0,No Log,No Log,No Log,No Log,No Log,0.000363,0.25,6.363961,-76.852638,217.372086
9,0.0,-75.457695,207.76973,1202.375,120.0,1357.0,0.875,120.0,120.0,120.0,No Log,No Log,No Log,No Log,No Log,0.000345,0.75,7.778175,-76.207695,215.547913
10,0.0,-77.340515,213.095154,1195.0,61.0,1357.0,0.875,61.0,61.0,61.0,No Log,No Log,No Log,No Log,No Log,0.000479,-0.25,4.949748,-77.090515,218.044907


Unsloth: Will smartly offload gradients to save VRAM!
def mountaincar_strategy(state):
    x, v = state
    epsilon = 0.01
    if abs(v) < epsilon:
        action = 0
    elif x < -0.5:
        action = max(-1, min(1, -v * 0.1))
    elif x > -0.2:
        action = max(-1, min(1, v * 0.1))
    else:
        action = max(-1, min(1, v * 0.05))
    return action
def mountaincar_strategy(state):
    x, v = state
    epsilon = 0.01
    if abs(v) < epsilon:
        action = 0
    elif x < -0.5:
        action = max(-1, min(1, -v * 0.1))
    elif x > -0.2:
        action = max(-1, min(1, v * 0.1))
    else:
        action = max(-1, min(1, v * 0.05))
    return action
<function mountaincar_strategy at 0x7b0a8ea1d750>
None
Compile error: compile() arg 1 must be a string, bytes or AST object
``` 
None
Compile error: compile() arg 1 must be a string, bytes or AST object
None
Compile error: compile() arg 1 must be a string, bytes or AST object
None
Compile error: compile() arg 1 must be a string, b

In [17]:
os.makedirs("adapters", exist_ok=True)
model_policy.save_pretrained("adapters/mountaincart-lora-grpo_trained")
tokenizer.save_pretrained("adapters/mountaincart-lora-grpo_trained")

('adapters/mountaincart-lora-grpo_trained/tokenizer_config.json',
 'adapters/mountaincart-lora-grpo_trained/special_tokens_map.json',
 'adapters/mountaincart-lora-grpo_trained/chat_template.jinja',
 'adapters/mountaincart-lora-grpo_trained/tokenizer.json')