# Qwen Sales Agent — GSPO Fine-tuning

This notebook fine-tunes a Qwen model for **sales** using **GSPO + LoRA**.

Notes:
- For a laptop-friendly run, start with 3B.
- For 7B, you’ll likely want `use_4bit=True` and shorter lengths.


In [None]:
import os

os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ.setdefault("WANDB_MODE", "disabled")


In [None]:
MODEL_NAME = "Qwen/Qwen2.5-3B-Instruct"  # try 7B with use_4bit=True
OUTPUT_DIR = "./outputs/notebooks/qwen_sales_gspo"


In [None]:
from stateset_agents import MultiTurnAgent
from stateset_agents.core.agent import AgentConfig
from stateset_agents.core.environment import ConversationEnvironment, CONVERSATION_CONFIGS
from stateset_agents.rewards import create_domain_reward
from stateset_agents.training import GSPOConfig, train_with_gspo

env_config = CONVERSATION_CONFIGS["sales"].copy()
env_config["scenarios"] = env_config["scenarios"] + [
    {
        "topic": "discovery",
        "user_goal": "Get a recommendation",
        "context": "User describes their team size and goals and wants the best plan recommendation.",
    },
    {
        "topic": "pricing_objection",
        "user_goal": "Address pricing concerns",
        "context": "User likes the product but says it’s too expensive. Handle objections and propose options.",
    },
    {
        "topic": "security_objection",
        "user_goal": "Build trust",
        "context": "User asks about security, data handling, and compliance before buying.",
    },
    {
        "topic": "onboarding",
        "user_goal": "Explain onboarding",
        "context": "User asks how long setup takes and what onboarding help is available.",
    },
    {
        "topic": "upsell",
        "user_goal": "Choose higher tier",
        "context": "User is on the basic tier and asks what they gain by upgrading to premium.",
    },
]

environment = ConversationEnvironment(**env_config)
reward_model = create_domain_reward("sales")

system_prompt = (
    "You are Qwen, a helpful sales assistant. "
    "Ask a couple of clarifying questions, recommend the best option, "
    "clearly explain value, handle objections professionally, and suggest next steps."
)

agent = MultiTurnAgent(
    AgentConfig(
        model_name=MODEL_NAME,
        system_prompt=system_prompt,
        max_new_tokens=384,
        temperature=0.7,
    ),
    memory_window=8,
)


In [None]:
config = GSPOConfig(
    model_name=MODEL_NAME,
    output_dir=OUTPUT_DIR,
    report_to="none",
    num_outer_iterations=5,
    num_iterations=1,
    generations_per_iteration=len(environment.scenarios),
    num_generations=4,
    use_lora=True,
    lora_r=32,
    lora_alpha=64,
    lora_dropout=0.05,
    gradient_checkpointing=True,
    use_4bit=False,
    use_8bit=False,
    bf16=True,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    learning_rate=8e-6,
    max_prompt_length=512,
    max_completion_length=384,
    temperature=0.7,
    top_p=0.9,
    logging_steps=1,
    save_steps=2,
)


In [None]:
trained_agent = await train_with_gspo(
    config=config,
    agent=agent,
    environment=environment,
    reward_model=reward_model,
)


In [None]:
test_messages = [
    {
        "role": "user",
        "content": "I have a 10-person team and we’re evaluating plans. What do you recommend and why?",
    }
]

print(await trained_agent.generate_response(test_messages))


## Artifacts

- Checkpoints: `OUTPUT_DIR/checkpoint-*`
- Final save: `OUTPUT_DIR/final_model`

If `use_lora=True`, the saved folder is a **LoRA adapter** (not the full base model).
