In [None]:
from sglang.utils import (
    execute_shell_command,
    wait_for_server,
    terminate_process,
    print_highlight,
)

server_process = execute_shell_command(
    "python3 -m sglang.launch_server --model-path Qwen/Qwen2.5-1.5B-Instruct --mem-fraction-static 0.8 --port 30333 --host 0.0.0.0"  # llama3
)
wait_for_server("http://localhost:30333")

## Multi-turn conversation

SGLang provides simple api to build multi-turn conversations.

Define the prompt template.

In [None]:
from sglang import (
    function,
    system,
    user,
    assistant,
    gen,
    set_default_backend,
    RuntimeEndpoint,
)


@function
def multi_turn_conversation(s, country: str):
    s += system("You are Qwen, created by Alibaba Cloud. You are a helpful assistant.")
    s += user(f"What is the capital of {country}?")
    s += assistant(gen("capital", max_tokens=250))
    s += user("Name an interesting building in this city.")
    s += assistant(gen("building", max_tokens=250))

Set the default backend to sglang.

In [None]:
set_default_backend(RuntimeEndpoint("http://localhost:30333"))

Execute the function and extract the results.

In [None]:
state = multi_turn_conversation.run(
    country="Germany",
)

for m in state.messages():
    print(m["role"], ":", m["content"])

print("#" * 50)
print(state["capital"])
print("#" * 50)
print(state["building"])

We can use SGLang for OpenAI models as well.

For that we only need to execute `export OPENAI_API_KEY=<your-openai-api-key>` and then `set_default_backend(OpenAI(<chosen-model>))`. Everything else stays exactly the same as above.

## Control Flow

SGLang's choices method is a powerful tool to control the flow of the conversation.

In [None]:
@function
def control_flow(s, question: str):
    s += user(question)
    s += assistant(
        "Based on the question, this seems like "
        + gen("type", choices=["a technical query", "a creative request"])
    )

    if s["type"] == "a technical query":
        s += assistant(
            "Here's a technical explanation: "
            + gen("technical_response", max_tokens=250)
        )
    else:
        s += assistant(
            "Here's a creative response: " + gen("creative_response", max_tokens=250)
        )


state = control_flow.run(
    question="What is the main difference between a CPU and a GPU?"
)
print_highlight(state["technical_response"])
print("#" * 50)
state = control_flow.run(question="Can you help me write a story about time travel?")
print_highlight(state["creative_response"])
print("#" * 50)

## Parallelism

Sometimes it is useful to evaluate a model on the same prompt multiple times. We can use `fork` to launch multiple prompts in parallel.

In [None]:
@function
def parallel_sample(s, question, n):
    s += user(question)
    forks = s.fork(n)
    forks += assistant(gen("answer", temperature=0.7))
    forks.join()


states = parallel_sample.run(
    question="What does the integral of sin(x) from 0 to 2pi evaluate to? Answer without calculation.",
    n=5,
)
for answer in states["answer"]:
    print_highlight(answer)
    print_highlight("-" * 50)

## Multi-modal

TODO

## Constrained Decoding

We can use a regular expression to constrain the output of the model. This is only supported for local models.

In [None]:
@function
def regular_expression_gen(s):
    s += user("What is the birth date of Albert Einstein?")
    s += assistant(
        gen(
            "answer",
            temperature=0,
            regex=r"\d{1,2}\/\d{1,2}\/\d{2,4}",
        )
    )


state = regular_expression_gen.run()
print_highlight(state["answer"])

In [None]:
terminate_process(server_process)