# Frontend: Structured Generation Language (SGLang)

The frontend language can be used with local models or API models. It is an alternative to the OpenAI API. You may find it easier to use for complex prompting workflow.

Start the server.

In [None]:
from sglang.utils import (
    execute_shell_command,
    wait_for_server,
    terminate_process,
    print_highlight,
)

server_process = execute_shell_command(
    "python3 -m sglang.launch_server --model-path Qwen/Qwen2.5-1.5B-Instruct --mem-fraction-static 0.8 --port 30333 --host 0.0.0.0"
)
wait_for_server("http://localhost:30333")

Setup default backend for SGLang.

In [None]:
from sglang import set_default_backend, RuntimeEndpoint

set_default_backend(RuntimeEndpoint("http://localhost:30333"))

## Multi-turn conversation

SGLang provides simple api to build multi-turn conversations. Prompt templates can be defined intuitively with the `function` decorator.

In [None]:
from sglang import function, system, user, assistant, gen


@function
def multi_turn_conversation(s, country: str):
    s += system("You are Qwen, created by Alibaba Cloud. You are a helpful assistant.")
    s += user(f"What is the capital of {country}?")
    s += assistant(gen("capital", max_tokens=250))
    s += user("Name an interesting building in this city.")
    s += assistant(gen("building", max_tokens=250))


state = multi_turn_conversation.run(
    country="Germany",
)

for m in state.messages():
    print(m["role"], ":", m["content"])

print_highlight("#" * 50)
print_highlight(state["capital"])
print_highlight("#" * 50)
print_highlight(state["building"])

We can use SGLang for OpenAI models as well.

For that we only need to execute `export OPENAI_API_KEY=<your-openai-api-key>` and then `set_default_backend(OpenAI(<chosen-model>))`. Everything else stays exactly the same as above.

## Control Flow

SGLang's choices method is a powerful tool to control the flow of the conversation.

In [None]:
@function
def control_flow(s, question: str):
    s += user(question)
    s += assistant(
        "Based on the question, this seems like "
        + gen("type", choices=["a technical query", "a creative request"])
    )

    if s["type"] == "a technical query":
        s += assistant(
            "Here's a technical explanation: "
            + gen("technical_response", max_tokens=250)
        )
    else:
        s += assistant(
            "Here's a creative response: " + gen("creative_response", max_tokens=250)
        )


state = control_flow.run(
    question="What is the main difference between a CPU and a GPU?"
)
print_highlight(state["technical_response"])
print_highlight("#" * 50)
state = control_flow.run(question="Can you help me write a story about time travel?")
print_highlight(state["creative_response"])
print_highlight("#" * 50)

## Parallelism

SGLang supports parallelism. `fork` can be used to launch multiple prompts in parallel.

In [None]:
@function
def parallel_sample(s, question, n):
    s += user(question)
    forks = s.fork(n)
    forks += assistant(gen("answer", temperature=0.7))
    forks.join()


states = parallel_sample.run(
    question="What does the integral of sin(x) from 0 to 2pi evaluate to? Answer without calculation.",
    n=5,
)
for answer in states["answer"]:
    print_highlight(answer)
    print_highlight("-" * 50)

## Constrained Decoding

SGLang supports constrained decoding for structured outputs. The output format can be specified in form of a regular expression.

*Note: This is only supported for local models.*

In [None]:
@function
def regular_expression_gen(s):
    s += user("What is the birth date of Albert Einstein?")
    s += assistant(
        gen(
            "answer",
            temperature=0,
            regex=r"\d{1,2}\/\d{1,2}\/\d{2,4}",
        )
    )


state = regular_expression_gen.run()
print_highlight(state["answer"])

Regular expression can also be used for schema extraction.

In [None]:
import json

character_regex = (
    r"""\{\n"""
    + r"""    "name": "[\w\d\s]{1,16}",\n"""
    + r"""    "house": "(Gryffindor|Slytherin|Ravenclaw|Hufflepuff)",\n"""
    + r"""    "blood status": "(Pure-blood|Half-blood|Muggle-born)",\n"""
    + r"""    "occupation": "(student|teacher|auror|ministry of magic|death eater|order of the phoenix)",\n"""
    + r"""    "wand": \{\n"""
    + r"""        "wood": "[\w\d\s]{1,16}",\n"""
    + r"""        "core": "[\w\d\s]{1,16}",\n"""
    + r"""        "length": [0-9]{1,2}\.[0-9]{0,2}\n"""
    + r"""    \},\n"""
    + r"""    "alive": "(Alive|Deceased)",\n"""
    + r"""    "patronus": "[\w\d\s]{1,16}",\n"""
    + r"""    "bogart": "[\w\d\s]{1,16}"\n"""
    + r"""\}"""
)


@function
def generate_character(s, name):
    s += system(
        "You are a helpful assistant that extracts information about a character from a text."
    )
    s += user(f"Extract the relevant information about {name}.")
    s += assistant(gen("character", regex=character_regex, max_tokens=256))


state = generate_character.run(name="Harry Potter")
print_highlight(state["character"])

## Batching

`run_batch` can be used to run prompts with continous batching.

In [None]:
@function
def simple_qa(s, question: str):
    s += user(question)
    s += assistant(gen("answer", max_tokens=128, stop=["assistant"], temperature=0))


states = simple_qa.run_batch(
    [
        {"question": "Who was the first man on the moon?"},
        {"question": "Who was Lev Landau?"},
        {"question": "Please tell me a joke about a chicken."},
    ]
)

for state in states:
    print_highlight(f"Answer: {state['answer']}")
    print_highlight("-" * 50)

## Streaming

`stream` can be used to stream the response from the model.

*Note: We use* `print_highlight` *here to keep the color convention. In practice, we would use* `print(out, end="", flush=True)` *to stream the response.*

In [None]:
@function
def stream_qa(s, question: str):
    s += user(question)
    s += assistant(gen("answer", max_tokens=128, stop=["assistant"], temperature=0))


state = stream_qa.run(question="Who was the first man on the moon?", stream=True)
for out in state.text_iter():
    print_highlight(out)

## Roles

`[user|assistant|system]_[begin|end]` can be used to define more complex prompts.

In [None]:
from sglang import user_begin, user_end, assistant_begin, assistant_end


@function
def roles(s):
    s += system(
        "You talk like a pirate and use frequently phrases like 'arrr' and 'yo-ho-ho'."
    )
    s += user_begin()
    s += "Hello, how do you like life as a pirate?"
    s += user_end()
    s += assistant_begin()
    s += "There is much to tell about the life of a pirate." + gen(
        "story", max_tokens=128
    )
    s += assistant_end()


state = roles.run()
print_highlight(state["story"])

In [None]:
terminate_process(server_process)

## Multi-modal

SGLang supports a variety of [multi-modal models](https://docs.sglang.ai/backend/openai_api_vision.html).

In [None]:
server_process = execute_shell_command(
    "python3 -m sglang.launch_server --model-path lmms-lab/llama3-llava-next-8b --mem-fraction-static 0.8 --port 30333 --host 0.0.0.0"
)
wait_for_server("http://localhost:30333")

In [None]:
set_default_backend(RuntimeEndpoint("http://localhost:30333"))

Use `image` to pass an image to the model.

In [None]:
!wget -O example_image.png https://github.com/sgl-project/sglang/blob/main/test/lang/example_image.png?raw=true

In [None]:
from sglang import image


@function
def image_qa(s, image_file, question):
    s += user(image(image_file) + question)
    s += assistant(gen("answer", max_tokens=128, stop=["assistant"]))


state = image_qa.run(
    image_file="example_image.png", question="Describe the image in one short sentence."
)
print_highlight(state["answer"])

In [None]:
!rm example_image.png

In [None]:
terminate_process(server_process)

## Going further

To get more familar with SGLang we recommend to start studying the [benchmark scripts](https://github.com/sgl-project/sglang/tree/main/benchmark).