In [1]:
from llama_cpp.llama import LlamaGrammar
from llama_cpp import Llama
import json

In [2]:
model = Llama(
    # https://huggingface.co/TheBloke/Mistral-7B-OpenOrca-GGUF
    model_path="./mistral-7b-openorca.Q6_K.gguf",
    n_gpu_layers=-1, # Offload all layers to GPU
    n_batch=512,
    n_ctx=2048,
    stop=["<|im_end|>"],
    verbose=False,
)

In [3]:
grammar_str = r'''root   ::= object
value  ::= object | array | string | number | ("true" | "false" | "null") ws

object ::=
  "{" ws (
            string ":" ws value
    ("," ws string ":" ws value)*
  )? "}" ws

array  ::=
  "[" ws (
            value
    ("," ws value)*
  )? "]" ws

string ::=
  "\"" (
    [^"\\\x7F\x00-\x1F] |
    "\\" (["\\bfnrt] | "u" [0-9a-fA-F]{4}) # escapes
  )* "\"" ws

number ::= ("-"? ([0-9] | [1-9] [0-9]{0,15})) ("." [0-9]+)? ([eE] [-+]? [0-9] [1-9]{0,15})? ws

# Optional space: by convention, applied in this grammar after literal chars when allowed
ws ::= | " " | "\n" [ \t]{0,20}
'''

json_grammar = LlamaGrammar.from_string(grammar_str,verbose=False)

In [4]:
prompt = """
<|im_start|>system
You are MistralOrca, a large language model trained by Alignment Lab AI. Answer the questions as succinctly as possible.
<|im_end|>
<|im_start|>user
Tell me about Massachusetts.
<|im_end|>
<|im_start|>assistant
"""

In [5]:
response = model(
    prompt,
    grammar=json_grammar,
    max_tokens=None,
    temperature=1
)['choices'][0]['text']

In [6]:
print(response)

{
    "State": "Massachusetts",
    "Capital": "Boston",
    "Nickname": "The Bay State",
    "Location": "New England region",
    "Neighbors": "Connecticut, Rhode Island, New Hampshire, and Vermont",
    "Notable Cities": "Boston, Worcester, Springfield, Cambridge, and Lowell",
    "Size": "35,058 square miles",
    "Population": "6.9 million (as of 2020)",
    "Major Attractions": "Fenway Park, Freedom Trail, Cape Cod, Faneuil Hall, and Salem"
}

