In [1]:
from lark import Lark
import re
from dotenv import load_dotenv
import os

In [2]:
def convert_gbnf_to_lark_ebnf(gbnf_content):
    # Create output buffer
    ebnf_lines = ["// Auto-converted from GBNF to Lark EBNF format"]
    
    # Helper function to convert camelCase or PascalCase to snake_case
    def to_snake_case(name):
        # Insert underscore before uppercase letters and convert to lowercase
        s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
        return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower()
    
    # Process line by line
    lines = gbnf_content.split('\n')
    rule_names = []
    terminal_rules = set()
    rule_definitions = {}
    
    # Maintain a mapping of original rule names to snake_case versions
    name_mapping = {}
    
    # First pass: collect all rule names and their definitions
    for line in lines:
        # Skip comments and empty lines
        if line.strip().startswith('#') or line.strip().startswith('//') or not line.strip():
            continue
        
        # Extract rule name and definition
        match = re.match(r'([a-zA-Z0-9_]+)\s*::=\s*(.*)', line)
        if match:
            original_name = match.group(1)
            snake_name = to_snake_case(original_name)
            definition = match.group(2).strip()
            
            name_mapping[original_name] = snake_name
            rule_names.append(snake_name)
            rule_definitions[snake_name] = definition
    
    # Second pass: identify terminals
    for rule_name, definition in rule_definitions.items():
        # Skip WS terminal - we'll use the common one
        if rule_name.lower() == 'ws':
            continue
            
        # Check if definition contains character classes or string literals but no rule references
        has_char_class = '[' in definition and ']' in definition
        has_string_literal = '"' in definition
        
        # Get all potential rule references (words that could be rule names)
        words = re.findall(r'\b([a-zA-Z][a-zA-Z0-9_]*)\b', definition)
        # Map original names to snake_case for checking references
        has_rule_references = any(to_snake_case(word) in rule_names for word in words)
        
        # A rule is likely a terminal if it has character classes or literals but no rule references
        if (has_char_class or has_string_literal) and not has_rule_references:
            terminal_rules.add(rule_name)
    
    # Add start rule
    if 'root' in rule_names:
        ebnf_lines.append("start: root")
        ebnf_lines.append("")
    
    # Prepare separate lists for non-terminal and terminal rules
    non_terminal_rules = []
    terminal_rule_lines = []
    
    # Third pass: convert each rule and organize into appropriate lists
    for line in lines:
        if line.strip().startswith('#'):
            # Skip comments
            continue
        elif line.strip().startswith('//') or not line.strip():
            # Add empty lines or comments to appropriate section
            if len(non_terminal_rules) == 0 and len(terminal_rule_lines) == 0:
                ebnf_lines.append(line)
            elif len(terminal_rule_lines) > 0:
                terminal_rule_lines.append(line)
            else:
                non_terminal_rules.append(line)
            continue
        
        # Convert rule definition
        if '::=' in line:
            # Split into name and definition
            name, definition = line.split('::=', 1)
            original_name = name.strip()
            snake_name = name_mapping.get(original_name, to_snake_case(original_name))
            
            # Skip the WS rule - we'll use the common one
            if snake_name.lower() == 'ws':
                continue
                
            definition = definition.strip()
            
            # Convert character classes
            definition = re.sub(r'\[([^\]]+)\](\+|\*)?', lambda m: f"/[{m.group(1)}]/{m.group(2) if m.group(2) else ''}", definition)
            
            # Handle special string escaping
            definition = re.sub(r'"([^"]*)"', lambda m: f'"{m.group(1).replace("\\", "\\\\")}"', definition)
            
            # Fix quoted attribute values
            definition = definition.replace('\\"', '\\\\"')
            
            # Update rule references to use snake_case
            for original, snake in name_mapping.items():
                # Skip replacing WS references - use the common one
                if snake.lower() == 'ws':
                    # Replace references to user-defined WS with WS
                    definition = re.sub(r'\b' + re.escape(original) + r'\b', "WS", definition)
                    continue
                    
                # Use word boundary to ensure we're replacing whole words
                definition = re.sub(r'\b' + re.escape(original) + r'\b', snake, definition)
            
            # Capitalize and convert references to terminal rules in the definition
            for terminal in terminal_rules:
                # Use word boundary to ensure we're replacing whole words
                definition = re.sub(r'\b' + re.escape(terminal) + r'\b', terminal.upper(), definition)
            
            # Capitalize name if it's a terminal rule
            output_name = snake_name.upper() if snake_name in terminal_rules else snake_name
            rule_line = f"{output_name}: {definition}"
            
            # Add rule to appropriate list based on whether it's a terminal
            if snake_name in terminal_rules:
                terminal_rule_lines.append(rule_line)
            else:
                non_terminal_rules.append(rule_line)
    
    # Add non-terminal rules first
    ebnf_lines.extend(non_terminal_rules)
    
    # Add a separator comment between non-terminals and terminals
    if terminal_rule_lines:
        ebnf_lines.append("")
        ebnf_lines.append("// Terminal definitions")
        ebnf_lines.append("")
        
        # Then add terminal rules
        ebnf_lines.extend(terminal_rule_lines)
    
    # Add terminal imports
    ebnf_lines.append("")
    ebnf_lines.append("%import common.WS")
    ebnf_lines.append("%ignore WS")
    
    return "\n".join(ebnf_lines)

In [3]:
# We need to convert GBNF into an EBNF snippet for Lark to be able to parse it
with open('../html_grammar.gbnf', 'r') as f:
    grammar = f.read()
    ebnf_grammar = convert_gbnf_to_lark_ebnf(grammar)

parser = Lark(ebnf_grammar, start="start", parser="earley", lexer="basic", ambiguity="explicit")

In [4]:
def parse_html(html_str):
    return parser.parse(html_str)

html_sample = """
<html>
    <head>
        <title>Sample Page</title>
    </head>
    <body>
        Foo Bar!
        <p class="foobar">Hello, World!</p>
        <a href="https://example.com" target="_blank">Click Here</a>
        <div>
            <p>Nested Paragraph</p>
            <h1>Hello World</h1>
        </div>
        <br/>
    </body>
</html>
"""

# Parse the sample HTML
tree = parse_html(html_sample)
print(tree.pretty())

UnexpectedToken: Unexpected token Token('ATTRIBUTE_NAME', 'Sample') at line 4, column 16.
Expected one of: 
	* __ANON_7
	* TEXT


# AI Grammar Mode

In [5]:
from fireworks.client import Fireworks
import os
client = Fireworks(
	api_key=os.environ["FIREWORKS_API_KEY"]
)

In [6]:
with open('../html_grammar.gbnf', 'r') as f:
	html_grammar = f.read()

In [7]:
chat_completion = client.chat.completions.create(
    model="accounts/fireworks/models/llama-v3p1-405b-instruct",
    response_format={"type": "grammar", "grammar": html_grammar},
    messages=[
        {
            "role": "system",
            "content": "Given the symptoms try to guess the possible diagnosis. Return the output in a fully structured HTML format with more information about the diagnosis. Make sure you do NOT use any Markdown formatting, convert any formatting directly into HTML for styling purposes.",
        },
        {
            "role": "user",
            "content": "I have been having trouble with my muscles and joints. My neck is really tight and my muscles feel weak. I have swollen joints and it is hard to move around without becoming stiff. It is also really uncomfortable to walk.",
        },
    ],
)
print(chat_completion.choices[0].message.content)
llm_html = chat_completion.choices[0].message.content

<html><head><title>Possible Diagnosis</title></head><body>

<h1>Possible Diagnosis: Rheumatoid Arthritis (RA) or Fibromyalgia</h1>

<p>Based on the symptoms you've described, it's possible that you may be experiencing Rheumatoid Arthritis (RA) or Fibromyalgia. Both conditions can cause muscle and joint pain, stiffness, and swelling. Here's more information about each condition:</p>

<h2>Rheumatoid Arthritis (RA)</h2>

<p>RA is a chronic autoimmune disorder that causes inflammation and pain in the joints. It can also affect other parts of the body, such as the skin, eyes, and internal organs. Common symptoms of RA include:</p>

<ul><li>Pain, stiffness, and swelling in the joints, particularly in the hands and feet</li><li>Morning stiffness that lasts for hours</li><li>Loss of range of motion and flexibility</li><li>Fatigue and weakness</li></ul>

<p>RA can be treated with medications, physical therapy, and lifestyle changes. Early diagnosis and treatment can help manage symptoms and slo

In [8]:
from IPython.display import display, HTML
display(HTML(llm_html))