## Schema Guided Prompt Generation

In [3]:
import yaml

def generate_sparql_prompt_from_yaml(yaml_str, user_question, named_graph_uri):
    schema = yaml.safe_load(yaml_str)

 
    prefixes = schema.get("prefixes", {})
    prefix_lines = []
    for prefix, uri in prefixes.items():
        if isinstance(uri, dict):
            uri_value = uri.get("prefix_prefix", uri.get("prefix_reference", ""))
        else:
            uri_value = uri
        prefix_lines.append(f"PREFIX {prefix}: <{uri_value}>")
    prefixes_block = "\n".join(prefix_lines)

    print("*"*100)
    print(prefixes)
    print("*"*100)
    
    summary_lines = []
    if "classes" in schema:
        for cname, cdata in schema["classes"].items():
            summary_lines.append(f"- Class: {cname}")
            for slot in cdata.get("attributes", {}):
                summary_lines.append(f"  - Slot: {slot}")
    schema_summary = "\n".join(summary_lines)

    print("*"*100)
    print(prefixes)
    print("*"*100)

    
    prompt = f"""You are a SPARQL assistant that converts user questions into SPARQL queries using RDF data based on the provided LinkML schema.

            ### Graph Context:
            All queries should use the named graph:
            GRAPH <{named_graph_uri}> {{ ... }}
            
            ### Prefixes (from schema):
            {prefixes_block}
            
            ### RDF Type Encoding:
            Entity types use:
            ?entity biolink:category "bican:<ClassName>"^^xsd:anyURI
            
            ### LinkML Schema:
            {schema_summary}
            
            ### Instructions:
            1. Use classes and attributes from the schema to interpret the user question.
            2. Write a SPARQL query that retrieves appropriate answers.
            3. Return only the query, wrapped in triple backticks.
            
            ### User Question:
            "{user_question}"
            
            ### SPARQL Query:
            """
    return prompt


In [4]:
with open("genome_annotation.yaml") as f:
    yaml_data = f.read()

question = "List all genes with their molecular type"
graph_uri = "https://www.brainkb.org/version01"

prompt = generate_sparql_prompt_from_yaml(yaml_data, question, graph_uri)
print(prompt)


****************************************************************************************************
{'linkml': 'https://w3id.org/linkml/', 'bican': 'https://identifiers.org/brain-bican/vocab/', 'schema': 'http://schema.org/', 'ncbi': 'https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?mode=Info&id=', 'NCBIAssembly': 'https://www.ncbi.nlm.nih.gov/assembly/'}
****************************************************************************************************
****************************************************************************************************
{'linkml': 'https://w3id.org/linkml/', 'bican': 'https://identifiers.org/brain-bican/vocab/', 'schema': 'http://schema.org/', 'ncbi': 'https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?mode=Info&id=', 'NCBIAssembly': 'https://www.ncbi.nlm.nih.gov/assembly/'}
****************************************************************************************************
You are a SPARQL assistant that converts user questions into SPA