# JSON Index
The JSON index is useful for querying JSON documents that conform to a JSON schema.

This JSON schema is then used in the context of a prompt to convert a natural language query into a structured JSON Path query. This JSON Path query is then used to retrieve data to answer the given question.

In [1]:
# First, install the jsonpath-ng package which is used by default to parse & execute the JSONPath queries.
!pip install jsonpath-ng



In [2]:
import logging
import sys

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

In [3]:
import dotenv
dotenv.load_dotenv("../../../.env")

False

In [4]:
from IPython.display import Markdown, display

### Let's start on a Toy JSON

Very simple JSON object containing data from a blog post site with user comments.

We will also provide a JSON schema (which we were able to generate by giving ChatGPT a sample of the JSON).

In [5]:
# Test on some sample data 
json_value = {
  "blogPosts": [
    {
      "id": 1,
      "title": "First blog post",
      "content": "This is my first blog post"
    },
    {
      "id": 2,
      "title": "Second blog post",
      "content": "This is my second blog post"
    }
  ],
  "comments": [
    {
      "id": 1,
      "content": "Nice post!",
      "username": "user2",
      "blogPostId": 1
    },
    {
      "id": 2,
      "content": "Interesting thoughts",
      "username": "user1",
      "blogPostId": 2
    },
    {
      "id": 3,
      "content": "Loved reading this!",
      "username": "user1",
      "blogPostId": 2
    }
  ]
}

# JSON Schema object that the above JSON value conforms to
json_schema = {
  "$schema": "http://json-schema.org/draft-07/schema#",
  "description": "Schema for a very simple blog post app",
  "type": "object",
  "properties": {
    "blogPosts": {
      "description": "List of blog posts",
      "type": "array",
      "items": {
        "type": "object",
        "properties": {
          "id": {
            "description": "Unique identifier for the blog post",
            "type": "integer"
          },
          "title": {
            "description": "Title of the blog post",
            "type": "string"
          },
          "content": {
            "description": "Content of the blog post",
            "type": "string"
          }
        },
        "required": ["id", "title", "content"]
      }
    },
    "comments": {
      "description": "List of comments on blog posts",
      "type": "array",
      "items": {
        "type": "object",
        "properties": {
          "id": {
            "description": "Unique identifier for the comment",
            "type": "integer"
          },
          "content": {
            "description": "Content of the comment",
            "type": "string"
          },
          "username": {
            "description": "Username of the commenter",
            "type": "string"
          },
          "blogPostId": {
            "description": "Identifier for the blog post to which the comment belongs",
            "type": "integer"
          }
        },
        "required": ["id", "content", "username", "blogPostId"]
      }
    }
  },
  "required": ["blogPosts", "comments"]
}


In [6]:
from llama_index.indices.service_context import ServiceContext
from langchain.llms.openai import OpenAI
from llama_index.indices.struct_store import GPTJSONIndex

llm = OpenAI(model_name="gpt4")
service_context = ServiceContext.from_defaults()
index = GPTJSONIndex(json_value=json_value, json_schema=json_schema, service_context=service_context)

INFO:numexpr.utils:NumExpr defaulting to 2 threads.
NumExpr defaulting to 2 threads.


  from .autonotebook import tqdm as notebook_tqdm


INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total LLM token usage: 0 tokens
> [build_index_from_nodes] Total LLM token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total embedding token usage: 0 tokens
> [build_index_from_nodes] Total embedding token usage: 0 tokens


In [7]:
query_engine = index.as_query_engine(verbose=True)
response = query_engine.query(
    "What comments has user1 been writing?",
)

> JSONPath Prompt: We have provided a JSON schema below:
{"$schema": "http://json-schema.org/draft-07/schema#", "description": "Schema for a very simple blog post app", "type": "object", "properties": {"blogPosts": {"description": "List of blog posts", "type": "array", "items": {"type": "object", "properties": {"id": {"description": "Unique identifier for the blog post", "type": "integer"}, "title": {"description": "Title of the blog post", "type": "string"}, "content": {"description": "Content of the blog post", "type": "string"}}, "required": ["id", "title", "content"]}}, "comments": {"description": "List of comments on blog posts", "type": "array", "items": {"type": "object", "properties": {"id": {"description": "Unique identifier for the comment", "type": "integer"}, "content": {"description": "Content of the comment", "type": "string"}, "username": {"description": "Username of the commenter", "type": "string"}, "blogPostId": {"description": "Identifier for the blog post to which t

In [8]:
display(Markdown(f"<b>{response}</b>"))

<b>[{'id': 2, 'content': 'Interesting thoughts', 'username': 'user1', 'blogPostId': 2}, {'id': 3, 'content': 'Loved reading this!', 'username': 'user1', 'blogPostId': 2}]</b>

In [9]:
# get json path query string
print(response.extra_info["json_path_response_str"])

 $.comments[?(@.username == 'user1')]
