In [1]:
print("hello, world!")

hello, world!


In [2]:
# system packages 
import os
import json

# internal packages 

# external packages
from openai import OpenAI
from dotenv import load_dotenv
from graphql import build_schema, parse
from jinja2 import Environment, FileSystemLoader

load_dotenv()
open_ai_api_key = os.getenv("OPENAI_API_KEY")

## GraphQL Schema Parsing

In [3]:

# with open('../assets/schemas/opensea/opensea_original_schema.graphql', 'r') as schema_file:
#     schema_str = schema_file.read()

# # TEMP: Try to parse the schema given the errors we are seeing
# custom_definitions = '''
# scalar BigDecimal
# scalar BigInt

# directive @entity on OBJECT
# directive @dailySnapshot on OBJECT
# directive @regularPolling on OBJECT
# directive @derivedFrom on OBJECT
# directive @transaction on OBJECT
# '''

# full_schema_str = custom_definitions + schema_str
# schema = build_schema(full_schema_str)

# for type_name, graphql_type in schema.type_map.items():
#     if type_name.startswith('__'):
#         continue  # Skip introspection types
#     print(f"Type: {type_name}")
#     for field_name, field in graphql_type.fields.items():
#         print(f"  Field: {field_name} (type: {field.type})")

## LangGraph

In [4]:
# import getpass
# import os

# if "OPENAI_API_KEY" not in os.environ:
#     os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter your OpenAI API key: ")
# open_ai_api_key = os.environ["OPENAI_API_KEY"]

In [5]:
# from typing import Annotated

# from langchain_openai import OpenAI
# from typing_extensions import TypedDict

# from langgraph.graph import StateGraph
# from langgraph.graph.message import add_messages


# class State(TypedDict):
#     messages: Annotated[list, add_messages]


# graph_builder = StateGraph(State)


# llm = OpenAI(model="gpt-4")


# def chatbot(state: State):
#     return {"messages": [llm.invoke(state["messages"])]}

# graph_builder.add_node("chatbot", chatbot)
# graph_builder.set_entry_point("chatbot")
# graph_builder.set_finish_point("chatbot")
# graph = graph_builder.compile()

In [6]:
# from IPython.display import Image, display

# try:
#     display(Image(graph.get_graph().draw_mermaid_png()))
# except Exception:
#     pass

## OpenAI Directly

In [7]:
client = OpenAI(
    api_key=os.environ.get("OPENAI_API_KEY"),  # This is the default and can be omitted
)

def gpt_chat_completion(client, message, model="gpt-4o"):
    return client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": message,
        }
    ],
    model=model,
)

print(gpt_chat_completion(client, "Hello! How are you?").choices[0].message.content)

Hello! I'm here and ready to help. How can I assist you today?


## Initial Test Set 

### Prompts 

- [ ] One entity quality vs. another entity quality 
- [ ] One column quality vs. another column quality 
- [ ] One schema quality vs. another schema quality 

### Demonstrations 

- [ ] A high quality entity (description)
- [ ] A high quality column (description)
- [ ] A high quality schema (description)

### Quality Categories

- [ ] 

In [8]:
with open("tests/assets/entity_comparison_assets.json", "r") as f:
    entity_comparison_assets = json.load(f)

gold_entity_comparison = "".join(entity_comparison_assets["gold"]["prompt"])
four_entity_comparison = "".join(entity_comparison_assets["four"]["prompt"])
three_entity_comparison = "".join(entity_comparison_assets["three"]["prompt"])
two_entity_comparison = "".join(entity_comparison_assets["two"]["prompt"])
one_entity_comparison = "".join(entity_comparison_assets["one"]["prompt"])

<Template 'entity_comparison_prompt.txt'>

In [44]:
# Function to reload the template
def reload_template(template_path="../assets/prompts", template_name="entity_comparison_prompt.txt"):
    env = Environment(loader=FileSystemLoader(template_path))
    env.cache.clear()
    entity_comparison_prompt_template = env.get_template(template_name)
    return entity_comparison_prompt_template

def parse_response(response): 
    response = response.choices[0].message.content
    response = response.strip('```json').strip('```').strip()
    response = json.loads(response)
    return response

example_prompt = reload_template()
example_prompt_filled = example_prompt.render({"entity_pred": "<example prediction goes here>", "entity_gold": "<example gold goes here>"})

entity_comparison_prompt_template = reload_template()
gold_output = entity_comparison_prompt_template.render({"entity_pred": {gold_entity_comparison}, "entity_gold": {gold_entity_comparison}})
four_output = entity_comparison_prompt_template.render({"entity_pred": {four_entity_comparison}, "entity_gold": {gold_entity_comparison}})
three_output = entity_comparison_prompt_template.render({"entity_pred": {three_entity_comparison}, "entity_gold": {gold_entity_comparison}})
two_output = entity_comparison_prompt_template.render({"entity_pred": {two_entity_comparison}, "entity_gold": {gold_entity_comparison}})
one_output = entity_comparison_prompt_template.render({"entity_pred": {one_entity_comparison}, "entity_gold": {gold_entity_comparison}})

print("Expecting output: 3 \n")
gold_comparison = parse_response(gpt_chat_completion(client, gold_output))
print(gold_comparison)

print("\n Expecting output: 4 \n")
four_comparison = parse_response(gpt_chat_completion(client, four_output))
print(four_comparison)

print("\n Expecting output: 3 \n")
three_comparison = parse_response(gpt_chat_completion(client, three_output))
print(three_comparison)

print("\n Expecting output: 2 \n")
two_comparison = parse_response(gpt_chat_completion(client, two_output))
print(two_comparison)

print("\n Expecting output: 1 \n")
one_comparison = parse_response(gpt_chat_completion(client, one_output))
print(one_comparison)  

Expecting output: 3 

{'reasoning': "First, we must compare the EVALUATION DOCUMENTATION with the GOLD DOCUMENTATION. Upon careful examination, both documentations are identical in their format and content. Each column in both documentations contains a full English sentence that clearly describes the meaning of the column, eliminating ambiguity. For example, all entries specify the context when necessary, such as 'The marketplace that this snapshot belongs to' rather than a less informative 'Market.' Also, they include concise yet comprehensive descriptions, such as explaining that a column represents 'Cumulative trade volume (in ETH).' The queries are syntactically correct, using proper capitalization and punctuation. Therefore, the provided documentation matches the GOLD documentation criteria and lacks any missing information, misleading descriptions, or grammar errors.", 'correctness': 3}

 Expecting output: 4 

{'reasoning': "The evaluation documentation provides detailed explanat

In [45]:
entity_comparison_prompt_template = reload_template(
    template_path="../assets/prompts",
    template_name="entity_comparison_revision.txt",
)

entity_comparison_prompt_modification = entity_comparison_prompt_template.render(
    {
        "original_prompt": {example_prompt_filled}, 

        "four_result_correct": {"True" if four_comparison["correctness"] == 4 else "False"},
        "four_result": {four_comparison["reasoning"]},
        
        "three_result_correct": {"True" if three_comparison["correctness"] == 3 else "False"},
        "three_result": {three_comparison["reasoning"]},
        
        "two_result_correct": {"True" if two_comparison["correctness"] == 2 else "False"},
        "two_result": {two_comparison["reasoning"]},
        
        "one_result_correct": {"True" if one_comparison["correctness"] == 1 else "False"},
        "one_result": {one_comparison["reasoning"]},
    }
)

In [46]:
entity_comparison_prompt_modification

'[TASK]: Your job is to revise the prompt to better meet our goal of determining the quality of documentation. Based upon the results and reasoning provided by the model, update the prompt we are using to score the documentation. \n\nIt is important that the prompt addresses that the comparison should label the EVALUATION DOCUMENTATION in comparison to the GOLD DOCUMENTATION by a scale of 1-4. \n***\n[ORIGINAL PROMPT]: {\'You are evaluating the quality of documentation between two representations of the same schema. One schema will be provided as an example of quality documentation. The other table schema is the one you will be evaluating. \\n[BEGIN DATA]\\n***\\n[TASK]: The task is to determine the quality of documentation for one table schema, given access only to a table definition and two different documentation implementations. \\n\\nThe goal is to create informative descriptions which reduces ambiguity and increases understanding for users of the database .\\n\\n***\\n[EVALUATION

In [47]:
prompt_revision = gpt_chat_completion(client, entity_comparison_prompt_modification).choices[0].message.content

In [21]:
entity_comparison_prompt_modification

'[TASK]: Your job is to revise the prompt to better meet our goal of determining the quality of documentation. Based upon the results and reasoning provided by the model, update the prompt we are using to score the documentation. \n\nIt is important that the prompt addresses that the comparison should label the EVALUATION DOCUMENTATION in comparison to the GOLD DOCUMENTATION by a scale of 1-4. \n***\n[ORIGINAL PROMPT]: {<Template \'entity_comparison_revision.txt\'>}\n***\n[Expected Output of 4]: {\'```json\\n{\\n    "reasoning": "The evaluation documentation provides detailed descriptions for each field in the MarketplaceDailySnapshot schema. Let\\\'s compare each field with the gold documentation to determine how it meets the criterion.\\\\n\\\\n1. \\\'id\\\' field: The evaluation documentation describes the structure of the ID in detail, which exceeds the information provided in the gold documentation. This improves clarity and reduces ambiguity.\\\\n\\\\n2. \\\'marketplace\\\' field

In [48]:
prompt_revision

'{"You are tasked with evaluating the quality of documentation between two representations of the same schema. One schema is provided as the benchmark (GOLD documentation), and you will evaluate the other based on this benchmark.\\n\\n[BEGIN DATA]\\n***\\n[TASK]: Your task is to determine the quality of the EVALUATION DOCUMENTATION for a table schema, using only a table definition and two different documentation implementations. The goal is to create informative descriptions that reduce ambiguity and increase understanding for users of the database.\\n\\n[EVALUATION DOCUMENTATION]: <example prediction goes here>\\n***\\n[GOLD DOCUMENTATION]: <example gold goes here>\\n***\\n[DOCUMENTATION CRITERION]: Evaluation Criteria\\n\\nCorrectness: \\n4: Perfect (Exceeding the GOLD documentation):\\nThe description provides full clarity without ambiguity, describing what each column references. It uses complete English sentences with proper grammar, capitalization, and punctuation.\\n\\n3: Almost