In [None]:
!pip install openai
!pip install python-dotenv

In [1]:
from openai import OpenAI
import os
from dotenv import load_dotenv
import time
import json5
import json

In [2]:
_ = load_dotenv()
openai_client = OpenAI()

In [3]:
GPT3_MODEL = "gpt-3.5-turbo-1106"
GPT4_MODEL = "gpt-4-1106-preview"

In [4]:
input_data = """
[
  {
    "name": "Sachin",
    "team": "India"
  },
  {
    "name": "Sourav",
    "team": "India"
  },
  {
    "name": "Lara",
    "team": "West Indies"
  }
]
"""

output_data = """
[
 {
   "team": India,
   "playerCount": 2
 },
 {
   "team": "West Indies",
   "playerCount": 1
 }
]
"""

In [5]:
assistant_instructions = """You are a MongoDB expert with great expertise in writing MongoDB queries \
for any given data to produce an expected output."""

In [6]:
execute_query_function_interface = {
    "name": "executeQuery",
    "description": "Execute the MongoDB Query on the given input data to verify the output",
    "parameters": {
        "type": "object",
        "properties": {
            "mongoDBQuery": {
                "type": "string",
                "description": """The MongoDB aggregation pipeline to produce the expected output for a given input. 
                               This field corresponds to just the list of stages in the aggregation pipeline 
                               and shouldn't contain the 'db.collection.aggregate' prefix."""
            },
            "queryExplanation": {
                "type": "string",
                "description": "A detailed explanation for the query that was returned."
            } 
        },
        "required": ["mongoDBQuery", "queryExplanation"]
    }
}

In [7]:
assistant = openai_client.beta.assistants.create(
    name="MongoDB SME",
    instructions=assistant_instructions,
    model=GPT3_MODEL,
    tools = [
        {
            "type": "function", 
            "function": execute_query_function_interface
        }
    ]
)

In [8]:
def process_user_input(user_input):
    #Create a new thread
    thread = openai_client.beta.threads.create()
    
    #Add a message with the user query to the thread
    message = openai_client.beta.threads.messages.create(
        thread_id=thread.id,
        role="user",
        content=user_prompt
    )
    
    #Create a run to invoke the assistant
    run = openai_client.beta.threads.runs.create(
        thread_id=thread.id,
        assistant_id=assistant.id
    )
    return thread, run

In [9]:
def get_completed_run(thread, run, max_attempts=60, sleep_interval=2):
    for i in range(max_attempts):
        try:
            run = openai_client.beta.threads.runs.retrieve(
              thread_id=thread.id,
              run_id=run.id
            )
            # Check if the status indicates completion
            if run.status == "completed" or run.status == "requires_action":
                return run
        except Exception as e:
            print(f"Error: {str(e)}. Trying again...")
        finally:
            time.sleep(sleep_interval)
    else:        
        # If max_attempts reached without completion, then assistant call timed out
        return None

In [10]:
def executeQuery(mongoDBQuery):
    return "success"

In [11]:
user_prompt = f"""
Your task is to write a MongoDB Query, specifically an aggregation pipeline\
that would produce the expected output for the given input.

Important: You will always execute the query to verify that it produces the expected output.

Input data: {input_data} 
Expected output data: {output_data}
"""

In [13]:
thread, run = process_user_input(user_prompt)

for i in range(3):
    run = get_completed_run(thread, run)

    if run:
        if run.status == "requires_action": 
            tool_call = run.required_action.submit_tool_outputs.tool_calls[0]
            function_name = tool_call.function.name
            arguments = json5.loads(tool_call.function.arguments)
            print(f"Function Name: {function_name}\nArguments: {arguments}")
            
            response = executeQuery(arguments["mongoDBQuery"])
            if response == "success":
                print(f"Assistant Response - MongoDB Query: {arguments['mongoDBQuery']}")
                break
            else:
                run = openai_client.beta.threads.runs.submit_tool_outputs(
                    thread_id=thread.id,
                    run_id=run.id,
                    tool_outputs=[
                        {
                            "tool_call_id": tool_call.id,
                            "output": """The generated MongoDB Query 
                                         didn't produce the expected output.
                                         Please try again"""
                        }
                    ]
                )
        elif run.status == "completed":
            messages = openai_client.beta.threads.messages.list(thread.id)
            print(f"Assistant Response: {messages.data[0].content[0].text.value}")
            break
    else: 
        print("Error: Assistant timed out.")
else:
    print("Error: Assistant couldn't produce the query for the given input.")

Function Name: executeQuery
Arguments: {'mongoDBQuery': '[\n  {\n    "$group": {\n      "_id": "$team",\n      "playerCount": {"$sum": 1}\n    }\n  },\n  {\n    "$project": {\n      "team": "$_id",\n      "playerCount": 1,\n      "_id": 0\n    }\n  }\n]', 'queryExplanation': "The query groups the documents by the 'team' field and calculates the count of players for each team. Then it projects the 'team' and 'playerCount' fields, and removes the '_id' field from the output."}
Assistant Response - MongoDB Query: [
  {
    "$group": {
      "_id": "$team",
      "playerCount": {"$sum": 1}
    }
  },
  {
    "$project": {
      "team": "$_id",
      "playerCount": 1,
      "_id": 0
    }
  }
]
