### Optional step: Check to ensure we are using the right virtual env. 

In [None]:
import sys
sys.prefix

In [None]:
pip -V

In [None]:
pip freeze

### Install the required libraries

In [None]:
!pip install openai
!pip install python-dotenv
!pip install langchain==0.0.270

### Import the necessary packages

In [1]:
import openai
import os
import re
from dotenv import find_dotenv, load_dotenv
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.output_parsers import ResponseSchema
from langchain.output_parsers import StructuredOutputParser

### Setup OpenAI API

In [2]:
_ = load_dotenv(find_dotenv())
openai.api_key = os.environ["OPENAI_API_KEY"]

### Instantiate the model

In [3]:
chat = ChatOpenAI(temperature=0.0, model="gpt-3.5-turbo")

### Prompt Template

In [4]:
prompt_template_str = """\
Your task is to write a MongoDB query to produce the expected output for the given input data.

mongodb_query: The query that produces the expected output for the given input.

query_explanation: Explanation of what the query is doing.

Format the output as JSON with the following keys:
mongodb_query
query_explanation

Input data: {input_data}
Expected output: {expected_output}

{format_instructions}
"""

prompt_template = ChatPromptTemplate.from_template(template=prompt_template_str)

### Response Schema & Output Instructions

In [5]:
mongodb_query_schema = ResponseSchema(name="mongodb_query",
                                      description="The query that produces the expected output for the given input.")
query_explanation_schema = ResponseSchema(name="query_explanation",
                                          description="Explanation of what the query is doing.")

response_schemas = [mongodb_query_schema, query_explanation_schema]

In [6]:
output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
format_instructions = output_parser.get_format_instructions()

print(format_instructions)

The output should be a markdown code snippet formatted in the following schema, including the leading and trailing "```json" and "```":

```json
{
	"mongodb_query": string  // The query that produces the expected output for the given input.
	"query_explanation": string  // Explanation of what the query is doing.
}
```


### Define a generic function to chat with AI assistant for various input

In [29]:
def get_llm_response(input_data, expected_output, format_instructions):
    messages = prompt_template.format_messages(input_data = input_data,
                                               expected_output = expected_output,
                                               format_instructions = format_instructions)
    response = chat(messages)
    #print(f"LLM Response:\n{response.content}")

    json_str = pre_process_json_str(response.content)
    #print(f"Pre-processed JSON str:\n{json_str}")
    
    output_dict = output_parser.parse(json_str)
    return output_dict

def pre_process_json_str(json_str):
    json_str = re.sub(r"\n", '', json_str)
    json_str = re.sub(r"\r", '', json_str)
    json_str = re.sub(r"\t", '', json_str)
    return json_str

### Example 1: Simple Group By query
The task here is to write a MongoDB aggregation pipeline to group the data based on team and count the players for each team. 

In [12]:
ex1_input_data = """
[
  {
    "name": "Sachin",
    "team": "India"
  },
  {
    "name": "Sourav",
    "team": "India"
  },
  {
    "name": "Lara",
    "team": "West Indies"
  }
]
"""

ex1_expected_output = """
[
 {
   "team": India,
   "playerCount": 2
 },
 {
   "team": "West Indies",
   "playerCount": 1
 }
]
"""

In [30]:
llm_response = get_llm_response(ex1_input_data, ex1_expected_output, format_instructions)

print(f"MongoDB Query:\n{llm_response.get('mongodb_query')}")
print(f"\nQuery Explanation: {llm_response.get('query_explanation')}")

MongoDB Query:
db.collection.aggregate([{$group: {_id: '$team', playerCount: {$sum: 1}}}, {$project: {team: '$_id', playerCount: 1, _id: 0}}])

Query Explanation: The query uses the aggregate method to group the documents by the 'team' field. It then calculates the count of players in each group using the $sum operator. Finally, it uses the $project stage to reshape the output and exclude the _id field.


### Example 2: Find the duplicates
The task is to write a MongoDB aggregation pipeline to find the documents that have duplicates in the nested array "courses" and count the number of times those duplicate items are present in the array. 

In [16]:
ex2_input_data = """
[
 {
   "student": "Sachin",
   "courses": [
      {
        "courseName": "batting",
        "marks": 100
      },
      {
        "courseName": "batting",
        "marks": 50
      },
      {
        "courseName": "fielding",
        "marks": 60
      }
   ]
 },
 {
   "student": "Sourav",
   "courses": [
      {
        "courseName": "batting",
        "marks": 80
      },
      {
        "courseName": "bowling",
        "marks": 60
      },
      {
        "courseName": "fielding",
        "marks": 40
      }
   ]
 }
]
"""

ex2_expected_output = """
[
  {
    "student": "Sachin"
    "duplicateCourses": [
      {
        "courseName": "batting"
        "duplicateCount": 2
      }
    ]
  }
]
"""

In [31]:
llm_response = get_llm_response(ex2_input_data, ex2_expected_output, format_instructions)

print(f"MongoDB Query:\n{llm_response.get('mongodb_query')}")
print(f"\nQuery Explanation: {llm_response.get('query_explanation')}")

MongoDB Query:
[{'$unwind': '$courses'}, {'$group': {'_id': {'student': '$student', 'courseName': '$courses.courseName'}, 'duplicateCount': {'$sum': 1}}}, {'$match': {'duplicateCount': {'$gt': 1}}}, {'$group': {'_id': '$_id.student', 'duplicateCourses': {'$push': {'courseName': '$_id.courseName', 'duplicateCount': '$duplicateCount'}}}}, {'$project': {'_id': 0, 'student': '$_id', 'duplicateCourses': 1}}]

Query Explanation: The query first unwinds the 'courses' array to create separate documents for each course. Then, it groups the documents by 'student' and 'courseName', and calculates the count of duplicate courses using the $sum operator. The $match stage filters out the documents where the duplicate count is greater than 1. Next, the query groups the documents by 'student' and creates an array of duplicate courses using the $push operator. Finally, the $project stage reshapes the output to include only the 'student' and 'duplicateCourses' fields.


### Example 3 - Group By an attribute which is in the nested array
The task is to group by author name to get the list of books written by them

In [18]:
ex3_input_data = """
[
  {
    "bookCategory": "Non-Fiction",
    "books": [
      {
        "bookName": "Seven Habits",
        "pages": 200,
        "authors": [
          {
            "authorName": "Sachin",
            "authorEmail": "sachin@gmail.com"
          },
          {
            "authorName": "Sourav",
            "authorEmail": "sourav@gmail.com"
          }
        ]
      },
      {
        "bookName": "One thing",
        "pages": 100,
        "authors": [
          {
            "authorName": "Sachin",
            "authorEmail": "sachin@gmail.com"
          }
        ]
      },
      
    ]
  },
  {
    "bookCategory": "Fiction",
    "books": [
      {
        "bookName": "Harry Potter",
        "pages": 400,
        "authors": [
          {
            "authorName": "Sachin",
            "authorEmail": "sachin@gmail.com"
          },
          {
            "authorName": "Tim",
            "authorEmail": "Tim@gmail.com"
          }
        ]
      },
      {
        "bookName": "Alchemist",
        "pages": 100,
        "authors": [
          {
            "authorName": "Sourav",
            "authorEmail": "sourav@gmail.com"
          }
        ]
      },
      
    ]
  }
]
"""

ex3_expected_output = """
[
  {
     "authorName": "Sachin",
     "bookName": [
        "Seven Habits",
        "One thing",
        "Harry Potter"
     ] 
  },
  {
     "authorName": "Sourav",
     "bookName": [
        "Seven Habits",
        "Alchemist"
     ] 
  },
  {
     "authorName": "Tim",
     "bookName": [
        "Harry Potter"
     ] 
  }
]
"""

In [32]:
llm_response = get_llm_response(ex3_input_data, ex3_expected_output, format_instructions)

print(f"MongoDB Query:\n{llm_response.get('mongodb_query')}")
print(f"\nQuery Explanation: {llm_response.get('query_explanation')}")

MongoDB Query:
[{'$unwind': '$books'}, {'$unwind': '$books.authors'}, {'$group': {'_id': '$books.authors.authorName', 'bookName': {'$addToSet': '$books.bookName'}}}, {'$project': {'_id': 0, 'authorName': '$_id', 'bookName': 1}}]

Query Explanation: The query first unwinds the 'books' array and then unwinds the 'authors' array within each book. Then, it groups the documents by the 'authorName' field and creates an array of unique 'bookName' values for each author using the $addToSet operator. Finally, it projects the 'authorName' and 'bookName' fields while excluding the '_id' field.


### Example 4: Extract nested array content into one single array

In [20]:
ex4_input_data = """
[
  {
    "bookCategory": "Non-Fiction",
    "books": [
      {
        "bookName": "Seven Habits",
        "pages": 200,
        "authors": [
          {
            "authorName": "Sachin",
            "authorEmail": "sachin@gmail.com"
          },
          {
            "authorName": "Sourav",
            "authorEmail": "sourav@gmail.com"
          }
        ]
      },
      {
        "bookName": "One thing",
        "pages": 100,
        "authors": [
          {
            "authorName": "Sachin",
            "authorEmail": "sachin@gmail.com"
          }
        ]
      },
      
    ]
  },
  {
    "bookCategory": "Fiction",
    "books": [
      {
        "bookName": "Harry Potter",
        "pages": 400,
        "authors": [
          {
            "authorName": "Sachin",
            "authorEmail": "sachin@gmail.com"
          },
          {
            "authorName": "Tim",
            "authorEmail": "Tim@gmail.com"
          }
        ]
      },
      {
        "bookName": "Alchemist",
        "pages": 100,
        "authors": [
          {
            "authorName": "Sourav",
            "authorEmail": "sourav@gmail.com"
          }
        ]
      },
      
    ]
  }
]
"""

ex4_expected_output = """
[
  {
    "authors": [
      {
        "authorEmail": "sachin@gmail.com",
        "authorName": "Sachin"
      },
      {
        "authorEmail": "sourav@gmail.com",
        "authorName": "Sourav"
      }
    ],
    "bookName": "Seven Habits",
    "pages": 200
  },
  {
    "authors": [
      {
        "authorEmail": "sachin@gmail.com",
        "authorName": "Sachin"
      }
    ],
    "bookName": "One thing",
    "pages": 100
  },
  {
    "authors": [
      {
        "authorEmail": "sachin@gmail.com",
        "authorName": "Sachin"
      },
      {
        "authorEmail": "Tim@gmail.com",
        "authorName": "Tim"
      }
    ],
    "bookName": "Harry Potter",
    "pages": 400
  },
  {
    "authors": [
      {
        "authorEmail": "sourav@gmail.com",
        "authorName": "Sourav"
      }
    ],
    "bookName": "Alchemist",
    "pages": 100
  }
]
"""

In [33]:
llm_response = get_llm_response(ex4_input_data, ex4_expected_output, format_instructions)

print(f"MongoDB Query:\n{llm_response.get('mongodb_query')}")
print(f"\nQuery Explanation: {llm_response.get('query_explanation')}")

MongoDB Query:
db.collection.aggregate([{ $unwind: '$books' },{ $project: {bookName: '$books.bookName',pages: '$books.pages',authors: '$books.authors'} }])

Query Explanation: The query starts by using the $unwind operator to flatten the 'books' array. Then, the $project operator is used to select the desired fields 'bookName', 'pages', and 'authors' from the flattened documents. The result is the expected output.


### Example 5: extract nested array as objects

In [22]:
ex5_input_data = """
[
  {
    "bookName": "Seven Habits",
    "pages": 200,
    "authors": [
      {
        "authorName": "Sachin",
        "authorEmail": "sachin@gmail.com"
      },
      {
        "authorName": "Sourav",
        "authorEmail": "sourav@gmail.com"
      }
    ]
  },
  {
    "bookName": "One thing",
    "pages": 100,
    "authors": [
      {
        "authorName": "Sachin",
        "authorEmail": "sachin@gmail.com"
      }
    ]
  },
  {
    "bookName": "Harry Potter",
    "pages": 400,
    "authors": [
      {
        "authorName": "Sachin",
        "authorEmail": "sachin@gmail.com"
      },
      {
        "authorName": "Tim",
        "authorEmail": "Tim@gmail.com"
      }
    ]
  },
  {
    "bookName": "Alchemist",
    "pages": 100,
    "authors": [
      {
        "authorName": "Sourav",
        "authorEmail": "sourav@gmail.com"
      }
    ]
  }
]
"""

ex5_expected_output = """
[
    {
        "bookName": "Seven Habits",
        "pages": 200,
        "authorName": "Sachin",
        "authorEmail": "sachin@gmail.com"
    },
    {
        "bookName": "Seven Habits",
        "pages": 200,
        "authorName": "Sourav",
        "authorEmail": "sourav@gmail.com"
    },
    {
        "bookName": "One thing",
        "pages": 100,
        "authorName": "Sachin",
        "authorEmail": "sachin@gmail.com"
    },
    {
        "bookName": "Harry Potter",
        "pages": 400,
        "authorName": "Sachin",
        "authorEmail": "sachin@gmail.com"
    },
    {
        "bookName": "Harry Potter",
        "pages": 400,
        "authorName": "Tim",
        "authorEmail": "Tim@gmail.com"
    },
    {
        "bookName": "Alchemist",
        "pages": 100,
        "authorName": "Sourav",
        "authorEmail": "sourav@gmail.com"
    }
]
"""

In [34]:
llm_response = get_llm_response(ex5_input_data, ex5_expected_output, format_instructions)

print(f"MongoDB Query:\n{llm_response.get('mongodb_query')}")
print(f"\nQuery Explanation: {llm_response.get('query_explanation')}")

MongoDB Query:
db.books.aggregate([        {            $unwind: '$authors'        },        {            $project: {                bookName: 1,                pages: 1,                authorName: '$authors.authorName',                authorEmail: '$authors.authorEmail'            }        }    ])

Query Explanation: The query uses the aggregate method to perform the following operations:

1. $unwind: '$authors' - This operation deconstructs the authors array, creating a separate document for each author.
2. $project: {...} - This operation projects the desired fields from the input documents and renames the fields using the specified expressions. In this case, it selects the bookName, pages, authorName, and authorEmail fields from the input documents and renames them accordingly.

The result is a new set of documents where each document represents a book with its corresponding author.


### Example 6 - Aggregate and produce a summary

In [24]:
ex6_input_data = """
[
    {
        "studentName": "Pete",
        "subjects": [
            {
                "subjectName": "Math",
                "result": "passed"
            },
            {
                "subjectName": "Physics",
                "result": "passed"
            },
            {
                "subjectName": "Chemistry",
                "result": "failed"
            },
            {
                "subjectName": "Botany",
                "result": "failed"
            },
            {
                "subjectName": "Zoology",
                "result": "failed"
            }
        ]
    }
]
"""

ex6_expected_output = """
{
    "totalSubjectCount": 5,
    "totalResultPassedCount": 2,
    "totalResultFailedCount": 3
}
"""

In [35]:
llm_response = get_llm_response(ex6_input_data, ex6_expected_output, format_instructions)

print(f"MongoDB Query:\n{llm_response.get('mongodb_query')}")
print(f"\nQuery Explanation: {llm_response.get('query_explanation')}")

MongoDB Query:
db.students.aggregate([        {            $project: {                totalSubjectCount: { $size: '$subjects' },                totalResultPassedCount: {                    $size: {                        $filter: {                            input: '$subjects',                            as: 'subject',                            cond: { $eq: ['$$subject.result', 'passed'] }                        }                    }                },                totalResultFailedCount: {                    $size: {                        $filter: {                            input: '$subjects',                            as: 'subject',                            cond: { $eq: ['$$subject.result', 'failed'] }                        }                    }                }            }        }    ])

Query Explanation: The query uses the aggregate method to perform the following operations:

1. $project: This stage is used to create new fields in the output document.

2. totalSubjec

### Unit test for pre_process_json_str function

In [None]:
import re
value = """{
	"mongodb_query": "db.collection.aggregate([
		{ $unwind: '$books' },
		{ $project: {
			bookName: '$books.bookName',
			pages: '$books.pages',
			authors: '$books.authors'
		} }
	])",
	"query_explanation": "The query starts by using the $unwind operator to flatten the 'books' array. Then, the $project operator is used to select the desired fields 'bookName', 'pages', and 'authors' from the flattened documents. The result is the expected output."
}"""
value = re.sub(r"\n", '', value)
value = re.sub(r"\r", '', value)
value = re.sub(r"\t", '', value)
value = re.sub(r'(?<!\\)"', r"\"", value)
print(value)