## Generating Lucene Query Language queries from free text input

This notebook takes text input, generates LQL queries and queries RSpace ELN API
to retrieve documents.

To run this you'll need:

* An OpenAI API key
* An account on https://community.researchspace.com, and an API key. (It's free to set up)
* Python RSpace client - `pip install rspace_client`

Here we import everything we need and  check RSpace API:

In [1]:
import os
from rspace_client.eln import eln
import json
import openai
import requests
from tenacity import retry, wait_random_exponential, stop_after_attempt
from termcolor import colored

eln_cli = eln.ELNClient(os.getenv("RSPACE_URL"), os.getenv("RSPACE_API_KEY"))
print(eln_cli.get_status())
GPT_MODEL = "gpt-4"

{'message': 'OK', 'rspaceVersion': '1.91.1'}


In [2]:
def pretty_print_conversation(messages):
    role_to_color = {
        "system": "red",
        "user": "green",
        "assistant": "blue",
        "function": "magenta",
    }
    
    for message in messages:
        if message["role"] == "system":
            print(colored(f"system: {message['content']}\n", role_to_color[message["role"]]))
        elif message["role"] == "user":
            print(colored(f"user: {message['content']}\n", role_to_color[message["role"]]))
        elif message["role"] == "assistant" and message.get("function_call"):
            print(colored(f"assistant: {message['function_call']}\n", role_to_color[message["role"]]))
        elif message["role"] == "assistant" and not message.get("function_call"):
            print(colored(f"assistant: {message['content']}\n", role_to_color[message["role"]]))
        elif message["role"] == "function":
            print(colored(f"function ({message['name']}): {message['content']}\n", role_to_color[message["role"]]))

Standard method to send messages to OpenAI's Chat completion API

In [3]:
@retry(wait=wait_random_exponential(multiplier=1, max=40), stop=stop_after_attempt(3))
def chat_completion_request(messages, functions=None, function_call=None, model=GPT_MODEL):
    headers = {
        "Content-Type": "application/json",
        "Authorization": "Bearer " + openai.api_key,
    }
    json_data = {"model": model, "messages": messages}
    if functions is not None:
        json_data.update({"functions": functions})
    if function_call is not None:
        json_data.update({"function_call": function_call})
    try:
        response = requests.post(
            "https://api.openai.com/v1/chat/completions",
            headers=headers,
            json=json_data,
        )
        return response
    except Exception as e:
        print("Unable to generate ChatCompletion response")
        print(f"Exception: {e}")
        return e

In [4]:
## This is the function that will be invoked with arguments generated by AI
## It will make calls to RSpace's search API.
def search_rspace_eln(luceneQuery, sort_order="lastModified desc"):
    q = "l: " + luceneQuery
    docs = eln_cli.get_documents(query=q, order_by=sort_order)['documents']
    wanted_keys = ['globalId','name', 'tags', 'created'] # The keys we want
    summarised = list(map(lambda d: dict((k, d[k]) for k in wanted_keys if k in d), docs))
    return summarised

In [5]:
available_functions = {
 "lucene":search_rspace_eln
}

functions = [
  {
    "name": "lucene",
    "description": """
    A valid Lucene Query Language string generated from user input.
    Document fields are name, docTag, fields.fieldData, and username.
    Don't use wildcards.
    """,
    "parameters": {
        "type":"object",
        "properties": {
            "luceneQuery": {
                "type":"string",
                "description":"Valid Lucene Query Language as plain text"
            },
            "sort_order": {
                "type":"string",
                "description":"How results should be sorted",
                "enum":["name asc", "name desc", "created asc", "created desc"]
            },
            
        }
    }
  }
]

In [6]:
messages = [
 {
     "role" : "system",
     "content": "Generate function arguments from user input. Don't show reasoning."
 },
 {
     "role" : "user",
     "content": """
         I want to search for documents that are tagged with PCR but not ECL, 
         containing the phrase “DNA replication” but not "RNA"
         List results in reverse alphabetical order
         """
 } 
]

In [7]:
resp = chat_completion_request(messages, functions, {'name':'lucene'})
active_messages = messages
response_message = resp.json()['choices'][0]['message']
active_messages.append(response_message)

if response_message['function_call'] is not None:
    f_name = response_message['function_call']['name']
    f_args = json.loads(response_message['function_call']['arguments'])
    result = available_functions[f_name](**f_args)
pretty_print_conversation(active_messages)
print(json.dumps(result, indent=2))

[31msystem: Generate function arguments from user input. Don't show reasoning.
[0m
[32muser: 
         I want to search for documents that are tagged with PCR but not ECL, 
         containing the phrase “DNA replication” but not "RNA"
         List results in reverse alphabetical order
         
[0m
[34massistant: {'name': 'lucene', 'arguments': '{\n  "luceneQuery": "docTag:PCR -docTag:ECL AND fields.fieldData:\\"DNA replication\\" -fields.fieldData:\\"RNA\\"",\n  "sort_order": "name desc"\n}'}
[0m
[
  {
    "globalId": "SD1924288",
    "name": "aurora expression exit",
    "tags": "polyclonal,PCR",
    "created": "2023-09-15T23:47:26.853Z"
  }
]
