##  Importing necessary libraries

Ensure that the Python environment you are running this in has all the libraries present in [requirements.txt](requirements.txt).


In [2]:
import langchain
import openai
import json
import os
import pandas as pd
import tiktoken
import time

from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings
from langchain.schema import Document

from langchain.prompts import PromptTemplate
from langchain.callbacks import get_openai_callback
from langchain.llms import OpenAI
from langchain.agents import AgentExecutor
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain.schema import StrOutputParser
from langchain.chat_models import ChatOpenAI

## OpenAI API Key


To set the OPENAI_API_KEY as an environment variable run the following command:
```
export OPENAI_API_KEY = <Your API Key>
```


In [3]:
os.environ["OPENAI_API_KEY"]= <YOUR_OPENAI_API_KEY>  # Set your OpenAI API key

## Loading the API documentation and usage examples

The required datasets [.json] -
- API/Tool Descriptions (api_desc.json)
- Tool Usage Example (examples.json)
- Queries (PS_queries.json)
  
  These can be loaded either by
  1. Uploading the datasets on google drive and using its id to use `gdown` method (as shown).
  2. By directly uploading the datasets to the runtime


In [4]:
!gdown 1gG6Ghpkjxqz7vlPjCs2pTdmOSqjt0EIM
!gdown 1nJZpWHRdowbdzdRI7ylb1RrZUV_2E6k9
!gdown 1JCjW2f0fTsL6W7r7QjQO2FrwKNUj37ao

Downloading...
From: https://drive.google.com/uc?id=1gG6Ghpkjxqz7vlPjCs2pTdmOSqjt0EIM
To: /content/api_desc.json
100% 8.81k/8.81k [00:00<00:00, 25.4MB/s]
Downloading...
From: https://drive.google.com/uc?id=1nJZpWHRdowbdzdRI7ylb1RrZUV_2E6k9
To: /content/examples.json
100% 6.71k/6.71k [00:00<00:00, 22.0MB/s]
Downloading...
From: https://drive.google.com/uc?id=1JCjW2f0fTsL6W7r7QjQO2FrwKNUj37ao
To: /content/PS_queries.json
100% 1.99k/1.99k [00:00<00:00, 9.05MB/s]


In [5]:
#Loading the JSON files as a list

documentation_file_path = '/content/api_desc.json'
with open(documentation_file_path, 'r') as file:
    documentation = json.load(file)


examples_file_path = '/content/examples.json'
with open(examples_file_path, 'r') as file:
    examples = json.load(file)

queries_file_path = '/content/PS_queries.json'
with open(queries_file_path, 'r') as file:
    queries = json.load(file)

print(documentation)
print(examples)
print(queries)

[{'tool': 'works_list', 'description': 'Returns a list of work items matching the request', 'arguments': [{'argument_name': ' applies_to_part', 'argument_description': 'Filters for work belonging to any of the provided parts', 'argument_type': 'array of strings', 'examples': ['FEAT-123', 'ENH-123', 'PROD-123', 'CAPL-123']}, {'argument_name': 'created_by', 'argument_description': 'Filters for work created by any of these users', 'argument_type': 'array of strings', 'examples': ['DEVU-123']}, {'argument_name': 'issue.priority', 'argument_description': 'Filters for issues with any of the provided priorities. Allowed values: p0, p1, p2,p3', 'argument_type': 'array of strings', 'examples': []}, {'argument_name': 'issue.rev_orgs', 'argument_description': 'Filters for issues with any of the provided Rev organizations', 'argument_type': 'array of strings', 'examples': ['REV-123']}, {'argument_name': 'limit', 'argument_description': "The maximum number of works to return. The default is '50'", 

## Retrievers for relevant tools and examples

- The following section implements the retriever to filter out only the relevant tools and examples from the complete tool description bank and usage examples respectively.
- The `get_tools` function returns the set of relevant tools,their arguments and the cost of embeddings based on the user query, similarily `get_examples` function returns the set of relevant tool usage examples with the cost of embeddings.
- Additionally, it calculates the number of tokens used to create embeddings.




In [6]:
# Inputs a string and returns the number of tokens in the string

def num_tokens_from_string(string: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding("cl100k_base")
    num_tokens = len(encoding.encode(string))
    return num_tokens

In [7]:
#Applying semantic search on the API descriptions

API_descriptions = [
    Document(page_content=t['description'], metadata={"index": i})
    for i, t in enumerate(documentation)
]
API_descriptions_vector_store = FAISS.from_documents(API_descriptions, OpenAIEmbeddings())
API_retriever = API_descriptions_vector_store.as_retriever()
def get_tools(query):
    n_tokens_p = num_tokens_from_string(query)*0.0001/10000
    for item in API_descriptions:
        n_tokens_p+= num_tokens_from_string(item.page_content)*0.0001/10000
    docs = API_retriever.get_relevant_documents(query)
    tools = [documentation[d.metadata["index"]]['tool'] for d in docs]
    arguments = [documentation[d.metadata["index"]] for d in docs]
    return tools, arguments, n_tokens_p

In [8]:
#Applying semantic search on the API usage examples

API_usage_examples = [
    Document(page_content=t['Query'], metadata={"index": i})
    for i, t in enumerate(examples)
]
API_usage_examples_vector_score = FAISS.from_documents(API_usage_examples, OpenAIEmbeddings())
Examples_retriever = API_usage_examples_vector_score.as_retriever()
def get_examples(query):
    n_tokens_p = num_tokens_from_string(query)*0.0001/10000
    for item in API_usage_examples:
        n_tokens_p+= num_tokens_from_string(item.page_content)*0.0001/10000
    docs = Examples_retriever.get_relevant_documents(query)
    return [examples[d.metadata["index"]] for d in docs] , n_tokens_p

# PIRO Prompts

## Planning

In [9]:
planning_template ="""
Given a query and a list of APIs with their arguments, find whether the APIs can solve the query or not. If not, return empty list. Else, extract values or variables from the query that correspond to each argument of the tools that needs to be called. If some argument value isn't known, use APIs that can retrieve those argument values. Return a JSON file containing the tools in the order they need to be called. Identify cases where an API argument depends on the output of a previous API call. Replace such arguments with the notation $$PREV[i] where 'i' represents the order of the previous API call in the chain.

Query: {query}
APIs: {tools}

Answer:
"""

## Improvement with Examples

**Note:** Same template will be used for the reflection process also

In [10]:

improvement_template = """
Given a list of examples, each containing a query and the corresponding APIs to be called to solve the query, find whether the JSON present in current solution solves the current query.
If yes, modify its format to match with that of solutions in examples and return the modified JSON.
If not, modify the current JSON solution to include necessary tools from list of available tools.
If no solution is available, return a blank list.
Remember to use double quotes instead of single quotes in JSON output.

Examples : {examples}

Current Query : {query}
Current Solution : {solution}

Answer:
"""

## Optimization

In [11]:
optimization_template = """
Given a query, solution and available APIs, optimize the number of APIs calls in the solution. Do not use API calls unless absolutely necessary. Look for redundancy and check for mistakes in the combination of API calls. Return current solution if no changes are necessary.
return only the json.
Current Query : {query}
Current Solution : {solution}
Available APIs : {tools}

Final solution (same format as current solution):
"""

# Execution

In [17]:
#Set the model you want to generate the solution(s) with
model = "gpt-3.5-turbo"

#Creating empty dataframes to store the results
df = pd.DataFrame(columns=['prompts', 'outputs'])
df2 = pd.DataFrame(columns=['query', 'f_output','time','cost'])

The `get_result` function return the final predicted output,the total time taken to generate the output, and the cost given a user query


> Note: To avoid the `RateLimitError` keep the sleep time >10 sec

In [18]:
def get_result(Query):
    cost = 0
    start = time.time()

    #Getting the relevant APIs
    _,Tools , c1 = get_tools(Query)
    cost+=c1
    time.sleep(10)

    #Selecting the first 3 relevant examples
    Examples, c2 = get_examples(Query)[:3]
    cost+=c2


    #Planning
    planning_prompt = PromptTemplate.from_template(template=planning_template)
    planning_chain = planning_prompt | ChatOpenAI(temperature=0.1,model_name=model) | StrOutputParser()
    #Calculating the cost of the API calls
    with get_openai_callback() as cb:
      Solution1 = planning_chain.invoke({"query":Query, "tools":Tools})
      cost+=cb.total_cost

    print(f"Output 1: {Solution1}")
    df.loc[len(df.index)] = [planning_prompt.format(query=Query, tools=Tools), Solution1]
    time.sleep(10)


    #Improvement
    improvement_prompt = PromptTemplate.from_template(template=improvement_template)
    improvement_chain = improvement_prompt | ChatOpenAI(temperature=0,model_name=model) | StrOutputParser()
    #Calculating the cost of the API calls
    with get_openai_callback() as cb:
      Solution2 = improvement_chain.invoke({"examples":Examples, "query":Query, "solution":Solution1})
      cost+=cb.total_cost

    print(f"Output 2: {Solution2}")
    df.loc[len(df.index)] = [improvement_prompt.format(examples=Examples, query=Query, solution=Solution1), Solution2]
    time.sleep(10)


    #Reflection
    improvement_prompt = PromptTemplate.from_template(template=improvement_template)
    improvement_chain = improvement_prompt | ChatOpenAI(temperature=0,model_name=model) | StrOutputParser()
    #Calculating the cost of the API calls
    with get_openai_callback() as cb:
      Solution3 = improvement_chain.invoke({"examples":Examples, "query":Query, "solution":Solution2})
      cost+=cb.total_cost

    print(f"Output 2.5: {Solution3}")
    df.loc[len(df.index)] = [improvement_prompt.format(examples=Examples, query=Query, solution=Solution2), Solution3]
    time.sleep(10)


    #Optimization
    optimization_prompt = PromptTemplate.from_template(template=optimization_template)
    optimization_chain = optimization_prompt | ChatOpenAI(temperature=0,model_name=model) | StrOutputParser()
    #Calculating the cost of the API calls
    with get_openai_callback() as cb:
      Solution4 = optimization_chain.invoke({"query":Query, "solution":Solution3, "tools":Tools})
      cost+=cb.total_cost

    print(f"Output 3: {Solution3}")
    df.loc[len(df.index)] = [optimization_prompt.format(query=Query, solution=Solution3, tools=Tools), Solution4]


    end = time.time()
    time_taken = end-start


    return Solution4 , time_taken -40 , cost

In [None]:
#Running the pipeline on the queries
for query in queries:
  f_output ,time_taken , cost ,_= get_result(query)
  f_output = f_output.strip("```").lstrip("json")
  df2 = df2.append({'query': query, 'f_output': f_output, 'time': time_taken, 'cost': cost}, ignore_index=True)
  time.sleep(5)

In [None]:
df2

In [None]:
df

In [None]:
#Exporting the results to excel files
df.to_excel("GPT_4_prompts_output.xlsx", index=False)
df2.to_excel("GPT_4_query_Output_time_cost.xlsx", index=False)