##  Importing necessary libraries

Ensure that the Python environment you are running this in has all the libraries present in [requirements.txt](requirements.txt).


### Setting Up Xterm

In [19]:
%%capture
!pip install colab-xterm
%load_ext colabxterm
!pip install lshw
!curl https://ollama.ai/install.sh | sh

- Run `ollama serve` on the xterm terminal to activate ollama [if using brave browser turn off brave shields]

In [None]:
%xterm

### Utility Functions
For getting dataset, getting embeddings

In [3]:
!gdown 1ngGX4bgpXGyH3zh7cWOmmPsiGPPtaz7f -O testing.tar.xz
!tar -xf testing.tar.xz

Downloading...
From: https://drive.google.com/uc?id=1ngGX4bgpXGyH3zh7cWOmmPsiGPPtaz7f
To: /content/testing.tar.xz
  0% 0.00/8.30k [00:00<?, ?B/s]100% 8.30k/8.30k [00:00<00:00, 19.4MB/s]


In [13]:
%%capture
!pip install -U sentence-transformers
!pip install faiss-gpu
!pip install langchain openai tiktoken

In [14]:
import typing
import os
import faiss
import json
import numpy as np
import pandas as pd

from sentence_transformers import SentenceTransformer
import langchain
import openai
import json
import os

from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings
from langchain.schema import Document

from langchain.prompts import PromptTemplate
from langchain.llms import OpenAI
from langchain.agents import AgentExecutor
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain.memory import ConversationBufferMemory
from langchain.schema import StrOutputParser
from langchain.chat_models import ChatOpenAI

Add your GPT and Hugging Face Keys \\
!! If GPT key shows rate limit exceeded please add `time.sleep()`

In [15]:
os.environ["OPENAI_API_KEY"] = ""
os.environ['HUGGINGFACE_API_KEY'] = ""

In [7]:
def get_testing_data(query_lst : typing.Union[typing.Dict, str]):
  """
  We expect query_lst to have the keys query and solution
  """
  if isinstance(query_lst, str):
    query_lst = json.load(open(query_lst))

  ################################################### RETURN QUERY LIST AS IS #####################################
  ################################################### EXPECTED must have field "solution" #########################

  return [i["query"] for i in query_lst], [i["solution"] for i in query_lst]

In [8]:
def create_indices(api_desc : typing.Union[typing.Dict, str]):
  '''
  Pretty simple utility function, give it the dictionary of the description or
  '''
  if isinstance(api_desc, str):
    api_desc = json.load(open(api_desc))
  IDX_TO_TOOL = {
      i : api_desc[i]["tool"] for i in range(len(api_desc))
  }
  TOOL_TO_IDX = {
      i : j for j, i in IDX_TO_TOOL.items()
  }
  return IDX_TO_TOOL, TOOL_TO_IDX, [json.dumps(i) for i in api_desc]

In [9]:
def get_embedding(documents : typing.List[str], model : str):
  """
  Use sentence transformers to get embeddings. Most models on https://huggingface.co/sentence-transformers work. :0:
  """
  model = SentenceTransformer(model)
  embeddings = model.encode(documents)
  return embeddings

### Searching

In [10]:
# Semantic Search
def get_k_nearest(api_embed : typing.List, query_embed : typing.List, k : int = 10):

  index = faiss.IndexFlatL2(api_embed.shape[1])
  index.add(api_embed)
  return index.search(query_embed, k)

### Formatting Data

creating embeddings, index to tool & tool to index dictionaries


In [11]:
IDX_TOOL, TOOL_IDX, DESC = create_indices("augment_tools.json")

In [12]:
query, expected = get_testing_data("augment_queries.json")

### Evaluate Function

Returns $\frac{n(\text{missing tools})}{len(\text{idx})}$

In [21]:
def evaluate(model : str, mode : str, k : int, idx_tool : typing.Dict, tool_idx : typing.Dict,
             tool_desc : typing.List[str], query : typing.List[str], expected : typing.List):
  if mode == 'ST':
    tool_embed = get_embedding(tool_desc, model)
    query_embed = get_embedding(query, model)
  elif mode == 'openai':
    embed_mod = OpenAIEmbeddings(model = model)
    tool_embed = np.array(embed_mod.embed_documents(tool_desc)).astype('float32')
    query_embed = np.array(embed_mod.embed_documents(query)).astype('float32')
  else:
    pass
  distances, idx = get_k_nearest(tool_embed, query_embed, k)
  idx = [list(i) for i in idx]

  expected_idx = [[tool_idx[i] for i in j] for j in expected]
  msd = 0
  for i in range(len(idx)):
    msd += len([j for j in expected_idx[i] if j not in idx[i]])
    # print(msd)

  return idx, msd/len(idx)


#### MAIN
Modify and change models, k value according to need


- For Sentence Transformers:
  - Change model name to desired model name
  - Change mode to "ST"
- For OpenAI embeddings:
  - Change model name to required model name, default - 'text-embedding-ada-002'
  - Change mode to "openAI"

In [24]:
idx, missed = evaluate(
      model = "multi-qa-mpnet-base-dot-v1", #Change model
      mode = "ST", #ST for sentence transfomer, openai for OPENAI
      idx_tool = IDX_TOOL,
      tool_idx = TOOL_IDX,
      tool_desc = DESC,
      query = query,
      expected = expected,
      k = 10 # Change K
  )

print(missed)

.gitattributes:   0%|          | 0.00/737 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/8.66k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json:   0%|          | 0.00/25.5k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

train_script.py:   0%|          | 0.00/13.9k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

0.89171974522293
