# Langchain Retriever



## Setting up the environment



##  Importing necessary libraries

Ensure that the Python environment you are running this in has all the libraries present in [requirements.txt](requirements.txt).


In [2]:
#Importing necessary libraries
import json
import langchain
import json
import openai
import os
import pandas as pd
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings, OllamaEmbeddings, HuggingFaceEmbeddings
from langchain.schema import Document

from langchain.prompts import PromptTemplate
from langchain.llms import OpenAI
from langchain.agents import AgentExecutor
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain.schema import StrOutputParser
from langchain.chat_models import ChatOpenAI

import yaml
import typing

## Data Loading

In [3]:
!gdown 1ngGX4bgpXGyH3zh7cWOmmPsiGPPtaz7f -O testing.tar.xz
!tar -xf testing.tar.xz

Downloading...
From: https://drive.google.com/uc?id=1ngGX4bgpXGyH3zh7cWOmmPsiGPPtaz7f
To: /content/testing.tar.xz
  0% 0.00/8.30k [00:00<?, ?B/s]100% 8.30k/8.30k [00:00<00:00, 34.7MB/s]


In [4]:
#Function to load the testing data
def get_testing_data(query_lst : typing.Union[typing.Dict, str]):
  """
  We expect query_lst to have the keys query and solution
  """
  if isinstance(query_lst, str):
    query_lst = json.load(open(query_lst))


  ################################################### RETURN QUERY LIST AS IS #####################################
  ################################################### EXPECTED must have field "solution" #########################
  return [i["query"] for i in query_lst], [i["solution"] for i in query_lst]

In [5]:
#Loading the API/Tool Description Data
api_desc = json.load(open("augment_tools.json"))

#Dataset of the expected outputs based on the query
queries, expected = get_testing_data("augment_queries.json")

## Setting a YAML document to give easier model configurations in the retriever

> Note: Before executing kindly enter your OpenAI API key
> in `api_key =<YOUR API KEY>`

In [6]:
#Define the YAML document as a multi-line string
model_map = """
llm:
  gpt-3.5-turbo : openAI
  llama2 : ollama

embedding:
  text-embedding-ada-002 : openAI
  llama2 : ollama
  "all-mpnet-base-v2" : sent
  "multi-qa-mpnet-base-dot-v1" : sent
  "all-MiniLM-L6-v2" : sent
  "all-mpnet-base-v2": sent
  "multi-qa-mpnet-base-dot-v1" : sent
  "all-distilroberta-v1" : sent
  "all-MiniLM-L12-v2" : sent
  "multi-qa-distilbert-cos-v1" : sent
  """

#Loading the OpenAI-API Key
api_key = <YOUR_API_KEY>

#Parse the YAML document and convert it into a python object
model_map = yaml.load(model_map, Loader=yaml.FullLoader)


## Implementation

Implemenation of the LangChain retriever consists of -
1. `get_embedding_model` - to return the best embedding model, given a model name
2. `get_retriever` - to return a retriver object
3. `get_tools` - to return the relevant tools from the complete tool bank
4. `evalaute` - to calculate the number to tools missing in the predicted output which are present in the expected output

In [7]:
# Function to return the embedding model based on the given model name

def get_embedding_model(modelname : str, model_map : typing.Dict = model_map):
    try:
        model_type = model_map['embedding'][modelname].lower()
        if model_type == 'openai':
            return OpenAIEmbeddings(model=modelname, api_key=api_key)
        elif model_type == 'ollama':
            return OllamaEmbeddings(model=modelname)
        elif model_type == 'sent':
            return HuggingFaceEmbeddings(model_name=modelname)
        else:
            return KeyError(f"Model Type {model_type} not found")
    except KeyError:
        print(KeyError("Model not found"))
        return None

In [8]:
#Function to retrieve documents based on their embeddings using specified model

def get_retriever(docs : typing.List, modelname : str, model_map : typing.Dict = model_map):
    embedding_model = get_embedding_model(modelname)
    # print(embedding_model)
    assert embedding_model is not None

    #Load the Tool descriptions (CHANGE ACCORDING TO TOOL DESC)
    Descs = [
                Document(page_content=t['description'], metadata={"index": i})
                for i, t in enumerate(docs)
    ]

    #Creating a FAISS Vector Database to store embeddings
    Descs_Store = FAISS.from_documents(Descs, embedding_model)
    Retriever = Descs_Store.as_retriever()
    return Retriever


In [9]:
# Function to retrieve the relevant set of tools with arguments based on the user query from the complete tool bank using semantic search

def get_tools(query : str, documentation : typing.List, retriever : typing.Any):
    docs = retriever.get_relevant_documents(query)
    tools = [documentation[d.metadata["index"]]['tool'] for d in docs]
    arguments = [documentation[d.metadata["index"]] for d in docs]
    return tools, arguments

In [10]:
# Function to count differences between predicted and expected results
def evaluate(expected, actual):
  """
  expected : ground truth
  actual : what we get
  """
  return len([i for i in expected if i not in actual])


## Execution

 Calculating the difference between the expected and predicted outputs

In [11]:
#Function to return a list of {missing tools/total expected tools} for each query
def fin(model : str , queries : typing.List):
  retriever = get_retriever(api_desc, model)
  ans = []
  for i in range(len(queries)):
    tools = get_tools(queries[i],api_desc,retriever)[0]
    ans.append(evaluate(expected[i],tools)/len(expected[i]))

  return ans

In [12]:
#List of all the models used for testing
all_models = ["text-embedding-ada-002",
"all-MiniLM-L6-v2",
"all-mpnet-base-v2",
"multi-qa-mpnet-base-dot-v1",
"all-distilroberta-v1",
"all-MiniLM-L12-v2",
"multi-qa-distilbert-cos-v1"]

In [13]:
#Empty Dataframe to store the results
df = pd.DataFrame(columns=['model name','outputs'])

for model in all_models:
  exp = fin(model,queries)
  df = df.append({'model name':model,"outputs":exp}, ignore_index=True)



  df = df.append({'model name':model,"outputs":exp}, ignore_index=True)


.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

  df = df.append({'model name':model,"outputs":exp}, ignore_index=True)
  df = df.append({'model name':model,"outputs":exp}, ignore_index=True)
  df = df.append({'model name':model,"outputs":exp}, ignore_index=True)


.gitattributes:   0%|          | 0.00/737 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.3k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/653 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json:   0%|          | 0.00/15.7k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/329M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/333 [00:00<?, ?B/s]

train_script.py:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

  df = df.append({'model name':model,"outputs":exp}, ignore_index=True)


.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/573 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/134M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/352 [00:00<?, ?B/s]

train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

  df = df.append({'model name':model,"outputs":exp}, ignore_index=True)


.gitattributes:   0%|          | 0.00/737 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/9.47k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/523 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json:   0%|          | 0.00/25.5k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/265M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/333 [00:00<?, ?B/s]

train_script.py:   0%|          | 0.00/13.8k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

  df = df.append({'model name':model,"outputs":exp}, ignore_index=True)


In [14]:
#Saving the Result to a excel file
df.to_excel('lang_retriever_results7.xlsx')