# Create a Research Agent

This is a concept agent that is inspired by my process of researching on a subject. <br>
I also took some inspiration from BabyAGI (without tools) implementation 

In [2]:
from dotenv import load_dotenv
load_dotenv('./.env') 

## Basic imports.
import os
import sys
import glob
import time
from pathlib import Path
import pandas as pd
import numpy as np
import uuid
# sys.path.append('')

In [3]:
import os
# from collections import deque
# from typing import Dict, List, Optional, Any

# from langchain import LLMChain, OpenAI, PromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.embeddings import OpenAIEmbeddings
from langchain.llms import BaseLLM
# from langchain.vectorstores.base import VectorStore
from pydantic import BaseModel, Field
from langchain.chains.base import Chain
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA


text_embedding_model = "text-embedding-ada-002"
gpt3t = "gpt-3.5-turbo"
gpt4 = "gpt-4"

verbose = True
temperature = 0
llm = ChatOpenAI(model_name=gpt3t, temperature=temperature)
embeddings = OpenAIEmbeddings(model=text_embedding_model)


## Set up the vector store

In [4]:
use_localdb = True

SUPABASE_PASSWORD = os.environ['SUPABASE_PASSWORD']
SUPABASE_DBUSER = os.environ['SUPABASE_DBUSER']
SUPABASE_DATABASE = os.environ['SUPABASE_DATABASE']
supabasedb_string = f"postgresql://{SUPABASE_DBUSER}:{SUPABASE_PASSWORD}@db.doxggeyqopdnxfhseufq.supabase.co:5432/{SUPABASE_DATABASE}"

PGVECTOR_USER = os.environ['PGVECTOR_USER']
PGVECTOR_PASSWORD = os.environ['PGVECTOR_PASSWORD']
PGVECTOR_DATABASE = os.environ['PGVECTOR_DATABASE']
localdb_string = f"postgresql://{PGVECTOR_USER}:{PGVECTOR_PASSWORD}@localhost:5432/{PGVECTOR_DATABASE}"

connection_string = localdb_string if use_localdb else supabasedb_string

### Main text store

In [5]:
from langchain.vectorstores import PGVector

combined_text_store = PGVector(
    collection_name='mahabharat_combined_text',
    connection_string=connection_string,
    embedding_function=embeddings,
)
print(type(combined_text_store))

<class 'langchain.vectorstores.pgvector.PGVector'>


### Supabase vector store for storing runs

The supabase client here is not used as a vector store. 
I am only using it to save the runs data. 
You can remove it if you dont need it. 

In [6]:
from supabase.client import Client, create_client
from langchain.vectorstores import SupabaseVectorStore

supabase_url = os.environ.get("SUPABASE_URL")
supabase_key = os.environ.get("SUPABASE_SERVICE_KEY")
supabase: Client = create_client(supabase_url, supabase_key)

runs_store = SupabaseVectorStore(embedding=embeddings, client=supabase, table_name='runs',query_name="match_runs")


In [7]:
# ## Testing store
# run_id = str(uuid.uuid4())
# runs_store.add_texts(texts=["testing the store"], metadatas=[{"key": "value"}], ids=[run_id])
# matched_docs = runs_store.similarity_search_with_relevance_scores("testing store", 1)
# matched_docs


#### Test Question

In [8]:

scratchpad = {
    'original_question': "Why was Arjun not present in the battle when Abhimanyu was slain?",
    'unanswered_questions': {},
    'answerpad': [],
    'current_question': '',
    'current_answer': "Based on the given context, it is not explicitly mentioned why Arjuna was absent during the battle when Abhimanyu was slain. The context only provides information about Arjuna's grief and anger upon learning about his son's death. Arjuna expresses his disappointment and blames his allies, the Panchalas, for not protecting Abhimanyu. He questions their manliness and prowess, suggesting that they failed to fulfill their promise of protecting his son.\n\nArjuna's absence during the battle could be due to various reasons, but without further information, it is difficult to determine the exact reason.",
    'notes': [],
    'current_documents': [],
}

### Helper Function for converting question string to list of questions

In [9]:
def split2Dict(question):
    question = question.strip().split(".", 1)
    question_dict = {question[0].strip(): question[-1].strip()}
    return question_dict


def result2DictOfQuestions(result: str):
    questions = result.split("\n")
    qdict = {}
    for q in questions:
        qdict = {**qdict, **split2Dict(q)}
    return qdict


## Question creator

Idea: Generate new questions based on 
- `question` - Original question. This is important so that the pertinence to the original question is always maintained. Or else the context can diverge quickly into impertinent results.
- `unanswered_questions`: So that the new questions do not overlap with the old ones.
- `context`: The document found during the current run. So the questions are derived from the fresh information based on the current question being asked. 
- `num_questions`: Configurable hyper parameter.
- `start_id`: This is passed so that the ids of the newly generated question do overlap with the current list. 

In [10]:

class QuestionCreationChain(LLMChain):
    """Chain to generates subsequent questions."""

    @classmethod
    def from_llm(cls, llm: BaseLLM, verbose: bool = True) -> LLMChain:
        """Get the response parser."""
        task_creation_template = (
            "You are a research agent who is provided with a user query and some context"
            " User query: {question}"
            " Context: {context}"
            " Your task is to ask questions which can help your team research on the users query"
            " These are previously asked unanswered questions: {unanswered_questions}."
            " You can ask only upto {num_questions} new questions"
            " The new questions should have no overlap with the previously unanswered questions."
            " Return the questions as a comma separated list."
            " Format your response as a numbered list of questions, like:"
            " #. First question"
            " #. Second question"
            " Start the list with number {start_id}"
        )
        prompt = PromptTemplate(
            template=task_creation_template,
            input_variables=[
                "question",
                "context",
                "unanswered_questions",
                "num_questions",
                "start_id",
            ],
        )
        return cls(prompt=prompt, llm=llm, verbose=verbose)


#### Testing the question creation agent

In [11]:
# question_creation_chain = QuestionCreationChain.from_llm(llm, verbose=False)

# question_response = question_creation_chain.run(
#         question=scratchpad['original_question'],
#         context=scratchpad['current_answer'],
#         unanswered_questions=scratchpad['unanswered_questions'],
#         num_questions = 4,
#         start_id=5,
#     )

# scratchpad['unanswered_questions'] = {**scratchpad['unanswered_questions'], **result2DictOfQuestions(result = question_response)}

# print(scratchpad['original_question'], "\n")
# print(scratchpad['unanswered_questions'], "\n")


## Most Pertinent Question Picker

Pick the most pertinent question out of the given list of questions <br>
I am not using any additional context other than the `original_question` for decising the pertinence.

In [12]:
class MostPertinentQuestion(LLMChain):
    """
    This chain picks one question out of a list of questions
    most pertinent to the original question. 
    """

    @classmethod
    def from_llm(cls, llm: BaseLLM, verbose: bool = True) -> LLMChain:
        """Get the response parser."""
        task_prioritization_template = (
            "You are provided with the following list of questions:"
            " {unanswered_questions} \n"
            " Your task is to find one question from the given list" 
            " that is the most pertinent to the following query" 
            " {original_question} \n"
            " Respond with one question out of the provided list of questions"
            " Return the questions as it is without any edits"
            " Format your response like:"
            " #. question"
        )
        prompt = PromptTemplate(
            template=task_prioritization_template,
            input_variables=["unanswered_questions", "original_question"],
        )
        return cls(prompt=prompt, llm=llm, verbose=verbose)


#### Testing the 'most pertinent question' chain

In [13]:

# print("\033[93m\033[1m", ",\n".join(f"'{key}. {value}'" for key, value in scratchpad['unanswered_questions'].items()), "\033[0m\033[0m")

# most_pertinent_question = MostPertinentQuestion.from_llm(llm = ChatOpenAI(model_name=gpt3t, temperature=0.5), verbose = False)
# response = most_pertinent_question.run(
#         original_question=scratchpad['original_question'],
#         unanswered_questions="\n".join(f"'{key}. {value}'," for key, value in scratchpad['unanswered_questions'].items()),
#     )

# next_question = response

# print(scratchpad['original_question'])
# print("\033[92m\033[1m", next_question, "\033[0m\033[0m")


## Retrieval QA

This chain is used to answer the intermediate questions. The idea is to generate succinct answers which can be used as notes to finally answer the original question

In [14]:
def qa(llm, store: PGVector, question: str, verbose: bool = True):
    """
    This chain is used to answer the intermediate questions.
    """
    prompt_template = (
    "Use the following pieces of context" 
    " Context:"
    " {context}"
    " Your objective is to answer the following question"
    " Question:"
    " {question}"
    " Answer based only on the context and no other previous knowledge"
    " don't try to make up an answer."
    " If you don't know the answer, just say that you don't know,"
    " Answer in less than 200 words."
    " Answer :")

    PROMPT = PromptTemplate(
        template=prompt_template, input_variables=["context", "question"]
    )

    qa_chain = RetrievalQA.from_chain_type(
        llm=llm, 
        chain_type="stuff", 
        retriever=combined_text_store.as_retriever(),
        return_source_documents=True,
        chain_type_kwargs={"prompt": PROMPT},
        verbose = verbose,
    )

    result = qa_chain({"query": question})
    return result['result'], result['source_documents']



#### Test the qa chain

In [15]:
# r, d = qa(
#     llm = ChatOpenAI(model_name=gpt3t, temperature=0.1),
#     store = combined_text_store,
#     question = scratchpad['original_question'], 
#     verbose = verbose)


## Result Analyser

Not using this right now. I am not able to get this piece working well. So currently I will just run the agent for a fixed number of iterations and then compile the answer. 

In [16]:

# def analyser(llm, original_question: str, answered_questions: str, unanswered_questions: str, verbose: bool = True):
#     prompt_template = (
#     "You are a research assistant provided with the users question," 
#     " previously answered questions, and some yet unanswered questions."
#     " users question:"
#     " {original_question} \n"
#     " answered questions:"
#     " {answered_questions} \n"
#     " unanswered questions:"
#     " {unanswered_questions} \n"
#     " Your task is to decide if the users question can be aptly "
#     " answered based on the answered to the previously answered questions."
#     " If you think the answers to the answered questions are enough to answer the users question," 
#     " and unanswered questions are not necessary, then answer with a 'Yes'. \n"
#     " Otherwise answer with a 'No'."
#     " Answer only in a 'Yes' or a 'No'."
#     )

#     PROMPT = PromptTemplate(
#         template=prompt_template, input_variables=["original_question", "answered_questions", "unanswered_questions"]
#     )

#     chain = LLMChain(
#         llm=llm,
#         prompt=PROMPT,
#     )

#     result = chain({
#         "original_question": original_question, 
#         "answered_questions": answered_questions, 
#         "unanswered_questions": unanswered_questions})
        
#     return result

#### Test the Analyser

In [17]:
# result = analyser(
#     llm = ChatOpenAI(model_name=gpt3t, temperature=0.1),
#     original_question = scratchpad['original_question'], 
#     answered_questions = scratchpad['answered_questions'], 
#     unanswered_questions = scratchpad['unanswered_questions'],
#     verbose = verbose)

# result

## Research Compiler

In [18]:
from langchain.chains import LLMChain

def research_compiler(llm, question: str, context: str, prev_answer: str, verbose: bool = True):
    prompt_template = (
    "Your task is to answer the users question"
    " Question: {question} \n"
    " You can use previously found incomplete answers (only if needed)"
    " Previous Answer: {prev_answer} \n"
    " Use the following context to create a complete and elaborate answer." 
    " Context: {context} \n"
    " The context includes answers to several similar questions"
    " Give an elaborate answer. don't try to make up an answer."
    " If you don't know the answer, just say that you don't know,"
    " Answer only based on the given information, and no other prior knowledge."
    " Answer :")

    PROMPT = PromptTemplate(
        template=prompt_template, input_variables=["context", "question", "prev_answer"]
    )

    chain = LLMChain(
        llm=llm,
        prompt=PROMPT,
    )

    result = chain({"question": question, "context": context, "prev_answer": prev_answer})
    return result

#### Test the research Compiler

In [19]:
# result = research_compiler(
#     llm = ChatOpenAI(model_name=gpt3t, temperature=0.1),
#     question = scratchpad['original_question'], 
#     context = scratchpad['notes'], 
#     prev_answer = scratchpad['answerpad'],
#     verbose = verbose)

# The Research Agent

This is the final research Agent. 

In [22]:
## First create a run id. 
run_id = str(uuid.uuid4())

## Define scratchpad for keeping the run data
scratchpad = {
    'original_question': "Why did the pandavas had to go live in the forest for 12 years?",
    'unanswered_questions': {},
    'answerpad': [],
    'notes': [],
    'current_documents': [],
    'documents': [],
    'answered_questions': [],
}
verbose = False
first_run = True
max_iterations = 6
current_iteration = 0
num_questions_per_iteration = 3
question_creation_temperature = 0.5
question_prioritisation_temperature = 0.5
analyser_temperature = 0
store = combined_text_store
current_question_id = None
## Model with parameters
def model(model_name: str = gpt3t, temperature: float = 0):
    llm = ChatOpenAI(model_name=model_name, temperature=temperature)
    return llm


## ---- The researcher -----
while True: 
    current_iteration += 1
    if current_iteration == 2:
        time.sleep(10)

    print("\033[91m\033[1m" + f"\n ------  Iteration {current_iteration} ------- \n" + "\033[0m\033[0m")
    if current_iteration >= max_iterations:
        print("\033[91m\033[1m" + f"\n ------ Max iterations reached." + "\033[0m\033[0m")
        break
    
    ## 1. First run the qa chain on current question
    if first_run:
        current_question = scratchpad['original_question']
        current_answer, current_documents = qa( 
            llm = model(temperature=0), 
            store = combined_text_store, 
            question = current_question, 
            verbose = verbose)
        scratchpad['answerpad'] += [current_answer]
        first_run = False
    else:
        current_answer, current_documents = qa( 
            llm = model(temperature=0), 
            store = combined_text_store, 
            question = current_question, 
            verbose = verbose)
        scratchpad['notes'] += [{'question': current_question, 'answer': current_answer}]

    scratchpad['current_documents'] = current_documents
    scratchpad['documents'] += current_documents
    print(
        "\033[36;2m",
        f"\nCurrent Question: {current_question}\n", 
        f"Current Answer: {current_answer} \n",
        "\033[0m\033[0m")

    ## 2. Ask more questions based on current_answer as context
    ## ----
    start_id = (current_iteration-1)*num_questions_per_iteration + 1
    question_creation_chain = QuestionCreationChain.from_llm(
        llm = model(temperature=question_creation_temperature), 
        verbose=verbose
        )
    question_creation_response = question_creation_chain.run(
        question=scratchpad['original_question'],
        context="\n".join(list(map(lambda x: x.page_content, scratchpad['current_documents']))),
        unanswered_questions=scratchpad['unanswered_questions'],
        num_questions = num_questions_per_iteration,
        start_id=start_id,
    )
    scratchpad['unanswered_questions'] = {
        **scratchpad['unanswered_questions'], 
        **result2DictOfQuestions(result = question_creation_response)
        }

    if current_question_id:
        scratchpad['answered_questions'] += [scratchpad['unanswered_questions'].pop(current_question_id)]
        # Remove the current question here after generating the new questions so that the 
        # current question is not regenerated. 
       
    print(
        "\033[35;3m",
        "** Unanswered Questions **\n",
        "\n".join(f"'{key}. {value}'" for key, value in scratchpad['unanswered_questions'].items()), 
        "\033[32;5m",
        "\n** Answered Questions **\n",
        "\n".join(scratchpad['answered_questions']),
        "\033[0m\033[0m")
    ### --- ###

    ## 3. Find the most pertinent question to ask next 
    ## ----
    find_next_question = MostPertinentQuestion.from_llm(
        llm = model(temperature=question_prioritisation_temperature),
        verbose = verbose
        )
    current_question = find_next_question.run(
        original_question=scratchpad['original_question'],
        unanswered_questions="\n".join(f"'{key}. {value}'," for key, value in scratchpad['unanswered_questions'].items()),
    )

    # print("next q response -->", current_question)
    # Calculate question id so it can be deleted in the next iteration
    current_question_id = current_question.split('.', 1)[0]
    current_question = current_question.split('.', 1)[-1]
    print(
        "\033[93m\033[1m", 
        f"\nNext Question I need to ask: {current_question}\nQuestion Id: {current_question_id}", 
        "\033[0m\033[0m")
    ### --- ###



## ----- The Analyser -----
final_answer= research_compiler(
    llm = model(temperature = analyser_temperature),
    question = scratchpad['original_question'], 
    context = scratchpad['notes'], 
    prev_answer = scratchpad['answerpad'],
    verbose = verbose)

print("\033[1;37m" + f"\nFinal Answer: \n {final_answer['text']}\n" + "\033[0m\033[0m")  

## ----- Save the run data -----
## If you do not need this, feel free to comment it out. 
runs_store.add_texts(
    texts=[scratchpad['original_question']], 
    metadatas=[{
        'docs_metadata': [document.metadata for document in scratchpad['documents']], 
        'answer': final_answer['text'], 
        'context': final_answer['context']
        }], 
    ids=[run_id])
    

[91m[1m
 ------  Iteration 1 ------- 
[0m[0m
[36;2m 
Current Question: Why did the pandavas had to go live in the forest for 12 years?
 Current Answer: The Pandavas had to go live in the forest for 12 years because they lost their kingdom in a gambling match and were exiled as a result. They were forced to spend their time in the forest as part of their punishment. Bhima, one of the Pandavas, found it difficult to tolerate the ascetic life in the forest and contemplated killing all of the Kurus, but his brother Yudhisthira always pacified him and advised against it. During their time in the forest, the Pandavas received knowledge from sages and saintly persons. The great sage Vyasa informed them that many generals had agreed to support their cousin Duryodhana, including Karna, Shakuni, Bhurishravas, Sala, Drona, and Bhishma. Vyasa also told Arjuna that if he were to fight in a future war, he would need to obtain the divine astras of Lord Indra and Lord Shiva. After Vyasa's departu

2023-08-28 10:25:03,485:INFO - HTTP Request: POST https://doxggeyqopdnxfhseufq.supabase.co/rest/v1/runs "HTTP/1.1 201 Created"


['cf1a07da-4174-4a56-aba8-291060845771']