# Create a Research Agent


This is a concept agent that is inspired by my process of researching on a subject. <br>
I also took some inspiration from BabyAGI (without tools) implementation


In [1]:
from dotenv import load_dotenv

load_dotenv("./.env")
import uuid
import os
from langchain.chat_models import ChatOpenAI
from langchain.embeddings import OpenAIEmbeddings
from langchain.prompts import PromptTemplate
from yachalk import chalk

In [2]:
text_embedding_model = "text-embedding-ada-002"
embeddings = OpenAIEmbeddings(model=text_embedding_model)

## Set up the vector store


In [3]:
use_localdb = True

SUPABASE_PASSWORD = os.environ["SUPABASE_PASSWORD"]
SUPABASE_DBUSER = os.environ["SUPABASE_DBUSER"]
SUPABASE_DATABASE = os.environ["SUPABASE_DATABASE"]
supabasedb_string = f"postgresql://{SUPABASE_DBUSER}:{SUPABASE_PASSWORD}@db.doxggeyqopdnxfhseufq.supabase.co:5432/{SUPABASE_DATABASE}"

PGVECTOR_USER = os.environ["PGVECTOR_USER"]
PGVECTOR_PASSWORD = os.environ["PGVECTOR_PASSWORD"]
PGVECTOR_DATABASE = os.environ["PGVECTOR_DATABASE"]
localdb_string = f"postgresql://{PGVECTOR_USER}:{PGVECTOR_PASSWORD}@localhost:5432/{PGVECTOR_DATABASE}"

connection_string = localdb_string if use_localdb else supabasedb_string

### Main text store


In [4]:
from langchain.vectorstores import PGVector

combined_text_store = PGVector(
    collection_name="mahabharat_combined_text",
    connection_string=connection_string,
    embedding_function=embeddings,
)
print(type(combined_text_store))

<class 'langchain.vectorstores.pgvector.PGVector'>


### Supabase vector store for storing runs

The supabase client here is not used as a vector store.
I am only using it to save the runs data.
You can remove it if you dont need it.


In [5]:
from supabase.client import Client, create_client
from langchain.vectorstores import SupabaseVectorStore

supabase_url = os.environ.get("SUPABASE_URL")
supabase_key = os.environ.get("SUPABASE_SERVICE_KEY")
supabase: Client = create_client(supabase_url, supabase_key)

runs_store = SupabaseVectorStore(
    embedding=embeddings, client=supabase, table_name="runs", query_name="match_runs"
)

# ## Testing store
# run_id = str(uuid.uuid4())
# runs_store.add_texts(texts=["testing the store"], metadatas=[{"key": "value"}], ids=[run_id])
# matched_docs = runs_store.similarity_search_with_relevance_scores("testing store", 1)
# matched_docs

### Helper Function for converting question string to list of questions


In [6]:
def split2Dict(question):
    question = question.strip().split(".", 1)
    question_dict = {int(question[0]): question[-1].strip()}
    return question_dict


def result2DictOfQuestions(result: str):
    questions = result.split("\n")
    qdict = {}
    for q in questions:
        qdict = {**qdict, **split2Dict(q)}
    return qdict

## Import Chains


#### Question creator

Generate new questions based on

-   `question` - Original question. This is important so that the pertinence to the original question is always maintained. Or else the context can diverge quickly into impertinent results.
-   `unanswered_questions`: So that the new questions do not overlap with the old ones.
-   `context`: The document found during the current run. So the questions are derived from the fresh information based on the current question being asked.
-   `num_questions`: Configurable hyper parameter.
-   `start_id`: This is passed so that the ids of the newly generated question do overlap with the current list.

---

#### Most pertinent Question chain

Pick the most pertinent question out of the given list of questions <br>
I am not using any additional context other than the `original_question` for decising the pertinence.

---

#### Retrieval QA

This chain is used to answer the intermediate questions. The idea is to generate succinct answers which can be used as notes to finally answer the original question

---

#### Result Analyser

Not using this right now. I am not able to get this piece working well. So currently I will just run the agent for a fixed number of iterations and then compile the answer.


In [7]:
## Import all the chains.
from chains_v2.create_questions import QuestionCreationChain
from chains_v2.most_pertinent_question import MostPertinentQuestion
from chains_v2.retrieval_qa import retrieval_qa
from chains_v2.research_compiler import research_compiler
from chains_v2.question_atomizer import QuestionAtomizer

## Model with parameters
gpt3t = "gpt-3.5-turbo"


def language_model(
    model_name: str = gpt3t, temperature: float = 0, verbose: bool = False
):
    llm = ChatOpenAI(model_name=model_name, temperature=temperature, verbose=verbose)
    return llm


from helpers.response_helpers import result2QuestionsList
from helpers.response_helpers import qStr2Dict

# The Research Agent


In [8]:
getAnsweredQuestions = lambda questions: [
    q for q in questions if q["status"] == "answered"
]
getUnansweredQuestions = lambda questions: [
    q for q in questions if q["status"] == "unanswered"
]
getSubquestions = lambda questions: [q for q in questions if q["type"] == "subquestion"]
getHopquestions = lambda questions: [q for q in questions if q["type"] == "hop"]
getLastQuestionId = lambda questions: max([q["id"] for q in questions])


def markAnswered(questions, id: int):
    for q in questions:
        if q["id"] == id:
            q["status"] = "answered"


def getQuestionById(questions, id: int):
    q = [q for q in questions if q["id"] == id]
    if len(q) == 0:
        return None
    return q[0]

Question Schema

```
  Question: {
    id: int,
    question: string,
    type: 'subquestion' | 'hops',
    status: 'answered' | 'unanswered',
    answer: string,
    documents: []
  }
```


In [9]:
run_id = str(uuid.uuid4())
scratchpad = {
    "original_question": "Do you think Duryodhana was the rightful hair of the Hastinapur Throne?",
    "questions": [],  # list of type Question
    "answerpad": [],
}
verbose = False
max_iterations = 4
current_iteration = 0
num_atomistic_questions = 3
num_questions_per_iteration = 3
question_atomizer_temperature = 0
question_creation_temperature = 0.2
question_prioritisation_temperature = 0
qa_temperature = 0
analyser_temperature = 0
intermediate_answers_length = 300
store = combined_text_store

## ---- The researcher ----- ##

## Step 0. Prepare the initial set of questions
question_atomizer = QuestionAtomizer.from_llm(
    llm=language_model(temperature=question_atomizer_temperature), verbose=verbose
)

atomized_questions_response = question_atomizer.run(
    question=scratchpad["original_question"],
    num_questions=num_atomistic_questions,
)

scratchpad["questions"] += result2QuestionsList(
    question_response=atomized_questions_response,
    type="subquestion",
    status="unanswered",
)

for q in scratchpad["questions"]:
    q["answer"], q["documents"] = retrieval_qa(
        llm=language_model(temperature=qa_temperature, verbose=verbose),
        retriever=store.as_retriever(
            search_type="mmr", search_kwargs={"k": 5, "fetch_k": 10}
        ),
        question=q["question"],
        answer_length=300,
        verbose=verbose,
    )
    q["status"] = "answered"

current_context = "".join(
    f"\n{q['id']}. {q['question']}\n{q['answer']}\n" for q in scratchpad["questions"]
)

print(
    chalk.blue_bright("** Atomised Sub Questions **"),
    chalk.blue(current_context),
)

scratchpad_temp = scratchpad.copy()
verbose = False
while True:
    current_iteration += 1
    if current_iteration > max_iterations:
        break

    # STEP 1: create questions
    start_id = getLastQuestionId(scratchpad_temp["questions"]) + 1
    question_creation_chain = QuestionCreationChain.from_llm(
        language_model(temperature=question_creation_temperature), verbose=verbose
    )
    questions_response = question_creation_chain.run(
        question=scratchpad_temp["original_question"],
        context=current_context,
        previous_questions=[
            "".join(f"\n{q['question']}") for q in scratchpad_temp["questions"]
        ],
        num_questions=4,
        start_id=start_id,
    )
    scratchpad_temp["questions"] += result2QuestionsList(
        question_response=questions_response,
        type="hop",
        status="unanswered",
    )

    # STEP 2: Choose question for current iteration
    unanswered = getUnansweredQuestions(scratchpad_temp["questions"])
    unanswered_questions_prompt = (
        "[" + "".join([f"\n{q['id']}. {q['question']}" for q in unanswered]) + "]"
    )
    print(
        chalk.cyan_bright("** Unanswered Questions **"),
        chalk.cyan("".join([f"\n'{q['id']}. {q['question']}'" for q in unanswered])),
    )
    most_pertinent_question = MostPertinentQuestion.from_llm(
        language_model(temperature=question_creation_temperature), verbose=verbose
    )
    response = most_pertinent_question.run(
        original_question=scratchpad_temp["original_question"],
        unanswered_questions=unanswered_questions_prompt,
    )

    current_question_dict = qStr2Dict(question=response)
    current_question_id = current_question_dict["id"]
    
    current_question = getQuestionById(scratchpad['questions'], current_question_id)

    print(
        chalk.magenta.bold("** Next Questions **\n"),
        chalk.magenta(current_question_id),
        chalk.magenta(current_question['question']),
    )

    # STEP 3: Answer the question
    current_question["answer"], current_question["documents"] = retrieval_qa(
        llm=language_model(temperature=qa_temperature, verbose=verbose),
        retriever=store.as_retriever(
            search_type="mmr", search_kwargs={"k": 5, "fetch_k": 10}
        ),
        question=current_question["question"],
        answer_length=300,
        verbose=verbose,
    )
    markAnswered(scratchpad_temp["questions"], current_question_id)

    print(
        chalk.yellow_bright.bold("** Answer **\n"),
        chalk.yellow_bright(current_question["answer"]))

    current_context = current_question["answer"]


## STEP 4: Compile the research
answered_questions = getAnsweredQuestions(scratchpad['questions'])
answered_questions_prompt = "".join([ f"{q['question']}\n{q['answer']}\n\n" for q in answered_questions ])
result = research_compiler(
    llm = language_model(temperature=qa_temperature, verbose=verbose),
    question = scratchpad['original_question'], 
    notes = answered_questions_prompt, 
    verbose = True)

print(chalk.white.bold(result['text']))

# current_question = response
## Collect initial document with semantic search.

qStr2Dict : 1. Who is Duryodhana?
qStr2Dict : 2. What is the Hastinapur Throne?
qStr2Dict : 3. Was Duryodhana the rightful heir of the Hastinapur Throne?
[94m** Atomised Sub Questions **[39m [34m[39m
[34m1. Who is Duryodhana?[39m
[34mDuryodhana is a character in the context provided who is described as a sinful person and a warrior. He is shown to possess skill and is engaged in a battle against the Pandavas. Duryodhana is portrayed as being possessed by greed and having a strong desire to gain the throne and become king. He is depicted as being arrogant and having no appreciation for the spiritual qualities of the Pandavas. Duryodhana is also shown to have committed offenses and sins, such as trying to poison Bhima, stealing away the Pandavas' kingdom, and offending Draupadi. Due to his actions, Duryodhana is cursed by Maitreya Rishi to die by Bhima's mace. Despite appearing to have won sovereignty, it is mentioned that Duryodhana's plant of sin will soon result in suffering wh