# Create a Research Agent

This is a concept agent that is inspired by my process of researching on a subject. <br>
I also took some inspiration from BabyAGI (without tools) implementation 

In [1]:
from dotenv import load_dotenv
load_dotenv('./.env') 
import uuid
import os
from langchain.chat_models import ChatOpenAI
from langchain.embeddings import OpenAIEmbeddings
from langchain.prompts import PromptTemplate


In [2]:
text_embedding_model = "text-embedding-ada-002"
gpt3t = "gpt-3.5-turbo"
gpt4 = "gpt-4"

embeddings = OpenAIEmbeddings(model=text_embedding_model)


## Set up the vector store

In [3]:
use_localdb = True

SUPABASE_PASSWORD = os.environ['SUPABASE_PASSWORD']
SUPABASE_DBUSER = os.environ['SUPABASE_DBUSER']
SUPABASE_DATABASE = os.environ['SUPABASE_DATABASE']
supabasedb_string = f"postgresql://{SUPABASE_DBUSER}:{SUPABASE_PASSWORD}@db.doxggeyqopdnxfhseufq.supabase.co:5432/{SUPABASE_DATABASE}"

PGVECTOR_USER = os.environ['PGVECTOR_USER']
PGVECTOR_PASSWORD = os.environ['PGVECTOR_PASSWORD']
PGVECTOR_DATABASE = os.environ['PGVECTOR_DATABASE']
localdb_string = f"postgresql://{PGVECTOR_USER}:{PGVECTOR_PASSWORD}@localhost:5432/{PGVECTOR_DATABASE}"

connection_string = localdb_string if use_localdb else supabasedb_string

### Main text store

In [4]:
from langchain.vectorstores import PGVector

combined_text_store = PGVector(
    collection_name='mahabharat_combined_text',
    connection_string=connection_string,
    embedding_function=embeddings,
)
print(type(combined_text_store))

<class 'langchain.vectorstores.pgvector.PGVector'>


### Supabase vector store for storing runs

The supabase client here is not used as a vector store. 
I am only using it to save the runs data. 
You can remove it if you dont need it. 

In [5]:
from supabase.client import Client, create_client
from langchain.vectorstores import SupabaseVectorStore

supabase_url = os.environ.get("SUPABASE_URL")
supabase_key = os.environ.get("SUPABASE_SERVICE_KEY")
supabase: Client = create_client(supabase_url, supabase_key)

runs_store = SupabaseVectorStore(embedding=embeddings, client=supabase, table_name='runs',query_name="match_runs")

# ## Testing store
# run_id = str(uuid.uuid4())
# runs_store.add_texts(texts=["testing the store"], metadatas=[{"key": "value"}], ids=[run_id])
# matched_docs = runs_store.similarity_search_with_relevance_scores("testing store", 1)
# matched_docs



### Helper Function for converting question string to list of questions

In [6]:
def split2Dict(question):
    question = question.strip().split(".", 1)
    question_dict = {question[0].strip(): question[-1].strip()}
    return question_dict


def result2DictOfQuestions(result: str):
    questions = result.split("\n")
    qdict = {}
    for q in questions:
        qdict = {**qdict, **split2Dict(q)}
    return qdict


## Import Chains

#### Question creator
Generate new questions based on 
- `question` - Original question. This is important so that the pertinence to the original question is always maintained. Or else the context can diverge quickly into impertinent results.
- `unanswered_questions`: So that the new questions do not overlap with the old ones.
- `context`: The document found during the current run. So the questions are derived from the fresh information based on the current question being asked. 
- `num_questions`: Configurable hyper parameter.
- `start_id`: This is passed so that the ids of the newly generated question do overlap with the current list. 

---
#### Most pertinent Question chain
Pick the most pertinent question out of the given list of questions <br>
I am not using any additional context other than the `original_question` for decising the pertinence.

---
#### Retrieval QA
This chain is used to answer the intermediate questions. The idea is to generate succinct answers which can be used as notes to finally answer the original question

--- 
#### Result Analyser
Not using this right now. I am not able to get this piece working well. So currently I will just run the agent for a fixed number of iterations and then compile the answer. 

In [7]:
## Import all the chains. 
from chains.create_questions import QuestionCreationChain 
from chains.most_pertinent_question import MostPertinentQuestion
from chains.retrieval_qa import retrieval_qa
from chains.research_compiler import research_compiler

## Model with parameters
def language_model(model_name: str = gpt3t, temperature: float = 0, verbose: bool = False):
    llm = ChatOpenAI(model_name=model_name, temperature=temperature)
    return llm


# The Research Agent

This is the final research Agent. 

In [14]:
## First create a run id. 
run_id = str(uuid.uuid4())

## Define scratchpad for keeping the run data
scratchpad = {
    'original_question': "why did Mahabharata war happen?",
    'unanswered_questions': {},
    'answerpad': [],
    'notes': [],
    'current_documents': [],
    'documents': [],
    'answered_questions': [],
}
verbose = False
first_run = True
max_iterations = 6
current_iteration = 0
num_questions_per_iteration = 3
question_creation_temperature = 0.2
question_prioritisation_temperature = 0.2
analyser_temperature = 0
store = combined_text_store
current_question_id = None


## ---- The researcher -----
while True: 
    current_iteration += 1

    print("\033[91m\033[1m" + f"\n ------  Iteration {current_iteration} ------- \n" + "\033[0m\033[0m")
    if current_iteration >= max_iterations:
        print("\033[91m\033[1m" + f"\n ------ Max iterations reached." + "\033[0m\033[0m")
        break
    
    ## 1. First run the retrieval_qa chain on current question
    if first_run:
        current_question = scratchpad['original_question']
        current_answer, current_documents = retrieval_qa( 
            llm = language_model(temperature=0), 
            store = combined_text_store, 
            question = current_question, 
            verbose = verbose)
        scratchpad['answerpad'] += [current_answer]
        first_run = False
    else:
        current_answer, current_documents = retrieval_qa( 
            llm = language_model(temperature=0), 
            store = combined_text_store, 
            question = current_question, 
            verbose = verbose)
        scratchpad['notes'] += [{'question': current_question, 'answer': current_answer}]

    scratchpad['current_documents'] = current_documents
    scratchpad['documents'] += current_documents
    print(
        "\033[36;2m",
        f"\nCurrent Question: {current_question}\n", 
        f"Current Answer: {current_answer} \n",
        "\033[0m\033[0m")

    ## 2. Ask more questions based on current_answer as context
    ## ----
    start_id = (current_iteration-1)*num_questions_per_iteration + 1
    question_creation_chain = QuestionCreationChain.from_llm(
        llm = language_model(temperature=question_creation_temperature), 
        verbose=verbose
        )
    question_creation_response = question_creation_chain.run(
        question=scratchpad['original_question'],
        context="\n".join(list(map(lambda x: x.page_content, scratchpad['current_documents']))),
        unanswered_questions=scratchpad['unanswered_questions'],
        num_questions = num_questions_per_iteration,
        start_id=start_id,
    )
    scratchpad['unanswered_questions'] = {
        **scratchpad['unanswered_questions'], 
        **result2DictOfQuestions(result = question_creation_response)
        }

    if current_question_id:
        scratchpad['answered_questions'] += [scratchpad['unanswered_questions'].pop(current_question_id)]
        # Remove the current question here after generating the new questions so that the 
        # current question is not regenerated. 
       
    print(
        "\033[35;3m",
        "** Unanswered Questions **\n",
        "\n".join(f"'{key}. {value}'" for key, value in scratchpad['unanswered_questions'].items()), 
        "\033[32;5m",
        "\n** Answered Questions **\n",
        "\n".join(scratchpad['answered_questions']),
        "\033[0m\033[0m")
    ### --- ###

    ## 3. Find the most pertinent question to ask next 
    ## ----
    find_next_question = MostPertinentQuestion.from_llm(
        llm = language_model(temperature=question_prioritisation_temperature),
        verbose = verbose
        )
    current_question = find_next_question.run(
        original_question=scratchpad['original_question'],
        unanswered_questions="\n".join(f"'{key}. {value}'," for key, value in scratchpad['unanswered_questions'].items()),
    )

    # print("next q response -->", current_question)
    # Calculate question id so it can be deleted in the next iteration
    current_question_id = current_question.split('.', 1)[0]
    current_question = current_question.split('.', 1)[-1]
    print(
        "\033[93m\033[1m", 
        f"\nNext Question I need to ask: {current_question}\nQuestion Id: {current_question_id}", 
        "\033[0m\033[0m")
    ### --- ###



## ----- The Compiler -----
final_answer= research_compiler(
    llm = language_model(temperature = analyser_temperature, verbose=True),
    question = scratchpad['original_question'], 
    context = scratchpad['notes'], 
    prev_answer = scratchpad['answerpad'],
    verbose = verbose)

print("\033[1;37m" + f"\nFinal Answer: \n {final_answer['text']}\n" + "\033[0m\033[0m")  

## ----- Save the run data -----
## If you do not need this, feel free to comment it out. 
_ = runs_store.add_texts(
    texts=[scratchpad['original_question']], 
    metadatas=[{
        'docs_metadata': [document.metadata for document in scratchpad['documents']], 
        'answer': final_answer['text'], 
        'context': final_answer['context']
        }], 
    ids=[run_id])
    

[91m[1m
 ------  Iteration 1 ------- 
[0m[0m
[36;2m 
Current Question: why did Mahabharata war happen?
 Current Answer: The Mahabharata war happened because of a series of events and conflicts between the Pandavas and the Kauravas. The fight began in Dwaraka, with some Yadavas defending the Pandavas and others defending the Kauravas. The quarreling citizens of Dwaraka grabbed sharp reeds from the seashore and destroyed themselves. Satyaki and Kritavarma, who had survived the war, killed each other in that fight. 

Meanwhile, King Duryodhana had collected a large army to fight against the Pandavas. Both sides amassed their armies and arsenals, intending to aid either the Pandavas or the Kauravas. This was all part of Lord Krishna's plan to decrease the unnecessary military burden of the world. The battle between the Pandavas and the Kauravas raged furiously.

After the battle, Yudhishthira, the Pandava king, became filled with fear at the thought of Gandhari's wrath. He sent Krishn

2023-09-17 13:34:00,088:INFO - HTTP Request: POST https://doxggeyqopdnxfhseufq.supabase.co/rest/v1/runs "HTTP/1.1 201 Created"
