# RFP Response Generation

Given template, given context, generate hypothetical RFP response report.

In [None]:
# download JEDI Cloud RFP Template
!wget "https://imlive.s3.amazonaws.com/Federal%20Government/ID151830346965529215587195222610265670631/HQ0034-18-R-0077.pdf" -O data/jedi_cloud_rfp.pdf

In [None]:
# microsoft annual report
!wget "https://www.dropbox.com/scl/fi/4v5dx8dc9yqc8k0yw5g4h/msft_10k_2024.pdf?rlkey=jdyfrsoyb18ztlq5msunmibns&st=9w6bdyvn&dl=1" -O data/msft_10k_2024.pdf
# !wget "https://microsoft.gcs-web.com/static-files/1c864583-06f7-40cc-a94d-d11400c83cc8" -O data/msft_10k_2024.pdf

In [None]:
# azure wikipedia page
!wget "https://www.dropbox.com/scl/fi/7waur8ravmve3fe8nej0k/azure_wiki.pdf?rlkey=icru2w64oylx1p76ftt6y9irv&st=fr87vxob&dl=1" -O data/azure_wiki.pdf

In [None]:
# azure government slide deck
!wget "https://cdn.ymaws.com/flclerks.site-ym.com/resource/resmgr/2017_Fall_Conf/Presentations/2018-10-12_FCCC_Microsoft_Az.pdf" -O data/azure_gov.pdf

In [None]:
# microsoft cybersecurity capabilities
!wget "https://www.dropbox.com/scl/fi/qh00xz29rlom4md8ce675/microsoft_ddr.pdf?rlkey=d868nbnsu1ng41y1chw69y64b&st=24iqemb1&dl=1" -O data/msft_ddr.pdf

## Setup

In [9]:
import nest_asyncio

nest_asyncio.apply()

In [None]:
from llama_parse import LlamaParse

# use our multimodal models for extractions
parser = LlamaParse(
    result_type="markdown",
    use_vendor_multimodal_model=True,
    vendor_multimodal_model_name="anthropic-sonnet-3.5",
)

In [10]:
data_dir = "data"
files = [
    "azure_gov.pdf",
    "azure_wiki.pdf",
    # "jedi_cloud_rfp.pdf",
    "msft_10k_2024.pdf",
    "msft_ddr.pdf"
]


file_dicts = {}

for f in files:
    file_base = Path(f).stem
    full_file_path = str(Path(data_dir) / f)
    # md_json_objs = parser.get_json_result(full_file_path)
    # json_dicts = md_json_objs[0]["pages"]
    
    file_docs = parser.load_data(full_file_path)

    # image_path = str(Path(out_image_dir) / file_base)
    # image_dicts = parser.get_images(md_json_objs, download_path=image_path)
    file_dicts[f] = {
        "file_path": full_file_path,
        "docs": file_docs
        # "file_path": full_file_path,
        # "json_dicts": json_dicts,
        # "image_path": image_path,
    }

### Build Indexes

Once the text nodes are ready, we feed into our vector store, which will index these nodes into Chroma (you're welcome to use our other 40+ vector store integrations if you'd like).

In [None]:
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import VectorStoreIndex

persist_dir = "storage_rfp_chroma"

vector_store = ChromaVectorStore.from_params(
    collection_name="rfp_docs",
    persist_dir=persist_dir
)
index = VectorStoreIndex.from_vector_store(vector_store)

**NOTE**: Don't run if you've already inserted the nodes.

In [None]:
index.clear()
all_nodes = [c for d in file_dicts.values() for c in d["docs"]]
index.insert_nodes(nodes)

### Define Retrievers

Define retrievers, one for each file. 

In [None]:
from llama_index.core.vector_stores import (
    MetadataFilter,
    MetadataFilters,
    FilterOperator,
)

# function tools
def chunk_retriever_fn_factory(file: str):
    """Return a function that retrieves only within a given file."""
    filters = MetadataFilters(
        filters=[
            MetadataFilter(
                key="file_name", operator=FilterOperator.EQ, value=file
            ),
        ]
    )
    
    def chunk_retriever_fn(query: str) -> List[NodeWithScore]:
        """Retrieves a small set of relevant document chunks from the corpus.

        ONLY use for research questions that want to look up specific facts from the knowledge corpus,
        and don't need entire documents.

        """
        retriever = index.as_retriever(similarity_top_k=5, filters=filters)
        nodes = retriever.retrieve(query)
        return nodes
    
    return chunk_retriever_fn

# generate tools 
tools = []
for f in files:
    tools.append(FunctionTool.from_defaults(fn=chunk_retriever_fn_factory(f)))

## Build Workflow

The user specifies an RFP document as input. 

Let's build a workflow that can iterate through the extracted keys/questions from the RFP, and fill them out! 

In [None]:
rfp_docs = parser.load_data(Path(data_dir) / "jedi_cloud_rfp.pdf")

In [None]:
from llama_index.core.workflow import (
    Event,
    StartEvent,
    StopEvent,
    Context,
    Workflow,
    step,
)
from llama_index.core.llms import LLM
from typing import Optional


# this is the research agent's system prompt, tasked with answering a specific question
AGENT_SYSTEM_PROMPT = """\
You are a research agent tasked with filling out a specific form key/question with the appropriate value, given a bank of context.
You are given a specific form key/question. Think step-by-step and use the existing set of tools to help answer the question.

"""

# This is the prompt tasked with extracting information from an RFP file. 
EXTRACT_KEYS_PROMPT = """\
You are provided an entire RFP document, or a large subsection from it. 

We wish to generate a response to the RFP in a way that adheres to the instructions within the RFP, \
including the specific sections that an RFP response should contain, and the content that would need to go \
into each section.

Your task is to extract out a list of "questions", where each question corresponds to a specific section that is required in the RFP response.
Put another way, after we extract out the questions we will go through each question and answer each one \
with our downstream research assistant, and the combined
question:answer pairs will constitute the full RFP response.

- Make sure the questions are comprehensive and adheres to the RFP requirements.
- Make sure each question is descriptive - this gives our downstream assistant context to fill out the value for that question 
- Extract out all the questions as a list of strings.

"""

class OutputQuestions(BaseModel):
    """List of keys that make up the sections of the RFP response."""
    questions: List[str]


class OutputTemplateEvent(Event):
    doc: List[Document]


class QuestionsExtractedEvent(Event):
    questions: List[str]


class HandleQuestionEvent(Event):
    question: str


class QuestionAnsweredEvent(Event):
    question: str
    answer: str

class CollectedAnswersEvent(Event):
    combined_answers: str

class RFPWorkflow(Workflow):
    """RFP workflow."""
    
    def __init__(
        tools,
        parser: LlamaParse,
        llm: LLM | None = None,
        similarity_top_k: int = 20,
        output_dir: str = "data_out_rfp",
        agent_system_prompt: str = AGENT_SYSTEM_PROMPT,
        **kwargs,
    ) -> None:
        """Init params."""
        super().__init__(**kwargs)
        self.tools = tools
        
        self.parser = parser
        
        self.llm = llm or OpenAI(model="gpt-4o-mini")
        self.similarity_top_k = similarity_top_k
        
        self.output_dir = output_dir
        
        # initialize a Function Calling "research" agent where given a task, it can pull responses from relevant tools 
        self.research_agent = FunctionCallingAgentWorker.from_tools(
            tools, llm=llm, verbose=True, system_prompt=agent_system_prompt
        ).as_agent()
        
    @step
    async def parse_output_template(self, ctx: Context, ev: StartEvent) -> OutputTemplateEvent:
        # load output template file 
        docs = await self.parser.aload_data(ev.rfp_template_path)
        # save output template to file
        with open(f"{self.output_dir}/workflow_output/output_template.json", "w") as f:
            for doc in docs:
                f.write(doc.model_dump_json())
                f.write("\n")

        await ctx.set("output_template", docs)
        return OutputTemplateEvent(doc=docs)
    
    @step
    async def extract_questions(self, ctx: Context, ev: OutputTemplateEvent) -> HandleQuestionEvent:
        doc = ev.doc
        try: 
            prompt = PromptTemplate(template=EXTRACT_KEYS_PROMPT)
            output_qs = self.llm.structured_predict(
                OutputQuestions, prompt, context=page.text
            )
        except Exception as e:
            _logger.error(f"Error extracting questions from page: {page.text}")
            _logger.error(e)


        # save all_questions to file
        with open(f"{self.output_dir}/workflow_output/all_keys.json", "w") as f:
            f.write(json.dumps(output_qs))
        
        await ctx.set("num_to_collect", len(output_qs))

        for question in output_qs:
            ctx.send_event(HandleQuestionEvent(question=question))
        
        return None
    
    @step
    async def handle_question(self, ev: HandleQuestionEvent) -> QuestionAnsweredEvent:
        question = ev.question
        
        # ensure the agent's memory is cleared 
        self.research_agent.reset()
        response = self.research_agent.query(question)
        
        return QuestionAnsweredEvent(question=question, answer=str(response))

    @step
    async def combine_answers(self, ctx: Context, ev: QuestionAnsweredEvent) -> CollectedAnswersEvent:
        num_to_collect = await ctx.get("num_to_collect")
        results = ctx.collect_events(ev, [QuestionAnsweredEvent] * num_to_collect)
        if results is None:
            return None
        
        combined_answers = "\n".join([result.model_dump_json() for result in results])
        # save combined_answers to file
        with open(f"{self.output_dir}/workflow_output/combined_answers.json", "w") as f:
            f.write(combined_answers)

        return CollectedAnswersEvent(combined_answers=combined_answers)

    @step
    async def generate_output(self, ctx: Context, ev: CollectedAnswersEvent) -> StopEvent:
        output_template = await ctx.get("output_template")
        output_template = "\n".join([doc.get_content('none') for doc in output_template])

        prompt = PromptTemplate(
            template=GENERATE_OUTPUT_PROMPT,
        )
        final_output = self.llm.predict(prompt, output_template=output_template, answers=ev.combined_answers)
        # save final_output to file
        with open(f"{self.output_dir}/workflow_output/final_output.md", "w") as f:
            f.write(final_output)

        return StopEvent(result=final_output)

In [None]:
workflow = RFPWorkflow(
    tools,
    parser=parser,
    llm=llm
    verbose=True,
    timeout=60.0,
)

In [None]:
# run the agent
response = workflow.run(str(Path(data_dir) / "jedi_cloud_rfp.pdf"))
print(str(response))