# Edgar: Comparative Q&A

In [None]:
import os
import sys
import nest_asyncio
nest_asyncio.apply()
from typing import List, Any
import logging


current_dir = os.getcwd()
kit_dir = os.path.abspath(os.path.join(current_dir, ".."))
repo_dir = os.path.abspath(os.path.join(kit_dir, ".."))

sys.path.append(kit_dir)
sys.path.append(repo_dir)

# Langchain imports
from langchain.vectorstores import Chroma
from langchain.embeddings.huggingface import HuggingFaceInstructEmbeddings
from langchain.chains import LLMChain
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
from langchain.prompts import (
    PromptTemplate,
    load_prompt
)
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.output_parsers import BaseOutputParser
from langchain_core.documents.base import Document
from langchain.retrievers.multi_query import MultiQueryRetriever

# Llama index imports
from llama_index.core import SimpleDirectoryReader, ServiceContext, VectorStoreIndex
from llama_index.core.tools import QueryEngineTool, ToolMetadata
from llama_index.core.query_engine import SubQuestionQueryEngine
from llama_index.core.llms import (
    CustomLLM,
    CompletionResponse,
    CompletionResponseGen,
    LLMMetadata,
)
from llama_index.core import Settings
from llama_index.core.llms.callbacks import llm_completion_callback


from utils.model_wrappers.api_gateway import APIGateway


from dotenv import load_dotenv
load_dotenv(os.path.join(repo_dir,'.env'))

logging.basicConfig()
logging.getLogger("langchain.retrievers.multi_query").setLevel(logging.INFO)

## Get the data

In [None]:
# Define the directory path
dir_path = f'{kit_dir}/data/sec-edgar-filings/reports'

# Check if the directory exists and create it if it doesn't
if not os.path.exists(dir_path):
    os.makedirs(dir_path)
    print("Directory created:", dir_path)
else:
    print("Directory already exists:", dir_path)

In [None]:
!wget 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/docs/examples/data/10k/uber_2021.pdf' -O '{dir_path}/uber_2021.pdf'
!wget 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/docs/examples/data/10k/lyft_2021.pdf' -O '{dir_path}/lyft_2021.pdf'

## Llama index

### Uber vs Lift 2021

In [None]:
class SambaNovaLLMWrapper(CustomLLM):
    context_window: int = 3900
    num_output: int = 256
    model_name: str = "llama3-8b" # expert name
    dummy_response: str = ""
    
    def _get_sambanova_llm(self):

        # Set gateway
        llm = APIGateway.load_llm(
            type="sncloud", # sncloud or sambastudio here
            streaming=False,
            bundle=True,
            max_tokens_to_generate=512,
            temperature=0.0,
            select_expert=self.model_name,
        )

        return llm

    @property
    def metadata(self) -> LLMMetadata:
        """Get LLM metadata."""
        return LLMMetadata(
            context_window=self.context_window,
            num_output=self.num_output,
            model_name=self.model_name,
        )

    @llm_completion_callback()
    def complete(self, prompt: str, **kwargs: Any) -> CompletionResponse:
        llm = self._get_sambanova_llm()
        self.dummy_response = llm.invoke(prompt)
        return CompletionResponse(text=self.dummy_response)

    @llm_completion_callback()
    def stream_complete(
        self, prompt: str, **kwargs: Any
    ) -> CompletionResponseGen:
        llm = self._get_sambanova_llm()
        self.dummy_response = llm.invoke(prompt)
        response = ""
        for token in self.dummy_response:
            response += token
            yield CompletionResponse(text=response, delta=token)

In [None]:
# define our LLM
Settings.llm = SambaNovaLLMWrapper()

In [None]:
# define embed model
Settings.embed_model = HuggingFaceInstructEmbeddings(
    query_instruction="Represent the query for retrieval: "
)

In [None]:
## Load data 
lyft_docs = SimpleDirectoryReader(
    input_files=[os.path.join(kit_dir,"data/sec-edgar-filings/reports/lyft_2021.pdf")]
).load_data()
uber_docs = SimpleDirectoryReader(
    input_files=[os.path.join(kit_dir,"data/sec-edgar-filings/reports/uber_2021.pdf")]
).load_data()

## Build indices
lyft_index = VectorStoreIndex.from_documents(lyft_docs, show_progress=True)

uber_index = VectorStoreIndex.from_documents(uber_docs, show_progress=True)

## Build query engines
lyft_engine = lyft_index.as_query_engine(similarity_top_k=3)

uber_engine = uber_index.as_query_engine(similarity_top_k=3)

In [None]:
# Instantiate query engine tools
query_engine_tools = [
    QueryEngineTool(
        query_engine=lyft_engine,
        metadata=ToolMetadata(
            name="lyft_10k",
            description=(
                "Provides information about Lyft financials for year 2021"
            ),
        ),
    ),
    QueryEngineTool(
        query_engine=uber_engine,
        metadata=ToolMetadata(
            name="uber_10k",
            description=(
                "Provides information about Uber financials for year 2021"
            ),
        ),
    ),
]

# Instantiate Sub query engine
s_engine = SubQuestionQueryEngine.from_defaults(
    query_engine_tools=query_engine_tools,
)

In [None]:
## Run queries
response = s_engine.query(
    "Compare and contrast the customer segments and geographies that grew the fastest"
)

print(response)

response = s_engine.query(
    "Compare revenue growth of Uber and Lyft from 2020 to 2021"
)

print(response)

## Langchain

### Uber vs Lift 2021

In [None]:
chunk_size = 1000
chunk_overlap = 0

In [None]:
# Load uber data
loader = PyPDFLoader(os.path.join(kit_dir,"data/sec-edgar-filings/reports/uber_2021.pdf"))
data = loader.load()
for document in data:
    document.metadata['company'] = 'Uber'
    document.metadata['year'] = 2021

# Split
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
uber_splits = text_splitter.split_documents(data)

In [None]:
# check uber splits
uber_splits[:3]

In [None]:
# Load lyft data
loader = PyPDFLoader(os.path.join(kit_dir,"data/sec-edgar-filings/reports/lyft_2021.pdf"))
data = loader.load()
for document in data:
    document.metadata['company'] = 'Lyft'
    document.metadata['year'] = 2021

# Split
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
lyft_splits = text_splitter.split_documents(data)

In [None]:
# check lyft splits
lyft_splits[:3]

In [None]:
splits = [*uber_splits,*lyft_splits]

print(f"{len(uber_splits)} uber split docs")
print(f"{len(lyft_splits)} lyft split docs")
print(f"{len(splits)} all docs")

In [None]:
# Load embeddings and create vector store
embedding = HuggingFaceInstructEmbeddings(
    query_instruction="Represent the query for retrieval: "
)

vectordb = Chroma.from_documents(documents=splits, embedding=embedding)

In [None]:
# Using SambaNovaCloud
api_type = "sncloud"
llm_expert = 'llama3-8b'

# Using SambaStudio
# api_type = "sambastudio"
# llm_expert = 'Meta-Llama-3-70B-Instruct-4096'

# Set gateway
llm = APIGateway.load_llm(
    type="sncloud",
    streaming=False,
    bundle=True,
    max_tokens_to_generate=512,
    temperature=0.0,
    select_expert='llama3-8b',
)

llm.invoke("hi!")

In [None]:
# Output parser will split the LLM result into a list of queries
class LineListOutputParser(BaseOutputParser[List[str]]):
    """Output parser for a list of lines."""

    def parse(self, text: str) -> List[str]:
        lines = text.strip().split("\n")
        questions = [question.strip() for question in lines if '?' in question]
        return list(filter(None, questions))  # Remove empty lines

output_parser = LineListOutputParser()

# Testing parser
parsing = output_parser.parse("  1. What are the revenue breakdowns for Document 1?\n                       2. What are the revenue breakdowns for Document 2?")
parsing

In [None]:
# Saving and Loading a prompt template
query_decomposition_prompt = PromptTemplate(
    input_variables=["question"],
    
    template="""Given the following complex query, decompose the query into a list of questions directly and concisely.
    Complex query: {question}
    List of decomposed questions: """,
)


query_decomposition_prompt.save(os.path.join(kit_dir,'prompts/edgar_comparative_qna-query_decomposition_prompt.yaml'))
query_decomposition_prompt = load_prompt(os.path.join(kit_dir,'prompts/edgar_comparative_qna-query_decomposition_prompt.yaml'))

In [None]:
# Testing a Chain including the parser
llm_chain = LLMChain(llm=llm, prompt=query_decomposition_prompt, output_parser=output_parser)
llm_chain.invoke("What are the key risks mentioned in the risk factors section of both Microsoft and Apple's 10-K reports, and how do they differ in terms of potential impact and mitigation strategies?")

In [None]:
# Setting up the Chain and MultiqueryRetriever
llm_chain = LLMChain(llm=llm, prompt=query_decomposition_prompt, output_parser=output_parser)

multiquery_retriever = MultiQueryRetriever(
    retriever=vectordb.as_retriever(search_kwargs={
        'k': 3,
        'filter': {'$or': [{'company': {'$eq': 'Uber'}}, {'company': {'$eq': 'Lyft'}}]},
    }), 
    llm_chain=llm_chain, 
    parser_key="decomposed_questions", 
    verbose = True
)  

question = "What are the key risks mentioned in the risk factors section of both Uber and Lyft's 10-K reports, and how do they differ in terms of potential impact and mitigation strategies?"

# Testing multiquery results
multiquery_retrieved_docs = multiquery_retriever.get_relevant_documents(
    query=question
)[:6]
multiquery_retrieved_docs

In [None]:
# Define prompt for answering and summarization
summarization_prompt_template = """You're a helpful assistant. Follow these rules:
1. Use only the information provided in the context section.
2. Provide relevant information to answer the question.
Write an answer to the following question based on the following context information and metadata:
Question:
{original_question}
Context:
{context}
Answer: """
summarization_prompt = PromptTemplate.from_template(summarization_prompt_template)

summarization_prompt.save(os.path.join(kit_dir,'prompts/edgar_comparative_qna-answering_and_summarization_prompt.yaml'))
summarization_prompt = load_prompt(os.path.join(kit_dir,'prompts/edgar_comparative_qna-answering_and_summarization_prompt.yaml'))

In [None]:
# Transform the retrieved docs to include metadata in page_content
docs_for_summary = []
for doc in multiquery_retrieved_docs:
    metadata_str = ", ".join([f"{key}: {value}" for key, value in doc.metadata.items() if key in ("company", "year", "page")])
    extended_page_content = f"Metadata: \"{metadata_str}\", Information: \"{doc.page_content}\""
    extended_doc = Document(page_content=extended_page_content)
    docs_for_summary.append(extended_doc)
docs_for_summary

In [None]:
# Define StuffDocumentsChain for question answering 
llm_chain = LLMChain(llm=llm, prompt=summarization_prompt)
stuff_chain = StuffDocumentsChain(llm_chain=llm_chain, document_variable_name="context", verbose=True)

response = stuff_chain.invoke({"input_documents": docs_for_summary, 'original_question': question})
print(response['output_text'])

In [None]:
# other questions to try
questions = [
    "What are the revenue breakdowns for Uber and Lyft in their respective 10-K reports, and how do they compare in terms of total revenue and revenue from different segments?",
    "What are the key risks mentioned in the risk factors section of both Uber and Lyft's 10-K reports, and how do they differ in terms of potential impact and mitigation strategies?",
    "How do the corporate governance structures of Uber and Lyft, as outlined in their 10-K filings, compare in terms of board composition, executive compensation, and shareholder rights?",
    "What are the major investments and acquisitions disclosed in the investment section of Uber and Lyft's 10-K reports, and how do they reflect each company's strategic priorities and growth strategies?",
    "How do the research and development expenditures disclosed in Uber and Lyft's 10-K reports compare in terms of absolute spending and percentage of revenue, and what insights can be drawn regarding their innovation efforts?",
    "What are the legal proceedings and regulatory issues disclosed in the legal proceedings section of both Uber and Lyft's 10-K filings, and how do they differ in terms of nature, severity, and potential impact on the companies?",
    "How do the financial performance metrics such as net income, operating margins, and cash flow ratios disclosed in Uber and Lyft's 10-K reports compare, and what factors contribute to any observed differences?",
    "What are the geographical revenue breakdowns provided in the geographic segments section of both Uber and Lyft's 10-K reports, and how do they reflect each company's international presence and market diversification?",
    "How do the sustainability initiatives and environmental disclosures in Uber and Lyft's 10-K filings compare, including information on energy consumption, carbon footprint, and supply chain sustainability efforts?",
    "What are the forward-looking statements and risk factors outlined in the Management's Discussion and Analysis (MD&A) sections of Uber and Lyft's 10-K reports, and how do they reflect each company's outlook, challenges, and opportunities in the market?",
]