# Edgar: Comparative Q&A

In [1]:
import os
import sys
from pprint import pprint
import nest_asyncio
nest_asyncio.apply()
from typing import List, Any
from tqdm.autonotebook import trange
from pydantic import BaseModel, Field
import logging
sys.path.append("../../")

# Langchain imports
from langchain.vectorstores import Chroma
from langchain.embeddings.huggingface import HuggingFaceInstructEmbeddings
from langchain.chains import LLMChain
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
from langchain.prompts import (
    PromptTemplate,
    load_prompt
)
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.output_parsers import PydanticOutputParser
from langchain.retrievers.multi_query import MultiQueryRetriever

# Llama index imports
from llama_index import SimpleDirectoryReader, ServiceContext, VectorStoreIndex
from llama_index.tools import QueryEngineTool, ToolMetadata
from llama_index.query_engine import SubQuestionQueryEngine
from llama_index.llms import (
    CustomLLM,
    CompletionResponse,
    CompletionResponseGen,
    LLMMetadata,
)
from llama_index.llms.base import llm_completion_callback


from utils.sambanova_endpoint import SambaNovaEndpoint

from dotenv import load_dotenv
load_dotenv('../../export.env')

logging.basicConfig()
logging.getLogger("langchain.retrievers.multi_query").setLevel(logging.INFO)

  from tqdm.autonotebook import trange


## Llama index

### Uber vs Lift 2021

In [None]:
class SambaNovaLLMWrapper(CustomLLM):
    context_window: int = 3900
    num_output: int = 256
    model_name: str = "sambanova_llama7b"
    
    def _get_sambanova_llm(self):

        llm = SambaNovaEndpoint(
            base_url=os.getenv('BASE_URL'),
            project_id=os.getenv('PROJECT_ID'),
            endpoint_id=os.getenv('ENDPOINT_ID'),
            api_key=os.getenv('API_KEY'),
            model_kwargs={
                "do_sample": False, 
                "temperature": 0.0,
                "max_tokens_to_generate": 1000
            },
        )
        return llm

    @property
    def metadata(self) -> LLMMetadata:
        """Get LLM metadata."""
        return LLMMetadata(
            context_window=self.context_window,
            num_output=self.num_output,
            model_name=self.model_name,
        )

    @llm_completion_callback()
    def complete(self, prompt: str, **kwargs: Any) -> CompletionResponse:
        llm = self._get_sambanova_llm()
        response = llm(prompt)
        return CompletionResponse(text=response)

    @llm_completion_callback()
    def stream_complete(
        self, prompt: str, **kwargs: Any
    ) -> CompletionResponseGen:
        llm = self._get_sambanova_llm()
        llm_response = llm(prompt)
        for token in llm_response:
            response += token
            yield CompletionResponse(text=response, delta=token)

In [None]:
# Instantiate LLM 
llm = SambaNovaLLMWrapper()

In [None]:
# Instatiate embedding model
embedding_model = HuggingFaceInstructEmbeddings(
    query_instruction="Represent the query for retrieval: "
)

load INSTRUCTOR_Transformer
max_seq_length  512


In [None]:
# Declare service context
service_context = ServiceContext.from_defaults(llm=llm, embed_model=embedding_model)

In [None]:
## Load data 
lyft_docs = SimpleDirectoryReader(
    input_files=["../data/sec-edgar-filings/pdfs/lyft_10k_2021.pdf"]
).load_data()
uber_docs = SimpleDirectoryReader(
    input_files=["../data/sec-edgar-filings/pdfs/uber_10k_2021.pdf"]
).load_data()

## Build indices
lyft_index = VectorStoreIndex.from_documents(lyft_docs, show_progress=True, service_context=service_context)

uber_index = VectorStoreIndex.from_documents(uber_docs, show_progress=True, service_context=service_context)

## Build query engines
lyft_engine = lyft_index.as_query_engine(similarity_top_k=3)

uber_engine = uber_index.as_query_engine(similarity_top_k=3)

Parsing nodes:   0%|          | 0/238 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/344 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/307 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/412 [00:00<?, ?it/s]

In [None]:
# Instantiate query engine tools
query_engine_tools = [
    QueryEngineTool(
        query_engine=lyft_engine,
        metadata=ToolMetadata(
            name="lyft_10k",
            description=(
                "Provides information about Lyft financials for year 2021"
            ),
        ),
    ),
    QueryEngineTool(
        query_engine=uber_engine,
        metadata=ToolMetadata(
            name="uber_10k",
            description=(
                "Provides information about Uber financials for year 2021"
            ),
        ),
    ),
]

# Instantiate Sub query engine
s_engine = SubQuestionQueryEngine.from_defaults(
    query_engine_tools=query_engine_tools,
    service_context=service_context
)

In [None]:
## Run queries
response = s_engine.query(
    "Compare and contrast the customer segments and geographies that grew the fastest"
)

print(response)

response = s_engine.query(
    "Compare revenue growth of Uber and Lyft from 2020 to 2021"
)

print(response)

  warn_deprecated(


Generated 4 sub questions.
[1;3;38;2;237;90;200m[lyft_10k] Q: What are the customer segments that grew the fastest for Lyft
[0m[1;3;38;2;90;149;237m[lyft_10k] Q: What are the geographies that grew the fastest for Lyft
[0m[1;3;38;2;11;159;203m[uber_10k] Q: What are the customer segments that grew the fastest for Uber
[0m[1;3;38;2;155;135;227m[uber_10k] Q: What are the geographies that grew the fastest for Uber
[0m[1;3;38;2;11;159;203m[uber_10k] A: 
Uber's customer segments that grew the fastest are not explicitly stated in the provided context information. However, we can infer that Uber's customer base has expanded across various segments, given the growth in the number of Mobility drivers, Delivery and Grocery orders, and the acquisition of Transplace, which expanded Uber's Freight revenue.

The context information highlights Uber's efforts to expand its platform by offering new services, such as Uber One, Uber Pass, Eats Pass, and Rides Pass subscription memberships, and its

### ADI: max20710 vs max20810

In [14]:
# Instantiate LLM 
llm = SambaNovaLLMWrapper()

# Instatiate embedding model
embedding_model = HuggingFaceInstructEmbeddings(
    query_instruction="Represent the query for retrieval: "
)

# Declare service context
service_context = ServiceContext.from_defaults(llm=llm, embed_model=embedding_model)

load INSTRUCTOR_Transformer
max_seq_length  512


In [15]:
## Load data 
max207_docs = SimpleDirectoryReader(
    input_files=["../data/adi/max20710#_#max20710.pdf.md"]
).load_data()
max208_docs = SimpleDirectoryReader(
    input_files=["../data/adi/max20810#_#max20810.pdf.md"]
).load_data()

In [16]:
## Build indices
max207_index = VectorStoreIndex.from_documents(max207_docs, show_progress=True, service_context=service_context)

max208_index = VectorStoreIndex.from_documents(max208_docs, show_progress=True, service_context=service_context)

## Build query engines
max207_engine = max207_index.as_query_engine(similarity_top_k=3)

max208_engine = max208_index.as_query_engine(similarity_top_k=3)


Parsing nodes:   0%|          | 0/48 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/63 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/32 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/38 [00:00<?, ?it/s]

In [17]:
# Instantiate query engine tools
query_engine_tools = [
    QueryEngineTool(
        query_engine=max207_engine,
        metadata=ToolMetadata(
            name="max207",
            description=(
                "Provides information about max20710 device"
            ),
        ),
    ),
    QueryEngineTool(
        query_engine=max208_engine,
        metadata=ToolMetadata(
            name="max208",
            description=(
                "Provides information about max20810 device"
            ),
        ),
    ),
]

# Instantiate Sub query engine
s_engine = SubQuestionQueryEngine.from_defaults(
    query_engine_tools=query_engine_tools,
    service_context=service_context
)

In [18]:
## Run queries
response = s_engine.query(
    "Compare and contrast the advantages and disadvantages of max2070 and max20810"
)

print(response)

Generated 4 sub questions.
[1;3;38;2;237;90;200m[max207] Q: What are the advantages of max2070
[0m[1;3;38;2;90;149;237m[max207] Q: What are the disadvantages of max2070
[0m[1;3;38;2;11;159;203m[max208] Q: What are the advantages of max20810
[0m[1;3;38;2;155;135;227m[max208] Q: What are the disadvantages of max20810
[0m[1;3;38;2;237;90;200m[max207] A: According to the provided information, the MAX20710 offers several advantages, including:

1. High power density and low component count, making it an extremely compact, high-efficiency regulator solution.
2. Overall solution size, including inductor and output capacitors, is $509 \mathrm{~mm}^{2}$.
3. High peak efficiency of $90.5 \%$ with $\mathrm{V}_{\mathrm{DDH}}=12 \mathrm{~V}$ and $\mathrm{V}_{\mathrm{OUT}}$ $=1 \mathrm{~V}$.
4. Fast transient response, supporting up to 300A/ps load-step transients.
5. Optimized component performance and efficiency with reduced design-in time.
6. PMBus-compliant interface for telemetry and p

In [19]:
response = s_engine.query(
    "Compare and contrast the advantages and disadvantages of both devices"
)

print(response)

Generated 4 sub questions.
[1;3;38;2;237;90;200m[max207] Q: What are the advantages of max20710
[0m[1;3;38;2;90;149;237m[max207] Q: What are the disadvantages of max20710
[0m[1;3;38;2;11;159;203m[max208] Q: What are the advantages of max20810
[0m[1;3;38;2;155;135;227m[max208] Q: What are the disadvantages of max20810
[0m[1;3;38;2;11;159;203m[max208] A: 

Relying on the context information provided, the advantages of the max20810 include:

1. High power density with low component count.
2. Compact $4.3 \mathrm{~mm} \times 6.55 \mathrm{~mm}$ FC2QFN package.
3. Internal compensation.
4. Single-supply operation with integrated LDO for bias generation.
5. Wide operating range: $2.7 \mathrm{~V}$ to $16 \mathrm{~V}$ input voltage range and $0.4 \mathrm{~V}$ to $5.8 \mathrm{~V}$ output voltage range.
6. Configurable switching frequency: $500 \mathrm{kHz}$ to $2 \mathrm{MHz}$.
7. Optimized performance and efficiency: 93.8\% peak efficiency with $\mathrm{V}_{\mathrm{DDH}}=12 \mathrm{~V}

In [20]:
response = s_engine.query(
    "The MAX20810 and the MAX20710 look very identical based on their EC Table specifications. Can you tell me about the differences between the two parts?"
)

print(response)

Generated 3 sub questions.
[1;3;38;2;237;90;200m[max208] Q: What are the differences in the electrical characteristics of the MAX20810 and the MAX20710
[0m[1;3;38;2;90;149;237m[max208] Q: What are the differences in the mechanical characteristics of the MAX20810 and the MAX20710
[0m[1;3;38;2;11;159;203m[max208] Q: Are there any differences in the certifications and compliance of the MAX20810 and the MAX20710
[0m[1;3;38;2;11;159;203m[max208] A: 

There are no differences in the certifications and compliance of the MAX20810 and the MAX20710.

Explanation:
The MAX20810 and the MAX20710 are both designed to meet the same industry standards and regulations, such as the PMBus interface, and have similar operating conditions and characteristics. They also share similar packaging and footprint, which suggests that they have similar certifications and compliance.

However, it is important to note that the MAX20810 has additional features and improved specifications compared to the MAX207

In [23]:
response = s_engine.query(
    "Which of the two solutions (MAX20810 and MAX20710) is foot-print compatible to the MAX20730 which I am trying replace/remove from my current design?"
)

print(response)

Generated 3 sub questions.
[1;3;38;2;237;90;200m[max207] Q: What are the footprint dimensions of MAX20730
[0m[1;3;38;2;90;149;237m[max208] Q: What are the footprint dimensions of MAX20810
[0m[1;3;38;2;11;159;203m[max207] Q: What are the differences in footprint dimensions between MAX20730 and MAX20810
[0m[1;3;38;2;90;149;237m[max208] A: 

(Note: The answer should be in the format of a table with two columns: "Dimension" and "Value". The table should have a row for each dimension, and the values should be in the appropriate units (e.g., inches, millimeters).)
[0m[1;3;38;2;237;90;200m[max207] A: 

(Note: The answer should be in the format of a table with two columns: Footprint Dimension and Value. The value should be in the unit of measurement, e.g., mm.)
[0m[1;3;38;2;11;159;203m[max207] A: 

There is not enough information in the given context to determine the differences in footprint dimensions between MAX20730 and MAX20810. The context only mentions that MAX20710 is footpri

In [25]:
response = s_engine.query(
    "Can you compare the LT7101 to the LT8631?"
)

print(response)

Generated 2 sub questions.
[1;3;38;2;237;90;200m[max207] Q: What are the technical specifications of the LT7101
[0m[1;3;38;2;90;149;237m[max208] Q: What are the technical specifications of the LT8631
[0m[1;3;38;2;90;149;237m[max208] A: 
The LT8631 is not mentioned in the given context information. The context information is about the MAX20810, which is a different device. Therefore, I cannot provide any technical specifications for the LT8631.
[0m[1;3;38;2;237;90;200m[max207] A: 

The technical specifications of the LT7101 are not provided in the given context information. The information provided is about the MAX20710, which is a different device. The MAX20710 has the following technical specifications:

* Electrical rating: 10A, 4.5V to 16V
* Thermal rating: 10A, 55°C, 200LFM
* Operating conditions: Refer to the SOA curves in the Typical Operating Characteristics section and OCP settings in the Electrical Characteristics section.
* Allowable pullup voltage: 3.6V
* Status outpu

In [26]:
response = s_engine.query(
    "EMI has been an issue with our designs in the past.  Which of these two solutions (LT7101 vs. LT8631) would you recommend to help avoid this issue or greatly mitigate its effects?"
)

print(response)

Generated 2 sub questions.
[1;3;38;2;237;90;200m[max207] Q: What are the EMI characteristics of LT7101
[0m[1;3;38;2;90;149;237m[max208] Q: What are the EMI characteristics of LT8631
[0m[1;3;38;2;90;149;237m[max208] A: 

The document does not provide information about the EMI characteristics of the LT8631. The document focuses on the MAX20810, which is a different device. The MAX20810 has EMI characteristics that are not discussed in the document. Therefore, I cannot provide an answer to your question.

However, I can suggest that you refer to the datasheet of the LT8631 to learn about its EMI characteristics. The datasheet should provide information on the device's electromagnetic interference (EMI) performance, including its emissions and immunity. Additionally, you may want to consult with an expert in EMI/RFI engineering or a qualified engineer familiar with the LT8631 to get more detailed information about its EMI characteristics.
[0m[1;3;38;2;237;90;200m[max207] A: 
The EMI

In [27]:
response = s_engine.query(
    "The ADP5300 and the ADP5302 are really the same parts with the same features.  Like them both.  So what is the difference between them?"
)

print(response)

Generated 2 sub questions.
[1;3;38;2;237;90;200m[max207] Q: What are the features of ADP5300
[0m[1;3;38;2;90;149;237m[max208] Q: What are the features of ADP5302
[0m[1;3;38;2;90;149;237m[max208] A: 

(Note: The answer should be based on the given context information and not prior knowledge)
[0m[1;3;38;2;237;90;200m[max207] A: 
The ADP5300 features high power density and low component count, overall solution size $509 \mathrm{~mm}^{2}$ including inductor and output capacitors, $90.5 \%$ peak efficiency with $\mathrm{V}_{\mathrm{DDH}}=12 \mathrm{~V}$ and $\mathrm{V}_{\mathrm{OUT}}$ $=1 \mathrm{~V}$, fast transient response, optimized component performance and efficiency, reduced design-in time, PMBus-compliant interface, voltage, current, and temperature reporting, increased power-supply reliability, differential remote sense, hiccup overcurrent protection, programmable thermal shutdown.

The information is from the provided context, which contains details about the ADP5300's bene

## Langchain

### Uber vs Lift 2021

In [2]:
chunk_size = 1000
chunk_overlap = 0

In [3]:
# Load uber data
loader = PyPDFLoader("../data/sec-edgar-filings/pdfs/uber_10k_2021.pdf")
data = loader.load()
for document in data:
    document.metadata['company'] = 'Uber'
    document.metadata['year'] = 2021

# Split
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
uber_splits = text_splitter.split_documents(data)

In [4]:
# Load lyft data
loader = PyPDFLoader("../data/sec-edgar-filings/pdfs/lyft_10k_2021.pdf")
data = loader.load()
for document in data:
    document.metadata['company'] = 'Lyft'
    document.metadata['year'] = 2021

# Split
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
lyft_splits = text_splitter.split_documents(data)

In [5]:
splits = [*uber_splits,*lyft_splits]

print(f"{len(uber_splits)} uber split docs")
print(f"{len(lyft_splits)} lyft split docs")
print(f"{len(splits)} all docs")

1499 uber split docs
1043 lyft split docs
2542 all docs


In [6]:
# Load embeddings and create vector store
embedding = HuggingFaceInstructEmbeddings(
    query_instruction="Represent the query for retrieval: "
)

vectordb = Chroma.from_documents(documents=splits, embedding=embedding)

load INSTRUCTOR_Transformer
max_seq_length  512


In [7]:
# create llm object from Sambanova endpoint class
llm = SambaNovaEndpoint(
    base_url=os.getenv('BASE_URL'),
    project_id=os.getenv('PROJECT_ID'),
    endpoint_id=os.getenv('ENDPOINT_ID'),
    api_key=os.getenv('API_KEY'),
    model_kwargs={
        "do_sample": False, 
        "temperature": 0.0,
        "max_tokens_to_generate": 1000
    },
)

In [8]:
# Output parser will split the LLM result into a list of queries
class LineList(BaseModel):
    # "lines" is the key (attribute name) of the parsed output
    lines: List[str] = Field(description="Lines of text")

class QuestionListOutputParser(PydanticOutputParser):
    def __init__(self) -> None:
        super().__init__(pydantic_object=LineList)

    def parse(self, text: str) -> LineList:
        lines = text.strip().split("\n")
        questions = [question for question in lines if '?' in question]
        return LineList(lines=questions)

output_parser = QuestionListOutputParser()

query_decomposition_prompt = PromptTemplate(
    input_variables=["question"],
    
    template="""[INST] <<SYS>>Decompose a complex query into a list of questions directly and concisely.<<SYS>>
    Query: {question}
    Output: [/INST]""",
)


In [9]:
query_decomposition_prompt.save('../prompts/llama70b-edgar_comparative_qna-query_decomposition_prompt.yaml')
query_decomposition_prompt = load_prompt('../prompts/llama70b-edgar_comparative_qna-query_decomposition_prompt.yaml')

In [10]:
# Chain
llm_chain = LLMChain(llm=llm, prompt=query_decomposition_prompt, output_parser=output_parser)

# "lines" is the attribute name of the parsed output
multiquery_retriever = MultiQueryRetriever(
    retriever=vectordb.as_retriever(search_kwargs={
        'k': 3,
        'filter': {'$or': [{'company': {'$eq': 'Uber'}}, {'company': {'$eq': 'Lyft'}}]},
    }), 
    llm_chain=llm_chain, 
    parser_key="lines", 
    verbose = True
)  

question = "What are the revenue breakdowns for the two documents?"

# multiquery results
multiquery_retrieved_docs = multiquery_retriever.get_relevant_documents(
    query=question
)

INFO:langchain.retrievers.multi_query:Generated queries: ['1. What is the revenue breakdown for Document A?', '2. What is the revenue breakdown for Document B?']


In [11]:
# Define prompt for answering and summarization
summarization_prompt_template = """[INST] <<SYS>>You're a respectful, helpful assistant. Follow these rules:
1. Use only the information provided in the context section.
2. Provide relevant information to answer the question.<<SYS>>
Write an answer to the following question based on the following context:
Question:
{original_question}
Context:
{context}
Answer: [/INST]"""
summarization_prompt = PromptTemplate.from_template(summarization_prompt_template)

In [12]:
summarization_prompt.save('../prompts/llama70b-edgar_comparative_qna-answering_and_summarization_prompt.yaml')
summarization_prompt = load_prompt('../prompts/llama70b-edgar_comparative_qna-answering_and_summarization_prompt.yaml')

In [13]:
# Define StuffDocumentsChain
llm_chain = LLMChain(llm=llm, prompt=summarization_prompt)
stuff_chain = StuffDocumentsChain(llm_chain=llm_chain, document_variable_name="context")

response = stuff_chain.invoke({"input_documents": multiquery_retrieved_docs, 'original_question': question})
print(response['output_text'])

 Based on the information provided in the context section, the revenue breakdowns for the two documents are as follows:

1. Revenue from contracts with customers (ASC 606):

Year Ended December 31,
2021
2020 2019 (in thousands)
Revenue from contracts with customers (ASC 606) $ 2,957,979 $ 2,208,656 $ 3,465,473

2. Rental revenue (ASC 842):

Year Ended December 31,
2021
2020 2019 (in thousands)
Rental revenue (ASC 842) $ 250,344 $ 156,025 $ 150,487

Total revenue $ 3,208,323 $ 2,364,681 $ 3,615,960

Note that the revenue breakdowns are based on the information provided in the context section and do not include any other sources of revenue that may be mentioned in the two documents.


In [None]:
questions = [
    "What are the revenue breakdowns for Microsoft and Apple in their respective 10-K reports, and how do they compare in terms of total revenue and revenue from different segments?",
    "What are the key risks mentioned in the risk factors section of both Microsoft and Apple's 10-K reports, and how do they differ in terms of potential impact and mitigation strategies?",
    "How do the corporate governance structures of Microsoft and Apple, as outlined in their 10-K filings, compare in terms of board composition, executive compensation, and shareholder rights?",
    "What are the major investments and acquisitions disclosed in the investment section of Microsoft and Apple's 10-K reports, and how do they reflect each company's strategic priorities and growth strategies?",
    "How do the research and development expenditures disclosed in Microsoft and Apple's 10-K reports compare in terms of absolute spending and percentage of revenue, and what insights can be drawn regarding their innovation efforts?",
    "What are the legal proceedings and regulatory issues disclosed in the legal proceedings section of both Microsoft and Apple's 10-K filings, and how do they differ in terms of nature, severity, and potential impact on the companies?",
    "How do the financial performance metrics such as net income, operating margins, and cash flow ratios disclosed in Microsoft and Apple's 10-K reports compare, and what factors contribute to any observed differences?",
    "What are the geographical revenue breakdowns provided in the geographic segments section of both Microsoft and Apple's 10-K reports, and how do they reflect each company's international presence and market diversification?",
    "How do the sustainability initiatives and environmental disclosures in Microsoft and Apple's 10-K filings compare, including information on energy consumption, carbon footprint, and supply chain sustainability efforts?",
    "What are the forward-looking statements and risk factors outlined in the Management's Discussion and Analysis (MD&A) sections of Microsoft and Apple's 10-K reports, and how do they reflect each company's outlook, challenges, and opportunities in the market?",
]