In [None]:
!pip install llama-index
!pip install llama-index-core
!pip install llama-index-embeddings-openai
!pip install llama-index-postprocessor-flag-embedding-reranker
!pip install git+https://github.com/FlagOpen/FlagEmbedding.git
!pip install llama-parse

In [3]:
!wget 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/examples/data/10q/uber_10q_march_2022.pdf' -O './uber_10q_march_2022.pdf'

--2024-03-18 12:49:08--  https://raw.githubusercontent.com/run-llama/llama_index/main/docs/examples/data/10q/uber_10q_march_2022.pdf
Loaded CA certificate '/etc/ssl/certs/ca-certificates.crt'
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1260185 (1.2M) [application/octet-stream]
Saving to: ‘./uber_10q_march_2022.pdf’


2024-03-18 12:49:08 (191 MB/s) - ‘./uber_10q_march_2022.pdf’ saved [1260185/1260185]



In [4]:
# llama-parse is async-first, running the async code in a notebook requires the use of nest_asyncio
import nest_asyncio
nest_asyncio.apply()

import os
# API access to llama-cloud
os.environ["LLAMA_CLOUD_API_KEY"] = "llx-"

# Using OpenAI API for embeddings
os.environ["OPENAI_API_KEY"] = "sk-"

# Using NVIDIA API Playground API Key for LLM
os.environ["NVIDIA_AI_PLAYGROUND_API_KEY"] = "nvapi-"

In [5]:
from llama_index.llms.nvidia_ai_playground import NvidiaAIPlayground
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import VectorStoreIndex
from llama_index.core import Settings

embed_model=OpenAIEmbedding(model="text-embedding-3-small")
llm = NvidiaAIPlayground()

Settings.llm = llm
Settings.embed_model = embed_model

In [None]:
from llama_parse import LlamaParse

documents = LlamaParse(result_type="markdown").load_data('./uber_10q_march_2022.pdf')

In [7]:
print(documents[0].text[:1000] + '...')

# Document

# UNITED STATES SECURITIES AND EXCHANGE COMMISSION Washington, D.C. 20549 FORM 10-Q

(Mark One)

☒ QUARTERLY REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934 For the quarterly period
ended March 31, 2022 OR ☐ TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934
For the transition period from_____ to _____ Commission File Number: 001-38902 UBER TECHNOLOGIES, INC. (Exact name
of registrant as specified in its charter) Not Applicable (Former name, former address and former fiscal year, if
changed since last report) Delaware 45-2647441 (State or other jurisdiction of incorporation or organization) (I.R.S.
Employer Identification No.) 1515 3rd Street San Francisco, California 94158 (Address of principal executive offices,
including zip code) (415) 612-8582 (Registrant’s telephone number, including area code) Securities registered
pursuant to Section 12(b) of the Act: Title of each class Trading Symbol(s) Name of ea

In [8]:
from llama_index.core.node_parser import MarkdownElementNodeParser
from llama_index.llms.openai import OpenAI

node_parser = MarkdownElementNodeParser(llm=OpenAI(model="gpt-3.5-turbo-0125"), num_workers=8)

In [9]:
nodes = node_parser.get_nodes_from_documents(documents)

Embeddings have been explicitly disabled. Using MockEmbedding.


75it [00:00, 70579.49it/s]
100%|██████████| 75/75 [00:22<00:00,  3.28it/s]


In [10]:
base_nodes, objects = node_parser.get_nodes_and_objects(nodes)

In [11]:
recursive_index = VectorStoreIndex(nodes=base_nodes+objects)
raw_index = VectorStoreIndex.from_documents(documents)

In [12]:
from llama_index.postprocessor.flag_embedding_reranker import FlagEmbeddingReranker

reranker = FlagEmbeddingReranker(
    top_n=5,
    model="BAAI/bge-reranker-large",
)

recursive_query_engine = recursive_index.as_query_engine(
    similarity_top_k=15, 
    node_postprocessors=[reranker], 
    verbose=True
)

raw_query_engine = raw_index.as_query_engine(similarity_top_k=15, node_postprocessors=[reranker])

In [13]:
print(len(nodes))

291


In [14]:
query = "how is the Cash paid for Income taxes, net of refunds from Supplemental disclosures of cash flow information?"

response_1 = raw_query_engine.query(query)
print("\n***********New LlamaParse+ Basic Query Engine***********")
print(response_1)

response_2 = recursive_query_engine.query(query)
print("\n***********New LlamaParse+ Recursive Retriever Query Engine***********")
print(response_2)


***********New LlamaParse+ Basic Query Engine***********
Cash paid for income taxes, net of refunds from Supplemental disclosures of cash flow information:

Income taxes are a significant expense for most businesses, and the amount of cash paid for income taxes can vary significantly from period to period. The supplemental disclosures of cash flow information in the context of the provided financial statements provide important information related to non-cash investing and financing activities, but do not provide any information related to cash paid for income taxes.

To assess the Company's operating performance and ability to generate cash flows, investors and management should focus on the primary measure of cash flow provided by (used in) operating activities, which is a key measure provided in the financial statements. This measure represents the amount of cash that is generated from the Company's core business operations, and is used to fund investments in the business, pay divi