In [33]:
# llama-parse is async-first, running the async code in a notebook requires the use of nest_asyncio
import nest_asyncio
nest_asyncio.apply()

In [34]:
# Access the API through environment variable
import os
from dotenv import load_dotenv
load_dotenv()

openai_api_key = os.getenv('OPENAI_API_KEY')
llama_cloud_api_key = os.getenv('LLAMA_CLOUD_API_KEY')

In [42]:
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import VectorStoreIndex
from llama_index.core import Settings

embed_model=OpenAIEmbedding(model="text-embedding-3-small")
llm = OpenAI(model="gpt-3.5-turbo-0125")

Settings.llm = llm
Settings.embed_model = embed_model

In [36]:
from llama_parse import LlamaParse
from pathlib import Path

# This constructs a Path object for the "data" directory.
data_dir = Path('data')

# This constructs the full path to "attention.pdf" within the "data" directory.
file_path = data_dir / 'uber_10q_march_2022.pdf'

# Use the constructed path in your method call
documents = LlamaParse(result_type="text").load_data(file_path)

Started parsing the file under job_id 41f3b6c6-96ba-4639-9660-cea77d93ef1b


In [37]:
#Save the documents
import os

for i, document in enumerate(documents):
    markdown_content = document.text  # Assuming 'text' is the attribute containing the Markdown content
    file_path = os.path.join(data_dir, f"document_{i}.md")
    with open(file_path, 'w', encoding='utf-8') as file:
        file.write(markdown_content)

In [None]:
#Reload the documents
# Initialize an empty list to hold the reloaded documents
documents_reloaded = []

# List all files in the data directory
files = os.listdir(data_dir)
# Filter out files that match the naming pattern of saved documents
document_files = [file for file in files if file.startswith("document_") and file.endswith(".md")]

# Sort files to maintain the original order based on their index
document_files.sort(key=lambda x: int(x.split('_')[1].split('.')[0]))

# Reload each document
for file_name in document_files:
    file_path = os.path.join(data_dir, file_name)
    with open(file_path, 'r', encoding='utf-8') as file:
        documents_reloaded.append(file.read())

# documents_reloaded now contains all the documents loaded from the files

In [38]:
print(documents[0].text[2000:3000] + '...')

__________________________
                                                Delaware                                                                                                     45-2647441
           (State or other jurisdiction of incorporation or organization)                                                                  (I.R.S. Employer Identification No.)
                                                                                                   1515 3rd Street
                                                                                       San Francisco, California 94158
                                                                    (Address of principal executive offices, including zip code)
                                                                                                   (415) 612-8582
                                                                        (Registrant’s telephone number, including area code)
                           

In [39]:
sum(len(documents) for document in documents)

1

In [40]:
num_characters = len(documents[0].text)
print(f"Number of characters in documents[0]: {num_characters}")

Number of characters in documents[0]: 570125


In [43]:
from llama_index.core.node_parser import MarkdownElementNodeParser

In [45]:
node_parser = MarkdownElementNodeParser(llm=OpenAI(model="gpt-3.5-turbo-0125"))

In [46]:
nodes = node_parser.get_nodes_from_documents(documents)

Embeddings have been explicitly disabled. Using MockEmbedding.


0it [00:00, ?it/s]
0it [00:00, ?it/s]


In [None]:
print(nodes)

In [47]:
base_nodes, objects = node_parser.get_nodes_and_objects(nodes)

In [48]:
recursive_index = VectorStoreIndex(nodes=base_nodes+objects)
raw_index = VectorStoreIndex.from_documents(documents)

In [49]:
from llama_index.postprocessor.flag_embedding_reranker import FlagEmbeddingReranker

reranker = FlagEmbeddingReranker(
    top_n=5,
    model="BAAI/bge-reranker-large",
)

recursive_query_engine = recursive_index.as_query_engine(
    similarity_top_k=15, 
    node_postprocessors=[reranker], 
    verbose=True
)

raw_query_engine = raw_index.as_query_engine(similarity_top_k=15, node_postprocessors=[reranker])

In [50]:
print(len(nodes))

120


In [51]:
query = "how is the Cash paid for Income taxes, net of refunds from Supplemental disclosures of cash flow information?"

response_1 = raw_query_engine.query(query)
print("\n***********New LlamaParse+ Basic Query Engine***********")
print(response_1)

response_2 = recursive_query_engine.query(query)
print("\n***********New LlamaParse+ Recursive Retriever Query Engine***********")
print(response_2)


***********New LlamaParse+ Basic Query Engine***********
The Cash paid for Income taxes, net of refunds, is calculated by adjusting the provision for (benefit from) income taxes by considering the changes in the income tax payable or refundable during the period. This adjustment reflects the actual cash outflows or inflows related to income taxes after accounting for any refunds received.

***********New LlamaParse+ Recursive Retriever Query Engine***********
The Cash paid for Income taxes, net of refunds from Supplemental disclosures of cash flow information is calculated by adjusting the provision for (benefit from) income taxes by considering the changes in the income tax payable during the period. This adjustment reflects the actual cash payments made for income taxes, taking into account any refunds received.


In [26]:
query = "what is the change of free cash flow and what is the rate from the financial and operational highlights?"

response_1 = raw_query_engine.query(query)
print("\n***********New LlamaParse+ Basic Query Engine***********")
print(response_1)

response_2 = recursive_query_engine.query(query)
print("\n***********New LlamaParse+ Recursive Retriever Query Engine***********")
print(response_2)


***********New LlamaParse+ Basic Query Engine***********
The change in free cash flow from the financial and operational highlights is an improvement of $635 million, with the rate being not meaningful.

***********New LlamaParse+ Recursive Retriever Query Engine***********
The change in free cash flow from the financial and operational highlights is an improvement of $635 million, with the rate being a positive 93%.


In [27]:
query = "what is the net loss value attributable to Uber compared to last year?"

response_1 = raw_query_engine.query(query)
print("\n***********New LlamaParse+ Basic Query Engine***********")
print(response_1)

response_2 = recursive_query_engine.query(query)
print("\n***********New LlamaParse+ Recursive Retriever Query Engine***********")
print(response_2)


***********New LlamaParse+ Basic Query Engine***********
The net loss attributable to Uber Technologies, Inc. for the current period is $5.9 billion, which is an increase compared to the net loss of $108 million for the same period last year.

***********New LlamaParse+ Recursive Retriever Query Engine***********
The net loss attributable to Uber Technologies, Inc. was $5.9 billion for the first quarter of 2022, compared to a net loss of $108 million for the same period in 2021.


In [28]:
query = "What were cash flows like from investing activities?"

response_1 = raw_query_engine.query(query)
print("\n***********New LlamaParse+ Basic Query Engine***********")
print(response_1)

response_2 = recursive_query_engine.query(query)
print("\n***********New LlamaParse+ Recursive Retriever Query Engine***********")
print(response_2)


***********New LlamaParse+ Basic Query Engine***********
Cash flows from investing activities were $135 million for the three months ended March 31, 2022. This primarily consisted of $62 million in purchases of property and equipment and $59 million in acquisition of business, net of cash acquired.

***********New LlamaParse+ Recursive Retriever Query Engine***********
Cash flows from investing activities were $135 million for the three months ended March 31, 2022. This primarily consisted of $62 million in purchases of property and equipment and $59 million in acquisition of business, net of cash acquired.


In [29]:
query = "What were greatest risks identified that could affect next quarter's earnings?"

response_1 = raw_query_engine.query(query)
print("\n***********New LlamaParse+ Basic Query Engine***********")
print(response_1)

response_2 = recursive_query_engine.query(query)
print("\n***********New LlamaParse+ Recursive Retriever Query Engine***********")
print(response_2)


***********New LlamaParse+ Basic Query Engine***********
The greatest risks identified that could affect next quarter's earnings include the impact of the COVID-19 pandemic and related restrictions on business operations, potential Driver supply constraints, uncertainties in end-user behavior and demand for Mobility offerings, and the unpredictability of when Driver supply levels will return to pre-pandemic levels.

***********New LlamaParse+ Recursive Retriever Query Engine***********
The greatest risks identified that could affect next quarter's earnings include the impact of the COVID-19 pandemic and related actions to mitigate it, potential fluctuations in operating results due to factors beyond the company's control, such as seasonal fluctuations and competition in the markets, the need to attract and retain platform users, manage growth effectively, and keep pace with technological changes, as well as potential security breaches, cyberattacks, and climate change risks. Additiona