In [None]:
# download files
# !mkdir data
!wget https://www.dropbox.com/s/948jr9cfs7fgj99/UBER.zip?dl=1 -O ../../data/datasets/UBER.zip
!unzip ../../data/datasets/UBER.zip -d ../../data/datasets

In [1]:
# set text wrapping
from IPython.display import HTML, display

def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))
get_ipython().events.register('pre_run_cell', set_css)

In [2]:
from llama_index import ServiceContext
from llama_index import download_loader, GPTSimpleVectorIndex
from pathlib import Path

import os
# os.environ['OPENAI_API_KEY'] = ""

# get keys from .env
from dotenv import load_dotenv

_ = load_dotenv()

### Ingest Unstructured Data Through the Unstructured.io Reader

Leverage the capabilities of Unstructured.io HTML parsing.
Downloaded through LlamaHub.

In [3]:
UnstructuredReader = download_loader("UnstructuredReader", refresh_cache=True)

In [4]:
loader = UnstructuredReader()
doc_set = {}
all_docs = []
years = [2022, 2021, 2020, 2019]
path = './../../data/datasets/UBER'
for year in years:
    year_docs = loader.load_data(file=Path(f'{path}/UBER_{year}.html'), split_documents=False)
    # insert year metadata into each year
    for d in year_docs:
        d.extra_info = {"year": year}
    doc_set[year] = year_docs
    all_docs.extend(year_docs)



[nltk_data] Downloading package punkt to
[nltk_data]     /Users/seanbearden/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/seanbearden/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
INFO:unstructured:Reading document from string ...
INFO:unstructured:Reading document ...
INFO:unstructured:Reading document from string ...
INFO:unstructured:Reading document ...
INFO:unstructured:Reading document from string ...
INFO:unstructured:Reading document ...
INFO:unstructured:Reading document from string ...
INFO:unstructured:Reading document ...


### Setup Service Context

In [5]:


service_context = ServiceContext.from_defaults(chunk_size_limit=512)

### Setup a Vector Index for each SEC filing

We setup a separate vector index for each SEC filing from 2019-2022.

We also optionally initialize a "global" index by dumping all files into the vector store.

In [6]:
indices_path = './../../data/indices/UBER'
if not os.path.exists(indices_path):
    os.makedirs(indices_path)
# initialize simple vector indices + global vector index
# NOTE: don't run this cell if the indices are already loaded! 
index_set = {}
for year in years:
    cur_index = GPTSimpleVectorIndex.from_documents(doc_set[year], service_context=service_context)
    index_set[year] = cur_index
    cur_index.save_to_disk(f'{indices_path}/index_{year}.json')
    

INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total LLM token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total embedding token usage: 232797 tokens
INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total LLM token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total embedding token usage: 241424 tokens
INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total LLM token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total embedding token usage: 257154 tokens
INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total LLM token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total embedding token usage: 246480 tokens


In [7]:
# Load indices from disk
index_set = {}
for year in years:
    cur_index = GPTSimpleVectorIndex.load_from_disk(f'{indices_path}/index_{year}.json', service_context=service_context)
    index_set[year] = cur_index

In [8]:
# NOTE: this global index is a single vector store containing all documents
# Only relevant for the section below: "Can a single vector index answer questions across years?"
global_index = GPTSimpleVectorIndex.from_documents(all_docs, service_context=service_context)
global_index.save_to_disk(f'{indices_path}/index_global.json')

INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total LLM token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total embedding token usage: 977855 tokens


In [9]:
global_index = GPTSimpleVectorIndex.load_from_disk(f'{indices_path}/index_global.json', service_context=service_context)

### Ask Initial Questions over a Given Year (2020)

Let's first ask some questions over the UBER 10-k for 2020! 

In [10]:
response = index_set[2020].query("What were some of the biggest risk factors in 2020?", similarity_top_k=3)

INFO:llama_index.token_counter.token_counter:> [query] Total LLM token usage: 2102 tokens
INFO:llama_index.token_counter.token_counter:> [query] Total embedding token usage: 11 tokens


In [11]:
print(response)



Some of the biggest risk factors in 2020 included: privacy and cybersecurity risks associated with increased remote working; legal and regulatory challenges related to the COVID-19 pandemic, including potential fines or other enforcement measures resulting from the launch of new services, features, or health and safety requirements; financial impacts from the COVID-19 pandemic, including workforce reductions and changes to pricing models; the uncertainty of the pandemic's cumulative and ultimate impact on our future business operations, liquidity, financial condition, and results of operations; the duration of the spread of the outbreak and any future “waves” or resurgences of the outbreak or variants of the virus, both globally and within the United States, the impact on capital and financial markets, foreign currencies exchange, governmental or regulatory orders that impact our business and whether the impacts may result in permanent changes to our end-users’ behaviors; the potenti

In [12]:
response = index_set[2020].query("What were some of the signifcant acquisitions?", similarity_top_k=3)

INFO:llama_index.token_counter.token_counter:> [query] Total LLM token usage: 2285 tokens
INFO:llama_index.token_counter.token_counter:> [query] Total embedding token usage: 11 tokens


In [13]:
print(response)



Some of the significant acquisitions mentioned in the context information include the divestiture of our ATG business to Aurora, the divestiture of our Uber Elevate business to Joby, the Yandex.Taxi joint venture in Russia/CIS, the agreement to enter into a joint venture with SK Telecom Co., LTD., the acquisition of Careem, the purchase of a controlling interest in Cornershop, the acquisition of Routematch Holdings, Inc., the acquisition of Postmates Inc., the divestiture of Uber Eats India to Zomato, and the divestiture of certain assets of our JUMP business and investment in Lime. These acquisitions resulted in the recognition of $2.5 billion in goodwill in our Mobility segment, $540 million in intangible assets, $384 million in goodwill in our Delivery segment, $122 million in intangible assets, $91 million in goodwill in our Mobility segment, $27 million in intangible assets, and $3.1 billion in goodwill in our Delivery segment and $1.0 billion in intangible assets. Refer to Note

### Can a single vector index answer questions across years?

If we dump all documents to a single vector store, let's test its ability to answer questions across years! 

In [14]:
# # Option 2
# risk_query_str = (
#     "Describe the current risk factors. If the year is provided in the information, "
#     "provide that as well. If the context contains risk factors for multiple years, "
#     "explicitly provide the following:\n"
#     "- A description of the risk factors for each year\n"
#     "- A summary of how these risk factors are changing across years"
# )

In [15]:
# Option 1
risk_query_str = "What are some of the biggest risk factors in each year?"

In [16]:
response = global_index.query(risk_query_str, similarity_top_k=3)

INFO:llama_index.token_counter.token_counter:> [query] Total LLM token usage: 3022 tokens
INFO:llama_index.token_counter.token_counter:> [query] Total embedding token usage: 12 tokens


In [17]:
print(str(response))



Some of the biggest risk factors in 2020 include:

1. Security breaches that could expose us to liability under various laws and regulations across jurisdictions and increase the risk of litigation and governmental investigation.

2. Degrading services or sabotage systems that our security measures may not detect.

3. Individuals able to circumvent our security measures may misappropriate confidential, proprietary, or personal information held by or on behalf of us, disrupt our operations, damage our computers, or otherwise damage our business.

4. Potential failure to attract and retain Drivers and customers.

5. Potential failure to maintain and expand our brand.

6. Potential failure to protect our intellectual property.

7. Potential failure to comply with applicable laws and regulations.

8. Potential failure to manage our growth effectively.

9. High competition in the mobility, delivery, and logistics industries, with competitors in nearly every major geographic region.

10. L

### Composing a Graph to synthesize answers across 10-K filings (2019-2022)

We want our queries to aggregate/synthesize information across *all* 10-K filings. To do this, we define a List index
on top of the 4 vector indices.

In [18]:
from llama_index import GPTListIndex, LLMPredictor
from langchain import OpenAI
from llama_index.composability import ComposableGraph

In [19]:
# set summary text for each doc
summaries = {}
for year in years:
    summaries[year] = f'UBER 10-k Filing for {year} fiscal year'

In [20]:
# set number of output tokens
llm_predictor = LLMPredictor(llm=OpenAI(temperature=0, max_tokens=512))
service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor)

In [21]:
graph = ComposableGraph.from_indices(
    GPTListIndex,
    [index_set[y] for y in years],
    [summaries[y] for y in years],
    service_context=service_context
)

INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total LLM token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total embedding token usage: 0 tokens


In [22]:
graph.save_to_disk(f'{indices_path}/10k_graph.json')

In [23]:
graph = ComposableGraph.load_from_disk(f'{indices_path}/10k_graph.json', service_context=service_context)

### Setting Up the Query

We query about the risk factors. We want to synthesize information across each year.

In [24]:
risk_query_str = (
    "Describe the current risk factors. If the year is provided in the information, "
    "provide that as well. If the context contains risk factors for multiple years, "
    "explicitly provide the following:\n"
    "- A description of the risk factors for each year\n"
    "- A summary of how these risk factors are changing across years"
)

In [25]:
query_configs = [
    {
        "index_struct_type": "dict",
        "query_mode": "default",
        "query_kwargs": {
            "similarity_top_k": 1,
            # "include_summary": True
        }
    },
    {
        "index_struct_type": "list",
        "query_mode": "default",
        "query_kwargs": {
            "response_mode": "tree_summarize",
        }
    },
]

In [26]:
response_summary = graph.query(risk_query_str, query_configs=query_configs)

In [27]:
print(response_summary)


The current risk factors for 2021 include the COVID-19 pandemic and the impact of actions to mitigate the pandemic, the potential for Drivers to be classified as employees, workers or quasi-employees instead of independent contractors, the highly competitive mobility, delivery, and logistics industries, and the need to lower fares or service fees and offer Driver incentives and consumer discounts and promotions in order to remain competitive in certain markets. 

The risk factors for 2020 are outlined in Item 1A of the context information. These risk factors include unresolved staff comments, properties, legal proceedings, and mine safety disclosures. 

The current risk factors as of December 31, 2019 include interest rate risk, investment risk, and foreign currency risk. Interest rate risk relates to the 2016 Term Loan Facility and 2018 Term Loan Facility, which are floating rate notes and are carried at amortized cost. Investment risk relates to cash and cash equivalents including r

In [28]:
print(response_summary.get_formatted_sources())

> Source (Doc id: fab5c097-fc40-4134-b988-fe415572c79c): 
The current risk factors for 2022 include: Drivers being classified as employees, workers or qua...

> Source (Doc id: c29c129c-9d93-4ef4-ab45-331dfbbafb56): 
The year provided in the context is 2021. The current risk factors include the COVID-19 pandemic...

> Source (Doc id: 7eaa9915-ec03-4ca9-a662-1c6cb93100d7): 
The risk factors for 2020 are outlined in Item 1A of the context information. These risk factors...

> Source (Doc id: bf6e8524-dd71-4a85-b0a0-1b03b7c1a51e): 
The current risk factors as of December 31, 2019 include interest rate risk, investment risk, an...

> Source (Doc id: a6a1f65d-ae31-4a9e-84ab-2a5e4560d49e): year: 2022

and certain events we participate in or host with members of the investment community...

> Source (Doc id: bfbd1130-2943-4363-9163-144d02f6b052): year: 2021

into this report or in any other report or document we file.

ITEM 1A. RISK FACTORS

...

> Source (Doc id: a1f8fc17-fb73-4f24-871b-6bfb

In [29]:
# query a specific year
response_tmp = index_set[2022].query(risk_query_str)

INFO:llama_index.token_counter.token_counter:> [query] Total LLM token usage: 932 tokens
INFO:llama_index.token_counter.token_counter:> [query] Total embedding token usage: 60 tokens


In [30]:
str(response_tmp)

'\nThe current risk factors for 2022 include: Drivers being classified as employees, workers or quasi-employees instead of independent contractors; the mobility, delivery, and logistics industries being highly competitive with well-established and low-cost alternatives in nearly every major geographic region; and the need to remain competitive in certain markets by lowering fares or service fees, which we have in the past done and may continue to do in the future. These risk factors have remained consistent over the years.'

In [31]:
# query a global index
response = global_index.query(risk_query_str, similarity_top_k=4)

INFO:llama_index.token_counter.token_counter:> [query] Total LLM token usage: 5001 tokens
INFO:llama_index.token_counter.token_counter:> [query] Total embedding token usage: 60 tokens


In [32]:
str(response)

'\n\nThe risk factors for 2020 are outlined in Item 1A of the context information. These risk factors include unresolved staff comments, properties, legal proceedings, mine safety disclosures, and certain events we participate in or host with members of the investment community on our investor relations website. Additionally, we provide notifications of news or announcements regarding our financial performance, including SEC filings, investor events, press and earnings releases, as part of our investor relations website. The contents of these websites are not intended to be incorporated by reference into this report or in any other report or document we file. \n\nFor 2021, the risk factors outlined in Item 1A include the potential for Drivers to be classified as employees, workers or quasi-employees instead of independent contractors, as well as the highly competitive nature of the mobility, delivery, and logistics industries with well-established and low-cost alternatives that have be