In [2]:
# download files
!mkdir data
!wget "https://www.dropbox.com/s/948jr9cfs7fgj99/UBER.zip?dl=1" -O data/UBER.zip
!unzip data/UBER.zip -d data

--2023-04-24 00:24:51--  https://www.dropbox.com/s/948jr9cfs7fgj99/UBER.zip?dl=1
Resolving www.dropbox.com (www.dropbox.com)... 162.125.11.18
Connecting to www.dropbox.com (www.dropbox.com)|162.125.11.18|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: /s/dl/948jr9cfs7fgj99/UBER.zip [following]
--2023-04-24 00:24:51--  https://www.dropbox.com/s/dl/948jr9cfs7fgj99/UBER.zip
Reusing existing connection to www.dropbox.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://uc5f1d6b282d53b6c9f759e65c2e.dl.dropboxusercontent.com/cd/0/get/B6zhzGeG8ZlS0n8qMm1U9jYnfNWcwLwQbL-ut5n5IHfaOqyzOdNPeaQ7AUDSc6aydoB6oug0yoMXwyBhu6TfopPzWb-ELDoXHWCI5rkFXj4vhz-aXZNuzuP4EDXVkFJ8jVn_9dD3ok4hydVH-TQZQY_uPDHCo5Ex9-dVfdpefxKU4Q/file?dl=1# [following]
--2023-04-24 00:24:52--  https://uc5f1d6b282d53b6c9f759e65c2e.dl.dropboxusercontent.com/cd/0/get/B6zhzGeG8ZlS0n8qMm1U9jYnfNWcwLwQbL-ut5n5IHfaOqyzOdNPeaQ7AUDSc6aydoB6oug0yoMXwyBhu6TfopPzWb-ELDoXHWCI5rkFXj4vhz-aXZNu

In [3]:
import nest_asyncio
nest_asyncio.apply()

In [4]:
from gpt_index import download_loader, GPTSimpleVectorIndex
from pathlib import Path

  from .autonotebook import tqdm as notebook_tqdm


### Ingest Unstructured Data Through the Unstructured.io Reader

Leverage the capabilities of Unstructured.io HTML parsing.
Downloaded through LlamaHub.

In [5]:
years = [2022, 2021, 2020, 2019]

In [6]:
UnstructuredReader = download_loader("UnstructuredReader", refresh_cache=True, use_gpt_index_import=True)

Collecting unstructured
  Downloading unstructured-0.6.1.tar.gz (1.3 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m:01[0m
[?25h  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting argilla
  Downloading argilla-1.6.0-py3-none-any.whl (2.2 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m22.6 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m0:01[0m
[?25hCollecting lxml
  Downloading lxml-4.9.2-cp39-cp39-macosx_10_15_x86_64.whl (4.8 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.8/4.8 MB[0m [31m37.1 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
[?25hCollecting msg_parser
  Downloading msg_parser-1.2.0-py2.py3-none-any.whl (101 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32

In [7]:
loader = UnstructuredReader()
doc_set = {}
all_docs = []
for year in years:
    year_docs = loader.load_data(file=Path(f'./data/UBER/UBER_{year}.html'), split_documents=False)
    # insert year metadata into each year
    for d in year_docs:
        d.extra_info = {"year": year}
    doc_set[year] = year_docs
    all_docs.extend(year_docs)

[nltk_data] Downloading package punkt to /Users/suo/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/suo/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
INFO:unstructured:Reading document from string ...
INFO:unstructured:Reading document ...
INFO:unstructured:Reading document from string ...
INFO:unstructured:Reading document ...
INFO:unstructured:Reading document from string ...
INFO:unstructured:Reading document ...
INFO:unstructured:Reading document from string ...
INFO:unstructured:Reading document ...


### Setup a Vector Index for each SEC filing

We setup a separate vector index for each SEC filing from 2019-2022.

We also optionally initialize a "global" index by dumping all files into the vector store.

In [9]:
from gpt_index.indices.service_context import ServiceContext

service_context = ServiceContext.from_defaults(chunk_size_limit=512)
# initialize simple vector indices + global vector index
# NOTE: don't run this cell if the indices are already loaded! 
index_set = {}
for year in years:
    cur_index = GPTSimpleVectorIndex.from_documents(doc_set[year], service_context=service_context)
    index_set[year] = cur_index
    cur_index.save_to_disk(f'index_{year}.json')
    

INFO:gpt_index.token_counter.token_counter:> [build_index_from_nodes] Total LLM token usage: 0 tokens
INFO:gpt_index.token_counter.token_counter:> [build_index_from_nodes] Total embedding token usage: 232797 tokens
INFO:gpt_index.token_counter.token_counter:> [build_index_from_nodes] Total LLM token usage: 0 tokens
INFO:gpt_index.token_counter.token_counter:> [build_index_from_nodes] Total embedding token usage: 241424 tokens
INFO:gpt_index.token_counter.token_counter:> [build_index_from_nodes] Total LLM token usage: 0 tokens
INFO:gpt_index.token_counter.token_counter:> [build_index_from_nodes] Total embedding token usage: 257154 tokens
INFO:gpt_index.token_counter.token_counter:> [build_index_from_nodes] Total LLM token usage: 0 tokens
INFO:gpt_index.token_counter.token_counter:> [build_index_from_nodes] Total embedding token usage: 246480 tokens


In [10]:
# Load indices from disk
index_set = {}
for year in years:
    cur_index = GPTSimpleVectorIndex.load_from_disk(f'index_{year}.json')
    index_set[year] = cur_index

### Composing a Graph to synthesize answers across 10-K filings (2019-2022)

We want our queries to aggregate/synthesize information across *all* 10-K filings. To do this, we define a List index
on top of the 4 vector indices.

In [11]:
from gpt_index import GPTListIndex, LLMPredictor
from langchain import OpenAI
from gpt_index.composability import ComposableGraph

In [12]:
# set summary text for each doc
index_summaries = {}
for year in years:
    index_summaries[year] = f"UBER 10-k Filing for {year} fiscal year"

In [14]:
# set number of output tokens
llm_predictor = LLMPredictor(llm=OpenAI(temperature=0, max_tokens=512))
service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor)

In [18]:
# define a list index over the vector indices
# allows us to synthesize information across each index
graph = ComposableGraph.from_indices(
    GPTListIndex,
    children_indices=[index_set[y] for y in years],
    index_summaries=index_summaries,
    service_context=service_context
)

INFO:gpt_index.token_counter.token_counter:> [build_index_from_nodes] Total LLM token usage: 0 tokens
INFO:gpt_index.token_counter.token_counter:> [build_index_from_nodes] Total embedding token usage: 0 tokens


In [19]:
graph.save_to_disk('10k_graph.json')

In [20]:
graph = ComposableGraph.load_from_disk('10k_graph.json', llm_predictor=llm_predictor)

In [23]:
query_engine = graph.as_query_engine(
    response_mode='tree_summarize'
)

In [24]:
import asyncio
import time

cross_query_str = (
    "Compare/contrast the risk factors described in the Uber 10-K across years. Give answer in bullet points."
)

start_time = time.perf_counter()
task = query_engine.aquery(cross_query_str)
response = asyncio.run(task)
elapsed_time = time.perf_counter() - start_time

INFO:gpt_index.token_counter.token_counter:> [retrieve] Total LLM token usage: 0 tokens
INFO:gpt_index.token_counter.token_counter:> [retrieve] Total embedding token usage: 23 tokens
INFO:gpt_index.token_counter.token_counter:> [get_response] Total LLM token usage: 631 tokens
INFO:gpt_index.token_counter.token_counter:> [get_response] Total embedding token usage: 0 tokens
INFO:gpt_index.token_counter.token_counter:> [get_response] Total LLM token usage: 631 tokens
INFO:gpt_index.token_counter.token_counter:> [get_response] Total embedding token usage: 0 tokens
INFO:gpt_index.token_counter.token_counter:> [retrieve] Total LLM token usage: 0 tokens
INFO:gpt_index.token_counter.token_counter:> [retrieve] Total embedding token usage: 0 tokens


In [None]:
print(str(response))
print(str(elapsed_time))


• 2019: Potential legal proceedings, competitive nature of the industry, potential for regulatory changes
• 2020: Potential legal proceedings, competitive nature of the industry, potential for regulatory changes, impact of the COVID-19 pandemic
• 2021: Risk of Drivers being classified as employees, workers or quasi-employees instead of independent contractors, high competition in mobility, delivery, and logistics industries, risk of COVID-19 pandemic
• 2022: Risk of Drivers being classified as employees, workers or quasi-employees instead of independent contractors, competitive landscape of the mobility, delivery, and logistics industries, risk of having to lower fares or service fees, or offer significant Driver incentives and consumer discounts and promotions in order to remain competitive in certain markets, risk of incurring significant losses

• 2019-2020: Potential legal proceedings, competitive nature of the industry, potential for regulatory changes
• 2021-2022: Risk of Driver