In [2]:
# download files
!mkdir data
!wget "https://www.dropbox.com/s/948jr9cfs7fgj99/UBER.zip?dl=1" -O data/UBER.zip
!unzip data/UBER.zip -d data

--2023-03-22 17:55:19--  https://www.dropbox.com/s/948jr9cfs7fgj99/UBER.zip?dl=1
Resolving www.dropbox.com (www.dropbox.com)... 2620:100:601a:18::a27d:712, 162.125.7.18
Connecting to www.dropbox.com (www.dropbox.com)|2620:100:601a:18::a27d:712|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: /s/dl/948jr9cfs7fgj99/UBER.zip [following]
--2023-03-22 17:55:19--  https://www.dropbox.com/s/dl/948jr9cfs7fgj99/UBER.zip
Reusing existing connection to [www.dropbox.com]:443.
HTTP request sent, awaiting response... 302 Found
Location: https://ucc12f480a811ad66fa4e0167407.dl.dropboxusercontent.com/cd/0/get/B4yVmJruA6tAshAnTEubKx8G6wnJQMzejIhVFEhMm4NAqK5FvHi4ldO2CMkXLZKnBCAYvuMkq6GkyS9_Tzv3WtbtjomXqcMZaFMj-2AV2uLdr9JsgVxCol8hMCCLn1DvOn7A8-HQ2Do0-kdACrhGK8TaZeTdis1ksKUstwraaf1e-w/file?dl=1# [following]
--2023-03-22 17:55:19--  https://ucc12f480a811ad66fa4e0167407.dl.dropboxusercontent.com/cd/0/get/B4yVmJruA6tAshAnTEubKx8G6wnJQMzejIhVFEhMm4NAqK5FvHi4ldO2CMkXLZKnBCAYvuMkq6

In [3]:
import nest_asyncio
nest_asyncio.apply()

In [4]:
from gpt_index import download_loader, GPTSimpleVectorIndex
from pathlib import Path

### Ingest Unstructured Data Through the Unstructured.io Reader

Leverage the capabilities of Unstructured.io HTML parsing.
Downloaded through LlamaHub.

In [5]:
years = [2022, 2021, 2020, 2019]

In [6]:
UnstructuredReader = download_loader("UnstructuredReader", refresh_cache=True, use_gpt_index_import=True)

In [7]:
loader = UnstructuredReader()
doc_set = {}
all_docs = []
for year in years:
    year_docs = loader.load_data(file=Path(f'./data/UBER/UBER_{year}.html'), split_documents=False)
    # insert year metadata into each year
    for d in year_docs:
        d.extra_info = {"year": year}
    doc_set[year] = year_docs
    all_docs.extend(year_docs)

[nltk_data] Downloading package punkt to /Users/jerryliu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/jerryliu/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
INFO:unstructured:Reading document from string ...
INFO:unstructured:Reading document ...
INFO:unstructured:Reading document from string ...
INFO:unstructured:Reading document ...
INFO:unstructured:Reading document from string ...
INFO:unstructured:Reading document ...
INFO:unstructured:Reading document from string ...
INFO:unstructured:Reading document ...


### Setup a Vector Index for each SEC filing

We setup a separate vector index for each SEC filing from 2019-2022.

We also optionally initialize a "global" index by dumping all files into the vector store.

In [8]:
# initialize simple vector indices + global vector index
# NOTE: don't run this cell if the indices are already loaded! 
index_set = {}
for year in years:
    cur_index = GPTSimpleVectorIndex(doc_set[year], chunk_size_limit=512)
    index_set[year] = cur_index
    cur_index.save_to_disk(f'index_{year}.json')
    

INFO:gpt_index.token_counter.token_counter:> [build_index_from_documents] Total LLM token usage: 0 tokens
INFO:gpt_index.token_counter.token_counter:> [build_index_from_documents] Total embedding token usage: 156882 tokens
INFO:gpt_index.token_counter.token_counter:> [build_index_from_documents] Total LLM token usage: 0 tokens
INFO:gpt_index.token_counter.token_counter:> [build_index_from_documents] Total embedding token usage: 162641 tokens
INFO:gpt_index.token_counter.token_counter:> [build_index_from_documents] Total LLM token usage: 0 tokens
INFO:gpt_index.token_counter.token_counter:> [build_index_from_documents] Total embedding token usage: 173288 tokens
INFO:gpt_index.token_counter.token_counter:> [build_index_from_documents] Total LLM token usage: 0 tokens
INFO:gpt_index.token_counter.token_counter:> [build_index_from_documents] Total embedding token usage: 166363 tokens


In [9]:
# Load indices from disk
index_set = {}
for year in years:
    cur_index = GPTSimpleVectorIndex.load_from_disk(f'index_{year}.json')
    index_set[year] = cur_index

### Composing a Graph to synthesize answers across 10-K filings (2019-2022)

We want our queries to aggregate/synthesize information across *all* 10-K filings. To do this, we define a List index
on top of the 4 vector indices.

In [10]:
from gpt_index import GPTListIndex, LLMPredictor
from langchain import OpenAI
from gpt_index.composability import ComposableGraph

In [11]:
# set summary text for each doc
for year in years:
    index_set[year].set_text(f"UBER 10-k Filing for {year} fiscal year")

In [12]:
# set number of output tokens
llm_predictor = LLMPredictor(llm=OpenAI(temperature=0, max_tokens=512))

In [13]:
# define a list index over the vector indices
# allows us to synthesize information across each index
list_index = GPTListIndex([index_set[y] for y in years], llm_predictor=llm_predictor)

INFO:gpt_index.token_counter.token_counter:> [build_index_from_documents] Total LLM token usage: 0 tokens
INFO:gpt_index.token_counter.token_counter:> [build_index_from_documents] Total embedding token usage: 0 tokens


In [14]:
graph = ComposableGraph.build_from_index(list_index)

In [15]:
graph.save_to_disk('10k_graph.json')

In [16]:
graph = ComposableGraph.load_from_disk('10k_graph.json', llm_predictor=llm_predictor)

In [17]:
# TMP: define prompt helper
from gpt_index import PromptHelper
query_prompt_helper = PromptHelper(4096, 256, 0)

# define query configs for graph 
query_configs = [
    {
        "index_struct_type": "simple_dict",
        "query_mode": "default",
        "query_kwargs": {
            "similarity_top_k": 1,
            # "include_summary": True,
            "response_mode": "tree_summarize",
            "prompt_helper": query_prompt_helper
        },
    },
    {
        "index_struct_type": "list",
        "query_mode": "default",
        "query_kwargs": {
            "response_mode": "tree_summarize",
            "verbose": True,
            "prompt_helper": query_prompt_helper
        }
    },
]

In [None]:
# TODO: add aquery to graph
import asyncio
import time

cross_query_str = (
    "Compare/contrast the risk factors described in the Uber 10-K across years. Give answer in bullet points."
)

start_time = time.perf_counter()
task = graph.aquery(cross_query_str, query_configs=query_configs)
response = asyncio.run(task)
elapsed_time = time.perf_counter() - start_time

In [19]:
print(str(response))
print(str(elapsed_time))


• 2019: Potential legal proceedings, competitive nature of the industry, potential for regulatory changes
• 2020: Potential legal proceedings, competitive nature of the industry, potential for regulatory changes, impact of the COVID-19 pandemic
• 2021: Risk of Drivers being classified as employees, workers or quasi-employees instead of independent contractors, high competition in mobility, delivery, and logistics industries, risk of COVID-19 pandemic
• 2022: Risk of Drivers being classified as employees, workers or quasi-employees instead of independent contractors, competitive landscape of the mobility, delivery, and logistics industries, risk of having to lower fares or service fees, or offer significant Driver incentives and consumer discounts and promotions in order to remain competitive in certain markets, risk of incurring significant losses

• 2019-2020: Potential legal proceedings, competitive nature of the industry, potential for regulatory changes
• 2021-2022: Risk of Driver