In [1]:
# download files
!mkdir data
!wget "https://www.dropbox.com/s/948jr9cfs7fgj99/UBER.zip?dl=1" -O data/UBER.zip
!unzip data/UBER.zip -d data

mkdir: cannot create directory ‘data’: File exists
--2023-04-17 19:02:24--  https://www.dropbox.com/s/948jr9cfs7fgj99/UBER.zip?dl=1
Resolving www.dropbox.com (www.dropbox.com)... 162.125.1.18, 2620:100:6016:18::a27d:112
Connecting to www.dropbox.com (www.dropbox.com)|162.125.1.18|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: /s/dl/948jr9cfs7fgj99/UBER.zip [following]
--2023-04-17 19:02:24--  https://www.dropbox.com/s/dl/948jr9cfs7fgj99/UBER.zip
Reusing existing connection to www.dropbox.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://ucc1001c276b1921faf30a16a022.dl.dropboxusercontent.com/cd/0/get/B6bcRz_9WwprtoTL16Ml04tUVrdAYD7jxldrY39JEWnrY99aZ58ry_fxNNZxJ9oLBROEWFtIY-Wtu9DoWpzKaiKJTUn-j-1mDIRMXngXOV0deIke4g98TilzGoikCh-7p3bOhNotBLslHnt2b4I0ncRe1PDsv7BnktdUofZ5SrZmig/file?dl=1# [following]
--2023-04-17 19:02:25--  https://ucc1001c276b1921faf30a16a022.dl.dropboxusercontent.com/cd/0/get/B6bcRz_9WwprtoTL16Ml04tUVrdAYD7jxldrY39

In [1]:
# set text wrapping
from IPython.display import HTML, display

def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))
get_ipython().events.register('pre_run_cell', set_css)

In [3]:
from gpt_index import download_loader, GPTSimpleVectorIndex, ServiceContext
from pathlib import Path

  from .autonotebook import tqdm as notebook_tqdm


### Ingest Unstructured Data Through the Unstructured.io Reader

Leverage the capabilities of Unstructured.io HTML parsing.
Downloaded through LlamaHub.

In [4]:
years = [2022, 2021, 2020, 2019]

In [5]:
UnstructuredReader = download_loader("UnstructuredReader", refresh_cache=True)

In [6]:
loader = UnstructuredReader()
doc_set = {}
all_docs = []
for year in years:
    year_docs = loader.load_data(file=Path(f'./data/UBER/UBER_{year}.html'), split_documents=False)
    # insert year metadata into each year
    for d in year_docs:
        d.extra_info = {"year": year}
    doc_set[year] = year_docs
    all_docs.extend(year_docs)

[nltk_data] Downloading package punkt to /home/loganm/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/loganm/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
INFO:unstructured:Reading document from string ...
INFO:unstructured:Reading document ...
INFO:unstructured:Reading document from string ...
INFO:unstructured:Reading document ...
INFO:unstructured:Reading document from string ...
INFO:unstructured:Reading document ...
INFO:unstructured:Reading document from string ...
INFO:unstructured:Reading document ...


### Setup a Vector Index for each SEC filing

We setup a separate vector index for each SEC filing from 2019-2022.

We also optionally initialize a "global" index by dumping all files into the vector store.

In [7]:
# initialize simple vector indices + global vector index
# NOTE: don't run this cell if the indices are already loaded! 
index_set = {}
service_context = ServiceContext.from_defaults(chunk_size_limit=512)
for year in years:
    cur_index = GPTSimpleVectorIndex.from_documents(doc_set[year], service_context=service_context)
    index_set[year] = cur_index
    cur_index.save_to_disk(f'index_{year}.json')
    

INFO:gpt_index.token_counter.token_counter:> [build_index_from_nodes] Total LLM token usage: 0 tokens
INFO:gpt_index.token_counter.token_counter:> [build_index_from_nodes] Total embedding token usage: 232797 tokens
INFO:gpt_index.token_counter.token_counter:> [build_index_from_nodes] Total LLM token usage: 0 tokens
INFO:gpt_index.token_counter.token_counter:> [build_index_from_nodes] Total embedding token usage: 241424 tokens
INFO:gpt_index.token_counter.token_counter:> [build_index_from_nodes] Total LLM token usage: 0 tokens
INFO:gpt_index.token_counter.token_counter:> [build_index_from_nodes] Total embedding token usage: 257154 tokens
INFO:gpt_index.token_counter.token_counter:> [build_index_from_nodes] Total LLM token usage: 0 tokens
INFO:gpt_index.token_counter.token_counter:> [build_index_from_nodes] Total embedding token usage: 246480 tokens


In [8]:
# Load indices from disk
index_set = {}
for year in years:
    cur_index = GPTSimpleVectorIndex.load_from_disk(f'index_{year}.json')
    index_set[year] = cur_index

### Composing a Graph to synthesize answers across 10-K filings (2019-2022)

We want our queries to aggregate/synthesize information across *all* 10-K filings. To do this, we define a List index
on top of the 4 vector indices.

In [9]:
from gpt_index import GPTListIndex, LLMPredictor
from langchain import OpenAI
from gpt_index.indices.composability import ComposableGraph

In [10]:
# set summary text for each doc
index_summaries = [f"UBER 10-k Filing for {year} fiscal year" for year in years]

In [11]:
# set number of output tokens
llm_predictor = LLMPredictor(llm=OpenAI(temperature=0, max_tokens=512))
service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor)

In [12]:
# define a list index over the vector indices
# allows us to synthesize information across each index
graph = ComposableGraph.from_indices(
    GPTListIndex, 
    [index_set[y] for y in years], 
    index_summaries=index_summaries,
    service_context=service_context
)

INFO:gpt_index.token_counter.token_counter:> [build_index_from_nodes] Total LLM token usage: 0 tokens
INFO:gpt_index.token_counter.token_counter:> [build_index_from_nodes] Total embedding token usage: 0 tokens


In [13]:
graph.save_to_disk('10k_graph.json')

In [14]:
graph = ComposableGraph.load_from_disk('10k_graph.json', service_context=service_context)

## Setting up the Chatbot Agent

We use Langchain to define the outer chatbot abstraction. We use LlamaIndex as a core Tool within this abstraction.

In [15]:
from langchain.agents import Tool
from langchain.chains.conversation.memory import ConversationBufferMemory
from langchain.chat_models import ChatOpenAI
from langchain.agents import initialize_agent

from gpt_index.langchain_helpers.agents import LlamaToolkit, create_llama_chat_agent, IndexToolConfig, GraphToolConfig

In [16]:
# define a decompose transform
from gpt_index.indices.query.query_transform.base import DecomposeQueryTransform
decompose_transform = DecomposeQueryTransform(
    llm_predictor, verbose=True
)

# define query configs for graph 
query_configs = [
    {
        "index_struct_type": "simple_dict",
        "query_mode": "default",
        "query_kwargs": {
            "similarity_top_k": 1,
            # "include_summary": True
        },
        "query_transform": decompose_transform
    },
    {
        "index_struct_type": "list",
        "query_mode": "default",
        "query_kwargs": {
            "response_mode": "tree_summarize",
            "verbose": True
        }
    },
]

In [17]:
# define toolkit
index_configs = []
for y in range(2019, 2023):
    tool_config = IndexToolConfig(
        index=index_set[y], 
        name=f"Vector Index {y}",
        description=f"useful for when you want to answer queries about the {y} SEC 10-K for Uber",
        index_query_kwargs={"similarity_top_k": 3},
        tool_kwargs={"return_direct": True, "return_sources": True},
    )
    index_configs.append(tool_config)
    
# graph config
graph_config = GraphToolConfig(
    graph=graph,
    name=f"Graph Index",
    description="useful for when you want to answer queries that require analyzing multiple SEC 10-K documents for Uber.",
    query_configs=query_configs,
    tool_kwargs={"return_direct": True, "return_sources": True},
    return_sources=True
)

toolkit = LlamaToolkit(
    index_configs=index_configs,
    graph_configs=[graph_config]
)

In [18]:
memory = ConversationBufferMemory(memory_key="chat_history")
llm=OpenAI(temperature=0)
agent_chain = create_llama_chat_agent(
    toolkit,
    llm,
    memory=memory,
    verbose=True
)

In [19]:
agent_chain.run(input="hi, i am bob")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Thought: Do I need to use a tool? No
AI: Hi Bob, nice to meet you! How can I help you today?[0m

[1m> Finished chain.[0m


'Hi Bob, nice to meet you! How can I help you today?'

In [20]:
agent_chain.run(input="What were some of the biggest risk factors in 2020 for Uber?")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Thought: Do I need to use a tool? Yes
Action: Vector Index 2020
Action Input: Risk Factors[0m

INFO:gpt_index.token_counter.token_counter:> [query] Total LLM token usage: 2198 tokens
INFO:gpt_index.token_counter.token_counter:> [query] Total embedding token usage: 3 tokens



Observation: [33;1m[1;3m{'answer': '\n\nRisk factors include the COVID-19 pandemic and the impact of actions to mitigate the pandemic, the potential classification of Drivers as employees, workers or quasi-employees instead of independent contractors, the highly competitive nature of the mobility, delivery, and logistics industries, unresolved staff comments, legal proceedings, mine safety disclosures, market for registrant’s common equity, related stockholder matters and issuer purchases of equity securities, quantitative and qualitative disclosures about market risk, changes in and disagreements with accountants on accounting and financial disclosure, controls and procedures, certain relationships and related transactions, and director independence, and principal accounting fees and services. Additionally, this Annual Report on Form 10-K contains forward-looking statements within the meaning of the Private Securities Litigation Reform Act of 1995, which may affect the accuracy of 

"{'answer': '\\n\\nRisk factors include the COVID-19 pandemic and the impact of actions to mitigate the pandemic, the potential classification of Drivers as employees, workers or quasi-employees instead of independent contractors, the highly competitive nature of the mobility, delivery, and logistics industries, unresolved staff comments, legal proceedings, mine safety disclosures, market for registrant’s common equity, related stockholder matters and issuer purchases of equity securities, quantitative and qualitative disclosures about market risk, changes in and disagreements with accountants on accounting and financial disclosure, controls and procedures, certain relationships and related transactions, and director independence, and principal accounting fees and services. Additionally, this Annual Report on Form 10-K contains forward-looking statements within the meaning of the Private Securities Litigation Reform Act of 1995, which may affect the accuracy of the information provided

In [21]:
cross_query_str = (
    "Compare/contrast the risk factors described in the Uber 10-K across years. Give answer in bullet points."
)

response = agent_chain.run(input=cross_query_str)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Thought: Do I need to use a tool? Yes
Action: Graph Index
Action Input: Compare/contrast the risk factors described in the Uber 10-K across years.
AI: The Graph Index tool has been used to compare and contrast the risk factors described in the Uber 10-K across years. The following are the key differences: 

• In 2020, the risk factors included the COVID-19 pandemic and the impact of actions to mitigate the pandemic, the potential classification of Drivers as employees, workers or quasi-employees instead of independent contractors, the highly competitive nature of the mobility, delivery, and logistics industries, unresolved staff comments, legal proceedings, mine safety disclosures, market for registrant’s common equity, related stockholder matters and issuer purchases of equity securities, quantitative and qualitative disclosures about market risk, changes in and disagreements with accountants on accounting and financial dis

In [None]:
# parse the response w/ sources
import json
response_json = json.loads(response)
print(response)

### Setup Chatbot Loop Within Notebook

We'll keep a running loop so that you can converse with the agent. 

In [39]:
# reinitialize agent
memory = ConversationBufferMemory(memory_key="chat_history")
llm=OpenAI(temperature=0)
agent_chain = create_llama_chat_agent(
    toolkit,
    llm,
    memory=memory,
)

In [None]:
while True:
    text_input = input("User: ")
    response = agent_chain.run(input=text_input)
    print(f'Agent: {response}')
    

User:  What were some of the legal proceedings against Uber in 2022?


INFO:gpt_index.token_counter.token_counter:> [query] Total LLM token usage: 1957 tokens
INFO:gpt_index.token_counter.token_counter:> [query] Total embedding token usage: 6 tokens


Agent: 

In 2022, legal proceedings against Uber include a motion to compel arbitration, an appeal of a ruling that Proposition 22 is unconstitutional, a complaint alleging that drivers are employees and entitled to protections under the wage and labor laws, a summary judgment motion, allegations of misclassification of drivers and related employment violations in New York, fraud related to certain deductions, class actions in Australia alleging that Uber entities conspired to injure the group members during the period 2014 to 2017 by either directly breaching transport legislation or commissioning offenses against transport legislation by UberX Drivers in Australia, and claims of lost income and decreased value of certain taxi. Additionally, Uber is facing a challenge in California Superior Court alleging that Proposition 22 is unconstitutional, and a preliminary injunction order prohibiting Uber from classifying Drivers as independent contractors and from violating various wage and h