In [1]:
import os
import nest_asyncio
import streamlit as st

os.environ["OPENAI_API_KEY"] = st.secrets["OPENAI_API_KEY"]
# 這是在筆記本中運行的工具
nest_asyncio.apply()

In [2]:
# 下載 UBER 10-K 文件
!mkdir data
!wget "https://www.dropbox.com/s/948jr9cfs7fgj99/UBER.zip?dl=1" -O data/UBER.zip
!unzip data/UBER.zip -d data

--2024-05-31 03:48:17--  https://www.dropbox.com/s/948jr9cfs7fgj99/UBER.zip?dl=1
正在查找主機 www.dropbox.com (www.dropbox.com)... 162.125.85.18
正在連接 www.dropbox.com (www.dropbox.com)|162.125.85.18|:443... 連上了。
已送出 HTTP 要求，正在等候回應... 302 Found
位置: /scl/fi/vetj6tgcux8e309swquxs/UBER.zip?rlkey=oy4vx60rplorounkh9wh2waux&dl=1 [跟隨至新的 URL]
--2024-05-31 03:48:17--  https://www.dropbox.com/scl/fi/vetj6tgcux8e309swquxs/UBER.zip?rlkey=oy4vx60rplorounkh9wh2waux&dl=1
繼續使用和 www.dropbox.com:443 的連線。
已送出 HTTP 要求，正在等候回應... 302 Found
位置: https://uc09fdca175f3797d1d3ebc3a8eb.dl.dropboxusercontent.com/cd/0/inline/CT4ynsdCUOa4ch2dkXNbkS_tQIgkYeeCJrP9XhFZLyh0qDwkpWWNB87zRKn_-KI9JNSaTnBq_G23jVfQE4BDio7vc_qRaX3-jxOIy5-hgwiszV1cQbr-BehheFulUL0x5kc/file?dl=1# [跟隨至新的 URL]
--2024-05-31 03:48:18--  https://uc09fdca175f3797d1d3ebc3a8eb.dl.dropboxusercontent.com/cd/0/inline/CT4ynsdCUOa4ch2dkXNbkS_tQIgkYeeCJrP9XhFZLyh0qDwkpWWNB87zRKn_-KI9JNSaTnBq_G23jVfQE4BDio7vc_qRaX3-jxOIy5-hgwiszV1cQbr-BehheFulUL0x5kc/file?dl=1
正在查找主機 u

In [3]:
from llama_index.readers.file import UnstructuredReader
from pathlib import Path

# 這個壓縮檔中有四個年度的資料
years = [2022, 2021, 2020, 2019]

loader = UnstructuredReader()
doc_set = {}
all_docs = []
for year in years:
    year_docs = loader.load_data(
        file=Path(f"./data/UBER/UBER_{year}.html"),
        split_documents=False
    )
    for d in year_docs:
        d.metadata = {"year": year}
    doc_set[year] = year_docs
    all_docs.extend(year_docs)

In [4]:
from llama_index.core import VectorStoreIndex, StorageContext
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.llms.openai import OpenAI
from llama_index.core import Settings

Settings.chunk_size = 512
Settings.chunk_overlap = 64
Settings.llm = OpenAI(
    model=st.secrets["OPENAI_MODEL"]
)
Settings.embed_model = OpenAIEmbedding(
    model=st.secrets["OPENAI_EMBEDDING_MODEL"]
)

index_set = {}
for year in years:
    storage_context = StorageContext.from_defaults()
    cur_index = VectorStoreIndex.from_documents(
        doc_set[year],
        storage_context=storage_context
    )
    index_set[year] = cur_index
    storage_context.persist(persist_dir=f"./storage/{year}")


In [5]:
from llama_index.core import load_index_from_storage

index_set = {}
for year in years:
    storage_context = StorageContext.from_defaults(persist_dir=f"./storage/{year}")
    cur_index = load_index_from_storage(storage_context)
    index_set[year] = cur_index


In [6]:
from llama_index.core.tools import QueryEngineTool, ToolMetadata
from llama_index.core.query_engine import SubQuestionQueryEngine

individual_query_engine_tools = [
    QueryEngineTool(
        query_engine=index_set[year].as_query_engine(),
        metadata=ToolMetadata(
            name=f"vector_index_{year}",
            description=(
                f"當您想要回答有關 Uber {year} SEC 10-K 的查詢時非常有用"
            )
        )
    ) for year in years
]

query_engine = SubQuestionQueryEngine.from_defaults(
    query_engine_tools=individual_query_engine_tools
)


In [7]:
from llama_index.agent.openai import OpenAIAgent

query_engine_tool = QueryEngineTool(
    query_engine=query_engine,
    metadata=ToolMetadata(
        name="sub_question_query_engine",
        description=(
            "當您想要回答需要分析 Uber 的多個 SEC 10-K 文件的查詢時非常有用"
        )
    )
)

tools = individual_query_engine_tools + [query_engine_tool]

agent = OpenAIAgent.from_tools(tools, verbose=True)


In [8]:
# 預設問題來取得回答
response = agent.chat("哈囉，我是小柱子")
print(str(response))

# 預設問題來取得回答
response = agent.chat(
    "2020 年 Uber 面臨的最大風險因素有哪些？"
)
# 輸出答案
print(str(response))

# 預設問題
cross_query_str = (
    "比較/對比多年來 Uber 10-K 中描述的風險因素。以要點的形式給出答案。"
)
# 取得回答
response = agent.chat(cross_query_str)
# 輸出答案
print(str(response))


Added user message to memory: 哈囉，我是小柱子
哈囉小柱子，很高興見到你！有什麼我可以幫忙的嗎？
Added user message to memory: 2020 年 Uber 面臨的最大風險因素有哪些？
=== Calling Function ===
Calling function: vector_index_2020 with args: {"input":"What were the major risk factors faced by Uber in 2020?"}
Got output: In 2020, Uber faced several major risk factors including:

1. **Data Privacy and Protection Risks**: Uber was concerned about potential investigations, inquiries, litigation, fines, and negative publicity related to its data handling practices.

2. **Intellectual Property Risks**: There was a risk of significant expenses and adverse effects on the business if Uber failed to protect its intellectual property or if it was accused of infringing on the intellectual property of others.

3. **Stock Volatility**: The market price of Uber’s common stock was subject to volatility and could decline regardless of the company's operating performance, potentially failing to meet investor or analyst expectations.

4. **Impact of COV

In [10]:
agent = OpenAIAgent.from_tools(tools)

while True:
    text_input = input("User: ")
    if text_input == "exit":
        break
    response = agent.chat(text_input)
    print(f"Agent: {response}")

# User: What were some of the legal proceedings against Uber in 2022?
# User: 2022 年針對 Uber 的一些法律訴訟有哪些
# Agent: (回應例子)


Agent: 2022年Uber的主要利潤來源是大都會區和機場往返的行程，這些行程佔總預訂額的一大部分。
