In [None]:
# download files
!mkdir data
!wget "https://www.dropbox.com/s/948jr9cfs7fgj99/UBER.zip?dl=1" -O data/UBER.zip
!unzip data/UBER.zip -d data

In [2]:
# set text wrapping
from IPython.display import HTML, display

def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))
get_ipython().events.register('pre_run_cell', set_css)

In [3]:
from llama_index import download_loader, GPTVectorStoreIndex, ServiceContext
from pathlib import Path

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
years = [2022, 2021, 2020, 2019]

In [5]:
UnstructuredReader = download_loader("UnstructuredReader", refresh_cache=True)

In [7]:
loader = UnstructuredReader()
doc_set = {}
all_docs = []
for year in years:
    year_docs = loader.load_data(file=Path(f'./data/UBER/UBER_{year}.html'), split_documents=False)
    # insert year metadata into each year
    for d in year_docs:
        d.extra_info = {"year": year}
    doc_set[year] = year_docs
    all_docs.extend(year_docs)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\richa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\richa\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [26]:
from langchain.chat_models  import AzureChatOpenAI
from langchain.embeddings import OpenAIEmbeddings
from llama_index import LangchainEmbedding
from llama_index import (
    LLMPredictor,
    ServiceContext
)

llm = AzureChatOpenAI(deployment_name="gpt-35-turbo", 
            openai_api_key='',
            openai_api_base='',
            openai_api_type='azure',
            openai_api_version='2023-03-15-preview',
            temperature=0.0
        )
llm_predictor = LLMPredictor(llm=llm)

embedding_llm = LangchainEmbedding(
            OpenAIEmbeddings(
                model="text-embedding-ada-002",
                deployment="text-embedding-ada-002",
                openai_api_key= '',
                openai_api_base='',
                openai_api_type='azure',
                openai_api_version='2023-03-15-preview',
            ),
            embed_batch_size=1,
        )
service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor, chunk_size_limit=1024)

In [24]:
index_set = {}
for year in years:
    cur_index = GPTVectorStoreIndex.from_documents(doc_set[year], service_context=service_context)
    index_set[year] = cur_index

RetryError: RetryError[<Future at 0x2b048825600 state=finished raised InvalidRequestError>]

In [18]:
from llama_index import GPTListIndex, LLMPredictor
from langchain import OpenAI
from llama_index.indices.composability import ComposableGraph

In [19]:
index_summaries = [f"UBER 10-k Filing for {year} fiscal year" for year in years]

In [None]:
graph = ComposableGraph.from_indices(
    GPTListIndex, 
    [index_set[y] for y in years], 
    index_summaries=index_summaries,
    service_context=service_context
)