# Install llama-index and llama-parse dependencies

Additionally, we'll need to have the llamaCloud API key to continue

In [None]:
%pip install -qU llama-index llama-parse

In [1]:
import sys, os
from dotenv import load_dotenv

path = sys.path[0]+'/.env'
load_dotenv(path)

os.environ['LLAMA_CLOUD_API_KEY'] = os.getenv("LLAMA_CLOUD_API_KEY")
inference_api_key = os.getenv('HUGGINGFACEHUB_API_TOKEN')
os.environ["OPENAI_API_KEY"] = os.getenv('OPENAI_API_KEY')



In [2]:
# llama-parse is async-first, running the sync code in a notebook requires the use of nest_asyncio
import nest_asyncio

nest_asyncio.apply()

## Initializing `LlamaParse` object

In [3]:
from llama_parse import LlamaParse

parser = LlamaParse(
    result_type="markdown",
    verbose=True,
    language="en",
    num_workers=2,
)

## Load and Parse documents

In [5]:
docs = parser.load_data(['./data/ai-report.pdf', './data/goog-10-k-2023.pdf'])

Started parsing the file under job_id ab409edf-b99b-4af7-b137-441bf8da9e6d
Started parsing the file under job_id 0b94c051-1d1f-4f8b-8f85-b24a368d0bd9


Looking into the parsed documents

In [6]:
print(docs[0].text[:1000])

# OFFICE OF Artificial Intelligence Educational Technology and the Future of Teaching and Learning Insights and Recommendations May 2023
---
## Artificial Intelligence and the Future of Teaching and Learning

Miguel A. Cardona, Ed.D.
Secretary, U.S. Department of Education

Roberto J. Rodríguez
Assistant Secretary, Office of Planning, Evaluation, and Policy Development

Kristina Ishmael
Deputy Director, Office of Educational Technology

May 2023

Examples Are Not Endorsements

This document contains examples and resource materials that are provided for the user’s convenience. The inclusion of any material is not intended to reflect its importance nor is it intended to endorse any views expressed or products or services offered. These materials may contain the views and recommendations of various subject matter experts as well as hypertext links, contact addresses, and websites to information created and maintained by other public and private organizations. The opinions expressed in any

In [7]:
print(docs[1].text[:1000])

## UNITED STATES SECURITIES AND EXCHANGE COMMISSION Washington, D.C. 20549

## FORM 10-K

(Mark One)

☒ ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934 For the fiscal year ended December 31, 2023 OR ☐ TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934 For the transition period from to . Commission file number: 001-37580

Alphabet Inc. (Exact name of registrant as specified in its charter)

Delaware 61-1767919 (State or other jurisdiction of incorporation or organization) (I.R.S. Employer Identification No.)

1600 Amphitheatre Parkway Mountain View, CA 94043 (Address of principal executive offices, including zip code) (650) 253-0000 (Registrant's telephone number, including area code)

|Title of each class|Trading Symbol(s)|Name of each exchange on which registered|
|---|---|---|
|Class A Common Stock, $0.001 par value|GOOGL|Nasdaq Stock Market LLC (Nasdaq Global Select Market)|
|Class C Capital Stock, $0.001 par

## LLM and embedding model
Let's use some opensource llm and embedding models

In [None]:
from llama_index.core import Settings
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding

Settings.llm = OpenAI(model="gpt-3.5-turbo")
Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-small")

In [None]:
# from llama_index.core import Settings
# from llama_index.llms.huggingface import HuggingFaceInferenceAPI
# from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
# from llama_index.embeddings.langchain import LangchainEmbedding
# # from llama_index.llms.openai import OpenAI

# Settings.llm = HuggingFaceInferenceAPI(
#         model_name="HuggingFaceH4/zephyr-7b-alpha", token=inference_api_key
#     )
# Settings.embed_model = LangchainEmbedding(HuggingFaceInferenceAPIEmbeddings(
#     api_key=inference_api_key, model_name="sentence-transformers/all-MiniLM-l6-v2")
# )

#### Using `MarkdownElementNodeParser` to parse markdown objects

In [None]:
from llama_index.core.node_parser import MarkdownElementNodeParser

node_parser = MarkdownElementNodeParser(llm=OpenAI('gpt-3.5-turbo'), num_workers=8)

Parsing the documents

In [None]:
nodes = node_parser.get_nodes_from_documents(documents=[docs[0]])

In [None]:
base_nodes, objects = node_parser.get_nodes_and_objects(nodes)

In [None]:
from llama_index.core import VectorStoreIndex

recursive_index = VectorStoreIndex(nodes=base_nodes+objects)

In [None]:
%pip install -qU llama-index-postprocessor-flag-embedding-reranker git+https://github.com/FlagOpen/FlagEmbedding.git

In [None]:
from llama_index.postprocessor.flag_embedding_reranker import FlagEmbeddingReranker

reranker = FlagEmbeddingReranker(
    top_n=5,
    model="BAAI/bge-reranker-large",
)

recursive_query_engine = recursive_index.as_query_engine(
    similarity_top_k=15,
    node_postprocessors=[reranker],
    verbose=True
)

In [None]:
query = "your question based on context"
response = recursive_query_engine.query(query)