In [7]:
# install dependencies
%pip install jedi>=0.16
%pip install -Uq llama-index

In [4]:
# import api key secret from Colab
from google.colab import userdata

# verify that key can be loaded
print(userdata.get('OPENAI_API_KEY')[-4:])

HMUA


In [5]:
# make sure we are inside of /content/
%ls

'=0.16'   [0m[01;34msample_data[0m/


In [10]:
# importing llamaindex libraries for RAG agent setupimport os
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding

# set the OpenAI API key as an environment variable
os.environ['OPENAI_API_KEY'] = userdata.get('OPENAI_API_KEY')

# set openai language and embedding models
llm_model = OpenAI(model="gpt-5-nano-2025-08-07", temperature=0.1)
embedding_model = OpenAIEmbedding(model="text-embedding-3-small")

# check that models were set correctly
print(f"GPT model: {llm_model.model}")
print(f"embedding model: {embedding_model.model_name}")

GPT model: gpt-5-nano-2025-08-07
embedding model: text-embedding-3-small


In [13]:
# load documents
documents = SimpleDirectoryReader("data").load_data()

# create vector store index from documents
index = VectorStoreIndex.from_documents(documents, llm_model=llm_model, embed_model=embedding_model)

# create query engine that can answer questions about indexed documents
query_engine = index.as_query_engine()

response = query_engine.query("When did she become a superstar?")
print(response)

She became a superstar after executing one of the most successful comebacks in history with her sixth studio album, "reputation."


In [14]:
'''
we can persist the indexed data into disk
so that we won't have to re-index it later
'''

import os.path
from llama_index.core import (
    VectorStoreIndex,
    SimpleDirectoryReader,
    StorageContext,
    load_index_from_storage,
)

# check if storage already exists
PERSIST_DIR = "./storage"

if not os.path.exists(PERSIST_DIR):
    # load the documents and create the index
    documents = SimpleDirectoryReader("data").load_data()
    index = VectorStoreIndex.from_documents(documents)
    # store it for later
    index.storage_context.persist(persist_dir=PERSIST_DIR)
else:
    # load the existing index
    storage_context = StorageContext.from_defaults(persist_dir=PERSIST_DIR)
    index = load_index_from_storage(storage_context)

# Either way we can now query the index
query_engine = index.as_query_engine()
response = query_engine.query("When did she become a superstar?")
print(response)


Loading llama_index.core.storage.kvstore.simple_kvstore from ./storage/docstore.json.
Loading llama_index.core.storage.kvstore.simple_kvstore from ./storage/index_store.json.
She became a superstar after executing one of the most successful comebacks in history with her sixth studio album, "reputation."


In [15]:
# setup llamacloud api key for parsing
import getpass
import os

os.environ["LLAMA_CLOUD_API_KEY"] = getpass.getpass()

··········


In [16]:
import nest_asyncio

nest_asyncio.apply()

In [17]:
from llama_parse import LlamaParse

# parse document and return plain text version
documents = LlamaParse(result_type="text").load_data("data/taylor_swift_biography.html")

# create new index from parsed documents
index = VectorStoreIndex.from_documents(documents)

# same as before
query_engine = index.as_query_engine()
response = query_engine.query("why is she so stuck up on her exes?")
print(response)

Started parsing the file under job_id 5536cb7a-787b-478c-9b1e-377c0d4c39f5
She often draws inspiration from her personal experiences, including past relationships, to create music that resonates with her audience.
