In [32]:
import datasets
import faiss
import os
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core import Document, load_index_from_storage, VectorStoreIndex, StorageContext
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.vector_stores.faiss import FaissVectorStore

In [33]:
os.environ["OPENAI_API_KEY"] = "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"

In [11]:
hotpot_qa = datasets.load_from_disk("/Users/haochen/workspace/data/hotpot_qa_filtered")
wiki_docs = datasets.load_from_disk("/Users/haochen/workspace/data/hotpot_qa_wiki_docs").select(range(100))

In [12]:
hotpot_qa

DatasetDict({
    train: Dataset({
        features: ['id', 'question', 'answer', 'type', 'level', 'supporting_facts', 'context'],
        num_rows: 76715
    })
    validation: Dataset({
        features: ['id', 'question', 'answer', 'type', 'level', 'supporting_facts', 'context'],
        num_rows: 6260
    })
})

In [13]:
wiki_docs

Dataset({
    features: ['id', 'url', 'title', 'text'],
    num_rows: 100
})

In [14]:
documents = [Document(
    text=doc["text"], 
    doc_id=doc["id"],
    metadata={"url":doc["url"], "title": doc["title"]}
    ) for doc in wiki_docs]

## Create index

In [15]:
chuck_splitter = SentenceSplitter(chunk_size=512, chunk_overlap=50)
nodes = chuck_splitter.get_nodes_from_documents(documents)

In [18]:
embed_model = HuggingFaceEmbedding(model_name="Alibaba-NLP/gte-large-en-v1.5", trust_remote_code=True)

A new version of the following files was downloaded from https://huggingface.co/Alibaba-NLP/new-impl:
- configuration.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/Alibaba-NLP/new-impl:
- modeling.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


In [19]:
embeddings = embed_model.get_text_embedding("Hello World!")

In [21]:
len(embeddings)

1024

In [26]:
d = 1024
faiss_index = faiss.IndexFlatL2(d)
vector_store = FaissVectorStore(faiss_index=faiss_index)

storage_context = StorageContext.from_defaults(vector_store=vector_store)
storage_context.docstore.add_documents(nodes)
index = VectorStoreIndex(
    nodes, storage_context=storage_context, embed_model=embed_model, show_progress=True
)

Generating embeddings: 100%|██████████| 252/252 [01:08<00:00,  3.68it/s]


### Persist index

In [27]:
index.storage_context.persist(persist_dir="./storage")

### Load index

In [29]:
# load index from disk
vector_store = FaissVectorStore.from_persist_dir("./storage")
storage_context = StorageContext.from_defaults(
    vector_store=vector_store, persist_dir="./storage"
)
index = load_index_from_storage(storage_context=storage_context, embed_model=embed_model)

## Query Index

In [34]:
query_engine = index.as_query_engine()
response = query_engine.query("What did the author do growing up?")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [35]:
response

Response(response="The author would often go up to his father's office to draw while his father was working, and he recalls hearing his father doing therapy with patients upstairs.", source_nodes=[NodeWithScore(node=TextNode(id_='00ba2020-40ad-4abf-93f2-64b5fa7b60bb', embedding=None, metadata={'url': 'https://en.wikipedia.org/wiki/Edward%20Sharpe%20and%20the%20Magnetic%20Zeros', 'title': 'Edward Sharpe and the Magnetic Zeros'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='15824631', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'url': 'https://en.wikipedia.org/wiki/Edward%20Sharpe%20and%20the%20Magnetic%20Zeros', 'title': 'Edward Sharpe and the Magnetic Zeros'}, hash='5f7deac381edeefd84f856c65b69056c51abad26df842234f1260b70b778d6a1'), <NodeRelationship.PREVIOUS: '2'>: RelatedNodeInfo(node_id='f4a761e2-de6d-4c89-ba53-20670f1a8584', node_type=<ObjectType.TEXT: '1'>, metadata={'url': 'https://en.w