Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for Vespa Node's and indexing with additional fields #13356

Closed
wants to merge 9 commits into from
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from llama_index.core.schema import TextNode


class VespaNode(TextNode):
vespa_fields: dict
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Did we need a new object class for this? I wonder if this could just be a metadata field?

Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
metadata_dict_to_node,
)

from llama_index.schema.vespa.vespa_node import VespaNode
from llama_index.vector_stores.vespa.templates import hybrid_template

import asyncio
Expand Down Expand Up @@ -157,7 +158,14 @@ def client(self) -> Vespa:

def _try_get_running_app(self) -> Vespa:
app = Vespa(url=f"{self.url}:{self.port}")

status = app.get_application_status()
try:
status.status_code
except AttributeError:
raise ConnectionError(
f"Vespa application not running on url {self.url} and port {self.port}. Please start Vespa application first."
)
if status.status_code == 200:
return app
else:
Expand Down Expand Up @@ -213,14 +221,11 @@ def add(
node, remove_text=False, flat_metadata=self.flat_metadata
)
logger.debug(f"Metadata: {metadata}")
entry = {
"id": node.node_id,
"fields": {
"id": node.node_id,
"text": node.get_content(metadata_mode=MetadataMode.NONE) or "",
"metadata": json.dumps(metadata),
},
}

if isinstance(node, VespaNode):
entry = self._get_vespa_node_entry(node)
else:
entry = self._get_base_node_entry(node, metadata)
if self.embeddings_outside_vespa:
entry["fields"]["embedding"] = node.get_embedding()
data_to_insert.append(entry)
Expand All @@ -235,6 +240,28 @@ def add(
)
return ids

def _get_base_node_entry(self, node: BaseNode, metadata:dict):
entry = {
"id": node.node_id,
"fields": {
"id": node.node_id,
"text": node.get_content(metadata_mode=MetadataMode.NONE) or "",
"metadata": json.dumps(metadata),
},
}
return entry
def _get_vespa_node_entry(self,node: VespaNode):
vespa_fields = node.vespa_fields
metadata = node_to_metadata_dict(
node, remove_text=False, flat_metadata=self.flat_metadata
)
vespa_fields.update({
"text": node.get_content(metadata_mode=MetadataMode.NONE) or "",
"metadata": json.dumps(metadata),
})
entry = {"id": node.node_id, "fields": vespa_fields}
return entry

async def async_add(
self,
nodes: List[BaseNode],
Expand Down Expand Up @@ -465,18 +492,38 @@ def query(
ids: List[str] = []
similarities: List[float] = []
for hit in response.hits:
response_fields: dict = hit.get("fields", {})
metadata = response_fields.get("metadata", {})
metadata = json.loads(metadata)
logger.debug(f"Metadata: {metadata}")
node = metadata_dict_to_node(metadata)
text = response_fields.get("body", "")
node.set_content(text)
node = self._vespa_hit_to_node(hit)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This was a nice refactor! 🥇

nodes.append(node)
ids.append(response_fields.get("id"))
id = hit["fields"].get("id")
ids.append(id)
similarities.append(hit["relevance"])
return VectorStoreQueryResult(nodes=nodes, ids=ids, similarities=similarities)

def _vespa_hit_to_node(self, hit: dict) -> BaseNode | VespaNode:
response_fields = hit.get("fields", {})
if self._is_vespa_node(response_fields):
node = self._get_vespa_node_from_fields(response_fields)
else:
node = self._get_base_node_from_fields(response_fields)
text = response_fields.get("body", "")
node.set_content(text)
return node

def _is_vespa_node(self, fields: dict):
# Check if there are more fields than text and metadata
base_node_fields = ["text", "metadata"]
return len([f for f in fields if f not in base_node_fields]) > 0

def _get_vespa_node_from_fields(self, fields: dict):
vespa_fields = fields
return VespaNode(vespa_fields=vespa_fields)

def _get_base_node_from_fields(self, fields: dict):
metadata = fields.get("metadata", {})
node = metadata_dict_to_node(metadata)
return node


async def aquery(
self,
query: VectorStoreQuery,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -94,3 +94,87 @@
)
],
)

# I am not sure what to name this, so I decided a name that highlights the difference with hybrid_template
# This needs to be renamed to something more meaningful
with_fields_template = ApplicationPackage(
Comment on lines +98 to +100
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this is good for now, and we can improve template naming once we get a better feel for use cases.

name="withfields",
schema=[
Schema(
name="doc",
document=Document(
fields=[
Field(name="id", type="string", indexing=["summary"]),
Field(name="title", type="string", indexing=["summary"]),
Field(name="author", type="string", indexing=["summary"]),
Field(name="theme", type="string", indexing=["summary"]),
Field(name="year", type="int", indexing=["summary"]),
Field(name="metadata", type="string", indexing=["summary"]),
Field(
name="text",
type="string",
indexing=["index", "summary"],
index="enable-bm25",
bolding=True,
),
Field(
name="embedding",
type="tensor<float>(x[384])",
indexing=[
"input text",
"embed",
"index",
"attribute",
],
ann=HNSW(distance_metric="angular"),
is_document_field=False,
),
]
),
fieldsets=[FieldSet(name="default", fields=["text", "metadata"])],
rank_profiles=[
RankProfile(
name="bm25",
inputs=[("query(q)", "tensor<float>(x[384])")],
functions=[Function(name="bm25sum", expression="bm25(text)")],
first_phase="bm25sum",
),
RankProfile(
name="semantic",
inputs=[("query(q)", "tensor<float>(x[384])")],
first_phase="closeness(field, embedding)",
),
RankProfile(
name="fusion",
inherits="bm25",
inputs=[("query(q)", "tensor<float>(x[384])")],
first_phase="closeness(field, embedding)",
global_phase=GlobalPhaseRanking(
expression="reciprocal_rank_fusion(bm25sum, closeness(field, embedding))",
rerank_count=1000,
),
),
],
)
],
components=[
Component(
id="e5",
type="hugging-face-embedder",
parameters=[
Parameter(
"transformer-model",
{
"url": "https://github.com/vespa-engine/sample-apps/raw/master/simple-semantic-search/model/e5-small-v2-int8.onnx"
},
),
Parameter(
"tokenizer-model",
{
"url": "https://raw.githubusercontent.com/vespa-engine/sample-apps/master/simple-semantic-search/model/tokenizer.json"
},
),
],
)
],
)