From 9f046c3e6116a95c4d56595af851713a191370b2 Mon Sep 17 00:00:00 2001 From: Tyler Hutcherson Date: Wed, 24 Jan 2024 15:51:22 -0500 Subject: [PATCH 01/10] wip --- redisvl/schema/fields.py | 69 +++++++++++++++++++++++++++------------ redisvl/schema/schema.py | 16 +++++---- tests/unit/test_fields.py | 12 +++---- 3 files changed, 65 insertions(+), 32 deletions(-) diff --git a/redisvl/schema/fields.py b/redisvl/schema/fields.py index be629aec..4930ba9c 100644 --- a/redisvl/schema/fields.py +++ b/redisvl/schema/fields.py @@ -1,4 +1,4 @@ -from typing import Any, Dict, Optional, Union +from typing import Any, Dict, Optional, Tuple, Union from pydantic.v1 import BaseModel, Field, validator from redis.commands.search.field import Field as RedisField @@ -12,8 +12,15 @@ class BaseField(BaseModel): name: str = Field(...) - sortable: Optional[bool] = False - as_name: Optional[str] = None + path: Optional[str] = None + + def _handle_path_name(self) -> Tuple[str, str]: + # In the case of JSON path / objects + # We pass the path as the name field in the Redis API + # We pass the true name as the as_name (alias) in the Redis API + if self.path: + return self.path, self.name + return self.name, self.path class TextField(BaseField): @@ -21,52 +28,66 @@ class TextField(BaseField): no_stem: Optional[bool] = False phonetic_matcher: Optional[str] = None withsuffixtrie: Optional[bool] = False + sortable: Optional[bool] = False - def as_field(self) -> RedisField: + def as_redis_field(self) -> RedisField: + name, as_name = self._handle_path_name() return RedisTextField( - self.name, + name, + as_name=as_name, weight=self.weight, # type: ignore no_stem=self.no_stem, # type: ignore phonetic_matcher=self.phonetic_matcher, # type: ignore sortable=self.sortable, - as_name=self.as_name, ) class TagField(BaseField): separator: Optional[str] = "," case_sensitive: Optional[bool] = False + sortable: Optional[bool] = False - def as_field(self) -> RedisField: + def as_redis_field(self) -> RedisField: + name, as_name = self._handle_path_name() return RedisTagField( - self.name, + name, + as_name=as_name, separator=self.separator, # type: ignore case_sensitive=self.case_sensitive, # type: ignore sortable=self.sortable, - as_name=self.as_name, ) class NumericField(BaseField): - def as_field(self) -> RedisField: + sortable: Optional[bool] = False + + def as_redis_field(self) -> RedisField: + name, as_name = self._handle_path_name() return RedisNumericField( - self.name, sortable=self.sortable, as_name=self.as_name + name, + as_name=as_name, + sortable=self.sortable, ) class GeoField(BaseField): - def as_field(self) -> RedisField: - return RedisGeoField(self.name, sortable=self.sortable, as_name=self.as_name) + sortable: Optional[bool] = False + + def as_redis_field(self) -> RedisField: + name, as_name = self._handle_path_name() + return RedisGeoField( + name, + as_name=as_name, + sortable=self.sortable, + ) -class BaseVectorField(BaseModel): - name: str = Field(...) +class BaseVectorField(BaseField): dims: int = Field(...) algorithm: object = Field(...) datatype: str = Field(default="FLOAT32") distance_metric: str = Field(default="COSINE") initial_cap: Optional[int] = None - as_name: Optional[str] = None @validator("algorithm", "datatype", "distance_metric", pre=True) @classmethod @@ -89,13 +110,17 @@ class FlatVectorField(BaseVectorField): algorithm: Literal["FLAT"] = "FLAT" block_size: Optional[int] = None - def as_field(self) -> RedisField: + def as_redis_field(self) -> RedisField: # grab base field params and augment with flat-specific fields + name, as_name = self._handle_path_name() field_data = super().field_data if self.block_size is not None: field_data["BLOCK_SIZE"] = self.block_size return RedisVectorField( - self.name, self.algorithm, field_data, as_name=self.as_name + name, + self.algorithm, + field_data, + as_name=as_name ) @@ -106,8 +131,9 @@ class HNSWVectorField(BaseVectorField): ef_runtime: int = Field(default=10) epsilon: float = Field(default=0.01) - def as_field(self) -> RedisField: + def as_redis_field(self) -> RedisField: # grab base field params and augment with hnsw-specific fields + name, as_name = self._handle_path_name() field_data = super().field_data field_data.update( { @@ -118,7 +144,10 @@ def as_field(self) -> RedisField: } ) return RedisVectorField( - self.name, self.algorithm, field_data, as_name=self.as_name + name, + self.algorithm, + field_data, + as_name=as_name ) diff --git a/redisvl/schema/schema.py b/redisvl/schema/schema.py index af3e0106..70da4a22 100644 --- a/redisvl/schema/schema.py +++ b/redisvl/schema/schema.py @@ -15,6 +15,13 @@ class StorageType(Enum): JSON = "json" +class IndexInfo(BaseModel): + name: str + prefix: str = "rvl" + key_separator: str = ":" + storage_type: StorageType = StorageType.HASH + + class IndexSchema(BaseModel): """Represents a schema definition for an index in Redis, used in RedisVL for organizing and querying vector and metadata fields. @@ -58,11 +65,8 @@ class IndexSchema(BaseModel): }) """ - - name: str - prefix: str = "rvl" - key_separator: str = ":" - storage_type: StorageType = StorageType.HASH + version: str = "0.1" + index: IndexInfo fields: Dict[str, List[Union[BaseField, BaseVectorField]]] = {} @validator("fields", pre=True) @@ -115,7 +119,7 @@ def redis_fields(self) -> List[RedisField]: """ redis_fields: List[RedisField] = [] for field_list in self.fields.values(): - redis_fields.extend(field.as_field() for field in field_list) # type: ignore + redis_fields.extend(field.as_redis_field() for field in field_list) # type: ignore return redis_fields def add_fields(self, fields: Dict[str, List[Dict[str, Any]]]): diff --git a/tests/unit/test_fields.py b/tests/unit/test_fields.py index bf19da4e..bc0dcc58 100644 --- a/tests/unit/test_fields.py +++ b/tests/unit/test_fields.py @@ -73,19 +73,19 @@ def create_hnsw_vector_field(**kwargs): ) def test_field_schema_as_field(schema_func, field_class): schema = schema_func() - field = schema.as_field() + field = schema.as_redis_field() assert isinstance(field, field_class) assert field.name == f"example_{field_class.__name__.lower()}" def test_vector_fields_as_field(): flat_vector_schema = create_flat_vector_field() - flat_vector_field = flat_vector_schema.as_field() + flat_vector_field = flat_vector_schema.as_redis_field() assert isinstance(flat_vector_field, RedisVectorField) assert flat_vector_field.name == "example_flatvectorfield" hnsw_vector_schema = create_hnsw_vector_field() - hnsw_vector_field = hnsw_vector_schema.as_field() + hnsw_vector_field = hnsw_vector_schema.as_redis_field() assert isinstance(hnsw_vector_field, RedisVectorField) assert hnsw_vector_field.name == "example_hnswvectorfield" @@ -100,7 +100,7 @@ def test_vector_fields_as_field(): def test_vector_fields_with_optional_params(vector_schema_func, extra_params): # Create a vector schema with additional parameters set. vector_schema = vector_schema_func(**extra_params) - vector_field = vector_schema.as_field() + vector_field = vector_schema.as_redis_field() # Assert that the field is correctly created and the optional parameters are set. assert isinstance(vector_field, RedisVectorField) @@ -119,7 +119,7 @@ def test_hnsw_vector_field_optional_params_not_set(): assert hnsw_field.ef_runtime == 10 # default value assert hnsw_field.epsilon == 0.01 # default value - field_exported = hnsw_field.as_field() + field_exported = hnsw_field.as_redis_field() # Check the default values are correctly applied in the exported object assert field_exported.args[field_exported.args.index("M") + 1] == 16 @@ -131,7 +131,7 @@ def test_hnsw_vector_field_optional_params_not_set(): def test_flat_vector_field_block_size_not_set(): # Create Flat vector field without setting block_size flat_field = FlatVectorField(name="example_vector", dims=128, algorithm="FLAT") - field_exported = flat_field.as_field() + field_exported = flat_field.as_redis_field() # block_size and initial_cap should not be in the exported field if it was not set assert "BLOCK_SIZE" not in field_exported.args From fb2067ca21cdc534a1adac4d676457f2f8507e36 Mon Sep 17 00:00:00 2001 From: Tyler Hutcherson Date: Fri, 26 Jan 2024 11:04:46 -0500 Subject: [PATCH 02/10] WIP --- redisvl/cli/index.py | 2 +- redisvl/cli/main.py | 2 +- redisvl/cli/stats.py | 2 +- redisvl/cli/utils.py | 2 +- redisvl/cli/version.py | 2 +- redisvl/index.py | 15 +- redisvl/schema/schema.py | 437 ++++++++++++++++++---------------- redisvl/{cli => utils}/log.py | 0 schemas/schema.yaml | 40 ++++ 9 files changed, 296 insertions(+), 206 deletions(-) rename redisvl/{cli => utils}/log.py (100%) create mode 100644 schemas/schema.yaml diff --git a/redisvl/cli/index.py b/redisvl/cli/index.py index 50befd85..51506bc3 100644 --- a/redisvl/cli/index.py +++ b/redisvl/cli/index.py @@ -4,7 +4,7 @@ from tabulate import tabulate -from redisvl.cli.log import get_logger +from redisvl.utils.log import get_logger from redisvl.cli.utils import add_index_parsing_options, create_redis_url from redisvl.index import SearchIndex from redisvl.schema import IndexSchema diff --git a/redisvl/cli/main.py b/redisvl/cli/main.py index 2dcd16af..007fe75e 100644 --- a/redisvl/cli/main.py +++ b/redisvl/cli/main.py @@ -2,7 +2,7 @@ import sys from redisvl.cli.index import Index -from redisvl.cli.log import get_logger +from redisvl.utils.log import get_logger from redisvl.cli.stats import Stats from redisvl.cli.version import Version diff --git a/redisvl/cli/stats.py b/redisvl/cli/stats.py index dcff6834..a0b05fc7 100644 --- a/redisvl/cli/stats.py +++ b/redisvl/cli/stats.py @@ -4,7 +4,7 @@ from tabulate import tabulate -from redisvl.cli.log import get_logger +from redisvl.utils.log import get_logger from redisvl.cli.utils import add_index_parsing_options, create_redis_url from redisvl.index import SearchIndex from redisvl.schema import IndexSchema diff --git a/redisvl/cli/utils.py b/redisvl/cli/utils.py index 109f43ea..5d76a184 100644 --- a/redisvl/cli/utils.py +++ b/redisvl/cli/utils.py @@ -1,7 +1,7 @@ import os from argparse import ArgumentParser, Namespace -from redisvl.cli.log import get_logger +from redisvl.utils.log import get_logger logger = get_logger("[RedisVL]") diff --git a/redisvl/cli/version.py b/redisvl/cli/version.py index 845e3584..13facbcd 100644 --- a/redisvl/cli/version.py +++ b/redisvl/cli/version.py @@ -3,7 +3,7 @@ from argparse import Namespace from redisvl import __version__ -from redisvl.cli.log import get_logger +from redisvl.utils.log import get_logger logger = get_logger("[RedisVL]") diff --git a/redisvl/index.py b/redisvl/index.py index 7e9c4309..5c1e0e76 100644 --- a/redisvl/index.py +++ b/redisvl/index.py @@ -180,8 +180,19 @@ def __init__( connection_args: Dict[str, Any] = {}, **kwargs, ): - """Initialize the RedisVL search index class with a schema, redis_url, - connection_args, and other kwargs.""" + """Initialize the RedisVL search index with a schema, Redis client + (or URL string with other connection args), connection_args, and other + kwargs. + + Args: + schema (IndexSchema): Index schema object. + redis_url (str, optional): The URL of the Redis server to + connect to. + redis_client(Union[redis.Redis, aredis.Redis], optional): An + instantiated redis client. + connection_args (Dict[str, Any], optional): Redis client connection + args. + """ # final validation on schema object if not schema or not isinstance(schema, IndexSchema): raise ValueError("Must provide a valid schema object") diff --git a/redisvl/schema/schema.py b/redisvl/schema/schema.py index 70da4a22..14f2195f 100644 --- a/redisvl/schema/schema.py +++ b/redisvl/schema/schema.py @@ -1,100 +1,211 @@ import re + from enum import Enum from pathlib import Path -from typing import Any, Dict, List, Union +from typing import Any, Dict, List, Optional import yaml -from pydantic.v1 import BaseModel, validator +from pydantic.v1 import BaseModel, root_validator from redis.commands.search.field import Field as RedisField -from redisvl.schema.fields import BaseField, BaseVectorField, FieldFactory +from redisvl.schema.fields import BaseField, FieldFactory +from redisvl.utils.log import get_logger + + +logger = get_logger(__name__) class StorageType(Enum): + """ + Enumeration for the storage types supported in Redis. + + Attributes: + HASH (str): Represents the 'hash' storage type in Redis. + JSON (str): Represents the 'json' storage type in Redis. + """ HASH = "hash" JSON = "json" class IndexInfo(BaseModel): + """ + Represents the basic configuration information for an index in Redis. + + This class includes the essential details required to define an index, such as + its name, prefix, key separator, and storage type. + """ name: str + """The unique name of the index.""" prefix: str = "rvl" + """The prefix used for Redis keys associated with this index.""" key_separator: str = ":" + """The separator character used in Redis keys.""" storage_type: StorageType = StorageType.HASH + """The storage type used in Redis (e.g., 'hash' or 'json').""" class IndexSchema(BaseModel): - """Represents a schema definition for an index in Redis, used in RedisVL for - organizing and querying vector and metadata fields. - - This schema defines the structure of data stored in Redis, including - information about the storage type, field definitions, and key formatting - conventions used in the Redis database. Use the convenience class - constructor methods `from_dict` and `from_yaml` to load and create an index - schema from your definitions. + """Represents a schema definition for a search index in Redis, primarily + used in RedisVL for organizing and querying vector and metadata fields. - Note: All field names MUST be unique in the index schema. + This schema provides a structured format to define the layout and types of + fields stored in Redis, including details such as storage type, field + definitions, and key formatting conventions. - Attributes: - name (str): Unique name of the index. - prefix (str): Prefix used for Redis keys. Defaults to "rvl". - key_separator (str): Separator character used in Redis keys. Defaults - to ":". - storage_type (StorageType): Enum representing the underlying Redis data - structure (e.g. hash or json). Defaults to hash. - fields (Dict[str, List[Union[BaseField, BaseVectorField]]]): A dict - mapping field types to lists of redisvl field definitions. + The class offers methods to create an index schema from a YAML file or a + Python dictionary, supporting flexible schema definitions and easy + integration into various workflows. .. code-block:: python from redisvl.schema import IndexSchema + # From YAML schema = IndexSchema.from_yaml("schema.yaml") + # From Dict schema = IndexSchema.from_dict({ "index": { - "name": "my-index", + "name": "docs-index", "prefix": "docs", "storage_type": "hash", }, - "fields": { - "tag": [{"name": "doc-id"}], - "vector": [ - {"name": "doc-embedding", "algorithm": "flat", "dims": 1536} - ] - } + "fields": [ + { + "name": "doc-id", + "type": "tag" + }, + { + "name": "doc-embedding", + "type": "vector", + "attrs": { + "algorithm": "flat", + "dims": 1536 + } + } + ] }) + Note: + The `fields` attribute in the schema must contain unique field names to ensure + correct and unambiguous field references. + """ - version: str = "0.1" index: IndexInfo - fields: Dict[str, List[Union[BaseField, BaseVectorField]]] = {} + """Details of the basic index configurations.""" + fields: Dict[str, BaseField] = {} + """Fields associated with the search index and their properties""" + version: str = "0.1.0" + """Version of the underlying index schema.""" - @validator("fields", pre=True) - @classmethod - def check_unique_field_names(cls, fields): - """Validate that field names are all unique.""" - all_names = cls._get_field_names(fields) - print(all_names, flush=True) - if len(set(all_names)) != len(all_names): - raise ValueError( - f"Field names {all_names} must be unique across all fields." + def _make_field(self, **field_inputs) -> BaseField: + """ + Parse raw field inputs derived from YAML or dict. + + Validates and sets the 'path' attribute for fields when using JSON storage type. + """ + # Parse raw field inputs + field_name = field_inputs.get("name") + field_type = field_inputs.get("type") + field_attrs = field_inputs.get("attrs", {}) + field_path = field_inputs.get("path") + + if not field_name or not field_type: + raise ValueError("Fields must include a 'type' and 'name'.") + + if storage_type == StorageType.JSON: + field_path = field_path if field_path else f"$.{field_name}" + # Otherwise we are using HASH + if path is not None: + logger.warning( + f"Path attribute for field '{field_name}' will be ignored for HASH storage type." ) - return fields - @staticmethod - def _get_field_names( - fields: Dict[str, List[Union[BaseField, BaseVectorField]]] - ) -> List[str]: - """Returns a list of field names from a fields object. + return FieldFactory.create_field(field_type, name=field_name, **field_attrs) + + @root_validator(pre=True) + def validate_and_create_fields(cls, values): + """ + Validate uniqueness of field names and create valid field instances. + """ + raw_fields = values.get('fields', []) + field_definitions = {} + + for field_input in raw_fields: + field = cls._make_field(**field_input) + if field.name in field_definitions: + raise ValueError( + f"Duplicate field name: {field.name}. Field names must be unique across all fields." + ) + field_definitions[field.name] = field + + values['fields'] = field_definitions + return values + + @classmethod + def from_yaml(cls, file_path: str) -> "IndexSchema": + """Create an IndexSchema from a YAML file. + + Args: + file_path (str): The path to the YAML file. Returns: - List[str]: A list of field names from the fields object. + IndexSchema: The index schema. + + .. code-block:: python + + from redisvl.schema import IndexSchema + schema = IndexSchema.from_yaml("schema.yaml") """ - all_names: List[str] = [] - for field_list in fields.values(): - for field in field_list: - all_names.append(field.name) - return all_names + try: + fp = Path(file_path).resolve() + except OSError as e: + raise ValueError(f"Invalid file path: {file_path}") from e + + if not fp.exists(): + raise FileNotFoundError(f"Schema file {file_path} does not exist") + + with open(fp, "r") as f: + yaml_data = yaml.safe_load(f) + return cls(**yaml_data) + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "IndexSchema": + """Create an IndexSchema from a dictionary. + + Args: + data (Dict[str, Any]): The index schema data. + + Returns: + IndexSchema: The index schema. + + .. code-block:: python + + from redisvl.schema import IndexSchema + + schema = IndexSchema.from_dict({ + "index": { + "name": "docs-index", + "prefix": "docs", + "storage_type": "hash", + }, + "fields": [ + { + "name": "doc-id", + "type": "tag" + }, + { + "name": "doc-embedding", + "type": "vector", + "attrs": { + "algorithm": "flat", + "dims": 1536 + } + } + ] + }) + """ + return cls(**data) @property def field_names(self) -> List[str]: @@ -103,7 +214,7 @@ def field_names(self) -> List[str]: Returns: List[str]: A list of field names from the schema. """ - return self._get_field_names(self.fields) + return list(self.fields.keys()) @property def redis_fields(self) -> List[RedisField]: @@ -117,94 +228,96 @@ def redis_fields(self) -> List[RedisField]: Returns: List[RedisField]: A list of redis-py field definitions. """ - redis_fields: List[RedisField] = [] - for field_list in self.fields.values(): - redis_fields.extend(field.as_redis_field() for field in field_list) # type: ignore + redis_fields: List[RedisField] = [ + field.as_redis_field() for _, field in self.fields.items() + ] return redis_fields - def add_fields(self, fields: Dict[str, List[Dict[str, Any]]]): - """Extends the schema with additional fields. + def add_field(self, field_inputs: Dict[str, Any]): + """Adds a single field to the index schema based on the specified field + type and attributes. - This method allows dynamically adding new fields to the index schema. It - processes a dictionary where each key represents a field type, and the - corresponding value is a list of field definitions to add. + This method allows for the addition of individual fields to the schema, + providing flexibility in defining the structure of the index. Args: - fields (Dict[str, List[Dict[str, Any]]]): A dictionary mapping field - types to lists of field attributes. + field_inputs (Dict[str, Any]): A field to add. - .. code-block:: python - - schema.add_fields({}) - # From Dict - schema = IndexSchema.from_dict({ + Raises: + ValueError: If the field name or type are not provided or if the name + already exists within the schema. + .. code-block:: python - Raises: - ValueError: If a field with the same name already exists in the - schema. + # Add a tag field + schema.add_field({"name": "user", "type": "tag}) + + # Add a vector field + schema.add_field({ + "name": "user-embedding", + "type": "vector", + "attrs": { + "dims": 1024, + "algorithm": "flat", + "datatype": "float32" + } + }) """ - for field_type, field_list in fields.items(): - for field_data in field_list: - self.add_field(field_type, **field_data) + # Parse field inputs + field = self._make_field(**field_inputs) + # Check for duplicates + if field.name in self.fields: + raise ValueError( + f"Duplicate field name: {field.name}. Field names must be unique across all fields for this index." + ) + # Add field + self.fields[field.name] = field - def add_field(self, field_type: str, **kwargs): - """Adds a single field to the index schema based on the specified field - type and attributes. + def add_fields(self, fields: List[Dict[str, Any]]): + """Extends the schema with additional fields. - This method allows for the addition of individual fields to the schema, - providing flexibility in defining the structure of the index. + This method allows dynamically adding new fields to the index schema. It + processes a list of field definitions. Args: - field_type (str): Type of the field to be added - (e.g., 'text', 'numeric', 'tag', 'vector', 'geo'). - **kwargs: A dictionary of attributes for the field, including the - required 'name'. + fields (List[Dict[str, Any]]): A list of fields to add. Raises: - ValueError: If the field name is either not provided or already - exists within the schema. - """ - name = kwargs.get("name", None) - if name is None: - raise ValueError("Field name is required.") + ValueError: If a field with the same name already exists in the + schema. - new_field = FieldFactory.create_field(field_type, **kwargs) - if name in self.field_names: - raise ValueError( - f"Duplicate field '{name}' already present in index schema." - ) + .. code-block:: python - self.fields.setdefault(field_type, []).append(new_field) + schema.add_fields([ + {"name": "user", "type": "tag"}, + {"name": "bio", "type": "text"}, + { + "name": "user-embedding", + "type": "vector", + "attrs": { + "dims": 1024, + "algorithm": "flat", + "datatype": "float32" + } + } + ]) + """ + for field in fields: + self.add_field(**field) - def remove_field(self, field_type: str, field_name: str): - """Removes a field from the schema based on the specified field type and - name. + def remove_field(self, field_name: str): + """Removes a field from the schema based on the specified name. This method is useful for dynamically altering the schema by removing existing fields. Args: - field_type (str): The type of the field to be removed. field_name (str): The name of the field to be removed. - - Raises: - ValueError: If the field type or the specified field name does not - exist in the schema. """ - fields = self.fields.get(field_type) - - if fields is None: - raise ValueError(f"Field type '{field_type}' does not exist.") - - filtered_fields = [field for field in fields if field.name != field_name] - - if len(filtered_fields) == len(fields): - # field not found, raise Error - raise ValueError( - f"Field '{field_name}' does not exist in {field_type} fields." - ) - self.fields[field_type] = filtered_fields + if field_name not in self.fields: + logger.warning(f"Field '{field_name}' does not exist in the schema") + return + del self.fields[field_name] def generate_fields( self, @@ -235,107 +348,33 @@ def generate_fields( - This method employs heuristics and may not always correctly infer field types. """ - fields: Dict[str, List[Dict[str, Any]]] = {} + fields: List[Dict[str, Any]] for field_name, value in data.items(): if field_name in ignore_fields: continue try: field_type = TypeInferrer.infer(value) - new_field = FieldFactory.create_field( - field_type, - field_name, - ) - fields.setdefault(field_type, []).append( - new_field.dict(exclude_unset=True) - ) + fields.append({ + "type": field_type, + "attrs": FieldFactory.create_field( + field_type, + field_name, + ).dict(exclude_unset=True) + }) except ValueError as e: if strict: raise else: - print(f"Error inferring field type for {field_name}: {e}") + logger.warning(f"Error inferring field type for {field_name}: {e}") return fields - @classmethod - def from_dict(cls, data: Dict[str, Any]) -> "IndexSchema": - """Create an IndexSchema from a dictionary. - - Args: - data (Dict[str, Any]): The index schema data. - - Returns: - IndexSchema: The index schema. - - - .. code-block:: python - - from redisvl.schema import IndexSchema - schema = IndexSchema.from_dict({ - "index": { - "name": "my-index", - "prefix": "docs", - "storage_type": "hash", - }, - "fields": { - "tag": [{"name": "doc-id"}], - "vector": [ - {"name": "doc-embedding", "algorithm": "flat", "dims": 1536} - ] - } - }) - - """ - schema = cls(**data["index"]) - for field_type, field_list in data["fields"].items(): - for field_data in field_list: - schema.add_field(field_type, **field_data) - return schema - def to_dict(self) -> Dict[str, Any]: """Convert the index schema to a dictionary. Returns: Dict[str, Any]: The index schema as a dictionary. """ - index_data = { - "name": self.name, - "prefix": self.prefix, - "key_separator": self.key_separator, - "storage_type": self.storage_type.value, - } - formatted_fields = {} - for field_type, fields in self.fields.items(): - formatted_fields[field_type] = [ - field.dict(exclude_unset=True) for field in fields - ] - return {"index": index_data, "fields": formatted_fields} - - @classmethod - def from_yaml(cls, file_path: str) -> "IndexSchema": - """Create an IndexSchema from a YAML file. - - Args: - file_path (str): The path to the YAML file. - - Returns: - IndexSchema: The index schema. - - .. code-block:: python - - from redisvl.schema import IndexSchema - schema = IndexSchema.from_yaml("schema.yaml") - - """ - try: - fp = Path(file_path).resolve() - except OSError as e: - raise ValueError(f"Invalid file path: {file_path}") from e - - if not fp.exists(): - raise FileNotFoundError(f"Schema file {file_path} does not exist") - - with open(fp, "r") as f: - yaml_data = yaml.safe_load(f) - return cls.from_dict(yaml_data) + return self.dict(exclude_unset=True) def to_yaml(self, file_path: str, overwrite: bool = True) -> None: """Write the index schema to a YAML file. diff --git a/redisvl/cli/log.py b/redisvl/utils/log.py similarity index 100% rename from redisvl/cli/log.py rename to redisvl/utils/log.py diff --git a/schemas/schema.yaml b/schemas/schema.yaml new file mode 100644 index 00000000..52413242 --- /dev/null +++ b/schemas/schema.yaml @@ -0,0 +1,40 @@ +# NEW: add version.. ideally this will mirror the version of redisvl (0.1.0 for example) for change management +version: '0.1.0' + +index: + name: user-index-v1 + prefix: user + key_separator: ':' + storage_type: json + +fields: + + - name: user + type: tag + path: '$.user' + + - name: credit_score + type: tag + path: '$.credit_score' + + - name: embedding + type: vector + path: '$.embedding' + attrs: + - algorithm: flat + dims: 3 + distance_metric: cosine + datatype: float32 + + + +# To discuss +# - fields list +# - versioning...? +# - backwards compat +# - text/str data + + + + + From cfb87b2d9b64b8c2bfeb577fcd5595242b6d6a54 Mon Sep 17 00:00:00 2001 From: Tyler Hutcherson Date: Fri, 26 Jan 2024 13:26:34 -0500 Subject: [PATCH 03/10] handle old format temporarily --- redisvl/index.py | 46 +++++++++++----------- redisvl/schema/schema.py | 84 ++++++++++++++++++++++++++++++++-------- schemas/schema.yaml | 47 +++++++--------------- 3 files changed, 105 insertions(+), 72 deletions(-) diff --git a/redisvl/index.py b/redisvl/index.py index 5c1e0e76..52bd6447 100644 --- a/redisvl/index.py +++ b/redisvl/index.py @@ -175,8 +175,8 @@ class SearchIndex: def __init__( self, schema: IndexSchema, - redis_url: Optional[str] = None, redis_client: Optional[Union[redis.Redis, aredis.Redis]] = None, + redis_url: Optional[str] = None, connection_args: Dict[str, Any] = {}, **kwargs, ): @@ -186,10 +186,10 @@ def __init__( Args: schema (IndexSchema): Index schema object. - redis_url (str, optional): The URL of the Redis server to - connect to. redis_client(Union[redis.Redis, aredis.Redis], optional): An instantiated redis client. + redis_url (str, optional): The URL of the Redis server to + connect to. connection_args (Dict[str, Any], optional): Redis client connection args. """ @@ -206,31 +206,31 @@ def __init__( self.schema = schema - self._storage = self._STORAGE_MAP[self.schema.storage_type]( - prefix=self.schema.prefix, key_separator=self.schema.key_separator + self._storage = self._STORAGE_MAP[self.schema.index.storage_type]( + prefix=self.schema.index.prefix, key_separator=self.schema.index.key_separator ) @property def name(self) -> str: """The name of the Redis search index.""" - return self.schema.name + return self.schema.index.name @property def prefix(self) -> str: """The optional key prefix that comes before a unique key value in forming a Redis key.""" - return self.schema.prefix + return self.schema.index.prefix @property def key_separator(self) -> str: """The optional separator between a defined prefix and key value in forming a Redis key.""" - return self.schema.key_separator + return self.schema.index.key_separator @property def storage_type(self) -> StorageType: """The underlying storage type for the search index: hash or json.""" - return self.schema.storage_type + return self.schema.index.storage_type @property def client(self) -> Optional[Union[redis.Redis, aredis.Redis]]: @@ -385,7 +385,7 @@ def key(self, id: str) -> str: Returns: str: The full Redis key including key prefix and value as a string. """ - return self._storage._key(id, self.schema.prefix, self.schema.key_separator) + return self._storage._key(id, self.schema.index.prefix, self.schema.index.key_separator) @check_modules_present("_redis_conn") def create(self, overwrite: bool = False) -> None: @@ -417,7 +417,7 @@ def create(self, overwrite: bool = False) -> None: self._redis_conn.client.ft(self.name).create_index( # type: ignore fields=redis_fields, definition=IndexDefinition( - prefix=[self.prefix], index_type=self._storage.type + prefix=[self.schema.index.prefix], index_type=self._storage.type ), ) @@ -434,7 +434,7 @@ def delete(self, drop: bool = True): redis.exceptions.ResponseError: If the index does not exist. """ # Delete the search index - self._redis_conn.client.ft(self.name).dropindex(delete_documents=drop) # type: ignore + self._redis_conn.client.ft(self.schema.index.name).dropindex(delete_documents=drop) # type: ignore @check_modules_present("_redis_conn") def load( @@ -512,7 +512,7 @@ def search(self, *args, **kwargs) -> Union["Result", Any]: Returns: Union["Result", Any]: Search results. """ - results = self._redis_conn.client.ft(self.name).search( # type: ignore + results = self._redis_conn.client.ft(self.schema.index.name).search( # type: ignore *args, **kwargs ) return results @@ -522,7 +522,7 @@ def _query(self, query: BaseQuery) -> List[Dict[str, Any]]: results = self.search(query.query, query_params=query.params) # post process the results return process_results( - results, query=query, storage_type=self.schema.storage_type + results, query=query, storage_type=self.schema.index.storage_type ) @check_modules_present("_redis_conn") @@ -604,7 +604,7 @@ def exists(self) -> bool: Returns: bool: True if the index exists, False otherwise. """ - return self.name in self.listall() + return self.schema.index.name in self.listall() @check_modules_present("_redis_conn") @check_index_exists() @@ -615,7 +615,7 @@ def info(self) -> Dict[str, Any]: dict: A dictionary containing the information about the index. """ return convert_bytes( - self._redis_conn.client.ft(self.name).info() # type: ignore + self._redis_conn.client.ft(self.schema.index.name).info() # type: ignore ) @check_async_modules_present("_redis_conn") @@ -643,10 +643,10 @@ async def acreate(self, overwrite: bool = False) -> None: await self.adelete() # Create Index with proper IndexType - await self._redis_conn.client.ft(self.name).create_index( # type: ignore + await self._redis_conn.client.ft(self.schema.index.name).create_index( # type: ignore fields=redis_fields, definition=IndexDefinition( - prefix=[self.prefix], index_type=self._storage.type + prefix=[self.schema.index.prefix], index_type=self._storage.type ), ) @@ -663,7 +663,7 @@ async def adelete(self, drop: bool = True): redis.exceptions.ResponseError: If the index does not exist. """ # Delete the search index - await self._redis_conn.client.ft(self.name).dropindex(delete_documents=drop) # type: ignore + await self._redis_conn.client.ft(self.schema.index.name).dropindex(delete_documents=drop) # type: ignore @check_async_modules_present("_redis_conn") async def aload( @@ -740,7 +740,7 @@ async def asearch(self, *args, **kwargs) -> Union["Result", Any]: Returns: Union["Result", Any]: Search results. """ - results = await self._redis_conn.client.ft(self.name).search( # type: ignore + results = await self._redis_conn.client.ft(self.schema.index.name).search( # type: ignore *args, **kwargs ) return results @@ -750,7 +750,7 @@ async def _aquery(self, query: BaseQuery) -> List[Dict[str, Any]]: results = await self.asearch(query.query, query_params=query.params) # post process the results return process_results( - results, query=query, storage_type=self.schema.storage_type + results, query=query, storage_type=self.schema.index.storage_type ) @check_async_modules_present("_redis_conn") @@ -834,7 +834,7 @@ async def aexists(self) -> bool: Returns: bool: True if the index exists, False otherwise. """ - return self.name in await self.alistall() + return self.schema.index.name in await self.alistall() @check_async_modules_present("_redis_conn") @check_async_index_exists() @@ -845,5 +845,5 @@ async def ainfo(self) -> Dict[str, Any]: dict: A dictionary containing the information about the index. """ return convert_bytes( - await self._redis_conn.client.ft(self.name).info() # type: ignore + await self._redis_conn.client.ft(self.schema.index.name).info() # type: ignore ) diff --git a/redisvl/schema/schema.py b/redisvl/schema/schema.py index 14f2195f..7ef16f1e 100644 --- a/redisvl/schema/schema.py +++ b/redisvl/schema/schema.py @@ -2,10 +2,10 @@ from enum import Enum from pathlib import Path -from typing import Any, Dict, List, Optional +from typing import Any, Dict, List import yaml -from pydantic.v1 import BaseModel, root_validator +from pydantic.v1 import BaseModel, root_validator, validator from redis.commands.search.field import Field as RedisField from redisvl.schema.fields import BaseField, FieldFactory @@ -13,6 +13,7 @@ logger = get_logger(__name__) +SCHEMA_VERSION = "0.1.0" class StorageType(Enum): @@ -95,10 +96,11 @@ class IndexSchema(BaseModel): """Details of the basic index configurations.""" fields: Dict[str, BaseField] = {} """Fields associated with the search index and their properties""" - version: str = "0.1.0" + version: str = SCHEMA_VERSION """Version of the underlying index schema.""" - def _make_field(self, **field_inputs) -> BaseField: + @staticmethod + def _make_field(storage_type, **field_inputs) -> BaseField: """ Parse raw field inputs derived from YAML or dict. @@ -113,35 +115,82 @@ def _make_field(self, **field_inputs) -> BaseField: if not field_name or not field_type: raise ValueError("Fields must include a 'type' and 'name'.") + # Handle field path and storage type if storage_type == StorageType.JSON: field_path = field_path if field_path else f"$.{field_name}" - # Otherwise we are using HASH - if path is not None: - logger.warning( - f"Path attribute for field '{field_name}' will be ignored for HASH storage type." - ) + else: + if field_path is not None: + logger.warning( + f"Path attribute for field '{field_name}' will be ignored for HASH storage type." + ) + field_path = None - return FieldFactory.create_field(field_type, name=field_name, **field_attrs) + # Update attrs and create field instance + field_attrs.update({ + "name": field_name, + "path": field_path + }) + return FieldFactory.create_field(field_type=field_type, **field_attrs) + + @staticmethod + def _convert_old_format(storage_type: StorageType, raw_fields: Dict[str, List[Dict[str, Any]]]) -> List[Dict[str, Any]]: + updated_fields: List[Dict[str, Any]] = [] + for field_type, fields_list in raw_fields.items(): + for field in fields_list: + if storage_type == StorageType.HASH: + field.pop("path", None) + updated_fields.append({ + "name": field.pop("name", None), + "path": None, + "type": field_type, + "attrs": field + }) + else: + updated_fields.append({ + "name": field.pop("as_name", None), + "path": field.pop("path", field.pop("name", None)), + "type": field_type, + "attrs": field + }) + return updated_fields @root_validator(pre=True) + @classmethod def validate_and_create_fields(cls, values): """ Validate uniqueness of field names and create valid field instances. """ + index = IndexInfo(**values.get('index')) raw_fields = values.get('fields', []) - field_definitions = {} - + prepared_fields: Dict[str, BaseField] = {} + # Process raw fields + if isinstance(raw_fields, dict): + # Need to handle backwards compat for the moment + # TODO -- will remove this when 0.1.0 lands + logger.warning("New schema format introduced; please update schema specs prior to 0.1.0") + raw_fields = cls._convert_old_format(index.storage_type, raw_fields) for field_input in raw_fields: - field = cls._make_field(**field_input) - if field.name in field_definitions: + field = cls._make_field(index.storage_type, **field_input) + if field.name in prepared_fields: raise ValueError( f"Duplicate field name: {field.name}. Field names must be unique across all fields." ) - field_definitions[field.name] = field + prepared_fields[field.name] = field - values['fields'] = field_definitions + values['fields'] = prepared_fields + values['index'] = index return values + @validator("version", pre=True) + @classmethod + def validate_version(cls, version: str): + """Validate IndexSchema version.""" + if version != SCHEMA_VERSION: + raise ValueError( + f"RedisVL IndexSchema version must be {SCHEMA_VERSION} but got {version}" + ) + return version + @classmethod def from_yaml(cls, file_path: str) -> "IndexSchema": """Create an IndexSchema from a YAML file. @@ -264,7 +313,7 @@ def add_field(self, field_inputs: Dict[str, Any]): }) """ # Parse field inputs - field = self._make_field(**field_inputs) + field = self._make_field(self.index.storage_type, **field_inputs) # Check for duplicates if field.name in self.fields: raise ValueError( @@ -355,6 +404,7 @@ def generate_fields( try: field_type = TypeInferrer.infer(value) fields.append({ + "name": field_name, "type": field_type, "attrs": FieldFactory.create_field( field_type, diff --git a/schemas/schema.yaml b/schemas/schema.yaml index 52413242..8f9a071f 100644 --- a/schemas/schema.yaml +++ b/schemas/schema.yaml @@ -1,5 +1,3 @@ -# NEW: add version.. ideally this will mirror the version of redisvl (0.1.0 for example) for change management -version: '0.1.0' index: name: user-index-v1 @@ -8,33 +6,18 @@ index: storage_type: json fields: - - - name: user - type: tag - path: '$.user' - - - name: credit_score - type: tag - path: '$.credit_score' - - - name: embedding - type: vector - path: '$.embedding' - attrs: - - algorithm: flat - dims: 3 - distance_metric: cosine - datatype: float32 - - - -# To discuss -# - fields list -# - versioning...? -# - backwards compat -# - text/str data - - - - - + - name: user + type: tag + path: '.user' + other: test + - name: credit_score + type: tag + path: '$.credit_score' + - name: embedding + type: vector + path: '$.embedding' + attrs: + algorithm: flat + dims: 3 + distance_metric: cosine + datatype: float32 From f7d9da1a6f7a60808ed81fd9d5388bdef14494bc Mon Sep 17 00:00:00 2001 From: Tyler Hutcherson Date: Mon, 29 Jan 2024 12:52:33 -0500 Subject: [PATCH 04/10] refactor schema to 0.1.0 --- redisvl/cli/index.py | 2 +- redisvl/cli/main.py | 2 +- redisvl/cli/stats.py | 2 +- redisvl/index.py | 7 +- redisvl/llmcache/semantic.py | 36 +++-- redisvl/schema/fields.py | 231 ++++++++++++++++++------------- redisvl/schema/schema.py | 108 ++++----------- schemas/schema.yaml | 2 +- schemas/test_hash_schema.yaml | 16 +++ schemas/test_json_schema.yaml | 16 +++ tests/integration/test_query.py | 23 +-- tests/integration/test_simple.py | 53 +++---- tests/sample_hash_schema.yaml | 13 -- tests/sample_json_schema.yaml | 15 -- tests/unit/test_fields.py | 65 ++++++--- tests/unit/test_index.py | 8 +- tests/unit/test_schema.py | 143 ++++++++++--------- 17 files changed, 381 insertions(+), 361 deletions(-) create mode 100644 schemas/test_hash_schema.yaml create mode 100644 schemas/test_json_schema.yaml delete mode 100644 tests/sample_hash_schema.yaml delete mode 100644 tests/sample_json_schema.yaml diff --git a/redisvl/cli/index.py b/redisvl/cli/index.py index 51506bc3..64228081 100644 --- a/redisvl/cli/index.py +++ b/redisvl/cli/index.py @@ -4,11 +4,11 @@ from tabulate import tabulate -from redisvl.utils.log import get_logger from redisvl.cli.utils import add_index_parsing_options, create_redis_url from redisvl.index import SearchIndex from redisvl.schema import IndexSchema from redisvl.utils.connection import RedisConnection +from redisvl.utils.log import get_logger from redisvl.utils.utils import convert_bytes, make_dict logger = get_logger("[RedisVL]") diff --git a/redisvl/cli/main.py b/redisvl/cli/main.py index 007fe75e..1353192f 100644 --- a/redisvl/cli/main.py +++ b/redisvl/cli/main.py @@ -2,9 +2,9 @@ import sys from redisvl.cli.index import Index -from redisvl.utils.log import get_logger from redisvl.cli.stats import Stats from redisvl.cli.version import Version +from redisvl.utils.log import get_logger logger = get_logger(__name__) diff --git a/redisvl/cli/stats.py b/redisvl/cli/stats.py index a0b05fc7..db1cfbeb 100644 --- a/redisvl/cli/stats.py +++ b/redisvl/cli/stats.py @@ -4,10 +4,10 @@ from tabulate import tabulate -from redisvl.utils.log import get_logger from redisvl.cli.utils import add_index_parsing_options, create_redis_url from redisvl.index import SearchIndex from redisvl.schema import IndexSchema +from redisvl.utils.log import get_logger logger = get_logger("[RedisVL]") diff --git a/redisvl/index.py b/redisvl/index.py index 52bd6447..588dafb7 100644 --- a/redisvl/index.py +++ b/redisvl/index.py @@ -207,7 +207,8 @@ def __init__( self.schema = schema self._storage = self._STORAGE_MAP[self.schema.index.storage_type]( - prefix=self.schema.index.prefix, key_separator=self.schema.index.key_separator + prefix=self.schema.index.prefix, + key_separator=self.schema.index.key_separator, ) @property @@ -385,7 +386,9 @@ def key(self, id: str) -> str: Returns: str: The full Redis key including key prefix and value as a string. """ - return self._storage._key(id, self.schema.index.prefix, self.schema.index.key_separator) + return self._storage._key( + id, self.schema.index.prefix, self.schema.index.key_separator + ) @check_modules_present("_redis_conn") def create(self, overwrite: bool = False) -> None: diff --git a/redisvl/llmcache/semantic.py b/redisvl/llmcache/semantic.py index 7f1c4a14..113bedf1 100644 --- a/redisvl/llmcache/semantic.py +++ b/redisvl/llmcache/semantic.py @@ -4,8 +4,8 @@ from redisvl.index import SearchIndex from redisvl.llmcache.base import BaseLLMCache from redisvl.query import RangeQuery -from redisvl.schema import IndexSchema -from redisvl.schema.fields import BaseVectorField +from redisvl.schema.fields import BaseField +from redisvl.schema.schema import IndexInfo, IndexSchema from redisvl.utils.utils import array_to_buffer from redisvl.vectorize.base import BaseVectorizer from redisvl.vectorize.text import HFTextVectorizer @@ -32,20 +32,26 @@ def __init__( raise ValueError("Must provide vectorizer dimensions") # Construct the base base index schema - super().__init__(name=name, prefix=prefix, **kwargs) + super().__init__(index={"name": name, "prefix": prefix}) # other schema kwargs will get consumed here # otherwise fall back to index schema defaults # Add fields specific to the LLMCacheSchema - self.add_field("text", name=self.prompt_field_name) - self.add_field("text", name=self.response_field_name) - self.add_field( - "vector", - name=self.vector_field_name, - dims=vector_dims, - datatype="float32", - distance_metric="cosine", - algorithm="flat", + self.add_fields( + [ + {"name": self.prompt_field_name, "type": "text"}, + {"name": self.response_field_name, "type": "text"}, + { + "name": self.vector_field_name, + "type": "vector", + "attrs": { + "dims": vector_dims, + "datatype": "float32", + "distance_metric": "cosine", + "algorithm": "flat", + }, + }, + ] ) class Config: @@ -53,8 +59,8 @@ class Config: ignore_extra = True @property - def vector_field(self) -> BaseVectorField: - return self.fields["vector"][0] # type: ignore + def vector_field(self) -> BaseField: + return self.fields[self.vector_field_name] class SemanticCache(BaseLLMCache): @@ -199,7 +205,7 @@ def set_vectorizer(self, vectorizer: BaseVectorizer) -> None: if not isinstance(vectorizer, BaseVectorizer): raise TypeError("Must provide a valid redisvl.vectorizer class.") - schema_vector_dims = self._schema.vector_field.dims + schema_vector_dims = self._schema.vector_field.attrs.dims if schema_vector_dims != vectorizer.dims: raise ValueError( diff --git a/redisvl/schema/fields.py b/redisvl/schema/fields.py index 4930ba9c..ac45bc73 100644 --- a/redisvl/schema/fields.py +++ b/redisvl/schema/fields.py @@ -10,145 +10,175 @@ from typing_extensions import Literal +class BaseFieldAttributes(BaseModel): + sortable: Optional[bool] = False + + +class TextFieldAttributes(BaseFieldAttributes): + weight: Optional[float] = 1 + no_stem: Optional[bool] = False + phonetic_matcher: Optional[str] = None + withsuffixtrie: Optional[bool] = False + + +class TagFieldAttributes(BaseFieldAttributes): + separator: Optional[str] = "," + case_sensitive: Optional[bool] = False + + +class NumericFieldAttributes(BaseFieldAttributes): + pass + + +class GeoFieldAttributes(BaseFieldAttributes): + pass + + +class BaseVectorFieldAttributes(BaseModel): + dims: int = Field(...) + algorithm: object = Field(...) + datatype: str = Field(default="FLOAT32") + distance_metric: str = Field(default="COSINE") + initial_cap: Optional[int] = None + + @validator("algorithm", "datatype", "distance_metric", pre=True) + @classmethod + def uppercase_strings(cls, v): + return v.upper() + + @property + def field_data(self) -> Dict[str, Any]: + field_data = { + "TYPE": self.datatype, + "DIM": self.dims, + "DISTANCE_METRIC": self.distance_metric, + } + if self.initial_cap is not None: # Only include it if it's set + field_data["INITIAL_CAP"] = self.initial_cap + return field_data + + +class HNSWVectorFieldAttributes(BaseVectorFieldAttributes): + algorithm: Literal["HNSW"] = "HNSW" + m: int = Field(default=16) + ef_construction: int = Field(default=200) + ef_runtime: int = Field(default=10) + epsilon: float = Field(default=0.01) + + +class FlatVectorFieldAttributes(BaseVectorFieldAttributes): + algorithm: Literal["FLAT"] = "FLAT" + block_size: Optional[int] = None + + +### Field Classes ### + + class BaseField(BaseModel): - name: str = Field(...) + name: str + type: str path: Optional[str] = None + attrs: Optional[Union[BaseFieldAttributes, BaseVectorFieldAttributes]] = None - def _handle_path_name(self) -> Tuple[str, str]: - # In the case of JSON path / objects - # We pass the path as the name field in the Redis API - # We pass the true name as the as_name (alias) in the Redis API + def _handle_names(self) -> Tuple[str, Optional[str]]: if self.path: return self.path, self.name - return self.name, self.path + return self.name, None class TextField(BaseField): - weight: Optional[float] = 1 - no_stem: Optional[bool] = False - phonetic_matcher: Optional[str] = None - withsuffixtrie: Optional[bool] = False - sortable: Optional[bool] = False + type: str = Field(default="text", const=True) + attrs: Optional[TextFieldAttributes] = Field(default_factory=TextFieldAttributes) def as_redis_field(self) -> RedisField: - name, as_name = self._handle_path_name() + name, as_name = self._handle_names() return RedisTextField( name, as_name=as_name, - weight=self.weight, # type: ignore - no_stem=self.no_stem, # type: ignore - phonetic_matcher=self.phonetic_matcher, # type: ignore - sortable=self.sortable, + weight=self.attrs.weight, # type: ignore + no_stem=self.attrs.no_stem, # type: ignore + phonetic_matcher=self.attrs.phonetic_matcher, # type: ignore + sortable=self.attrs.sortable, ) class TagField(BaseField): - separator: Optional[str] = "," - case_sensitive: Optional[bool] = False - sortable: Optional[bool] = False + type: str = Field(default="tag", const=True) + attrs: Optional[TagFieldAttributes] = Field(default_factory=TagFieldAttributes) def as_redis_field(self) -> RedisField: - name, as_name = self._handle_path_name() + name, as_name = self._handle_names() return RedisTagField( name, as_name=as_name, - separator=self.separator, # type: ignore - case_sensitive=self.case_sensitive, # type: ignore - sortable=self.sortable, + separator=self.attrs.separator, # type: ignore + case_sensitive=self.attrs.case_sensitive, # type: ignore + sortable=self.attrs.sortable, ) class NumericField(BaseField): - sortable: Optional[bool] = False + type: str = Field(default="numeric", const=True) + attrs: Optional[NumericFieldAttributes] = Field( + default_factory=NumericFieldAttributes + ) def as_redis_field(self) -> RedisField: - name, as_name = self._handle_path_name() + name, as_name = self._handle_names() return RedisNumericField( name, as_name=as_name, - sortable=self.sortable, + sortable=self.attrs.sortable, ) class GeoField(BaseField): - sortable: Optional[bool] = False + type: str = Field(default="geo", const=True) + attrs: Optional[GeoFieldAttributes] = Field(default_factory=GeoFieldAttributes) def as_redis_field(self) -> RedisField: - name, as_name = self._handle_path_name() + name, as_name = self._handle_names() return RedisGeoField( name, as_name=as_name, - sortable=self.sortable, + sortable=self.attrs.sortable, ) -class BaseVectorField(BaseField): - dims: int = Field(...) - algorithm: object = Field(...) - datatype: str = Field(default="FLOAT32") - distance_metric: str = Field(default="COSINE") - initial_cap: Optional[int] = None - - @validator("algorithm", "datatype", "distance_metric", pre=True) - @classmethod - def uppercase_strings(cls, v): - return v.upper() - - @property - def field_data(self) -> Dict[str, Any]: - field_data = { - "TYPE": self.datatype, - "DIM": self.dims, - "DISTANCE_METRIC": self.distance_metric, - } - if self.initial_cap is not None: # Only include it if it's set - field_data["INITIAL_CAP"] = self.initial_cap - return field_data - - -class FlatVectorField(BaseVectorField): - algorithm: Literal["FLAT"] = "FLAT" - block_size: Optional[int] = None +class FlatVectorField(BaseField): + type: str = Field(default="vector", const=True) + attrs: Optional[FlatVectorFieldAttributes] = Field( + default_factory=FlatVectorFieldAttributes + ) def as_redis_field(self) -> RedisField: # grab base field params and augment with flat-specific fields - name, as_name = self._handle_path_name() - field_data = super().field_data - if self.block_size is not None: - field_data["BLOCK_SIZE"] = self.block_size - return RedisVectorField( - name, - self.algorithm, - field_data, - as_name=as_name - ) + name, as_name = self._handle_names() + field_data = self.attrs.field_data + if self.attrs.block_size is not None: + field_data["BLOCK_SIZE"] = self.attrs.block_size + return RedisVectorField(name, self.attrs.algorithm, field_data, as_name=as_name) -class HNSWVectorField(BaseVectorField): - algorithm: Literal["HNSW"] = "HNSW" - m: int = Field(default=16) - ef_construction: int = Field(default=200) - ef_runtime: int = Field(default=10) - epsilon: float = Field(default=0.01) +class HNSWVectorField(BaseField): + type: str = Field(default="vector", const=True) + attrs: Optional[HNSWVectorFieldAttributes] = Field( + default_factory=HNSWVectorFieldAttributes + ) def as_redis_field(self) -> RedisField: # grab base field params and augment with hnsw-specific fields - name, as_name = self._handle_path_name() - field_data = super().field_data + name, as_name = self._handle_names() + field_data = self.attrs.field_data field_data.update( { - "M": self.m, - "EF_CONSTRUCTION": self.ef_construction, - "EF_RUNTIME": self.ef_runtime, - "EPSILON": self.epsilon, + "M": self.attrs.m, + "EF_CONSTRUCTION": self.attrs.ef_construction, + "EF_RUNTIME": self.attrs.ef_runtime, + "EPSILON": self.attrs.epsilon, } ) - return RedisVectorField( - name, - self.algorithm, - field_data, - as_name=as_name - ) + return RedisVectorField(name, self.attrs.algorithm, field_data, as_name=as_name) class FieldFactory: @@ -167,32 +197,35 @@ class FieldFactory: } @classmethod - def _get_vector_type(cls, **field_data) -> BaseVectorField: + def pick_vector_field_type(cls, attrs: Dict[str, Any]) -> BaseField: """Get the vector field type from the field data.""" - if "algorithm" not in field_data: + if "algorithm" not in attrs: raise ValueError("Must provide algorithm param for the vector field.") - if "dims" not in field_data: + if "dims" not in attrs: raise ValueError("Must provide dims param for the vector field.") - algorithm = field_data["algorithm"].lower() + algorithm = attrs["algorithm"].lower() if algorithm not in cls.VECTOR_FIELD_TYPE_MAP: raise ValueError(f"Unknown vector field algorithm: {algorithm}") - # default to FLAT - return cls.VECTOR_FIELD_TYPE_MAP.get(algorithm, FlatVectorField)(**field_data) + return cls.VECTOR_FIELD_TYPE_MAP[algorithm] @classmethod def create_field( - cls, field_type: str, name: str, **kwargs - ) -> Union[BaseField, BaseVectorField]: + cls, + type: str, + name: str, + attrs: Dict[str, Any] = {}, + path: Optional[str] = None, + ) -> BaseField: """Create a field of a given type with provided attributes.""" - if field_type == "vector": - return cls._get_vector_type(name=name, **kwargs) - - if field_type not in cls.FIELD_TYPE_MAP: - raise ValueError(f"Unknown field type: {field_type}") + if type == "vector": + field_class = cls.pick_vector_field_type(attrs) + else: + if type not in cls.FIELD_TYPE_MAP: + raise ValueError(f"Unknown field type: {type}") + field_class = cls.FIELD_TYPE_MAP[type] - field_class = cls.FIELD_TYPE_MAP[field_type] - return field_class(name=name, **kwargs) + return field_class(name=name, path=path, attrs=attrs) diff --git a/redisvl/schema/schema.py b/redisvl/schema/schema.py index 7ef16f1e..4dd887a4 100644 --- a/redisvl/schema/schema.py +++ b/redisvl/schema/schema.py @@ -1,17 +1,15 @@ import re - from enum import Enum from pathlib import Path from typing import Any, Dict, List import yaml -from pydantic.v1 import BaseModel, root_validator, validator +from pydantic.v1 import BaseModel, Field, root_validator, validator from redis.commands.search.field import Field as RedisField from redisvl.schema.fields import BaseField, FieldFactory from redisvl.utils.log import get_logger - logger = get_logger(__name__) SCHEMA_VERSION = "0.1.0" @@ -24,6 +22,7 @@ class StorageType(Enum): HASH (str): Represents the 'hash' storage type in Redis. JSON (str): Represents the 'json' storage type in Redis. """ + HASH = "hash" JSON = "json" @@ -35,6 +34,7 @@ class IndexInfo(BaseModel): This class includes the essential details required to define an index, such as its name, prefix, key separator, and storage type. """ + name: str """The unique name of the index.""" prefix: str = "rvl" @@ -92,11 +92,12 @@ class IndexSchema(BaseModel): correct and unambiguous field references. """ + index: IndexInfo """Details of the basic index configurations.""" fields: Dict[str, BaseField] = {} """Fields associated with the search index and their properties""" - version: str = SCHEMA_VERSION + version: str = Field(default=SCHEMA_VERSION, const=True) """Version of the underlying index schema.""" @staticmethod @@ -106,53 +107,18 @@ def _make_field(storage_type, **field_inputs) -> BaseField: Validates and sets the 'path' attribute for fields when using JSON storage type. """ - # Parse raw field inputs - field_name = field_inputs.get("name") - field_type = field_inputs.get("type") - field_attrs = field_inputs.get("attrs", {}) - field_path = field_inputs.get("path") - - if not field_name or not field_type: - raise ValueError("Fields must include a 'type' and 'name'.") - + # Create field from inputs + field = FieldFactory.create_field(**field_inputs) # Handle field path and storage type if storage_type == StorageType.JSON: - field_path = field_path if field_path else f"$.{field_name}" + field.path = field.path if field.path else f"$.{field.name}" else: - if field_path is not None: + if field.path is not None: logger.warning( - f"Path attribute for field '{field_name}' will be ignored for HASH storage type." + f"Path attribute for field '{field.name}' will be ignored for HASH storage type." ) - field_path = None - - # Update attrs and create field instance - field_attrs.update({ - "name": field_name, - "path": field_path - }) - return FieldFactory.create_field(field_type=field_type, **field_attrs) - - @staticmethod - def _convert_old_format(storage_type: StorageType, raw_fields: Dict[str, List[Dict[str, Any]]]) -> List[Dict[str, Any]]: - updated_fields: List[Dict[str, Any]] = [] - for field_type, fields_list in raw_fields.items(): - for field in fields_list: - if storage_type == StorageType.HASH: - field.pop("path", None) - updated_fields.append({ - "name": field.pop("name", None), - "path": None, - "type": field_type, - "attrs": field - }) - else: - updated_fields.append({ - "name": field.pop("as_name", None), - "path": field.pop("path", field.pop("name", None)), - "type": field_type, - "attrs": field - }) - return updated_fields + field.path = None + return field @root_validator(pre=True) @classmethod @@ -160,16 +126,14 @@ def validate_and_create_fields(cls, values): """ Validate uniqueness of field names and create valid field instances. """ - index = IndexInfo(**values.get('index')) - raw_fields = values.get('fields', []) + index = IndexInfo(**values.get("index")) + input_fields = values.get("fields", []) prepared_fields: Dict[str, BaseField] = {} - # Process raw fields - if isinstance(raw_fields, dict): - # Need to handle backwards compat for the moment - # TODO -- will remove this when 0.1.0 lands - logger.warning("New schema format introduced; please update schema specs prior to 0.1.0") - raw_fields = cls._convert_old_format(index.storage_type, raw_fields) - for field_input in raw_fields: + # Handle old fields format temporarily + if isinstance(input_fields, dict): + raise ValueError("New schema format introduced; please update schema spec.") + # Process and create fields + for field_input in input_fields: field = cls._make_field(index.storage_type, **field_input) if field.name in prepared_fields: raise ValueError( @@ -177,20 +141,10 @@ def validate_and_create_fields(cls, values): ) prepared_fields[field.name] = field - values['fields'] = prepared_fields - values['index'] = index + values["fields"] = prepared_fields + values["index"] = index return values - @validator("version", pre=True) - @classmethod - def validate_version(cls, version: str): - """Validate IndexSchema version.""" - if version != SCHEMA_VERSION: - raise ValueError( - f"RedisVL IndexSchema version must be {SCHEMA_VERSION} but got {version}" - ) - return version - @classmethod def from_yaml(cls, file_path: str) -> "IndexSchema": """Create an IndexSchema from a YAML file. @@ -258,7 +212,7 @@ def from_dict(cls, data: Dict[str, Any]) -> "IndexSchema": @property def field_names(self) -> List[str]: - """Returns a list of field names associated with the index schema. + """A list of field names associated with the index schema. Returns: List[str]: A list of field names from the schema. @@ -267,10 +221,10 @@ def field_names(self) -> List[str]: @property def redis_fields(self) -> List[RedisField]: - """Returns a list of core redis-py field definitions based on the + """A list of core redis-py field definitions based on the current schema fields. - Converts field definitions into a format suitable for use with + Converts RedisVL field definitions into a format suitable for use with redis-py, facilitating the creation and management of index structures in the Redis database. @@ -352,7 +306,7 @@ def add_fields(self, fields: List[Dict[str, Any]]): ]) """ for field in fields: - self.add_field(**field) + self.add_field(field) def remove_field(self, field_name: str): """Removes a field from the schema based on the specified name. @@ -397,20 +351,18 @@ def generate_fields( - This method employs heuristics and may not always correctly infer field types. """ - fields: List[Dict[str, Any]] + fields: List[Dict[str, Any]] = [] for field_name, value in data.items(): if field_name in ignore_fields: continue try: field_type = TypeInferrer.infer(value) - fields.append({ - "name": field_name, - "type": field_type, - "attrs": FieldFactory.create_field( + fields.append( + FieldFactory.create_field( field_type, field_name, - ).dict(exclude_unset=True) - }) + ).dict() + ) except ValueError as e: if strict: raise diff --git a/schemas/schema.yaml b/schemas/schema.yaml index 8f9a071f..cc1ad287 100644 --- a/schemas/schema.yaml +++ b/schemas/schema.yaml @@ -1,3 +1,4 @@ +version: '0.1.0' index: name: user-index-v1 @@ -9,7 +10,6 @@ fields: - name: user type: tag path: '.user' - other: test - name: credit_score type: tag path: '$.credit_score' diff --git a/schemas/test_hash_schema.yaml b/schemas/test_hash_schema.yaml new file mode 100644 index 00000000..7244e423 --- /dev/null +++ b/schemas/test_hash_schema.yaml @@ -0,0 +1,16 @@ +version: '0.1.0' + +index: + name: hash-test + prefix: hash + storage_type: hash + +fields: + - name: sentence + type: text + - name: embedding + type: vector + attrs: + dims: 768 + algorithm: flat + distance_metric: cosine \ No newline at end of file diff --git a/schemas/test_json_schema.yaml b/schemas/test_json_schema.yaml new file mode 100644 index 00000000..f8fb3871 --- /dev/null +++ b/schemas/test_json_schema.yaml @@ -0,0 +1,16 @@ +version: '0.1.0' + +index: + name: json-test + prefix: json + storage_type: json + +fields: + - name: sentence + type: text + - name: embedding + type: vector + attrs: + dims: 768 + algorithm: flat + distance_metric: cosine \ No newline at end of file diff --git a/tests/integration/test_query.py b/tests/integration/test_query.py index 3b654d16..c0d902ca 100644 --- a/tests/integration/test_query.py +++ b/tests/integration/test_query.py @@ -76,21 +76,22 @@ def index(): "prefix": "v1", "storage_type": "hash", }, - "fields": { - "tag": [{"name": "credit_score"}], - "text": [{"name": "job"}], - "numeric": [{"name": "age"}], - "geo": [{"name": "location"}], - "vector": [ - { - "name": "user_embedding", + "fields": [ + {"name": "credit_score", "type": "tag"}, + {"name": "job", "type": "text"}, + {"name": "age", "type": "numeric"}, + {"name": "location", "type": "geo"}, + { + "name": "user_embedding", + "type": "vector", + "attrs": { "dims": 3, "distance_metric": "cosine", "algorithm": "flat", "datatype": "float32", - } - ], - }, + }, + }, + ], } ) diff --git a/tests/integration/test_simple.py b/tests/integration/test_simple.py index 437519e4..39919f56 100644 --- a/tests/integration/test_simple.py +++ b/tests/integration/test_simple.py @@ -1,6 +1,5 @@ from pprint import pprint -import numpy as np import pytest from redisvl.index import SearchIndex @@ -35,26 +34,31 @@ }, ] +fields_spec = [ + {"name": "credit_score", "type": "tag"}, + {"name": "user", "type": "tag"}, + {"name": "job", "type": "text"}, + {"name": "age", "type": "numeric"}, + { + "name": "user_embedding", + "type": "vector", + "attrs": { + "dims": 3, + "distance_metric": "cosine", + "algorithm": "flat", + "datatype": "float32", + }, + }, +] + + hash_schema = { "index": { "name": "user_index_hash", "prefix": "users_hash", "storage_type": "hash", }, - "fields": { - "tag": [{"name": "credit_score"}], - "text": [{"name": "job"}], - "numeric": [{"name": "age"}], - "vector": [ - { - "name": "user_embedding", - "dims": 3, - "distance_metric": "cosine", - "algorithm": "flat", - "datatype": "float32", - } - ], - }, + "fields": fields_spec, } json_schema = { @@ -63,24 +67,7 @@ "prefix": "users_json", "storage_type": "json", }, - "fields": { - "tag": [ - {"name": "$.credit_score", "as_name": "credit_score"}, - {"name": "$.user", "as_name": "user"}, - ], - "text": [{"name": "$.job", "as_name": "job"}], - "numeric": [{"name": "$.age", "as_name": "age"}], - "vector": [ - { - "name": "$.user_embedding", - "as_name": "user_embedding", - "dims": 3, - "distance_metric": "cosine", - "algorithm": "flat", - "datatype": "float32", - } - ], - }, + "fields": fields_spec, } diff --git a/tests/sample_hash_schema.yaml b/tests/sample_hash_schema.yaml deleted file mode 100644 index 0e7fba49..00000000 --- a/tests/sample_hash_schema.yaml +++ /dev/null @@ -1,13 +0,0 @@ -index: - name: hash-test - prefix: hash - storage_type: hash - -fields: - text: - - name: sentence - vector: - - name: embedding - dims: 768 - algorithm: flat - distance_metric: cosine \ No newline at end of file diff --git a/tests/sample_json_schema.yaml b/tests/sample_json_schema.yaml deleted file mode 100644 index 4ca5b393..00000000 --- a/tests/sample_json_schema.yaml +++ /dev/null @@ -1,15 +0,0 @@ -index: - name: json-test - prefix: json - storage_type: json - -fields: - text: - - name: '$.sentence' - as_name: sentence - vector: - - name: '$.embedding' - as_name: embedding - dims: 768 - algorithm: flat - distance_metric: cosine \ No newline at end of file diff --git a/tests/unit/test_fields.py b/tests/unit/test_fields.py index bc0dcc58..f420afff 100644 --- a/tests/unit/test_fields.py +++ b/tests/unit/test_fields.py @@ -18,46 +18,57 @@ # Utility functions to create schema instances with default values def create_text_field_schema(**kwargs): - defaults = {"name": "example_textfield", "sortable": False, "weight": 1.0} + defaults = { + "name": "example_textfield", + "attrs": {"sortable": False, "weight": 1.0}, + } defaults.update(kwargs) return TextField(**defaults) def create_tag_field_schema(**kwargs): - defaults = {"name": "example_tagfield", "sortable": False, "separator": ","} + defaults = { + "name": "example_tagfield", + "attrs": {"sortable": False, "separator": ","}, + } defaults.update(kwargs) return TagField(**defaults) def create_numeric_field_schema(**kwargs): - defaults = {"name": "example_numericfield", "sortable": False} + defaults = {"name": "example_numericfield", "attrs": {"sortable": False}} defaults.update(kwargs) return NumericField(**defaults) def create_geo_field_schema(**kwargs): - defaults = {"name": "example_geofield", "sortable": False} + defaults = {"name": "example_geofield", "attrs": {"sortable": False}} defaults.update(kwargs) return GeoField(**defaults) def create_flat_vector_field(**kwargs): - defaults = {"name": "example_flatvectorfield", "dims": 128, "algorithm": "FLAT"} - defaults.update(kwargs) + defaults = { + "name": "example_flatvectorfield", + "attrs": {"dims": 128, "algorithm": "FLAT"}, + } + defaults["attrs"].update(kwargs) return FlatVectorField(**defaults) def create_hnsw_vector_field(**kwargs): defaults = { "name": "example_hnswvectorfield", - "dims": 128, - "algorithm": "HNSW", - "m": 16, - "ef_construction": 200, - "ef_runtime": 10, - "epsilon": 0.01, + "attrs": { + "dims": 128, + "algorithm": "HNSW", + "m": 16, + "ef_construction": 200, + "ef_runtime": 10, + "epsilon": 0.01, + }, } - defaults.update(kwargs) + defaults["attrs"].update(kwargs) return HNSWVectorField(**defaults) @@ -112,12 +123,14 @@ def test_vector_fields_with_optional_params(vector_schema_func, extra_params): def test_hnsw_vector_field_optional_params_not_set(): # Create HNSW vector field without setting optional params - hnsw_field = HNSWVectorField(name="example_vector", dims=128, algorithm="HNSW") + hnsw_field = HNSWVectorField( + name="example_vector", attrs={"dims": 128, "algorithm": "hnsw"} + ) - assert hnsw_field.m == 16 # default value - assert hnsw_field.ef_construction == 200 # default value - assert hnsw_field.ef_runtime == 10 # default value - assert hnsw_field.epsilon == 0.01 # default value + assert hnsw_field.attrs.m == 16 # default value + assert hnsw_field.attrs.ef_construction == 200 # default value + assert hnsw_field.attrs.ef_runtime == 10 # default value + assert hnsw_field.attrs.epsilon == 0.01 # default value field_exported = hnsw_field.as_redis_field() @@ -130,7 +143,9 @@ def test_hnsw_vector_field_optional_params_not_set(): def test_flat_vector_field_block_size_not_set(): # Create Flat vector field without setting block_size - flat_field = FlatVectorField(name="example_vector", dims=128, algorithm="FLAT") + flat_field = FlatVectorField( + name="example_vector", attrs={"dims": 128, "algorithm": "flat"} + ) field_exported = flat_field.as_redis_field() # block_size and initial_cap should not be in the exported field if it was not set @@ -164,7 +179,7 @@ def test_create_standard_field(field_type, expected_class): ) def test_create_vector_field(algorithm, expected_class): field = FieldFactory.create_field( - "vector", "example_vector_field", algorithm=algorithm, dims=128 + "vector", "example_vector_field", attrs={"algorithm": algorithm, "dims": 128} ) assert isinstance(field, expected_class) assert field.name == "example_vector_field" @@ -174,7 +189,9 @@ def test_create_vector_field_with_unknown_algorithm(): """Test for unknown vector field algorithm.""" with pytest.raises(ValueError) as e: FieldFactory.create_field( - "vector", "example_vector_field", algorithm="unknown", dims=128 + "vector", + "example_vector_field", + attrs={"algorithm": "unknown", "dims": 128}, ) assert "Unknown vector field algorithm" in str(e.value) @@ -182,14 +199,16 @@ def test_create_vector_field_with_unknown_algorithm(): def test_missing_vector_field_algorithm(): """Test for missing vector field algorithm.""" with pytest.raises(ValueError) as e: - FieldFactory.create_field("vector", "example_vector_field", dims=128) + FieldFactory.create_field("vector", "example_vector_field", attrs={"dims": 128}) assert "Must provide algorithm param" in str(e.value) def test_missing_vector_field_dims(): """Test for missing vector field algorithm.""" with pytest.raises(ValueError) as e: - FieldFactory.create_field("vector", "example_vector_field", algorithm="flat") + FieldFactory.create_field( + "vector", "example_vector_field", attrs={"algorithm": "flat"} + ) assert "Must provide dims param" in str(e.value) diff --git a/tests/unit/test_index.py b/tests/unit/test_index.py index 36ef15d3..7a693752 100644 --- a/tests/unit/test_index.py +++ b/tests/unit/test_index.py @@ -5,12 +5,12 @@ from redisvl.schema.fields import TagField from redisvl.utils.utils import convert_bytes -fields = {"tag": [TagField(name="test")]} +fields = [{"name": "test", "type": "tag"}] @pytest.fixture def index_schema(): - return IndexSchema(name="my_index", fields=fields) + return IndexSchema.from_dict({"index": {"name": "my_index"}, "fields": fields}) @pytest.fixture @@ -30,7 +30,7 @@ def test_search_index_get_key(index): def test_search_index_no_prefix(index_schema): # specify None as the prefix... - si = index_schema.prefix = "" + si = index_schema.index.prefix = "" si = SearchIndex(schema=index_schema) key = si.key("foo") assert not si.prefix @@ -38,7 +38,7 @@ def test_search_index_no_prefix(index_schema): def test_search_index_client(client, index_schema): - si = index_schema.prefix = "" + si = index_schema.index.prefix = "" si = SearchIndex(schema=index_schema) si.set_client(client) diff --git a/tests/unit/test_schema.py b/tests/unit/test_schema.py index 7c1d47d9..62bae245 100644 --- a/tests/unit/test_schema.py +++ b/tests/unit/test_schema.py @@ -2,7 +2,7 @@ import pytest -from redisvl.schema.fields import NumericField, TextField +from redisvl.schema.fields import NumericField, TagField, TextField from redisvl.schema.schema import IndexSchema, StorageType @@ -12,11 +12,11 @@ def get_base_path(): # Sample data for testing def create_sample_index_schema(): - sample_fields = { - "text": [TextField(name="example_text", sortable=False)], - "numeric": [NumericField(name="example_numeric", sortable=True)], - } - return IndexSchema(name="test", fields=sample_fields) + sample_fields = [ + {"name": "example_text", "type": "text", "attrs": {"sortable": False}}, + {"name": "example_numeric", "type": "numeric", "attrs": {"sortable": True}}, + ] + return IndexSchema.from_dict({"index": {"name": "test"}, "fields": sample_fields}) # Tests for IndexSchema @@ -24,93 +24,99 @@ def create_sample_index_schema(): def test_initialization_with_default_params(): """Test basic schema init with defaults.""" - default_schema = IndexSchema(name="test") - assert default_schema.name == "test" - assert default_schema.prefix == "rvl" # Default value - assert default_schema.key_separator == ":" # Default value - assert default_schema.storage_type == StorageType.HASH # Default value + default_schema = IndexSchema.from_dict({"index": {"name": "test"}}) + assert default_schema.version == "0.1.0" + assert default_schema.index.name == "test" + assert default_schema.index.prefix == "rvl" # Default value + assert default_schema.index.key_separator == ":" # Default value + assert default_schema.index.storage_type == StorageType.HASH # Default value assert default_schema.fields == {} # Default value def test_initialization_with_custom_params(): """Test custom schema params.""" - custom_schema = IndexSchema( - name="custom_schema", - prefix="custom", - key_separator="|", - storage_type=StorageType.JSON, + custom_schema = IndexSchema.from_dict( + { + "index": { + "name": "custom_schema", + "prefix": "custom", + "key_separator": "|", + "storage_type": "json", + } + } ) - assert custom_schema.name == "custom_schema" - assert custom_schema.prefix == "custom" - assert custom_schema.key_separator == "|" - assert custom_schema.storage_type == StorageType.JSON + assert custom_schema.index.name == "custom_schema" + assert custom_schema.index.prefix == "custom" + assert custom_schema.index.key_separator == "|" + assert custom_schema.index.storage_type == StorageType.JSON def test_add_field(): """Test field addition.""" index_schema = create_sample_index_schema() - index_schema.add_field("text", name="new_text_field") - assert "new_text_field" in index_schema.field_names + index_schema.add_field({"name": "new_text_field", "type": "text"}) + assert "new_text_field" in index_schema.fields + assert isinstance(index_schema.fields["new_text_field"], TextField) def test_add_fields(): """Test multiple field addition.""" index_schema = create_sample_index_schema() index_schema.add_fields( - {"text": [{"name": "new_text_field"}], "tag": [{"name": "new_tag_field"}]} + [ + {"name": "new_text_field", "type": "text"}, + {"name": "new_tag_field", "type": "tag"}, + ] ) - assert "new_text_field" in index_schema.field_names - assert "new_tag_field" in index_schema.field_names + assert "new_text_field" in index_schema.fields + assert isinstance(index_schema.fields["new_text_field"], TextField) + assert "new_tag_field" in index_schema.fields + assert isinstance(index_schema.fields["new_tag_field"], TagField) def test_add_duplicate_field(): """Test adding a duplicate field.""" index_schema = create_sample_index_schema() with pytest.raises(ValueError): - index_schema.add_field("text", name="example_text") + index_schema.add_field({"name": "example_text", "type": "text"}) def test_remove_field(): """Test field removal.""" index_schema = create_sample_index_schema() - index_schema.remove_field("text", "example_text") + index_schema.remove_field("example_text") assert "example_text" not in index_schema.field_names -def test_remove_nonexistent_field(): - """Test failed remove of nonexistent field.""" - index_schema = create_sample_index_schema() - with pytest.raises(ValueError): - index_schema.remove_field("text", "nonexistent") - - def test_schema_compare(): """Test schema comparisons.""" - schema_1 = IndexSchema(name="test") + schema_1 = IndexSchema.from_dict({"index": {"name": "test"}}) # manually add the same fields as the helper method provides below schema_1.add_fields( - { - "text": [{"name": "example_text", "sortable": False}], - "numeric": [{"name": "example_numeric", "sortable": True}], - } + [ + {"name": "example_text", "type": "text", "attrs": {"sortable": False}}, + {"name": "example_numeric", "type": "numeric", "attrs": {"sortable": True}}, + ] ) - assert "example_text" in schema_1.field_names - assert "example_numeric" in schema_1.field_names + + assert "example_text" in schema_1.fields + assert "example_numeric" in schema_1.fields schema_2 = create_sample_index_schema() - assert schema_1.field_names == schema_2.field_names - assert schema_1.name == schema_2.name + assert schema_1.fields == schema_2.fields + assert schema_1.index.name == schema_2.index.name assert schema_1.to_dict() == schema_2.to_dict() def test_generate_fields(): """Test field generation.""" - data = {"name": "John", "age": 30, "tags": ["test", "test2"]} - index_schema = IndexSchema(name="test") - generated_fields = index_schema.generate_fields(data) - assert "text" in generated_fields - assert "numeric" in generated_fields - assert "tag" in generated_fields + sample = {"name": "John", "age": 30, "tags": ["test", "test2"]} + index_schema = IndexSchema.from_dict({"index": {"name": "test"}}) + generated_fields = index_schema.generate_fields(sample) + expected_field_names = sample.keys() + for field in generated_fields: + assert field["name"] in expected_field_names + assert field["path"] == None def test_to_dict(): @@ -118,14 +124,16 @@ def test_to_dict(): index_schema = create_sample_index_schema() index_dict = index_schema.to_dict() assert index_dict["index"]["name"] == "test" + assert isinstance(index_dict["fields"], dict) + assert len(index_dict["fields"]) == 2 == len(index_schema.fields) def test_from_dict(): """Test loading schema from a dictionary.""" - sample_fields = { - "text": [{"name": "example_text", "sortable": False}], - "numeric": [{"name": "example_numeric", "sortable": True}], - } + sample_fields = [ + {"name": "example_text", "type": "text", "attrs": {"sortable": False}}, + {"name": "example_numeric", "type": "tag", "attrs": {"sortable": True}}, + ] index_schema = IndexSchema.from_dict( { "index": { @@ -137,25 +145,32 @@ def test_from_dict(): "fields": sample_fields, } ) - assert index_schema.name == "example_index" - assert index_schema.key_separator == "|" - assert index_schema.prefix == "ex" - assert index_schema.storage_type == StorageType.JSON + assert index_schema.index.name == "example_index" + assert index_schema.index.key_separator == "|" + assert index_schema.index.prefix == "ex" + assert index_schema.index.storage_type == StorageType.JSON assert len(index_schema.fields) == 2 -def test_from_yaml(): +def test_hash_index_from_yaml(): """Test loading from yaml.""" index_schema = IndexSchema.from_yaml( - str(get_base_path().joinpath("../sample_hash_schema.yaml")) + str(get_base_path().joinpath("../../schemas/test_hash_schema.yaml")) ) - assert index_schema.name == "hash-test" - assert index_schema.prefix == "hash" - assert index_schema.storage_type == StorageType.HASH + assert index_schema.index.name == "hash-test" + assert index_schema.index.prefix == "hash" + assert index_schema.index.storage_type == StorageType.HASH assert len(index_schema.fields) == 2 - assert "vector" in index_schema.fields - assert "text" in index_schema.fields +def test_json_index_from_yaml(): + """Test loading from yaml.""" + index_schema = IndexSchema.from_yaml( + str(get_base_path().joinpath("../../schemas/test_json_schema.yaml")) + ) + assert index_schema.index.name == "json-test" + assert index_schema.index.prefix == "json" + assert index_schema.index.storage_type == StorageType.JSON + assert len(index_schema.fields) == 2 def test_from_yaml_file_not_found(): """Test loading from yaml with file not found.""" From 4bc78594faf96b98afa171af74cb39f30f947d4e Mon Sep 17 00:00:00 2001 From: Tyler Hutcherson Date: Mon, 29 Jan 2024 20:31:38 -0500 Subject: [PATCH 05/10] finish schema work and tests --- docs/_static/js/sidebar.js | 7 +- docs/api/searchindex.rst | 2 + docs/examples/openai_qna.ipynb | 39 ++- docs/user_guide/cli.ipynb | 64 ++-- docs/user_guide/getting_started_01.ipynb | 358 +++++++++++++---------- docs/user_guide/hash_vs_json_05.ipynb | 115 ++++---- docs/user_guide/hybrid_queries_02.ipynb | 73 ++--- docs/user_guide/llmcache_03.ipynb | 47 ++- docs/user_guide/schema.yaml | 21 +- docs/user_guide/vectorizers_04.ipynb | 86 ++++-- redisvl/cli/index.py | 4 +- redisvl/cli/stats.py | 4 +- redisvl/llmcache/semantic.py | 69 ++--- redisvl/schema/fields.py | 31 +- redisvl/schema/schema.py | 19 +- tests/integration/test_simple_async.py | 37 ++- tests/unit/test_schema.py | 18 +- 17 files changed, 594 insertions(+), 400 deletions(-) diff --git a/docs/_static/js/sidebar.js b/docs/_static/js/sidebar.js index f6046f60..5995b7b6 100644 --- a/docs/_static/js/sidebar.js +++ b/docs/_static/js/sidebar.js @@ -4,20 +4,21 @@ const toc = [ { title: "Install", path: "/overview/installation.html" }, { title: "CLI", path: "/user_guide/cli.html" }, ]}, - { header: "User Guide", toc: [ + { header: "User Guides", toc: [ { title: "Getting Started", path: "/user_guide/getting_started_01.html" }, { title: "Query and Filter", path: "/user_guide/hybrid_queries_02.html" }, + { title: "Customizable Storage (JSON and Hash)", path: "/user_guide/hash_vs_json_05.html" }, { title: "Vectorizers", path: "/user_guide/vectorizers_04.html" }, { title: "Semantic Caching", path: "/user_guide/llmcache_03.html" }, - { title: "JSON Storage", path: "/user_guide/hash_vs_json_05.html" } ]}, { header: "API", toc: [ + { title: "Schema", path: "/api/indexschema.html"}, { title: "Index", path: "/api/searchindex.html" }, { title: "Query", path: "/api/query.html" }, { title: "Filter", path: "/api/filter.html" }, { title: "Vectorizers", path: "/api/vectorizer.html" }, - { title: "LLMCache", path: "/api/cache.html" } + { title: "SemanticCache", path: "/api/cache.html" } ]} ]; diff --git a/docs/api/searchindex.rst b/docs/api/searchindex.rst index 77c83bae..71cd781b 100644 --- a/docs/api/searchindex.rst +++ b/docs/api/searchindex.rst @@ -32,6 +32,8 @@ SearchIndex SearchIndex.asearch SearchIndex.query SearchIndex.aquery + SearchIndex.query_batch + SearchIndex.aquery_batch SearchIndex.delete SearchIndex.adelete SearchIndex.info diff --git a/docs/examples/openai_qna.ipynb b/docs/examples/openai_qna.ipynb index 634bfd45..da2ac347 100644 --- a/docs/examples/openai_qna.ipynb +++ b/docs/examples/openai_qna.ipynb @@ -81,15 +81,6 @@ "%pip install pandas wget tenacity tiktoken openai==0.28.1" ] }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "import redisvl" - ] - }, { "cell_type": "code", "execution_count": 3, @@ -653,7 +644,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 2, "metadata": {}, "outputs": [ { @@ -667,18 +658,22 @@ "source": [ "%%writefile wiki_schema.yaml\n", "\n", + "version: '0.1.0'\n", + "\n", "index:\n", - " name: wiki\n", - " prefix: oaiWiki\n", + " name: wikipedia\n", + " prefix: chunk\n", "\n", "fields:\n", - " text:\n", - " - name: content\n", - " - name: title\n", - " tag:\n", - " - name: id\n", - " vector:\n", - " - name: embedding\n", + " - name: content\n", + " type: text\n", + " - name: title\n", + " type: text\n", + " - name: id\n", + " type: tag\n", + " - name: embedding\n", + " type: vector\n", + " attrs:\n", " dims: 1536\n", " distance_metric: cosine\n", " algorithm: flat" @@ -690,10 +685,14 @@ "metadata": {}, "outputs": [], "source": [ + "import redis.asyncio as redis\n", + "\n", "from redisvl.index import SearchIndex\n", "\n", + "client = redis.Redis.from_url(\"redis://localhost:6379\")\n", + "\n", "index = SearchIndex.from_yaml(\"wiki_schema.yaml\")\n", - "index.connect(\"redis://localhost:6379\", use_async=True)\n", + "index.set_client(client)\n", "\n", "await index.acreate()" ] diff --git a/docs/user_guide/cli.ipynb b/docs/user_guide/cli.ipynb index ff7a64db..3a8245ec 100644 --- a/docs/user_guide/cli.ipynb +++ b/docs/user_guide/cli.ipynb @@ -24,7 +24,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "\u001b[32m11:13:52\u001b[0m \u001b[34m[RedisVL]\u001b[0m \u001b[1;30mINFO\u001b[0m RedisVL version 0.0.5\n" + "\u001b[32m20:09:02\u001b[0m \u001b[34m[RedisVL]\u001b[0m \u001b[1;30mINFO\u001b[0m RedisVL version 0.0.7\n" ] } ], @@ -44,19 +44,22 @@ "first, we will create an index from a yaml schema that looks like the following\n", "\n", "```yaml\n", + "version: '0.1.0'\n", + "\n", "index:\n", - " name: providers\n", - " prefix: rvl\n", + " name: vectorizers\n", + " prefix: doc\n", " storage_type: hash\n", "\n", "fields:\n", - " text:\n", - " - name: sentence\n", - " vector:\n", - " - name: embedding\n", - " dims: 768\n", - " algorithm: flat\n", - " distance_metric: cosine\n", + " - name: sentence\n", + " type: text\n", + " - name: embedding\n", + " type: vector\n", + " attrs:\n", + " dims: 768\n", + " algorithm: flat\n", + " distance_metric: cosine\n", "```" ] }, @@ -69,7 +72,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "\u001b[32m11:13:54\u001b[0m \u001b[34m[RedisVL]\u001b[0m \u001b[1;30mINFO\u001b[0m Index created successfully\n" + "\u001b[32m20:09:04\u001b[0m \u001b[34m[RedisVL]\u001b[0m \u001b[1;30mINFO\u001b[0m Index created successfully\n" ] } ], @@ -87,8 +90,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "\u001b[32m11:13:56\u001b[0m \u001b[34m[RedisVL]\u001b[0m \u001b[1;30mINFO\u001b[0m Indices:\n", - "\u001b[32m11:13:56\u001b[0m \u001b[34m[RedisVL]\u001b[0m \u001b[1;30mINFO\u001b[0m 1. providers\n" + "\u001b[32m20:09:05\u001b[0m \u001b[34m[RedisVL]\u001b[0m \u001b[1;30mINFO\u001b[0m Indices:\n", + "\u001b[32m20:09:05\u001b[0m \u001b[34m[RedisVL]\u001b[0m \u001b[1;30mINFO\u001b[0m 1. vectorizers\n" ] } ], @@ -112,7 +115,7 @@ "╭──────────────┬────────────────┬────────────┬─────────────────┬────────────╮\n", "│ Index Name │ Storage Type │ Prefixes │ Index Options │ Indexing │\n", "├──────────────┼────────────────┼────────────┼─────────────────┼────────────┤\n", - "│ providers │ HASH │ ['rvl'] │ [] │ 0 │\n", + "│ vectorizers │ HASH │ ['doc'] │ [] │ 0 │\n", "╰──────────────┴────────────────┴────────────┴─────────────────┴────────────╯\n", "Index Fields:\n", "╭───────────┬─────────────┬────────┬────────────────┬────────────────╮\n", @@ -126,7 +129,7 @@ ], "source": [ "# inspect the index fields\n", - "!rvl index info -i providers" + "!rvl index info -i vectorizers" ] }, { @@ -138,13 +141,13 @@ "name": "stdout", "output_type": "stream", "text": [ - "\u001b[32m11:13:59\u001b[0m \u001b[34m[RedisVL]\u001b[0m \u001b[1;30mINFO\u001b[0m Index deleted successfully\n" + "\u001b[32m20:09:09\u001b[0m \u001b[34m[RedisVL]\u001b[0m \u001b[1;30mINFO\u001b[0m Index deleted successfully\n" ] } ], "source": [ "# delete an index without deleting the data within it\n", - "!rvl index delete -i providers" + "!rvl index delete -i vectorizers" ] }, { @@ -156,7 +159,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "\u001b[32m11:14:00\u001b[0m \u001b[34m[RedisVL]\u001b[0m \u001b[1;30mINFO\u001b[0m Indices:\n" + "\u001b[32m20:09:11\u001b[0m \u001b[34m[RedisVL]\u001b[0m \u001b[1;30mINFO\u001b[0m Indices:\n" ] } ], @@ -183,7 +186,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "\u001b[32m11:14:02\u001b[0m \u001b[34m[RedisVL]\u001b[0m \u001b[1;30mINFO\u001b[0m Index created successfully\n" + "\u001b[32m20:09:12\u001b[0m \u001b[34m[RedisVL]\u001b[0m \u001b[1;30mINFO\u001b[0m Index created successfully\n" ] } ], @@ -201,8 +204,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "\u001b[32m11:14:03\u001b[0m \u001b[34m[RedisVL]\u001b[0m \u001b[1;30mINFO\u001b[0m Indices:\n", - "\u001b[32m11:14:03\u001b[0m \u001b[34m[RedisVL]\u001b[0m \u001b[1;30mINFO\u001b[0m 1. providers\n" + "\u001b[32m20:09:14\u001b[0m \u001b[34m[RedisVL]\u001b[0m \u001b[1;30mINFO\u001b[0m Indices:\n", + "\u001b[32m20:09:14\u001b[0m \u001b[34m[RedisVL]\u001b[0m \u001b[1;30mINFO\u001b[0m 1. vectorizers\n" ] } ], @@ -250,7 +253,24 @@ ], "source": [ "# see all the stats for the index\n", - "!rvl stats -i providers" + "!rvl stats -i vectorizers" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[32m20:09:33\u001b[0m \u001b[34m[RedisVL]\u001b[0m \u001b[1;30mINFO\u001b[0m Index deleted successfully\n" + ] + } + ], + "source": [ + "!rvl index destroy -i vectorizers" ] } ], diff --git a/docs/user_guide/getting_started_01.ipynb b/docs/user_guide/getting_started_01.ipynb index 218fe794..44f6bc69 100644 --- a/docs/user_guide/getting_started_01.ipynb +++ b/docs/user_guide/getting_started_01.ipynb @@ -46,28 +46,28 @@ "**YAML Definition:**\n", "\n", "```yaml\n", + "version: '0.1.0'\n", + "\n", "index:\n", - " name: user_index\n", - " prefix: user\n", + " name: user_simple\n", + " prefix: user_simple_docs\n", "\n", "fields:\n", - " # define tag fields\n", - " tag:\n", - " - name: user\n", - " - name: credit_store\n", - " # define text fields\n", - " text:\n", - " - name: job\n", - " # define numeric fields\n", - " numeric:\n", - " - name: age\n", - " # define vector fields\n", - " vector:\n", - " - name: user_embedding\n", - " algorithm: flat\n", - " dims: 3\n", - " distance_metric: cosine\n", - " datatype: float32\n", + " - name: user\n", + " type: tag\n", + " - name: credit_store\n", + " type: tag\n", + " - name: job\n", + " type: text\n", + " - name: age\n", + " type: numeric\n", + " - name: user_embedding\n", + " type: vector\n", + " attrs:\n", + " algorithm: flat\n", + " dims: 3\n", + " distance_metric: cosine\n", + " datatype: float32\n", "```\n", "> Store this in a local file, such as `schema.yaml`, for RedisVL usage." ] @@ -87,22 +87,25 @@ "source": [ "schema = {\n", " \"index\": {\n", - " \"name\": \"user_index\",\n", - " \"prefix\": \"user\",\n", - " \"storage_type\": \"hash\",\n", + " \"name\": \"user_simple\",\n", + " \"prefix\": \"user_simple_docs\",\n", " },\n", - " \"fields\": {\n", - " \"tag\": [{\"name\": \"user\"}, {\"name\": \"credit_score\"}],\n", - " \"text\": [{\"name\": \"job\"}],\n", - " \"numeric\": [{\"name\": \"age\"}],\n", - " \"vector\": [{\n", + " \"fields\": [\n", + " {\"name\": \"user\", \"type\": \"tag\"},\n", + " {\"name\": \"credit_score\", \"type\": \"tag\"},\n", + " {\"name\": \"job\", \"type\": \"text\"},\n", + " {\"name\": \"age\", \"type\": \"numeric\"},\n", + " {\n", " \"name\": \"user_embedding\",\n", - " \"dims\": 3,\n", - " \"distance_metric\": \"cosine\",\n", - " \"algorithm\": \"flat\",\n", - " \"datatype\": \"float32\"\n", - " }]\n", - " },\n", + " \"type\": \"vector\",\n", + " \"attrs\": {\n", + " \"dims\": 3,\n", + " \"distance_metric\": \"cosine\",\n", + " \"algorithm\": \"flat\",\n", + " \"datatype\": \"float32\"\n", + " }\n", + " }\n", + " ]\n", "}" ] }, @@ -177,20 +180,99 @@ "cell_type": "code", "execution_count": 3, "metadata": {}, + "outputs": [], + "source": [ + "from redisvl.index import SearchIndex\n", + "\n", + "index = SearchIndex.from_dict(schema)\n", + "# or use .from_yaml('schema_file.yaml')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we also need to facilitate a Redis connection. There are a few ways to do this:\n", + "\n", + "- Create & manage your own client connection (recommended)\n", + "- Provide a simple Redis URL and let RedisVL connect on your behalf" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Bring your own Redis connection instance" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "Index already exists, overwriting.\n" - ] + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ - "from redisvl.index import SearchIndex\n", + "from redis import Redis\n", "\n", - "index = SearchIndex.from_dict(schema) # or use .from_yaml(...)\n", + "client = Redis.from_url(\"redis://localhost:6379\")\n", + "\n", + "index.set_client(client)\n", + "# optionally provide an async Redis client object to enable async index operations" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Let the index manage the connection instance" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ "index.connect(\"redis://localhost:6379\")\n", + "# optionally use an async client by passing use_async=True" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create the underlying index\n", + "\n", + "Now that we are connected to Redis, we need to run the create command." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ "index.create(overwrite=True)" ] }, @@ -211,15 +293,15 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "\u001b[32m18:25:34\u001b[0m \u001b[34m[RedisVL]\u001b[0m \u001b[1;30mINFO\u001b[0m Indices:\n", - "\u001b[32m18:25:34\u001b[0m \u001b[34m[RedisVL]\u001b[0m \u001b[1;30mINFO\u001b[0m 1. user_index\n" + "\u001b[32m20:11:48\u001b[0m \u001b[34m[RedisVL]\u001b[0m \u001b[1;30mINFO\u001b[0m Indices:\n", + "\u001b[32m20:11:48\u001b[0m \u001b[34m[RedisVL]\u001b[0m \u001b[1;30mINFO\u001b[0m 1. user_simple\n" ] } ], @@ -229,7 +311,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 8, "metadata": {}, "outputs": [ { @@ -239,11 +321,11 @@ "\n", "\n", "Index Information:\n", - "╭──────────────┬────────────────┬────────────┬─────────────────┬────────────╮\n", - "│ Index Name │ Storage Type │ Prefixes │ Index Options │ Indexing │\n", - "├──────────────┼────────────────┼────────────┼─────────────────┼────────────┤\n", - "│ user_index │ HASH │ ['user'] │ [] │ 0 │\n", - "╰──────────────┴────────────────┴────────────┴─────────────────┴────────────╯\n", + "╭──────────────┬────────────────┬──────────────────────┬─────────────────┬────────────╮\n", + "│ Index Name │ Storage Type │ Prefixes │ Index Options │ Indexing │\n", + "├──────────────┼────────────────┼──────────────────────┼─────────────────┼────────────┤\n", + "│ user_simple │ HASH │ ['user_simple_docs'] │ [] │ 0 │\n", + "╰──────────────┴────────────────┴──────────────────────┴─────────────────┴────────────╯\n", "Index Fields:\n", "╭────────────────┬────────────────┬─────────┬────────────────┬────────────────╮\n", "│ Name │ Attribute │ Type │ Field Option │ Option Value │\n", @@ -258,7 +340,7 @@ } ], "source": [ - "!rvl index info -i user_index" + "!rvl index info -i user_simple" ] }, { @@ -272,14 +354,14 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "['user:31d4f3c73f1a4c26b41cf0e2b8e0248a', 'user:c9ff740437064b919245e49ef585484d', 'user:6db5f2e09f08438785b73d8048d5350b']\n" + "['user_simple_docs:71cb417a3675404889a8a22255f482d0', 'user_simple_docs:3eda3f6f640144a086149ad36d2e8419', 'user_simple_docs:aa9195acd07f41a485477eb3cb333bb8']\n" ] } ], @@ -293,7 +375,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - ">By default, `load` will create a unique Redis \"key\" as a combination of the index key `prefix` and a UUID." + ">By default, `load` will create a unique Redis \"key\" as a combination of the index key `prefix` and a UUID. You can also customize the key by providing direct keys or pointing to a specified key_field on load." ] }, { @@ -307,9 +389,16 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 10, "metadata": {}, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Fetching data for user 71cb417a3675404889a8a22255f482d0\n" + ] + }, { "data": { "text/plain": [ @@ -320,13 +409,16 @@ " 'user_embedding': b'\\xcd\\xcc\\xcc=\\xcd\\xcc\\xcc=\\x00\\x00\\x00?'}" ] }, - "execution_count": 7, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "index.fetch(id=keys[0].strip(\"user:\"))" + "_id = keys[0].strip(\"user_simple_docs:\")\n", + "\n", + "print(f\"Fetching data for user {_id}\")\n", + "index.fetch(id=_id)" ] }, { @@ -334,19 +426,19 @@ "metadata": {}, "source": [ "### Upsert the index with new data\n", - "Upsert data by using the `load` method:= again:" + "Upsert data by using the `load` method again:" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "['user:054a3091bd99457f826f3278f4a0dc98']\n" + "['user_simple_docs:2747d2e5355d4b0fbb994a9b37518bcc']\n" ] } ], @@ -376,7 +468,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ @@ -401,7 +493,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 13, "metadata": {}, "outputs": [ { @@ -437,7 +529,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 14, "metadata": {}, "outputs": [ { @@ -475,37 +567,12 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 15, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "\n", - "Index Information:\n", - "╭──────────────┬────────────────┬────────────┬─────────────────┬────────────╮\n", - "│ Index Name │ Storage Type │ Prefixes │ Index Options │ Indexing │\n", - "├──────────────┼────────────────┼────────────┼─────────────────┼────────────┤\n", - "│ user_index │ HASH │ ['user'] │ [] │ 0 │\n", - "╰──────────────┴────────────────┴────────────┴─────────────────┴────────────╯\n", - "Index Fields:\n", - "╭────────────────┬────────────────┬─────────┬────────────────┬────────────────╮\n", - "│ Name │ Attribute │ Type │ Field Option │ Option Value │\n", - "├────────────────┼────────────────┼─────────┼────────────────┼────────────────┤\n", - "│ user │ user │ TAG │ SEPARATOR │ , │\n", - "│ credit_score │ credit_score │ TAG │ SEPARATOR │ , │\n", - "│ job │ job │ TEXT │ WEIGHT │ 1 │\n", - "│ age │ age │ NUMERIC │ │ │\n", - "│ user_embedding │ user_embedding │ VECTOR │ │ │\n", - "╰────────────────┴────────────────┴─────────┴────────────────┴────────────────╯\n" - ] - } - ], + "outputs": [], "source": [ - "# First we will inspect the index we already have...\n", - "!rvl index info -i user_index" + "# First we will clean up the existing index yet keep docs in place\n", + "await index.adelete(drop=False)" ] }, { @@ -519,24 +586,26 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "{'index': {'name': 'user_index', 'prefix': 'user', 'storage_type': 'hash'},\n", - " 'fields': {'tag': [{'name': 'user'}, {'name': 'credit_score'}],\n", - " 'text': [{'name': 'job'}],\n", - " 'numeric': [{'name': 'age'}],\n", - " 'vector': [{'name': 'user_embedding',\n", - " 'dims': 3,\n", + "{'index': {'name': 'user_simple', 'prefix': 'user_simple_docs'},\n", + " 'fields': [{'name': 'user', 'type': 'tag'},\n", + " {'name': 'credit_score', 'type': 'tag'},\n", + " {'name': 'job', 'type': 'text'},\n", + " {'name': 'age', 'type': 'numeric'},\n", + " {'name': 'user_embedding',\n", + " 'type': 'vector',\n", + " 'attrs': {'dims': 3,\n", " 'distance_metric': 'cosine',\n", " 'algorithm': 'flat',\n", - " 'datatype': 'float32'}]}}" + " 'datatype': 'float32'}}]}" ] }, - "execution_count": 13, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -548,68 +617,43 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 17, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'index': {'name': 'user_index', 'prefix': 'user', 'storage_type': 'hash'},\n", - " 'fields': {'tag': [{'name': 'credit_score'}, {'name': 'job'}],\n", - " 'text': [],\n", - " 'numeric': [{'name': 'age'}],\n", - " 'vector': [{'name': 'user_embedding',\n", - " 'dims': 3,\n", - " 'distance_metric': 'cosine',\n", - " 'algorithm': 'hnsw',\n", - " 'datatype': 'float32'}]}}" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "# We need to modify this schema dict to have what we want\n", - "schema['fields'].update({\n", - " 'text': [],\n", - " 'tag': [{'name': 'credit_score'}, {'name': 'job'}],\n", - " 'vector': [{\n", - " 'name': 'user_embedding',\n", - " 'dims': 3,\n", - " 'distance_metric': 'cosine',\n", - " 'algorithm': 'hnsw',\n", - " 'datatype': 'float32'\n", - " }]\n", - "})\n", + "# We need to modify this schema to have what we want\n", "\n", - "schema" + "index.schema.remove_field(\"job\")\n", + "index.schema.remove_field(\"user_embedding\")\n", + "\n", + "index.schema.add_fields([\n", + " {\"name\": \"job\", \"type\": \"tag\"},\n", + " {\n", + " \"name\": \"user_embedding\",\n", + " \"type\": \"vector\",\n", + " \"attrs\": {\n", + " \"dims\": 3,\n", + " \"distance_metric\": \"cosine\",\n", + " \"algorithm\": \"flat\",\n", + " \"datatype\": \"float32\"\n", + " }\n", + " }\n", + "])" ] }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 18, "metadata": {}, "outputs": [], "source": [ - "# Delete existing index without clearing out the underlying data\n", - "await index.adelete(drop=False)\n", - "\n", - "# Build the new index interface with updated schema\n", - "index = (\n", - " SearchIndex\n", - " .from_dict(schema)\n", - " .connect(\"redis://localhost:6379\", use_async=True)\n", - ")\n", - "\n", "# Run the index update\n", "await index.acreate()" ] }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 19, "metadata": {}, "outputs": [ { @@ -641,7 +685,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 20, "metadata": {}, "outputs": [ { @@ -656,28 +700,28 @@ "│ num_docs │ 4 │\n", "│ num_terms │ 0 │\n", "│ max_doc_id │ 4 │\n", - "│ num_records │ 16 │\n", + "│ num_records │ 20 │\n", "│ percent_indexed │ 1 │\n", "│ hash_indexing_failures │ 0 │\n", "│ number_of_uses │ 2 │\n", "│ bytes_per_record_avg │ 1 │\n", - "│ doc_table_size_mb │ 0.000400543 │\n", - "│ inverted_sz_mb │ 1.52588e-05 │\n", + "│ doc_table_size_mb │ 0.00044632 │\n", + "│ inverted_sz_mb │ 1.90735e-05 │\n", "│ key_table_size_mb │ 0.000138283 │\n", "│ offset_bits_per_record_avg │ nan │\n", "│ offset_vectors_sz_mb │ 0 │\n", "│ offsets_per_term_avg │ 0 │\n", - "│ records_per_doc_avg │ 4 │\n", + "│ records_per_doc_avg │ 5 │\n", "│ sortable_values_size_mb │ 0 │\n", - "│ total_indexing_time │ 0.664 │\n", - "│ total_inverted_index_blocks │ 7 │\n", - "│ vector_index_sz_mb │ 0.17852 │\n", + "│ total_indexing_time │ 0.624 │\n", + "│ total_inverted_index_blocks │ 11 │\n", + "│ vector_index_sz_mb │ 0.0201416 │\n", "╰─────────────────────────────┴─────────────╯\n" ] } ], "source": [ - "!rvl stats -i user_index" + "!rvl stats -i user_simple" ] }, { @@ -689,7 +733,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 21, "metadata": {}, "outputs": [], "source": [ @@ -714,7 +758,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.6" + "version": "3.9.12" }, "orig_nbformat": 4, "vscode": { diff --git a/docs/user_guide/hash_vs_json_05.ipynb b/docs/user_guide/hash_vs_json_05.ipynb index f9d65dd7..c486d4a9 100644 --- a/docs/user_guide/hash_vs_json_05.ipynb +++ b/docs/user_guide/hash_vs_json_05.ipynb @@ -16,15 +16,13 @@ "1. Have installed ``redisvl`` and have that environment active for this notebook.\n", "2. Have a running Redis Stack or Redis Enterprise instance with RediSearch > 2.4 activated.\n", "\n", - "For example, you can run Redis Stack locally with Docker:\n", + "For example, you can run [Redis Stack](https://redis.io/docs/install/install-stack/) locally with Docker:\n", "\n", "```bash\n", "docker run -d -p 6379:6379 -p 8001:8001 redis/redis-stack:latest\n", "```\n", "\n", - "Or create a [FREE Redis Enterprise instance.](https://redis.com/try-free).\n", - "\n", - "This example will assume a local Redis is running on port 6379 and RedisInsight at 8001." + "Or create a [FREE Redis Cloud](https://redis.com/try-free)." ] }, { @@ -108,23 +106,28 @@ "# define the hash index schema\n", "hash_schema = {\n", " \"index\": {\n", - " \"name\": \"user-hashes\",\n", - " \"storage_type\": \"hash\", # default setting\n", - " \"prefix\": \"hash\",\n", + " \"name\": \"user-hash\",\n", + " \"prefix\": \"user-hash-docs\",\n", + " \"storage_type\": \"hash\", # default setting -- HASH\n", " },\n", - " \"fields\": {\n", - " \"tag\": [{\"name\": \"credit_score\"}, {\"name\": \"user\"}],\n", - " \"text\": [{\"name\": \"job\"}],\n", - " \"numeric\": [{\"name\": \"age\"}],\n", - " \"geo\": [{\"name\": \"office_location\"}],\n", - " \"vector\": [{\n", - " \"name\": \"user_embedding\",\n", + " \"fields\": [\n", + " {\"name\": \"user\", \"type\": \"tag\"},\n", + " {\"name\": \"credit_score\", \"type\": \"tag\"},\n", + " {\"name\": \"job\", \"type\": \"text\"},\n", + " {\"name\": \"age\", \"type\": \"numeric\"},\n", + " {\"name\": \"office_location\", \"type\": \"geo\"},\n", + " {\n", + " \"name\": \"user_embedding\",\n", + " \"type\": \"vector\",\n", + " \"attrs\": {\n", " \"dims\": 3,\n", " \"distance_metric\": \"cosine\",\n", " \"algorithm\": \"flat\",\n", - " \"datatype\": \"float32\"}\n", - " ]\n", - " },\n", + " \"datatype\": \"float32\"\n", + " }\n", + "\n", + " }\n", + " ],\n", "}" ] }, @@ -231,7 +234,7 @@ "│ hash_indexing_failures │ 0 │\n", "│ number_of_uses │ 1 │\n", "│ bytes_per_record_avg │ 3.40909 │\n", - "│ doc_table_size_mb │ 0.000700951 │\n", + "│ doc_table_size_mb │ 0.000767708 │\n", "│ inverted_sz_mb │ 0.000143051 │\n", "│ key_table_size_mb │ 0.000248909 │\n", "│ offset_bits_per_record_avg │ 8 │\n", @@ -239,15 +242,15 @@ "│ offsets_per_term_avg │ 0.204545 │\n", "│ records_per_doc_avg │ 6.28571 │\n", "│ sortable_values_size_mb │ 0 │\n", - "│ total_indexing_time │ 0.121 │\n", + "│ total_indexing_time │ 1.168 │\n", "│ total_inverted_index_blocks │ 18 │\n", - "│ vector_index_sz_mb │ 0.0200424 │\n", + "│ vector_index_sz_mb │ 0.0202332 │\n", "╰─────────────────────────────┴─────────────╯\n" ] } ], "source": [ - "!rvl stats -i user-hashes" + "!rvl stats -i user-hash" ] }, { @@ -292,6 +295,16 @@ "result_print(results)\n" ] }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "# clean up\n", + "hindex.delete()\n" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -322,12 +335,14 @@ "metadata": {}, "source": [ "#### Full JSON Path support\n", - "Because RedisJSON enables full path support, when creating an index schema, elements need to be indexed and selected by their path with the `name` param and aliased using the `as_name` param as shown below." + "Because Redis enables full JSON path support, when creating an index schema, elements need to be indexed and selected by their path with the desired `name` AND `path` that points to where the data is located within the objects.\n", + "\n", + "> By default, RedisVL will assume the path as `$.{name}` if not provided in JSON fields schema." ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -335,29 +350,33 @@ "json_schema = {\n", " \"index\": {\n", " \"name\": \"user-json\",\n", - " \"storage_type\": \"json\", # updated storage_type option\n", - " \"prefix\": \"json\",\n", + " \"prefix\": \"user-json-docs\",\n", + " \"storage_type\": \"json\", # JSON storage type\n", " },\n", - " \"fields\": {\n", - " \"tag\": [{\"name\": \"$.credit_score\", \"as_name\": \"credit_score\"}, {\"name\": \"$.user\", \"as_name\": \"user\"}],\n", - " \"text\": [{\"name\": \"$.job\", \"as_name\": \"job\"}],\n", - " \"numeric\": [{\"name\": \"$.age\", \"as_name\": \"age\"}],\n", - " \"geo\": [{\"name\": \"$.office_location\", \"as_name\": \"office_location\"}],\n", - " \"vector\": [{\n", - " \"name\": \"$.user_embedding\",\n", - " \"as_name\": \"user_embedding\",\n", + " \"fields\": [\n", + " {\"name\": \"user\", \"type\": \"tag\"},\n", + " {\"name\": \"credit_score\", \"type\": \"tag\"},\n", + " {\"name\": \"job\", \"type\": \"text\"},\n", + " {\"name\": \"age\", \"type\": \"numeric\"},\n", + " {\"name\": \"office_location\", \"type\": \"geo\"},\n", + " {\n", + " \"name\": \"user_embedding\",\n", + " \"type\": \"vector\",\n", + " \"attrs\": {\n", " \"dims\": 3,\n", " \"distance_metric\": \"cosine\",\n", " \"algorithm\": \"flat\",\n", - " \"datatype\": \"float32\"}\n", - " ]\n", - " },\n", + " \"datatype\": \"float32\"\n", + " }\n", + "\n", + " }\n", + " ],\n", "}" ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ @@ -373,16 +392,15 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "\u001b[32m18:29:36\u001b[0m \u001b[34m[RedisVL]\u001b[0m \u001b[1;30mINFO\u001b[0m Indices:\n", - "\u001b[32m18:29:36\u001b[0m \u001b[34m[RedisVL]\u001b[0m \u001b[1;30mINFO\u001b[0m 1. user-hashes\n", - "\u001b[32m18:29:36\u001b[0m \u001b[34m[RedisVL]\u001b[0m \u001b[1;30mINFO\u001b[0m 2. user-json\n" + "\u001b[32m20:23:08\u001b[0m \u001b[34m[RedisVL]\u001b[0m \u001b[1;30mINFO\u001b[0m Indices:\n", + "\u001b[32m20:23:08\u001b[0m \u001b[34m[RedisVL]\u001b[0m \u001b[1;30mINFO\u001b[0m 1. user-json\n" ] } ], @@ -401,7 +419,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 14, "metadata": {}, "outputs": [], "source": [ @@ -415,7 +433,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 15, "metadata": {}, "outputs": [ { @@ -429,7 +447,7 @@ " 'user_embedding': [0.10000000149011612, 0.10000000149011612, 0.5]}" ] }, - "execution_count": 14, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -441,7 +459,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 16, "metadata": {}, "outputs": [], "source": [ @@ -450,7 +468,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 17, "metadata": {}, "outputs": [ { @@ -480,11 +498,10 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 18, "metadata": {}, "outputs": [], "source": [ - "hindex.delete()\n", "jindex.delete()" ] } @@ -505,7 +522,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.6" + "version": "3.9.12" }, "orig_nbformat": 4, "vscode": { diff --git a/docs/user_guide/hybrid_queries_02.ipynb b/docs/user_guide/hybrid_queries_02.ipynb index 9f2f3766..d4c48867 100644 --- a/docs/user_guide/hybrid_queries_02.ipynb +++ b/docs/user_guide/hybrid_queries_02.ipynb @@ -49,24 +49,29 @@ "source": [ "schema = {\n", " \"index\": {\n", - " \"name\": \"user_index\",\n", - " \"prefix\": \"v1\",\n", - " \"storage_type\": \"hash\",\n", + " \"name\": \"user_queries\",\n", + " \"prefix\": \"user_queries_docs\",\n", + " \"storage_type\": \"hash\", # default setting -- HASH\n", " },\n", - " \"fields\": {\n", - " \"tag\": [{\"name\": \"credit_score\"}],\n", - " \"text\": [{\"name\": \"job\"}],\n", - " \"numeric\": [{\"name\": \"age\"}],\n", - " \"geo\": [{\"name\": \"office_location\"}],\n", - " \"vector\": [{\n", - " \"name\": \"user_embedding\",\n", + " \"fields\": [\n", + " {\"name\": \"user\", \"type\": \"tag\"},\n", + " {\"name\": \"credit_score\", \"type\": \"tag\"},\n", + " {\"name\": \"job\", \"type\": \"text\"},\n", + " {\"name\": \"age\", \"type\": \"numeric\"},\n", + " {\"name\": \"office_location\", \"type\": \"geo\"},\n", + " {\n", + " \"name\": \"user_embedding\",\n", + " \"type\": \"vector\",\n", + " \"attrs\": {\n", " \"dims\": 3,\n", " \"distance_metric\": \"cosine\",\n", " \"algorithm\": \"flat\",\n", - " \"datatype\": \"float32\"}\n", - " ]\n", - " },\n", - "}\n" + " \"datatype\": \"float32\"\n", + " }\n", + "\n", + " }\n", + " ],\n", + "}" ] }, { @@ -96,8 +101,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "\u001b[32m18:26:34\u001b[0m \u001b[34m[RedisVL]\u001b[0m \u001b[1;30mINFO\u001b[0m Indices:\n", - "\u001b[32m18:26:34\u001b[0m \u001b[34m[RedisVL]\u001b[0m \u001b[1;30mINFO\u001b[0m 1. user_index\n" + "\u001b[32m20:12:24\u001b[0m \u001b[34m[RedisVL]\u001b[0m \u001b[1;30mINFO\u001b[0m Indices:\n", + "\u001b[32m20:12:24\u001b[0m \u001b[34m[RedisVL]\u001b[0m \u001b[1;30mINFO\u001b[0m 1. user_queries\n" ] } ], @@ -142,7 +147,7 @@ { "data": { "text/html": [ - "
vector_distanceusercredit_scoreagejoboffice_location
0johnhigh18engineer-122.4194,37.7749
0.109129190445tylerhigh100engineer-122.0839,37.3861
0.158809006214timhigh12dermatologist-122.0839,37.3861
0.266666650772nancyhigh94doctor-122.4194,37.7749
" + "
vector_distanceusercredit_scoreagejoboffice_location
0johnhigh18engineer-122.4194,37.7749
0.109129190445tylerhigh100engineer-122.0839,37.3861
0.158808946609timhigh12dermatologist-122.0839,37.3861
0.266666650772nancyhigh94doctor-122.4194,37.7749
" ], "text/plain": [ "" @@ -202,7 +207,7 @@ { "data": { "text/html": [ - "
vector_distanceusercredit_scoreagejoboffice_location
0johnhigh18engineer-122.4194,37.7749
0.109129190445tylerhigh100engineer-122.0839,37.3861
0.158809006214timhigh12dermatologist-122.0839,37.3861
0.266666650772nancyhigh94doctor-122.4194,37.7749
0.653301358223joemedium35dentist-122.0839,37.3861
" + "
vector_distanceusercredit_scoreagejoboffice_location
0johnhigh18engineer-122.4194,37.7749
0.109129190445tylerhigh100engineer-122.0839,37.3861
0.158808946609timhigh12dermatologist-122.0839,37.3861
0.266666650772nancyhigh94doctor-122.4194,37.7749
0.653301358223joemedium35dentist-122.0839,37.3861
" ], "text/plain": [ "" @@ -228,7 +233,7 @@ { "data": { "text/html": [ - "
vector_distanceusercredit_scoreagejoboffice_location
0johnhigh18engineer-122.4194,37.7749
0.109129190445tylerhigh100engineer-122.0839,37.3861
0.158809006214timhigh12dermatologist-122.0839,37.3861
0.266666650772nancyhigh94doctor-122.4194,37.7749
0.653301358223joemedium35dentist-122.0839,37.3861
" + "
vector_distanceusercredit_scoreagejoboffice_location
0johnhigh18engineer-122.4194,37.7749
0.109129190445tylerhigh100engineer-122.0839,37.3861
0.158808946609timhigh12dermatologist-122.0839,37.3861
0.266666650772nancyhigh94doctor-122.4194,37.7749
0.653301358223joemedium35dentist-122.0839,37.3861
" ], "text/plain": [ "" @@ -265,7 +270,7 @@ { "data": { "text/html": [ - "
vector_distanceusercredit_scoreagejoboffice_location
0johnhigh18engineer-122.4194,37.7749
0derricklow14doctor-122.4194,37.7749
0.109129190445tylerhigh100engineer-122.0839,37.3861
0.158809006214timhigh12dermatologist-122.0839,37.3861
0.217882037163taimurlow15CEO-122.0839,37.3861
0.266666650772nancyhigh94doctor-122.4194,37.7749
0.653301358223joemedium35dentist-122.0839,37.3861
" + "
vector_distanceusercredit_scoreagejoboffice_location
0johnhigh18engineer-122.4194,37.7749
0derricklow14doctor-122.4194,37.7749
0.109129190445tylerhigh100engineer-122.0839,37.3861
0.158808946609timhigh12dermatologist-122.0839,37.3861
0.217882037163taimurlow15CEO-122.0839,37.3861
0.266666650772nancyhigh94doctor-122.4194,37.7749
0.653301358223joemedium35dentist-122.0839,37.3861
" ], "text/plain": [ "" @@ -353,7 +358,7 @@ { "data": { "text/html": [ - "
vector_distanceusercredit_scoreagejoboffice_location
0johnhigh18engineer-122.4194,37.7749
0.109129190445tylerhigh100engineer-122.0839,37.3861
0.158809006214timhigh12dermatologist-122.0839,37.3861
0.217882037163taimurlow15CEO-122.0839,37.3861
0.266666650772nancyhigh94doctor-122.4194,37.7749
0.653301358223joemedium35dentist-122.0839,37.3861
" + "
vector_distanceusercredit_scoreagejoboffice_location
0johnhigh18engineer-122.4194,37.7749
0.109129190445tylerhigh100engineer-122.0839,37.3861
0.158808946609timhigh12dermatologist-122.0839,37.3861
0.217882037163taimurlow15CEO-122.0839,37.3861
0.266666650772nancyhigh94doctor-122.4194,37.7749
0.653301358223joemedium35dentist-122.0839,37.3861
" ], "text/plain": [ "" @@ -416,7 +421,7 @@ { "data": { "text/html": [ - "
vector_distanceusercredit_scoreagejoboffice_location
0johnhigh18engineer-122.4194,37.7749
0.109129190445tylerhigh100engineer-122.0839,37.3861
0.158809006214timhigh12dermatologist-122.0839,37.3861
0.217882037163taimurlow15CEO-122.0839,37.3861
0.653301358223joemedium35dentist-122.0839,37.3861
" + "
vector_distanceusercredit_scoreagejoboffice_location
0johnhigh18engineer-122.4194,37.7749
0.109129190445tylerhigh100engineer-122.0839,37.3861
0.158808946609timhigh12dermatologist-122.0839,37.3861
0.217882037163taimurlow15CEO-122.0839,37.3861
0.653301358223joemedium35dentist-122.0839,37.3861
" ], "text/plain": [ "" @@ -520,7 +525,7 @@ { "data": { "text/html": [ - "
vector_distanceusercredit_scoreagejoboffice_location
0johnhigh18engineer-122.4194,37.7749
0derricklow14doctor-122.4194,37.7749
0.109129190445tylerhigh100engineer-122.0839,37.3861
0.158809006214timhigh12dermatologist-122.0839,37.3861
0.217882037163taimurlow15CEO-122.0839,37.3861
0.266666650772nancyhigh94doctor-122.4194,37.7749
0.653301358223joemedium35dentist-122.0839,37.3861
" + "
vector_distanceusercredit_scoreagejoboffice_location
0johnhigh18engineer-122.4194,37.7749
0derricklow14doctor-122.4194,37.7749
0.109129190445tylerhigh100engineer-122.0839,37.3861
0.158808946609timhigh12dermatologist-122.0839,37.3861
0.217882037163taimurlow15CEO-122.0839,37.3861
0.266666650772nancyhigh94doctor-122.4194,37.7749
0.653301358223joemedium35dentist-122.0839,37.3861
" ], "text/plain": [ "" @@ -583,7 +588,7 @@ { "data": { "text/html": [ - "
vector_distanceusercredit_scoreagejoboffice_location
0johnhigh18engineer-122.4194,37.7749
0derricklow14doctor-122.4194,37.7749
0.109129190445tylerhigh100engineer-122.0839,37.3861
0.158809006214timhigh12dermatologist-122.0839,37.3861
0.217882037163taimurlow15CEO-122.0839,37.3861
0.266666650772nancyhigh94doctor-122.4194,37.7749
0.653301358223joemedium35dentist-122.0839,37.3861
" + "
vector_distanceusercredit_scoreagejoboffice_location
0johnhigh18engineer-122.4194,37.7749
0derricklow14doctor-122.4194,37.7749
0.109129190445tylerhigh100engineer-122.0839,37.3861
0.158808946609timhigh12dermatologist-122.0839,37.3861
0.217882037163taimurlow15CEO-122.0839,37.3861
0.266666650772nancyhigh94doctor-122.4194,37.7749
0.653301358223joemedium35dentist-122.0839,37.3861
" ], "text/plain": [ "" @@ -609,7 +614,7 @@ { "data": { "text/html": [ - "
vector_distanceusercredit_scoreagejoboffice_location
0.109129190445tylerhigh100engineer-122.0839,37.3861
0.158809006214timhigh12dermatologist-122.0839,37.3861
0.217882037163taimurlow15CEO-122.0839,37.3861
0.653301358223joemedium35dentist-122.0839,37.3861
" + "
vector_distanceusercredit_scoreagejoboffice_location
0.109129190445tylerhigh100engineer-122.0839,37.3861
0.158808946609timhigh12dermatologist-122.0839,37.3861
0.217882037163taimurlow15CEO-122.0839,37.3861
0.653301358223joemedium35dentist-122.0839,37.3861
" ], "text/plain": [ "" @@ -689,7 +694,7 @@ { "data": { "text/html": [ - "
vector_distanceusercredit_scoreagejoboffice_location
0derricklow14doctor-122.4194,37.7749
0.109129190445tylerhigh100engineer-122.0839,37.3861
0.158809006214timhigh12dermatologist-122.0839,37.3861
0.217882037163taimurlow15CEO-122.0839,37.3861
0.266666650772nancyhigh94doctor-122.4194,37.7749
" + "
vector_distanceusercredit_scoreagejoboffice_location
0derricklow14doctor-122.4194,37.7749
0.109129190445tylerhigh100engineer-122.0839,37.3861
0.158808946609timhigh12dermatologist-122.0839,37.3861
0.217882037163taimurlow15CEO-122.0839,37.3861
0.266666650772nancyhigh94doctor-122.4194,37.7749
" ], "text/plain": [ "" @@ -823,7 +828,7 @@ { "data": { "text/html": [ - "
vector_distanceusercredit_scoreagejoboffice_location
0johnhigh18engineer-122.4194,37.7749
0derricklow14doctor-122.4194,37.7749
0.109129190445tylerhigh100engineer-122.0839,37.3861
0.158809006214timhigh12dermatologist-122.0839,37.3861
0.217882037163taimurlow15CEO-122.0839,37.3861
0.266666650772nancyhigh94doctor-122.4194,37.7749
0.653301358223joemedium35dentist-122.0839,37.3861
" + "
vector_distanceusercredit_scoreagejoboffice_location
0johnhigh18engineer-122.4194,37.7749
0derricklow14doctor-122.4194,37.7749
0.109129190445tylerhigh100engineer-122.0839,37.3861
0.158808946609timhigh12dermatologist-122.0839,37.3861
0.217882037163taimurlow15CEO-122.0839,37.3861
0.266666650772nancyhigh94doctor-122.4194,37.7749
0.653301358223joemedium35dentist-122.0839,37.3861
" ], "text/plain": [ "" @@ -933,7 +938,7 @@ { "data": { "text/html": [ - "
vector_distanceusercredit_scoreagejob
0johnhigh18engineer
0derricklow14doctor
0.109129190445tylerhigh100engineer
0.158809006214timhigh12dermatologist
" + "
vector_distanceusercredit_scoreagejob
0johnhigh18engineer
0derricklow14doctor
0.109129190445tylerhigh100engineer
0.158808946609timhigh12dermatologist
" ], "text/plain": [ "" @@ -1044,7 +1049,7 @@ { "data": { "text/html": [ - "
vector_distanceageusercredit_scorejoboffice_location
0.109129190445100tylerhighengineer-122.0839,37.3861
0.26666665077294nancyhighdoctor-122.4194,37.7749
0.65330135822335joemediumdentist-122.0839,37.3861
018johnhighengineer-122.4194,37.7749
0.21788203716315taimurlowCEO-122.0839,37.3861
014derricklowdoctor-122.4194,37.7749
0.15880900621412timhighdermatologist-122.0839,37.3861
" + "
vector_distanceageusercredit_scorejoboffice_location
0.109129190445100tylerhighengineer-122.0839,37.3861
0.26666665077294nancyhighdoctor-122.4194,37.7749
0.65330135822335joemediumdentist-122.0839,37.3861
018johnhighengineer-122.4194,37.7749
0.21788203716315taimurlowCEO-122.0839,37.3861
014derricklowdoctor-122.4194,37.7749
0.15880894660912timhighdermatologist-122.0839,37.3861
" ], "text/plain": [ "" @@ -1106,10 +1111,10 @@ "name": "stdout", "output_type": "stream", "text": [ - "{'id': 'v1:fcb5f3ea23034dab9f09eebaa9f8ecbb', 'payload': None, 'user': 'john', 'age': '18', 'job': 'engineer', 'credit_score': 'high', 'office_location': '-122.4194,37.7749', 'user_embedding': '==\\x00\\x00\\x00?'}\n", - "{'id': 'v1:5d63fd6921364e76802582651ce1f681', 'payload': None, 'user': 'nancy', 'age': '94', 'job': 'doctor', 'credit_score': 'high', 'office_location': '-122.4194,37.7749', 'user_embedding': '333?=\\x00\\x00\\x00?'}\n", - "{'id': 'v1:0a1e7b02ae0449b18293eca52833528b', 'payload': None, 'user': 'tyler', 'age': '100', 'job': 'engineer', 'credit_score': 'high', 'office_location': '-122.0839,37.3861', 'user_embedding': '=>\\x00\\x00\\x00?'}\n", - "{'id': 'v1:0f9e0ba47e0b462681a3256fabaedfbc', 'payload': None, 'user': 'tim', 'age': '12', 'job': 'dermatologist', 'credit_score': 'high', 'office_location': '-122.0839,37.3861', 'user_embedding': '>>\\x00\\x00\\x00?'}\n" + "{'id': 'user_queries_docs:9f8ae1d270e642d89e41b5f512e35cc7', 'payload': None, 'user': 'john', 'age': '18', 'job': 'engineer', 'credit_score': 'high', 'office_location': '-122.4194,37.7749', 'user_embedding': '==\\x00\\x00\\x00?'}\n", + "{'id': 'user_queries_docs:45ab38080206444f994d59ee11d13a9c', 'payload': None, 'user': 'nancy', 'age': '94', 'job': 'doctor', 'credit_score': 'high', 'office_location': '-122.4194,37.7749', 'user_embedding': '333?=\\x00\\x00\\x00?'}\n", + "{'id': 'user_queries_docs:5b4b0b33e88447108eabd3b0f54a1fb2', 'payload': None, 'user': 'tyler', 'age': '100', 'job': 'engineer', 'credit_score': 'high', 'office_location': '-122.0839,37.3861', 'user_embedding': '=>\\x00\\x00\\x00?'}\n", + "{'id': 'user_queries_docs:7bf2ecb23e314a3f98245f2c07418f64', 'payload': None, 'user': 'tim', 'age': '12', 'job': 'dermatologist', 'credit_score': 'high', 'office_location': '-122.0839,37.3861', 'user_embedding': '>>\\x00\\x00\\x00?'}\n" ] } ], @@ -1176,7 +1181,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.6" + "version": "3.9.12" }, "orig_nbformat": 4, "vscode": { diff --git a/docs/user_guide/llmcache_03.ipynb b/docs/user_guide/llmcache_03.ipynb index 0b3bc7a1..288c7b0b 100644 --- a/docs/user_guide/llmcache_03.ipynb +++ b/docs/user_guide/llmcache_03.ipynb @@ -74,7 +74,16 @@ "cell_type": "code", "execution_count": 3, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/tyler.hutcherson/RedisVentures/redisvl/.venv/lib/python3.9/site-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly. To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n", + " return self.fget.__get__(instance, owner)()\n" + ] + } + ], "source": [ "from redisvl.llmcache import SemanticCache\n", "\n", @@ -168,7 +177,18 @@ "cell_type": "code", "execution_count": 7, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "'llmcache:115049a298532be2f181edb03f766770c0db84c22aff39003fec340deaec7545'" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Cache the question, answer, and arbitrary metadata\n", "llmcache.store(\n", @@ -323,7 +343,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Without caching, a call to openAI to answer this simple question took 0.5017588138580322 seconds.\n" + "Without caching, a call to openAI to answer this simple question took 0.7188658714294434 seconds.\n" ] } ], @@ -340,7 +360,18 @@ "cell_type": "code", "execution_count": 15, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "'llmcache:115049a298532be2f181edb03f766770c0db84c22aff39003fec340deaec7545'" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "llmcache.store(prompt=\"What is the capital of France?\", response=\"Paris\")" ] @@ -354,8 +385,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "Time Taken with cache enabled: 0.327639102935791\n", - "Percentage of time saved: 34.7%\n" + "Time Taken with cache enabled: 0.4411182403564453\n", + "Percentage of time saved: 38.64%\n" ] } ], @@ -397,8 +428,8 @@ "│ offsets_per_term_avg │ 0.875 │\n", "│ records_per_doc_avg │ 16 │\n", "│ sortable_values_size_mb │ 0 │\n", - "│ total_indexing_time │ 0.548 │\n", - "│ total_inverted_index_blocks │ 7 │\n", + "│ total_indexing_time │ 0.996 │\n", + "│ total_inverted_index_blocks │ 25 │\n", "│ vector_index_sz_mb │ 3.0161 │\n", "╰─────────────────────────────┴─────────────╯\n" ] diff --git a/docs/user_guide/schema.yaml b/docs/user_guide/schema.yaml index 5bedeab9..32cf085d 100644 --- a/docs/user_guide/schema.yaml +++ b/docs/user_guide/schema.yaml @@ -1,13 +1,16 @@ +version: '0.1.0' + index: - name: providers - prefix: rvl + name: vectorizers + prefix: doc storage_type: hash fields: - text: - - name: sentence - vector: - - name: embedding - dims: 768 - algorithm: flat - distance_metric: cosine \ No newline at end of file + - name: sentence + type: text + - name: embedding + type: vector + attrs: + dims: 768 + algorithm: flat + distance_metric: cosine \ No newline at end of file diff --git a/docs/user_guide/vectorizers_04.ipynb b/docs/user_guide/vectorizers_04.ipynb index 1fbd7b01..79186bae 100644 --- a/docs/user_guide/vectorizers_04.ipynb +++ b/docs/user_guide/vectorizers_04.ipynb @@ -28,7 +28,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -194,6 +194,14 @@ "execution_count": 6, "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/tyler.hutcherson/RedisVentures/redisvl/.venv/lib/python3.9/site-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly. To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n", + " return self.fget.__get__(instance, owner)()\n" + ] + }, { "data": { "text/plain": [ @@ -255,6 +263,7 @@ "\n", "\n", "**Make sure the following env vars are set:**\n", + "\n", "```\n", "GOOGLE_APPLICATION_CREDENTIALS=\n", "GCP_PROJECT_ID=\n", @@ -264,7 +273,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -282,7 +291,7 @@ " 0.013746795244514942]" ] }, - "execution_count": 12, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -292,7 +301,11 @@ "\n", "\n", "# create a vectorizer\n", - "vtx = VertexAITextVectorizer()\n", + "vtx = VertexAITextVectorizer(api_config={\n", + " \"project_id\": os.environ.get(\"GCP_PROJECT_ID\") or getpass.getpass(\"Enter your GCP Project ID: \"),\n", + " \"location\": os.environ.get(\"GCP_LOCATION\") or getpass.getpass(\"Enter your GCP Location: \"),\n", + " \"google_application_credentials\": os.environ.get(\"GOOGLE_APPLICATION_CREDENTIALS\") or getpass.getpass(\"Enter your Google App Credentials path: \")\n", + "})\n", "\n", "# embed a sentence\n", "test = vtx.embed(\"This is a test sentence.\")\n", @@ -314,7 +327,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -335,7 +348,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 12, "metadata": {}, "outputs": [ { @@ -345,7 +358,7 @@ "Vector dimensions: 1024\n", "[-0.010856628, -0.019683838, -0.0062179565, 0.003545761, -0.047943115, 0.0009365082, -0.005924225, 0.016174316, -0.03289795, 0.049194336]\n", "Vector dimensions: 1024\n", - "[-0.010108948, -0.016693115, -0.0002310276, -0.022644043, -0.04147339, 0.0021324158, -0.033477783, -0.0005378723, -0.02619934, 0.058013916]\n" + "[-0.009712219, -0.016036987, 2.8073788e-05, -0.022491455, -0.041259766, 0.002281189, -0.033294678, -0.00057029724, -0.026260376, 0.0579834]\n" ] } ], @@ -382,26 +395,28 @@ "Here's what the schema for the example looks like in yaml for the HuggingFace vectorizer:\n", "\n", "```yaml\n", + "version: '0.1.0'\n", + "\n", "index:\n", - " name: providers\n", - " prefix: rvl\n", + " name: vectorizers\n", + " prefix: doc\n", " storage_type: hash\n", - " key_separator: ':'\n", "\n", "fields:\n", - " text:\n", - " - name: sentence\n", - " vector:\n", - " - name: embedding\n", - " dims: 768\n", - " algorithm: flat\n", - " distance_metric: cosine\n", + " - name: sentence\n", + " type: text\n", + " - name: embedding\n", + " type: vector\n", + " attrs:\n", + " dims: 768\n", + " algorithm: flat\n", + " distance_metric: cosine\n", "```" ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 15, "metadata": {}, "outputs": [], "source": [ @@ -419,15 +434,15 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "\u001b[32m22:02:27\u001b[0m \u001b[34m[RedisVL]\u001b[0m \u001b[1;30mINFO\u001b[0m Indices:\n", - "\u001b[32m22:02:27\u001b[0m \u001b[34m[RedisVL]\u001b[0m \u001b[1;30mINFO\u001b[0m 1. providers\n" + "\u001b[32m20:22:42\u001b[0m \u001b[34m[RedisVL]\u001b[0m \u001b[1;30mINFO\u001b[0m Indices:\n", + "\u001b[32m20:22:42\u001b[0m \u001b[34m[RedisVL]\u001b[0m \u001b[1;30mINFO\u001b[0m 1. vectorizers\n" ] } ], @@ -438,9 +453,22 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 17, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "['doc:17c401b679ce43cb82f3ab2280ad02f2',\n", + " 'doc:3fc0502bec434b17a3f06e20824b2e59',\n", + " 'doc:199f17b0e5d24dcaa1fd4fb41558150c']" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# load expects an iterable of dictionaries where\n", "# the vector is stored as a bytes buffer\n", @@ -454,7 +482,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 18, "metadata": {}, "outputs": [ { @@ -484,6 +512,16 @@ "for doc in results:\n", " print(doc[\"text\"], doc[\"vector_distance\"])" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# cleanup\n", + "index.delete()" + ] } ], "metadata": { diff --git a/redisvl/cli/index.py b/redisvl/cli/index.py index 64228081..c607ce1e 100644 --- a/redisvl/cli/index.py +++ b/redisvl/cli/index.py @@ -6,7 +6,7 @@ from redisvl.cli.utils import add_index_parsing_options, create_redis_url from redisvl.index import SearchIndex -from redisvl.schema import IndexSchema +from redisvl.schema.schema import IndexSchema from redisvl.utils.connection import RedisConnection from redisvl.utils.log import get_logger from redisvl.utils.utils import convert_bytes, make_dict @@ -117,7 +117,7 @@ def _connect_to_index(self, args: Namespace) -> SearchIndex: exit(0) if args.index: - schema = IndexSchema(name=args.index) + schema = IndexSchema(index={"name": args.index}) index = SearchIndex(schema=schema, redis_url=redis_url) elif args.schema: index = SearchIndex.from_yaml(args.schema) diff --git a/redisvl/cli/stats.py b/redisvl/cli/stats.py index db1cfbeb..e1000996 100644 --- a/redisvl/cli/stats.py +++ b/redisvl/cli/stats.py @@ -6,7 +6,7 @@ from redisvl.cli.utils import add_index_parsing_options, create_redis_url from redisvl.index import SearchIndex -from redisvl.schema import IndexSchema +from redisvl.schema.schema import IndexSchema from redisvl.utils.log import get_logger logger = get_logger("[RedisVL]") @@ -75,7 +75,7 @@ def _connect_to_index(self, args: Namespace) -> SearchIndex: exit(0) if args.index: - schema = IndexSchema(name=args.index) + schema = IndexSchema(index={"name": args.index}) index = SearchIndex(schema=schema, redis_url=redis_url) elif args.schema: index = SearchIndex.from_yaml(args.schema, redis_url=redis_url) diff --git a/redisvl/llmcache/semantic.py b/redisvl/llmcache/semantic.py index 113bedf1..de77b082 100644 --- a/redisvl/llmcache/semantic.py +++ b/redisvl/llmcache/semantic.py @@ -1,5 +1,6 @@ import warnings from typing import Any, Dict, List, Optional +from redis import Redis from redisvl.index import SearchIndex from redisvl.llmcache.base import BaseLLMCache @@ -13,6 +14,7 @@ class SemanticCacheSchema(IndexSchema): """RedisVL index schema for the SemanticCache.""" + # TODO - refactor this to be somewhat flexible to the client user # User should not be able to change these for the default LLMCache entry_id_field_name: str = "id" @@ -23,8 +25,8 @@ class SemanticCacheSchema(IndexSchema): def __init__( self, - name: str = "cache", - prefix: str = "llmcache", + name: str, + prefix: str, vector_dims: Optional[int] = 768, **kwargs, ): @@ -33,12 +35,8 @@ def __init__( # Construct the base base index schema super().__init__(index={"name": name, "prefix": prefix}) - # other schema kwargs will get consumed here - # otherwise fall back to index schema defaults - # Add fields specific to the LLMCacheSchema - self.add_fields( - [ + self.add_fields([ {"name": self.prompt_field_name, "type": "text"}, {"name": self.response_field_name, "type": "text"}, { @@ -54,10 +52,6 @@ def __init__( ] ) - class Config: - # Ignore extra fields passed in kwargs - ignore_extra = True - @property def vector_field(self) -> BaseField: return self.fields[self.vector_field_name] @@ -75,6 +69,7 @@ def __init__( vectorizer: BaseVectorizer = HFTextVectorizer( model="sentence-transformers/all-mpnet-base-v2" ), + redis_client: Optional[Redis] = None, redis_url: str = "redis://localhost:6379", connection_args: Dict[str, Any] = {}, **kwargs, @@ -93,6 +88,8 @@ def __init__( in Redis. Defaults to None. vectorizer (BaseVectorizer, optional): The vectorizer for the cache. Defaults to HFTextVectorizer. + redis_client(Redis, optional): A redis client connection instance. + Defaults to None. redis_url (str, optional): The redis url. Defaults to "redis://localhost:6379". connection_args (Dict[str, Any], optional): The connection arguments @@ -133,25 +130,31 @@ def __init__( prefix = name # build cache index schema - self._schema = SemanticCacheSchema( - name=name, prefix=prefix, vector_dims=vectorizer.dims, **kwargs + schema = SemanticCacheSchema( + name=name, + prefix=prefix, + vector_dims=vectorizer.dims, + **kwargs, ) + # build search index and connect + self._index = SearchIndex(schema=schema) + if redis_client: + self._index.set_client(redis_client) + else: + self._index.connect(redis_url=redis_url, **connection_args) + # initialize other components self.default_return_fields = [ - self._schema.entry_id_field_name, - self._schema.prompt_field_name, - self._schema.response_field_name, - self._schema.vector_field_name, - self._schema.metadata_field_name, + schema.entry_id_field_name, + schema.prompt_field_name, + schema.response_field_name, + schema.vector_field_name, + schema.metadata_field_name, ] self.set_vectorizer(vectorizer) self.set_threshold(distance_threshold) - # build search index - self._index = SearchIndex( - schema=self._schema, redis_url=redis_url, connection_args=connection_args - ) self._index.create(overwrite=False) @property @@ -205,7 +208,7 @@ def set_vectorizer(self, vectorizer: BaseVectorizer) -> None: if not isinstance(vectorizer, BaseVectorizer): raise TypeError("Must provide a valid redisvl.vectorizer class.") - schema_vector_dims = self._schema.vector_field.attrs.dims + schema_vector_dims = self._index.schema.vector_field.attrs.dims # type: ignore if schema_vector_dims != vectorizer.dims: raise ValueError( @@ -257,7 +260,7 @@ def _search_cache( # Construct vector RangeQuery for the cache check query = RangeQuery( vector=vector, - vector_field_name=self._schema.vector_field_name, + vector_field_name=self._index.schema.vector_field_name, return_fields=return_fields, distance_threshold=self._distance_threshold, num_results=num_results, @@ -268,11 +271,11 @@ def _search_cache( cache_hits: List[Dict[str, Any]] = self._index.query(query) # Process cache hits for hit in cache_hits: - self._refresh_ttl(hit[self._schema.entry_id_field_name]) + self._refresh_ttl(hit[self._index.schema.entry_id_field_name]) # Check for metadata and deserialize - if self._schema.metadata_field_name in hit: - hit[self._schema.metadata_field_name] = self.deserialize( - hit[self._schema.metadata_field_name] + if self._index.schema.metadata_field_name in hit: + hit[self._index.schema.metadata_field_name] = self.deserialize( + hit[self._index.schema.metadata_field_name] ) return cache_hits @@ -362,18 +365,18 @@ def store( # Vectorize prompt if necessary and create cache payload vector = vector or self._vectorize_prompt(prompt) # Construct semantic cache payload - id_field = self._schema.entry_id_field_name + id_field = self._index.schema.entry_id_field_name payload = { id_field: self.hash_input(prompt), - self._schema.prompt_field_name: prompt, - self._schema.response_field_name: response, - self._schema.vector_field_name: array_to_buffer(vector), + self._index.schema.prompt_field_name: prompt, + self._index.schema.response_field_name: response, + self._index.schema.vector_field_name: array_to_buffer(vector), } if metadata is not None: if not isinstance(metadata, dict): raise TypeError("If specified, cached metadata must be a dictionary.") # Serialize the metadata dict and add to cache payload - payload[self._schema.metadata_field_name] = self.serialize(metadata) + payload[self._index.schema.metadata_field_name] = self.serialize(metadata) # Load LLMCache entry with TTL keys = self._index.load(data=[payload], ttl=self._ttl, key_field=id_field) diff --git a/redisvl/schema/fields.py b/redisvl/schema/fields.py index ac45bc73..1a0ae6b3 100644 --- a/redisvl/schema/fields.py +++ b/redisvl/schema/fields.py @@ -1,4 +1,4 @@ -from typing import Any, Dict, Optional, Tuple, Union +from typing import Any, Dict, Optional, Tuple, Type, Union from pydantic.v1 import BaseModel, Field, validator from redis.commands.search.field import Field as RedisField @@ -85,10 +85,13 @@ def _handle_names(self) -> Tuple[str, Optional[str]]: return self.path, self.name return self.name, None + def as_redis_field(self) -> RedisField: + raise NotImplementedError + class TextField(BaseField): type: str = Field(default="text", const=True) - attrs: Optional[TextFieldAttributes] = Field(default_factory=TextFieldAttributes) + attrs: TextFieldAttributes = Field(default_factory=TextFieldAttributes) def as_redis_field(self) -> RedisField: name, as_name = self._handle_names() @@ -104,7 +107,7 @@ def as_redis_field(self) -> RedisField: class TagField(BaseField): type: str = Field(default="tag", const=True) - attrs: Optional[TagFieldAttributes] = Field(default_factory=TagFieldAttributes) + attrs: TagFieldAttributes = Field(default_factory=TagFieldAttributes) def as_redis_field(self) -> RedisField: name, as_name = self._handle_names() @@ -119,9 +122,7 @@ def as_redis_field(self) -> RedisField: class NumericField(BaseField): type: str = Field(default="numeric", const=True) - attrs: Optional[NumericFieldAttributes] = Field( - default_factory=NumericFieldAttributes - ) + attrs: NumericFieldAttributes = Field(default_factory=NumericFieldAttributes) def as_redis_field(self) -> RedisField: name, as_name = self._handle_names() @@ -134,7 +135,7 @@ def as_redis_field(self) -> RedisField: class GeoField(BaseField): type: str = Field(default="geo", const=True) - attrs: Optional[GeoFieldAttributes] = Field(default_factory=GeoFieldAttributes) + attrs: GeoFieldAttributes = Field(default_factory=GeoFieldAttributes) def as_redis_field(self) -> RedisField: name, as_name = self._handle_names() @@ -147,9 +148,7 @@ def as_redis_field(self) -> RedisField: class FlatVectorField(BaseField): type: str = Field(default="vector", const=True) - attrs: Optional[FlatVectorFieldAttributes] = Field( - default_factory=FlatVectorFieldAttributes - ) + attrs: FlatVectorFieldAttributes def as_redis_field(self) -> RedisField: # grab base field params and augment with flat-specific fields @@ -162,9 +161,7 @@ def as_redis_field(self) -> RedisField: class HNSWVectorField(BaseField): type: str = Field(default="vector", const=True) - attrs: Optional[HNSWVectorFieldAttributes] = Field( - default_factory=HNSWVectorFieldAttributes - ) + attrs: HNSWVectorFieldAttributes def as_redis_field(self) -> RedisField: # grab base field params and augment with hnsw-specific fields @@ -197,7 +194,7 @@ class FieldFactory: } @classmethod - def pick_vector_field_type(cls, attrs: Dict[str, Any]) -> BaseField: + def pick_vector_field_type(cls, attrs: Dict[str, Any]) -> Type[BaseField]: """Get the vector field type from the field data.""" if "algorithm" not in attrs: raise ValueError("Must provide algorithm param for the vector field.") @@ -209,7 +206,7 @@ def pick_vector_field_type(cls, attrs: Dict[str, Any]) -> BaseField: if algorithm not in cls.VECTOR_FIELD_TYPE_MAP: raise ValueError(f"Unknown vector field algorithm: {algorithm}") - return cls.VECTOR_FIELD_TYPE_MAP[algorithm] + return cls.VECTOR_FIELD_TYPE_MAP[algorithm] # type: ignore @classmethod def create_field( @@ -226,6 +223,6 @@ def create_field( else: if type not in cls.FIELD_TYPE_MAP: raise ValueError(f"Unknown field type: {type}") - field_class = cls.FIELD_TYPE_MAP[type] + field_class = cls.FIELD_TYPE_MAP[type] # type: ignore - return field_class(name=name, path=path, attrs=attrs) + return field_class(name=name, path=path, attrs=attrs) # type: ignore diff --git a/redisvl/schema/schema.py b/redisvl/schema/schema.py index 4dd887a4..182e7495 100644 --- a/redisvl/schema/schema.py +++ b/redisvl/schema/schema.py @@ -44,6 +44,14 @@ class IndexInfo(BaseModel): storage_type: StorageType = StorageType.HASH """The storage type used in Redis (e.g., 'hash' or 'json').""" + def dict(self, *args, **kwargs) -> Dict[str, Any]: + return { + "name": self.name, + "prefix": self.prefix, + "key_separator": self.key_separator, + "storage_type": self.storage_type.value, + } + class IndexSchema(BaseModel): """Represents a schema definition for a search index in Redis, primarily @@ -327,8 +335,8 @@ def generate_fields( data: Dict[str, Any], strict: bool = False, ignore_fields: List[str] = [], - ) -> Dict[str, List[Dict[str, Any]]]: - """Generates a set of field definitions from a sample data dictionary. + ) -> List[Dict[str, Any]]: + """Generates a list of extracted field specs from a sample data point. This method simplifies the process of creating a schema by inferring field types and attributes from sample data. It's particularly useful @@ -376,7 +384,12 @@ def to_dict(self) -> Dict[str, Any]: Returns: Dict[str, Any]: The index schema as a dictionary. """ - return self.dict(exclude_unset=True) + dict_schema = self.dict(exclude_none=True) + # cast fields back to a pure list + dict_schema["fields"] = [ + field for field_name, field in dict_schema["fields"].items() + ] + return dict_schema def to_yaml(self, file_path: str, overwrite: bool = True) -> None: """Write the index schema to a YAML file. diff --git a/tests/integration/test_simple_async.py b/tests/integration/test_simple_async.py index 267e8493..d35b9252 100644 --- a/tests/integration/test_simple_async.py +++ b/tests/integration/test_simple_async.py @@ -33,34 +33,39 @@ "user_embedding": np.array([0.9, 0.9, 0.1], dtype=np.float32).tobytes(), }, ] + query_vector = np.array([0.1, 0.1, 0.5], dtype=np.float32).tobytes() -schema = { +fields_spec = [ + {"name": "credit_score", "type": "tag"}, + {"name": "user", "type": "tag"}, + {"name": "job", "type": "text"}, + {"name": "age", "type": "numeric"}, + { + "name": "user_embedding", + "type": "vector", + "attrs": { + "dims": 3, + "distance_metric": "cosine", + "algorithm": "flat", + "datatype": "float32", + }, + }, +] + +hash_schema = { "index": { "name": "user_index", "prefix": "users", "storage_type": "hash", }, - "fields": { - "tag": [{"name": "credit_score"}], - "text": [{"name": "job"}], - "numeric": [{"name": "age"}], - "vector": [ - { - "name": "user_embedding", - "dims": 3, - "distance_metric": "cosine", - "algorithm": "flat", - "datatype": "float32", - } - ], - }, + "fields": fields_spec, } @pytest.mark.asyncio async def test_simple(async_client): - index = SearchIndex.from_dict(schema) + index = SearchIndex.from_dict(hash_schema) # assign client (only for testing) index.set_client(async_client) # create the index diff --git a/tests/unit/test_schema.py b/tests/unit/test_schema.py index 62bae245..a85b0763 100644 --- a/tests/unit/test_schema.py +++ b/tests/unit/test_schema.py @@ -1,3 +1,4 @@ +import os import pathlib import pytest @@ -124,7 +125,7 @@ def test_to_dict(): index_schema = create_sample_index_schema() index_dict = index_schema.to_dict() assert index_dict["index"]["name"] == "test" - assert isinstance(index_dict["fields"], dict) + assert isinstance(index_dict["fields"], list) assert len(index_dict["fields"]) == 2 == len(index_schema.fields) @@ -162,6 +163,7 @@ def test_hash_index_from_yaml(): assert index_schema.index.storage_type == StorageType.HASH assert len(index_schema.fields) == 2 + def test_json_index_from_yaml(): """Test loading from yaml.""" index_schema = IndexSchema.from_yaml( @@ -172,6 +174,20 @@ def test_json_index_from_yaml(): assert index_schema.index.storage_type == StorageType.JSON assert len(index_schema.fields) == 2 + +def test_to_yaml_and_reload(): + index_schema = create_sample_index_schema() + index_schema.to_yaml("temp_test.yaml") + + assert os.path.exists("temp_test.yaml") + + new_schema = IndexSchema.from_yaml("temp_test.yaml") + assert new_schema == index_schema + assert new_schema.to_dict() == index_schema.to_dict() + + os.remove("temp_test.yaml") + + def test_from_yaml_file_not_found(): """Test loading from yaml with file not found.""" with pytest.raises(FileNotFoundError): From a68a8f6c27023f54e5c99a153afbf6f4aa1a4778 Mon Sep 17 00:00:00 2001 From: Tyler Hutcherson Date: Mon, 29 Jan 2024 22:00:05 -0500 Subject: [PATCH 06/10] work on docs update --- docs/_static/js/sidebar.js | 2 +- docs/api/indexschema.rst | 7 ++----- redisvl/index.py | 9 ++++++--- redisvl/schema/fields.py | 11 +++++++++++ 4 files changed, 20 insertions(+), 9 deletions(-) diff --git a/docs/_static/js/sidebar.js b/docs/_static/js/sidebar.js index 5995b7b6..0a8d1617 100644 --- a/docs/_static/js/sidebar.js +++ b/docs/_static/js/sidebar.js @@ -7,7 +7,7 @@ const toc = [ { header: "User Guides", toc: [ { title: "Getting Started", path: "/user_guide/getting_started_01.html" }, { title: "Query and Filter", path: "/user_guide/hybrid_queries_02.html" }, - { title: "Customizable Storage (JSON and Hash)", path: "/user_guide/hash_vs_json_05.html" }, + { title: "JSON vs Hash Storage", path: "/user_guide/hash_vs_json_05.html" }, { title: "Vectorizers", path: "/user_guide/vectorizers_04.html" }, { title: "Semantic Caching", path: "/user_guide/llmcache_03.html" }, diff --git a/docs/api/indexschema.rst b/docs/api/indexschema.rst index 6c7f3d0b..248900a5 100644 --- a/docs/api/indexschema.rst +++ b/docs/api/indexschema.rst @@ -13,16 +13,13 @@ IndexSchema .. autosummary:: IndexSchema.__init__ - IndexSchema.name - IndexSchema.prefix - IndexSchema.key_separator - IndexSchema.storage_type IndexSchema.fields + IndexSchema.version + IndexSchema.field_names IndexSchema.redis_fields IndexSchema.add_field IndexSchema.add_fields IndexSchema.remove_field - IndexSchema.generate_fields IndexSchema.from_yaml IndexSchema.to_yaml IndexSchema.from_dict diff --git a/redisvl/index.py b/redisvl/index.py index 588dafb7..54bb1449 100644 --- a/redisvl/index.py +++ b/redisvl/index.py @@ -160,6 +160,9 @@ class SearchIndex: # data is an iterable of dictionaries index.load(data) + # delete index and data + index.delete(drop=True) + # Do the same with an an async connection index = SearchIndex.from_yaml("schema.yaml", redis_url="redis://localhost:6379", use_async=True) await index.acreate(overwrite=True) @@ -291,9 +294,9 @@ def from_dict( "prefix": "rvl", "storage_type": "hash", }, - "fields": { - "tag": [{"name": "doc-id"}] - } + "fields": [ + {"name": "doc-id", "type": "tag"} + ] }, redis_url="redis://localhost:6379") index.create(overwrite=True) diff --git a/redisvl/schema/fields.py b/redisvl/schema/fields.py index 1a0ae6b3..fb07d26b 100644 --- a/redisvl/schema/fields.py +++ b/redisvl/schema/fields.py @@ -75,10 +75,15 @@ class FlatVectorFieldAttributes(BaseVectorFieldAttributes): class BaseField(BaseModel): + """Base field""" name: str + """Field name""" type: str + """Field type""" path: Optional[str] = None + """Field path (within JSON object)""" attrs: Optional[Union[BaseFieldAttributes, BaseVectorFieldAttributes]] = None + """Specified field attributes""" def _handle_names(self) -> Tuple[str, Optional[str]]: if self.path: @@ -90,6 +95,7 @@ def as_redis_field(self) -> RedisField: class TextField(BaseField): + """Text field supporting a full text search index""" type: str = Field(default="text", const=True) attrs: TextFieldAttributes = Field(default_factory=TextFieldAttributes) @@ -106,6 +112,7 @@ def as_redis_field(self) -> RedisField: class TagField(BaseField): + """Tag field for simple boolean filtering""" type: str = Field(default="tag", const=True) attrs: TagFieldAttributes = Field(default_factory=TagFieldAttributes) @@ -121,6 +128,7 @@ def as_redis_field(self) -> RedisField: class NumericField(BaseField): + """Numeric field for numeric range filtering""" type: str = Field(default="numeric", const=True) attrs: NumericFieldAttributes = Field(default_factory=NumericFieldAttributes) @@ -134,6 +142,7 @@ def as_redis_field(self) -> RedisField: class GeoField(BaseField): + """Geo field with a geo-spatial index for location based search""" type: str = Field(default="geo", const=True) attrs: GeoFieldAttributes = Field(default_factory=GeoFieldAttributes) @@ -147,6 +156,7 @@ def as_redis_field(self) -> RedisField: class FlatVectorField(BaseField): + "Vector field with a FLAT index (brute force nearest neighbors search)" type: str = Field(default="vector", const=True) attrs: FlatVectorFieldAttributes @@ -160,6 +170,7 @@ def as_redis_field(self) -> RedisField: class HNSWVectorField(BaseField): + """Vector field with an HNSW index (approximate nearest neighbors search)""" type: str = Field(default="vector", const=True) attrs: HNSWVectorFieldAttributes From 20c4075402c7520ff9c9d721b7fed0fe67265eed Mon Sep 17 00:00:00 2001 From: Tyler Hutcherson Date: Mon, 29 Jan 2024 23:57:25 -0500 Subject: [PATCH 07/10] doc tweaks --- docs/_static/js/sidebar.js | 2 +- docs/api/indexschema.rst | 21 ++++++++++++++++++++- redisvl/schema/__init__.py | 5 +++-- 3 files changed, 24 insertions(+), 4 deletions(-) diff --git a/docs/_static/js/sidebar.js b/docs/_static/js/sidebar.js index 0a8d1617..33c004ab 100644 --- a/docs/_static/js/sidebar.js +++ b/docs/_static/js/sidebar.js @@ -18,7 +18,7 @@ const toc = [ { title: "Query", path: "/api/query.html" }, { title: "Filter", path: "/api/filter.html" }, { title: "Vectorizers", path: "/api/vectorizer.html" }, - { title: "SemanticCache", path: "/api/cache.html" } + { title: "LLMCache", path: "/api/cache.html" } ]} ]; diff --git a/docs/api/indexschema.rst b/docs/api/indexschema.rst index 248900a5..bb6a1853 100644 --- a/docs/api/indexschema.rst +++ b/docs/api/indexschema.rst @@ -12,7 +12,7 @@ IndexSchema .. autosummary:: - IndexSchema.__init__ + IndexSchema.index IndexSchema.fields IndexSchema.version IndexSchema.field_names @@ -29,3 +29,22 @@ IndexSchema :show-inheritance: :inherited-members: :members: + + +IndexInfo +=========== + +.. currentmodule:: redisvl.schema + +.. autosummary:: + + IndexInfo.name + IndexInfo.prefix + IndexInfo.key_separator + IndexInfo.storage_type + + +.. autoclass:: IndexInfo + :show-inheritance: + :inherited-members: + :members: \ No newline at end of file diff --git a/redisvl/schema/__init__.py b/redisvl/schema/__init__.py index 7ab6597a..e11727c8 100644 --- a/redisvl/schema/__init__.py +++ b/redisvl/schema/__init__.py @@ -1,6 +1,7 @@ -from redisvl.schema.schema import IndexSchema, StorageType +from redisvl.schema.schema import IndexSchema, IndexInfo, StorageType __all__ = [ "StorageType", - "Schema", + "IndexSchema", + "IndexInfo" ] From bee50a9ac4bbd8abca7adc61a0cec7df3a820316 Mon Sep 17 00:00:00 2001 From: Tyler Hutcherson Date: Tue, 30 Jan 2024 10:56:31 -0500 Subject: [PATCH 08/10] update notebook --- docs/api/indexschema.rst | 6 +- docs/api/searchindex.rst | 1 - docs/user_guide/getting_started_01.ipynb | 10 ++- redisvl/cli/index.py | 2 +- redisvl/cli/stats.py | 2 +- redisvl/llmcache/semantic.py | 101 +++++++++-------------- redisvl/schema/__init__.py | 8 +- redisvl/schema/fields.py | 6 ++ redisvl/schema/schema.py | 46 ++++++++--- 9 files changed, 93 insertions(+), 89 deletions(-) diff --git a/docs/api/indexschema.rst b/docs/api/indexschema.rst index bb6a1853..78fd948b 100644 --- a/docs/api/indexschema.rst +++ b/docs/api/indexschema.rst @@ -1,6 +1,6 @@ *********** -IndexSchema +Schema *********** IndexSchema @@ -32,7 +32,7 @@ IndexSchema IndexInfo -=========== +========= .. currentmodule:: redisvl.schema @@ -47,4 +47,4 @@ IndexInfo .. autoclass:: IndexInfo :show-inheritance: :inherited-members: - :members: \ No newline at end of file + :members: diff --git a/docs/api/searchindex.rst b/docs/api/searchindex.rst index 71cd781b..b01235da 100644 --- a/docs/api/searchindex.rst +++ b/docs/api/searchindex.rst @@ -11,7 +11,6 @@ SearchIndex .. autosummary:: - SearchIndex.__init__ SearchIndex.from_yaml SearchIndex.from_dict SearchIndex.client diff --git a/docs/user_guide/getting_started_01.ipynb b/docs/user_guide/getting_started_01.ipynb index 44f6bc69..37464af8 100644 --- a/docs/user_guide/getting_started_01.ipynb +++ b/docs/user_guide/getting_started_01.ipynb @@ -202,7 +202,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Bring your own Redis connection instance" + "### Bring your own Redis connection instance\n", + "\n", + "This ideal in scenarious where you have custom settings on the connection instance or if your application will share a connection pool:" ] }, { @@ -234,7 +236,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Let the index manage the connection instance" + "### Let the index manage the connection instance\n", + "\n", + "This is ideal for simple cases:" ] }, { @@ -415,7 +419,7 @@ } ], "source": [ - "_id = keys[0].strip(\"user_simple_docs:\")\n", + "_id = keys[0].strip(f\"{index.prefix}:\") # strip the key prefix\n", "\n", "print(f\"Fetching data for user {_id}\")\n", "index.fetch(id=_id)" diff --git a/redisvl/cli/index.py b/redisvl/cli/index.py index c607ce1e..9dd633af 100644 --- a/redisvl/cli/index.py +++ b/redisvl/cli/index.py @@ -117,7 +117,7 @@ def _connect_to_index(self, args: Namespace) -> SearchIndex: exit(0) if args.index: - schema = IndexSchema(index={"name": args.index}) + schema = IndexSchema.from_dict({"index": {"name": args.index}}) index = SearchIndex(schema=schema, redis_url=redis_url) elif args.schema: index = SearchIndex.from_yaml(args.schema) diff --git a/redisvl/cli/stats.py b/redisvl/cli/stats.py index e1000996..0d0e54ae 100644 --- a/redisvl/cli/stats.py +++ b/redisvl/cli/stats.py @@ -75,7 +75,7 @@ def _connect_to_index(self, args: Namespace) -> SearchIndex: exit(0) if args.index: - schema = IndexSchema(index={"name": args.index}) + schema = IndexSchema.from_dict({"index": {"name": args.index}}) index = SearchIndex(schema=schema, redis_url=redis_url) elif args.schema: index = SearchIndex.from_yaml(args.schema, redis_url=redis_url) diff --git a/redisvl/llmcache/semantic.py b/redisvl/llmcache/semantic.py index de77b082..f3da56b2 100644 --- a/redisvl/llmcache/semantic.py +++ b/redisvl/llmcache/semantic.py @@ -1,65 +1,26 @@ import warnings from typing import Any, Dict, List, Optional + from redis import Redis from redisvl.index import SearchIndex from redisvl.llmcache.base import BaseLLMCache from redisvl.query import RangeQuery -from redisvl.schema.fields import BaseField -from redisvl.schema.schema import IndexInfo, IndexSchema +from redisvl.schema.schema import IndexSchema from redisvl.utils.utils import array_to_buffer from redisvl.vectorize.base import BaseVectorizer from redisvl.vectorize.text import HFTextVectorizer -class SemanticCacheSchema(IndexSchema): - """RedisVL index schema for the SemanticCache.""" - # TODO - refactor this to be somewhat flexible to the client user +class SemanticCache(BaseLLMCache): + """Semantic Cache for Large Language Models.""" - # User should not be able to change these for the default LLMCache entry_id_field_name: str = "id" prompt_field_name: str = "prompt" vector_field_name: str = "prompt_vector" response_field_name: str = "response" metadata_field_name: str = "metadata" - def __init__( - self, - name: str, - prefix: str, - vector_dims: Optional[int] = 768, - **kwargs, - ): - if not vector_dims: - raise ValueError("Must provide vectorizer dimensions") - - # Construct the base base index schema - super().__init__(index={"name": name, "prefix": prefix}) - - self.add_fields([ - {"name": self.prompt_field_name, "type": "text"}, - {"name": self.response_field_name, "type": "text"}, - { - "name": self.vector_field_name, - "type": "vector", - "attrs": { - "dims": vector_dims, - "datatype": "float32", - "distance_metric": "cosine", - "algorithm": "flat", - }, - }, - ] - ) - - @property - def vector_field(self) -> BaseField: - return self.fields[self.vector_field_name] - - -class SemanticCache(BaseLLMCache): - """Semantic Cache for Large Language Models.""" - def __init__( self, name: str = "llmcache", @@ -130,11 +91,23 @@ def __init__( prefix = name # build cache index schema - schema = SemanticCacheSchema( - name=name, - prefix=prefix, - vector_dims=vectorizer.dims, - **kwargs, + schema = IndexSchema.from_dict({"index": {"name": name, "prefix": prefix}}) + # add fields + schema.add_fields( + [ + {"name": self.prompt_field_name, "type": "text"}, + {"name": self.response_field_name, "type": "text"}, + { + "name": self.vector_field_name, + "type": "vector", + "attrs": { + "dims": vectorizer.dims, + "datatype": "float32", + "distance_metric": "cosine", + "algorithm": "flat", + }, + }, + ] ) # build search index and connect @@ -146,11 +119,11 @@ def __init__( # initialize other components self.default_return_fields = [ - schema.entry_id_field_name, - schema.prompt_field_name, - schema.response_field_name, - schema.vector_field_name, - schema.metadata_field_name, + self.entry_id_field_name, + self.prompt_field_name, + self.response_field_name, + self.vector_field_name, + self.metadata_field_name, ] self.set_vectorizer(vectorizer) self.set_threshold(distance_threshold) @@ -208,7 +181,7 @@ def set_vectorizer(self, vectorizer: BaseVectorizer) -> None: if not isinstance(vectorizer, BaseVectorizer): raise TypeError("Must provide a valid redisvl.vectorizer class.") - schema_vector_dims = self._index.schema.vector_field.attrs.dims # type: ignore + schema_vector_dims = self._index.schema.fields[self.vector_field_name].attrs.dims # type: ignore if schema_vector_dims != vectorizer.dims: raise ValueError( @@ -260,7 +233,7 @@ def _search_cache( # Construct vector RangeQuery for the cache check query = RangeQuery( vector=vector, - vector_field_name=self._index.schema.vector_field_name, + vector_field_name=self.vector_field_name, return_fields=return_fields, distance_threshold=self._distance_threshold, num_results=num_results, @@ -271,11 +244,11 @@ def _search_cache( cache_hits: List[Dict[str, Any]] = self._index.query(query) # Process cache hits for hit in cache_hits: - self._refresh_ttl(hit[self._index.schema.entry_id_field_name]) + self._refresh_ttl(hit[self.entry_id_field_name]) # Check for metadata and deserialize - if self._index.schema.metadata_field_name in hit: - hit[self._index.schema.metadata_field_name] = self.deserialize( - hit[self._index.schema.metadata_field_name] + if self.metadata_field_name in hit: + hit[self.metadata_field_name] = self.deserialize( + hit[self.metadata_field_name] ) return cache_hits @@ -365,18 +338,18 @@ def store( # Vectorize prompt if necessary and create cache payload vector = vector or self._vectorize_prompt(prompt) # Construct semantic cache payload - id_field = self._index.schema.entry_id_field_name + id_field = self.entry_id_field_name payload = { id_field: self.hash_input(prompt), - self._index.schema.prompt_field_name: prompt, - self._index.schema.response_field_name: response, - self._index.schema.vector_field_name: array_to_buffer(vector), + self.prompt_field_name: prompt, + self.response_field_name: response, + self.vector_field_name: array_to_buffer(vector), } if metadata is not None: if not isinstance(metadata, dict): raise TypeError("If specified, cached metadata must be a dictionary.") # Serialize the metadata dict and add to cache payload - payload[self._index.schema.metadata_field_name] = self.serialize(metadata) + payload[self.metadata_field_name] = self.serialize(metadata) # Load LLMCache entry with TTL keys = self._index.load(data=[payload], ttl=self._ttl, key_field=id_field) diff --git a/redisvl/schema/__init__.py b/redisvl/schema/__init__.py index e11727c8..24f6b821 100644 --- a/redisvl/schema/__init__.py +++ b/redisvl/schema/__init__.py @@ -1,7 +1,3 @@ -from redisvl.schema.schema import IndexSchema, IndexInfo, StorageType +from redisvl.schema.schema import IndexInfo, IndexSchema, StorageType -__all__ = [ - "StorageType", - "IndexSchema", - "IndexInfo" -] +__all__ = ["StorageType", "IndexSchema", "IndexInfo"] diff --git a/redisvl/schema/fields.py b/redisvl/schema/fields.py index fb07d26b..b01c3d56 100644 --- a/redisvl/schema/fields.py +++ b/redisvl/schema/fields.py @@ -76,6 +76,7 @@ class FlatVectorFieldAttributes(BaseVectorFieldAttributes): class BaseField(BaseModel): """Base field""" + name: str """Field name""" type: str @@ -96,6 +97,7 @@ def as_redis_field(self) -> RedisField: class TextField(BaseField): """Text field supporting a full text search index""" + type: str = Field(default="text", const=True) attrs: TextFieldAttributes = Field(default_factory=TextFieldAttributes) @@ -113,6 +115,7 @@ def as_redis_field(self) -> RedisField: class TagField(BaseField): """Tag field for simple boolean filtering""" + type: str = Field(default="tag", const=True) attrs: TagFieldAttributes = Field(default_factory=TagFieldAttributes) @@ -129,6 +132,7 @@ def as_redis_field(self) -> RedisField: class NumericField(BaseField): """Numeric field for numeric range filtering""" + type: str = Field(default="numeric", const=True) attrs: NumericFieldAttributes = Field(default_factory=NumericFieldAttributes) @@ -143,6 +147,7 @@ def as_redis_field(self) -> RedisField: class GeoField(BaseField): """Geo field with a geo-spatial index for location based search""" + type: str = Field(default="geo", const=True) attrs: GeoFieldAttributes = Field(default_factory=GeoFieldAttributes) @@ -171,6 +176,7 @@ def as_redis_field(self) -> RedisField: class HNSWVectorField(BaseField): """Vector field with an HNSW index (approximate nearest neighbors search)""" + type: str = Field(default="vector", const=True) attrs: HNSWVectorFieldAttributes diff --git a/redisvl/schema/schema.py b/redisvl/schema/schema.py index 182e7495..8479d0b7 100644 --- a/redisvl/schema/schema.py +++ b/redisvl/schema/schema.py @@ -40,7 +40,7 @@ class IndexInfo(BaseModel): prefix: str = "rvl" """The prefix used for Redis keys associated with this index.""" key_separator: str = ":" - """The separator character used in Redis keys.""" + """The separator character used in designing Redis keys.""" storage_type: StorageType = StorageType.HASH """The storage type used in Redis (e.g., 'hash' or 'json').""" @@ -65,6 +65,32 @@ class IndexSchema(BaseModel): Python dictionary, supporting flexible schema definitions and easy integration into various workflows. + An example `schema.yaml` file might look like this: + + .. code-block:: yaml + + version: '0.1.0' + + index: + name: user-index + prefix: user + storage_type: json + + fields: + - name: user + type: tag + - name: credit_score + type: tag + - name: embedding + type: vector + attrs: + algorithm: flat + dims: 3 + distance_metric: cosine + datatype: float32 + + Loading the schema with RedisVL using yaml or dict format: + .. code-block:: python from redisvl.schema import IndexSchema @@ -75,21 +101,21 @@ class IndexSchema(BaseModel): # From Dict schema = IndexSchema.from_dict({ "index": { - "name": "docs-index", - "prefix": "docs", - "storage_type": "hash", + "name": "user-index", + "prefix": "user", + "storage_type": "json", }, "fields": [ + {"name": "user", "type": "tag"}, + {"name": "credit_score", "type": "tag"}, { - "name": "doc-id", - "type": "tag" - }, - { - "name": "doc-embedding", + "name": "embedding", "type": "vector", "attrs": { "algorithm": "flat", - "dims": 1536 + "dims": 3, + "distance_metrics": "cosine", + "datatype": "float32" } } ] From 33a928736d28be2a5d05d11e056a7bdfb1f517d3 Mon Sep 17 00:00:00 2001 From: Tyler Hutcherson Date: Tue, 30 Jan 2024 11:08:15 -0500 Subject: [PATCH 09/10] remove logger for now --- redisvl/schema/schema.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/redisvl/schema/schema.py b/redisvl/schema/schema.py index 8479d0b7..866b4c32 100644 --- a/redisvl/schema/schema.py +++ b/redisvl/schema/schema.py @@ -1,16 +1,17 @@ import re +import warnings +import yaml + from enum import Enum from pathlib import Path from typing import Any, Dict, List -import yaml -from pydantic.v1 import BaseModel, Field, root_validator, validator +from pydantic.v1 import BaseModel, Field, root_validator from redis.commands.search.field import Field as RedisField from redisvl.schema.fields import BaseField, FieldFactory -from redisvl.utils.log import get_logger -logger = get_logger(__name__) + SCHEMA_VERSION = "0.1.0" @@ -148,8 +149,8 @@ def _make_field(storage_type, **field_inputs) -> BaseField: field.path = field.path if field.path else f"$.{field.name}" else: if field.path is not None: - logger.warning( - f"Path attribute for field '{field.name}' will be ignored for HASH storage type." + warnings.warn( + message=f"Path attribute for field '{field.name}' will be ignored for HASH storage type." ) field.path = None return field @@ -352,7 +353,7 @@ def remove_field(self, field_name: str): field_name (str): The name of the field to be removed. """ if field_name not in self.fields: - logger.warning(f"Field '{field_name}' does not exist in the schema") + warnings.warn(message=f"Field '{field_name}' does not exist in the schema") return del self.fields[field_name] @@ -401,7 +402,7 @@ def generate_fields( if strict: raise else: - logger.warning(f"Error inferring field type for {field_name}: {e}") + warnings.warn(message=f"Error inferring field type for {field_name}: {e}") return fields def to_dict(self) -> Dict[str, Any]: From 212083b69278fe5aee11f478aed420c92916ec97 Mon Sep 17 00:00:00 2001 From: Tyler Hutcherson Date: Tue, 30 Jan 2024 12:22:22 -0500 Subject: [PATCH 10/10] fix formatting again --- redisvl/schema/schema.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/redisvl/schema/schema.py b/redisvl/schema/schema.py index 866b4c32..e69b36f2 100644 --- a/redisvl/schema/schema.py +++ b/redisvl/schema/schema.py @@ -1,17 +1,15 @@ import re import warnings -import yaml - from enum import Enum from pathlib import Path from typing import Any, Dict, List +import yaml from pydantic.v1 import BaseModel, Field, root_validator from redis.commands.search.field import Field as RedisField from redisvl.schema.fields import BaseField, FieldFactory - SCHEMA_VERSION = "0.1.0" @@ -402,7 +400,9 @@ def generate_fields( if strict: raise else: - warnings.warn(message=f"Error inferring field type for {field_name}: {e}") + warnings.warn( + message=f"Error inferring field type for {field_name}: {e}" + ) return fields def to_dict(self) -> Dict[str, Any]: