From 0f9ef9ee144ab4ab72a8506a103cd5cd65f6f3a5 Mon Sep 17 00:00:00 2001 From: Tyler Hutcherson Date: Mon, 31 Jul 2023 14:49:15 -0400 Subject: [PATCH 1/2] add text embeddings module --- .github/workflows/run_tests.yml | 2 +- README.md | 6 +- conftest.py | 2 +- docs/examples/openai_qna.ipynb | 6 +- docs/index.md | 2 +- docs/user_guide/embedding_creation.rst | 11 +-- docs/user_guide/index.md | 4 +- docs/user_guide/providers_03.ipynb | 21 ++---- redisvl/llmcache/semantic.py | 16 ++--- redisvl/providers/__init__.py | 4 -- redisvl/vectorize/__init__.py | 0 redisvl/{providers => vectorize}/base.py | 2 +- redisvl/vectorize/text/__init__.py | 7 ++ .../text}/huggingface.py | 6 +- .../{providers => vectorize/text}/openai.py | 6 +- tests/integration/test_llmcache.py | 22 +++--- tests/integration/test_providers.py | 63 ----------------- tests/integration/test_vectorizers.py | 67 +++++++++++++++++++ 18 files changed, 122 insertions(+), 125 deletions(-) delete mode 100644 redisvl/providers/__init__.py create mode 100644 redisvl/vectorize/__init__.py rename redisvl/{providers => vectorize}/base.py (98%) create mode 100644 redisvl/vectorize/text/__init__.py rename redisvl/{providers => vectorize/text}/huggingface.py (83%) rename redisvl/{providers => vectorize/text}/openai.py (91%) delete mode 100644 tests/integration/test_providers.py create mode 100644 tests/integration/test_vectorizers.py diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index f48d7919..8dc079a3 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -37,7 +37,7 @@ jobs: echo REDIS_ADDRESS=$REDIS_ADDRESS >> $GITHUB_ENV - name: Run tests env: - OPENAI_KEY: ${{ secrets.OPENAI_KEY }} + OPENAI_API_KEY: ${{ secrets.OPENAI_KEY }} run: | make test-cov - name: Publish coverage results diff --git a/README.md b/README.md index 34ad8a0f..f8108bb7 100644 --- a/README.md +++ b/README.md @@ -42,11 +42,12 @@ RedisVL has a host of powerful features designed to streamline your vector datab 1. **Index Management**: RedisVL allows for indices to be created, updated, and deleted with ease. A schema for each index can be defined in yaml or directly in python code and used throughout the lifetime of the index. -2. **Vector Creation**: RedisVL integrates with OpenAI and other embedding providers to make the process of creating vectors straightforward. +2. **Embedding Creation**: RedisVL integrates with OpenAI and other text embedding providers to simplify the process of vectorizing unstructured data. *Image support coming soon.* 3. **Vector Search**: RedisVL provides robust search capabilities that enable you to query vectors synchronously and asynchronously. Hybrid queries that utilize tag, geographic, numeric, and other filters like full-text search are also supported. -4. **Semantic Caching**: ``LLMCache`` is a semantic caching interface built directly into RedisVL. It allows for the caching of generated output from LLM models like GPT-3 and others. As semantic search is used to check the cache, a threshold can be set to determine if the cached result is relevant enough to be returned. If not, the model is called and the result is cached for future use. This can increase the QPS and reduce the cost of using LLM models. +4. **Powerful Abstractions** + - **Semantic Caching**: `LLMCache` is a semantic caching interface built directly into RedisVL. It allows for the caching of generated output from LLMs like GPT-3 and others. As semantic search is used to check the cache, a threshold can be set to determine if the cached result is relevant enough to be returned. If not, the model is called and the result is cached for future use. This can increase the QPS and reduce the cost of using LLM models in production. ## 😊 Quick Start @@ -125,6 +126,7 @@ The ``LLMCache`` Interface in RedisVL can be used as follows. ```python from redisvl.llmcache.semantic import SemanticCache + cache = SemanticCache( redis_url="redis://localhost:6379", threshold=0.9, # semantic similarity threshold diff --git a/conftest.py b/conftest.py index 619e308b..6ffe27b2 100644 --- a/conftest.py +++ b/conftest.py @@ -23,7 +23,7 @@ def client(): @pytest.fixture def openai_key(): - return os.getenv("OPENAI_KEY") + return os.getenv("OPENAI_API_KEY") @pytest.fixture(scope="session") diff --git a/docs/examples/openai_qna.ipynb b/docs/examples/openai_qna.ipynb index 041ad5c8..e3a0c1b4 100644 --- a/docs/examples/openai_qna.ipynb +++ b/docs/examples/openai_qna.ipynb @@ -504,7 +504,7 @@ "source": [ "### Embedding Creation\n", "\n", - "With the text broken up into chunks, we can create embedding with the RedisVL OpenAIProvider. This provider uses the OpenAI API to create embeddings for the text. The code below shows how to create embeddings for the text chunks." + "With the text broken up into chunks, we can create embeddings with the RedisVL `OpenAITextVectorizer`. This provider uses the OpenAI API to create embeddings for the text. The code below shows how to create embeddings for the text chunks." ] }, { @@ -709,11 +709,11 @@ ], "source": [ "import os\n", - "from redisvl.providers.openai import OpenAIProvider\n", + "from redisvl.vectorize.text import OpenAITextVectorizer\n", "from redisvl.utils.utils import array_to_buffer\n", "\n", "api_key = os.environ.get(\"OPENAI_API_KEY\", \"\")\n", - "oaip = OpenAIProvider(EMBEDDINGS_MODEL, api_config={\"api_key\": api_key})\n", + "oaip = OpenAITextVectorizer(EMBEDDINGS_MODEL, api_config={\"api_key\": api_key})\n", "\n", "chunked_data[\"embedding\"] = oaip.embed_many(chunked_data[\"content\"].tolist())\n", "chunked_data[\"embedding\"] = chunked_data[\"embedding\"].apply(lambda x: array_to_buffer(x))\n", diff --git a/docs/index.md b/docs/index.md index c9d13776..2f31179e 100644 --- a/docs/index.md +++ b/docs/index.md @@ -21,7 +21,7 @@ to supercharge your application! - header: "{fas}`bolt;pst-color-primary` Vector Search" content: "Simple vector search capabilities supporting synchronous and asyncronous search." - header: "{fas}`circle-half-stroke;pst-color-primary` Embedding Creation" - content: "User OpenAI or any of the other embedding providers to create embeddings" + content: "User OpenAI or any of the other supported vectorizers to create embeddings" - header: "{fas}`palette;pst-color-primary` CLI" content: "Command line interface for RedisVL makes interacting with Redis as a vector database easy." - header: "{fab}`python;pst-color-primary` Semantic Caching" diff --git a/docs/user_guide/embedding_creation.rst b/docs/user_guide/embedding_creation.rst index 6efef5ff..a436eb00 100644 --- a/docs/user_guide/embedding_creation.rst +++ b/docs/user_guide/embedding_creation.rst @@ -4,22 +4,17 @@ Embedding Providers =================== -RedisVL enables you to call out to embedding providers +RedisVL enables you to vectorize unstructured data by calling out to embedding providers: OpenAI ====== -OpenAI is a commercial service that provides access to a number of models +[OpenAI](https://platform.openai.com) is a commercial service that provides access to a number of models. HuggingFace =========== -HuggingFace is a commercial service that provides access to a number of models +[HuggingFace](https://huggingface.co) is a commercial service that provides access to a number of models. - -Cohere -====== - -Cohere is a commercial service that provides access to a number of models diff --git a/docs/user_guide/index.md b/docs/user_guide/index.md index bddfa55d..5afe2de7 100644 --- a/docs/user_guide/index.md +++ b/docs/user_guide/index.md @@ -17,10 +17,10 @@ hybrid_queries_02 ``` ```{toctree} -:caption: Providers +:caption: Vectorizers :maxdepth: 3 -providers_03 +vectorizers_03 ``` ```{toctree} diff --git a/docs/user_guide/providers_03.ipynb b/docs/user_guide/providers_03.ipynb index c83fa752..5f271ee0 100644 --- a/docs/user_guide/providers_03.ipynb +++ b/docs/user_guide/providers_03.ipynb @@ -5,9 +5,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Embedding Providers\n", + "# Vectorizers\n", "\n", - "In this notebook, we will show how to use RedisVL to create embeddings using the built-in Providers. Today RedisVL supports:\n", + "In this notebook, we will show how to use RedisVL to create embeddings using the built-in text embedding vectorizers. Today RedisVL supports:\n", "1. OpenAI\n", "2. HuggingFace\n", "\n", @@ -33,7 +33,7 @@ "source": [ "## Creating Embeddings\n", "\n", - "This example will show how to create an embedding from 3 simple sentences with a number of different providers\n", + "This example will show how to create an embedding from 3 simple sentences with a number of different vectorizers\n", "\n", "- \"That is a happy dog\"\n", "- \"That is a happy person\"\n", @@ -80,11 +80,11 @@ ], "source": [ "os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\"\n", - "from redisvl.providers import HuggingfaceProvider\n", + "from redisvl.vectorize.text import HFTextVectorizer\n", "\n", "\n", "# create a provider\n", - "hf = HuggingfaceProvider(model=\"sentence-transformers/all-mpnet-base-v2\")\n", + "hf = HFTextVectorizer(model=\"sentence-transformers/all-mpnet-base-v2\")\n", "\n", "# embed a sentence\n", "test = hf.embed(\"This is a test sentence.\")\n", @@ -118,7 +118,7 @@ "\n", "First, we need to create the schema for our index.\n", "\n", - "Here's what the schema for the example looks like in yaml for the HuggingFace Provider\n", + "Here's what the schema for the example looks like in yaml for the HuggingFace vectorizer:\n", "\n", "```yaml\n", "index:\n", @@ -211,7 +211,7 @@ "source": [ "from redisvl.query import VectorQuery\n", "\n", - "# use the HuggingFace Provider again to create a query embedding\n", + "# use the HuggingFace vectorizer again to create a query embedding\n", "query_embedding = hf.embed(\"That is a happy cat\")\n", "\n", "query = VectorQuery(\n", @@ -226,13 +226,6 @@ " print(doc.text)\n", " print(doc.vector_distance)" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { diff --git a/redisvl/llmcache/semantic.py b/redisvl/llmcache/semantic.py index 390ad7bb..7239d116 100644 --- a/redisvl/llmcache/semantic.py +++ b/redisvl/llmcache/semantic.py @@ -4,8 +4,8 @@ from redisvl.index import SearchIndex from redisvl.llmcache.base import BaseLLMCache -from redisvl.providers import HuggingfaceProvider -from redisvl.providers.base import BaseProvider +from redisvl.vectorize.text import HFTextVectorizer +from redisvl.vectorize.base import BaseVectorizer from redisvl.query import VectorQuery from redisvl.utils.utils import array_to_buffer @@ -28,7 +28,7 @@ def __init__( prefix: str = "llmcache", threshold: float = 0.9, ttl: Optional[int] = None, - provider: Optional[BaseProvider] = HuggingfaceProvider( + vectorizer: Optional[BaseVectorizer] = HFTextVectorizer( "sentence-transformers/all-mpnet-base-v2" ), redis_url: Optional[str] = "redis://localhost:6379", @@ -41,8 +41,8 @@ def __init__( prefix (str, optional): The prefix for the index. Defaults to "llmcache". threshold (float, optional): Semantic threshold for the cache. Defaults to 0.9. ttl (Optional[int], optional): The TTL for the cache. Defaults to None. - provider (Optional[BaseProvider], optional): The provider for the cache. - Defaults to HuggingfaceProvider("sentence-transformers/all-mpnet-base-v2"). + vectorizer (Optional[BaseVectorizer], optional): The vectorizer for the cache. + Defaults to HFTextVectorizer("sentence-transformers/all-mpnet-base-v2"). redis_url (Optional[str], optional): The redis url. Defaults to "redis://localhost:6379". connection_args (Optional[dict], optional): The connection arguments for the redis client. Defaults to None. @@ -51,7 +51,7 @@ def __init__( """ self._ttl = ttl - self._provider = provider + self._vectorizer = vectorizer self.set_threshold(threshold) index = SearchIndex(name=index_name, prefix=prefix, fields=self._default_fields) @@ -150,7 +150,7 @@ def check( raise ValueError("Either prompt or vector must be specified.") if not vector: - vector = self._provider.embed(prompt) # type: ignore + vector = self._vectorizer.embed(prompt) # type: ignore v = VectorQuery( vector=vector, @@ -195,7 +195,7 @@ def store( key = self.hash_input(prompt) if not vector: - vector = self._provider.embed(prompt) # type: ignore + vector = self._vectorizer.embed(prompt) # type: ignore payload = { "id": key, diff --git a/redisvl/providers/__init__.py b/redisvl/providers/__init__.py deleted file mode 100644 index 062766d0..00000000 --- a/redisvl/providers/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -from redisvl.providers.huggingface import HuggingfaceProvider -from redisvl.providers.openai import OpenAIProvider - -__all__ = ["OpenAIProvider", "HuggingfaceProvider"] diff --git a/redisvl/vectorize/__init__.py b/redisvl/vectorize/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/redisvl/providers/base.py b/redisvl/vectorize/base.py similarity index 98% rename from redisvl/providers/base.py rename to redisvl/vectorize/base.py index 8c41017d..64f32568 100644 --- a/redisvl/providers/base.py +++ b/redisvl/vectorize/base.py @@ -1,7 +1,7 @@ from typing import Callable, Dict, List, Optional -class BaseProvider: +class BaseVectorizer: def __init__(self, model: str, dims: int, api_config: Optional[Dict] = None): self._dims = dims self._model = model diff --git a/redisvl/vectorize/text/__init__.py b/redisvl/vectorize/text/__init__.py new file mode 100644 index 00000000..bd5ee36e --- /dev/null +++ b/redisvl/vectorize/text/__init__.py @@ -0,0 +1,7 @@ +from redisvl.vectorize.text.huggingface import HFTextVectorizer +from redisvl.vectorize.text.openai import OpenAITextVectorizer + +__all__ = [ + "OpenAITextVectorizer", + "HFTextVectorizer", +] \ No newline at end of file diff --git a/redisvl/providers/huggingface.py b/redisvl/vectorize/text/huggingface.py similarity index 83% rename from redisvl/providers/huggingface.py rename to redisvl/vectorize/text/huggingface.py index 36885cc2..b46bfe0a 100644 --- a/redisvl/providers/huggingface.py +++ b/redisvl/vectorize/text/huggingface.py @@ -1,9 +1,9 @@ from typing import Callable, Dict, List, Optional -from redisvl.providers.base import BaseProvider +from redisvl.vectorize.base import BaseVectorizer -class HuggingfaceProvider(BaseProvider): +class HFTextVectorizer(BaseVectorizer): def __init__(self, model: str, api_config: Optional[Dict] = None): # TODO set dims based on model dims = 768 @@ -12,7 +12,7 @@ def __init__(self, model: str, api_config: Optional[Dict] = None): from sentence_transformers import SentenceTransformer except ImportError: raise ImportError( - "Huggingface provider requires sentence-transformers library. Please install with pip install sentence-transformers" + "HFTextVectorizer requires sentence-transformers library. Please install with pip install sentence-transformers" ) self._model_client = SentenceTransformer(model) diff --git a/redisvl/providers/openai.py b/redisvl/vectorize/text/openai.py similarity index 91% rename from redisvl/providers/openai.py rename to redisvl/vectorize/text/openai.py index 2dd63328..ac24a05b 100644 --- a/redisvl/providers/openai.py +++ b/redisvl/vectorize/text/openai.py @@ -1,9 +1,9 @@ from typing import Callable, Dict, List, Optional -from redisvl.providers.base import BaseProvider +from redisvl.vectorize.base import BaseVectorizer -class OpenAIProvider(BaseProvider): +class OpenAITextVectorizer(BaseVectorizer): def __init__(self, model: str, api_config: Optional[Dict] = None): dims = 1536 super().__init__(model, dims, api_config) @@ -13,7 +13,7 @@ def __init__(self, model: str, api_config: Optional[Dict] = None): import openai except ImportError: raise ImportError( - "OpenAI provider requires openai library. Please install with pip install openai" + "OpenAI vectorizer requires openai library. Please install with pip install openai" ) openai.api_key = api_config.get("api_key", None) self._model_client = openai.Embedding diff --git a/tests/integration/test_llmcache.py b/tests/integration/test_llmcache.py index f5656bc3..aa1759a3 100644 --- a/tests/integration/test_llmcache.py +++ b/tests/integration/test_llmcache.py @@ -2,24 +2,24 @@ from time import sleep from redisvl.llmcache.semantic import SemanticCache -from redisvl.providers import HuggingfaceProvider +from redisvl.vectorize.text import HFTextVectorizer @pytest.fixture -def provider(): - return HuggingfaceProvider("sentence-transformers/all-mpnet-base-v2") +def vectorizer(): + return HFTextVectorizer("sentence-transformers/all-mpnet-base-v2") @pytest.fixture -def cache(provider): - return SemanticCache(provider=provider, threshold=0.8) +def cache(vectorizer): + return SemanticCache(vectorizer=vectorizer, threshold=0.8) @pytest.fixture -def cache_with_ttl(provider): - return SemanticCache(provider=provider, threshold=0.8, ttl=2) +def cache_with_ttl(vectorizer): + return SemanticCache(vectorizer=vectorizer, threshold=0.8, ttl=2) @pytest.fixture -def vector(provider): - return provider.embed("This is a test sentence.") +def vector(vectorizer): + return vectorizer.embed("This is a test sentence.") def test_store_and_check_and_clear(cache, vector): @@ -70,13 +70,13 @@ def test_set_threshold(cache): assert cache.threshold == 0.9 cache._index.delete(True) -def test_from_existing(cache, vector, provider): +def test_from_existing(cache, vector, vectorizer): prompt = "This is another test prompt." response = "This is another test response." metadata = {"source": "test"} cache.store(prompt, response, vector=vector, metadata=metadata) # connect from existing? - new_cache = SemanticCache(provider=provider, threshold=0.8) + new_cache = SemanticCache(vectorizer=vectorizer, threshold=0.8) check_result = new_cache.check(vector=vector) assert len(check_result) >= 1 assert response in check_result diff --git a/tests/integration/test_providers.py b/tests/integration/test_providers.py deleted file mode 100644 index 720f8cb2..00000000 --- a/tests/integration/test_providers.py +++ /dev/null @@ -1,63 +0,0 @@ -import pytest - -from redisvl.providers import HuggingfaceProvider, OpenAIProvider - - -@pytest.fixture(params=[HuggingfaceProvider, OpenAIProvider]) -def provider(request, openai_key): - # Here we use actual models for integration test - if request.param == HuggingfaceProvider: - return request.param(model="sentence-transformers/all-mpnet-base-v2") - elif request.param == OpenAIProvider: - return request.param( - model="text-embedding-ada-002", api_config={"api_key": openai_key} - ) - - -def test_provider_embed(provider): - text = "This is a test sentence." - embedding = provider.embed(text) - - assert isinstance(embedding, list) - assert len(embedding) == provider.dims - - -def test_provider_embed_many(provider): - texts = ["This is the first test sentence.", "This is the second test sentence."] - embeddings = provider.embed_many(texts) - - assert isinstance(embeddings, list) - assert len(embeddings) == len(texts) - assert all( - isinstance(emb, list) and len(emb) == provider.dims for emb in embeddings - ) - - -@pytest.fixture(params=[OpenAIProvider]) -def aprovider(request, openai_key): - # Here we use actual models for integration test - if request.param == OpenAIProvider: - return request.param( - model="text-embedding-ada-002", api_config={"api_key": openai_key} - ) - - -@pytest.mark.asyncio -async def test_provider_aembed(aprovider): - text = "This is a test sentence." - embedding = await aprovider.aembed(text) - - assert isinstance(embedding, list) - assert len(embedding) == aprovider.dims - - -@pytest.mark.asyncio -async def test_provider_aembed_many(aprovider): - texts = ["This is the first test sentence.", "This is the second test sentence."] - embeddings = await aprovider.aembed_many(texts) - - assert isinstance(embeddings, list) - assert len(embeddings) == len(texts) - assert all( - isinstance(emb, list) and len(emb) == aprovider.dims for emb in embeddings - ) diff --git a/tests/integration/test_vectorizers.py b/tests/integration/test_vectorizers.py new file mode 100644 index 00000000..a666162d --- /dev/null +++ b/tests/integration/test_vectorizers.py @@ -0,0 +1,67 @@ +import os +import pytest + +from redisvl.vectorize.text import HFTextVectorizer, OpenAITextVectorizer + +@pytest.fixture +def openai_key(): + return os.getenv("OPENAI_API_KEY") + +@pytest.fixture(params=[HFTextVectorizer, OpenAITextVectorizer]) +def vectorizer(request, openai_key): + # Here we use actual models for integration test + if request.param == HFTextVectorizer: + return request.param(model="sentence-transformers/all-mpnet-base-v2") + elif request.param == OpenAITextVectorizer: + return request.param( + model="text-embedding-ada-002", api_config={"api_key": openai_key} + ) + + +def test_vectorizer_embed(vectorizer): + text = "This is a test sentence." + embedding = vectorizer.embed(text) + + assert isinstance(embedding, list) + assert len(embedding) == vectorizer.dims + + +def test_vectorizer_embed_many(vectorizer): + texts = ["This is the first test sentence.", "This is the second test sentence."] + embeddings = vectorizer.embed_many(texts) + + assert isinstance(embeddings, list) + assert len(embeddings) == len(texts) + assert all( + isinstance(emb, list) and len(emb) == vectorizer.dims for emb in embeddings + ) + + +@pytest.fixture(params=[OpenAITextVectorizer]) +def avectorizer(request, openai_key): + # Here we use actual models for integration test + if request.param == OpenAITextVectorizer: + return request.param( + model="text-embedding-ada-002", api_config={"api_key": openai_key} + ) + + +@pytest.mark.asyncio +async def test_vectorizer_aembed(avectorizer): + text = "This is a test sentence." + embedding = await avectorizer.aembed(text) + + assert isinstance(embedding, list) + assert len(embedding) == avectorizer.dims + + +@pytest.mark.asyncio +async def test_vectorizer_aembed_many(avectorizer): + texts = ["This is the first test sentence.", "This is the second test sentence."] + embeddings = await avectorizer.aembed_many(texts) + + assert isinstance(embeddings, list) + assert len(embeddings) == len(texts) + assert all( + isinstance(emb, list) and len(emb) == avectorizer.dims for emb in embeddings + ) From 45e32c6169a76e41d2831f5e7324c02fa4b54573 Mon Sep 17 00:00:00 2001 From: Tyler Hutcherson Date: Wed, 2 Aug 2023 16:08:23 -0400 Subject: [PATCH 2/2] remove unnecessary fixture --- tests/integration/test_vectorizers.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/integration/test_vectorizers.py b/tests/integration/test_vectorizers.py index a666162d..58ec6a65 100644 --- a/tests/integration/test_vectorizers.py +++ b/tests/integration/test_vectorizers.py @@ -3,9 +3,7 @@ from redisvl.vectorize.text import HFTextVectorizer, OpenAITextVectorizer -@pytest.fixture -def openai_key(): - return os.getenv("OPENAI_API_KEY") + @pytest.fixture(params=[HFTextVectorizer, OpenAITextVectorizer]) def vectorizer(request, openai_key):