add text embeddings module

tylerhutcherson · tylerhutcherson · commit 0f9ef9ee144a · 2023-08-02T16:01:59.000-04:00
diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml
@@ -37,7 +37,7 @@ jobs:
         echo REDIS_ADDRESS=$REDIS_ADDRESS >> $GITHUB_ENV
     - name: Run tests
       env:
-        OPENAI_KEY: ${{ secrets.OPENAI_KEY }}
+        OPENAI_API_KEY: ${{ secrets.OPENAI_KEY }}
       run: |
         make test-cov
     - name: Publish coverage results
diff --git a/README.md b/README.md
@@ -42,11 +42,12 @@ RedisVL has a host of powerful features designed to streamline your vector datab
 
 1. **Index Management**: RedisVL allows for indices to be created, updated, and deleted with ease. A schema for each index can be defined in yaml or directly in python code and used throughout the lifetime of the index.
 
-2. **Vector Creation**: RedisVL integrates with OpenAI and other embedding providers to make the process of creating vectors straightforward.
+2. **Embedding Creation**: RedisVL integrates with OpenAI and other text embedding providers to simplify the process of vectorizing unstructured data. *Image support coming soon.*
 
 3. **Vector Search**: RedisVL provides robust search capabilities that enable you to query vectors synchronously and asynchronously. Hybrid queries that utilize tag, geographic, numeric, and other filters like full-text search are also supported.
 
-4. **Semantic Caching**: ``LLMCache`` is a semantic caching interface built directly into RedisVL. It allows for the caching of generated output from LLM models like GPT-3 and others. As semantic search is used to check the cache, a threshold can be set to determine if the cached result is relevant enough to be returned. If not, the model is called and the result is cached for future use. This can increase the QPS and reduce the cost of using LLM models.
+4. **Powerful Abstractions**
+    - **Semantic Caching**: `LLMCache` is a semantic caching interface built directly into RedisVL. It allows for the caching of generated output from LLMs like GPT-3 and others. As semantic search is used to check the cache, a threshold can be set to determine if the cached result is relevant enough to be returned. If not, the model is called and the result is cached for future use. This can increase the QPS and reduce the cost of using LLM models in production.
 
 
 ## 😊 Quick Start
@@ -125,6 +126,7 @@ The ``LLMCache`` Interface in RedisVL can be used as follows.
 
 ```python
 from redisvl.llmcache.semantic import SemanticCache
+
 cache = SemanticCache(
   redis_url="redis://localhost:6379",
   threshold=0.9, # semantic similarity threshold
diff --git a/conftest.py b/conftest.py
@@ -23,7 +23,7 @@ def client():
 
 @pytest.fixture
 def openai_key():
-    return os.getenv("OPENAI_KEY")
+    return os.getenv("OPENAI_API_KEY")
 
 
 @pytest.fixture(scope="session")
diff --git a/docs/examples/openai_qna.ipynb b/docs/examples/openai_qna.ipynb
@@ -504,7 +504,7 @@
    "source": [
     "### Embedding Creation\n",
     "\n",
-    "With the text broken up into chunks, we can create embedding with the RedisVL OpenAIProvider. This provider uses the OpenAI API to create embeddings for the text. The code below shows how to create embeddings for the text chunks."
+    "With the text broken up into chunks, we can create embeddings with the RedisVL `OpenAITextVectorizer`. This provider uses the OpenAI API to create embeddings for the text. The code below shows how to create embeddings for the text chunks."
    ]
   },
   {
@@ -709,11 +709,11 @@
    ],
    "source": [
     "import os\n",
-    "from redisvl.providers.openai import OpenAIProvider\n",
+    "from redisvl.vectorize.text import OpenAITextVectorizer\n",
     "from redisvl.utils.utils import array_to_buffer\n",
     "\n",
     "api_key = os.environ.get(\"OPENAI_API_KEY\", \"\")\n",
-    "oaip = OpenAIProvider(EMBEDDINGS_MODEL, api_config={\"api_key\": api_key})\n",
+    "oaip = OpenAITextVectorizer(EMBEDDINGS_MODEL, api_config={\"api_key\": api_key})\n",
     "\n",
     "chunked_data[\"embedding\"] = oaip.embed_many(chunked_data[\"content\"].tolist())\n",
     "chunked_data[\"embedding\"] = chunked_data[\"embedding\"].apply(lambda x: array_to_buffer(x))\n",
diff --git a/docs/index.md b/docs/index.md
@@ -21,7 +21,7 @@ to supercharge your application!
 - header: "{fas}`bolt;pst-color-primary` Vector Search"
   content: "Simple vector search capabilities supporting synchronous and asyncronous search."
 - header: "{fas}`circle-half-stroke;pst-color-primary` Embedding Creation"
-  content: "User OpenAI or any of the other embedding providers to create embeddings"
+  content: "User OpenAI or any of the other supported vectorizers to create embeddings"
 - header: "{fas}`palette;pst-color-primary` CLI"
   content: "Command line interface for RedisVL makes interacting with Redis as a vector database easy."
 - header: "{fab}`python;pst-color-primary` Semantic Caching"
diff --git a/docs/user_guide/embedding_creation.rst b/docs/user_guide/embedding_creation.rst
@@ -4,22 +4,17 @@
 Embedding Providers
 ===================
 
-RedisVL enables you to call out to embedding providers
+RedisVL enables you to vectorize unstructured data by calling out to embedding providers:
 
 
 OpenAI
 ======
 
-OpenAI is a commercial service that provides access to a number of models
+[OpenAI](https://platform.openai.com) is a commercial service that provides access to a number of models.
 
 
 HuggingFace
 ===========
 
-HuggingFace is a commercial service that provides access to a number of models
+[HuggingFace](https://huggingface.co) is a commercial service that provides access to a number of models.
 
-
-Cohere
-======
-
-Cohere is a commercial service that provides access to a number of models
diff --git a/docs/user_guide/index.md b/docs/user_guide/index.md
@@ -17,10 +17,10 @@ hybrid_queries_02
 ```
 
 ```{toctree}
-:caption: Providers
+:caption: Vectorizers
 :maxdepth: 3
 
-providers_03
+vectorizers_03
 ```
 
 ```{toctree}
diff --git a/docs/user_guide/providers_03.ipynb b/docs/user_guide/providers_03.ipynb
@@ -5,9 +5,9 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# Embedding Providers\n",
+    "# Vectorizers\n",
     "\n",
-    "In this notebook, we will show how to use RedisVL to create embeddings using the built-in Providers. Today RedisVL supports:\n",
+    "In this notebook, we will show how to use RedisVL to create embeddings using the built-in text embedding vectorizers. Today RedisVL supports:\n",
     "1. OpenAI\n",
     "2. HuggingFace\n",
     "\n",
@@ -33,7 +33,7 @@
    "source": [
     "## Creating Embeddings\n",
     "\n",
-    "This example will show how to create an embedding from 3 simple sentences with a number of different providers\n",
+    "This example will show how to create an embedding from 3 simple sentences with a number of different vectorizers\n",
     "\n",
     "- \"That is a happy dog\"\n",
     "- \"That is a happy person\"\n",
@@ -80,11 +80,11 @@
    ],
    "source": [
     "os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\"\n",
-    "from redisvl.providers import HuggingfaceProvider\n",
+    "from redisvl.vectorize.text import HFTextVectorizer\n",
     "\n",
     "\n",
     "# create a provider\n",
-    "hf = HuggingfaceProvider(model=\"sentence-transformers/all-mpnet-base-v2\")\n",
+    "hf = HFTextVectorizer(model=\"sentence-transformers/all-mpnet-base-v2\")\n",
     "\n",
     "# embed a sentence\n",
     "test = hf.embed(\"This is a test sentence.\")\n",
@@ -118,7 +118,7 @@
     "\n",
     "First, we need to create the schema for our index.\n",
     "\n",
-    "Here's what the schema for the example looks like in yaml for the HuggingFace Provider\n",
+    "Here's what the schema for the example looks like in yaml for the HuggingFace vectorizer:\n",
     "\n",
     "```yaml\n",
     "index:\n",
@@ -211,7 +211,7 @@
    "source": [
     "from redisvl.query import VectorQuery\n",
     "\n",
-    "# use the HuggingFace Provider again to create a query embedding\n",
+    "# use the HuggingFace vectorizer again to create a query embedding\n",
     "query_embedding = hf.embed(\"That is a happy cat\")\n",
     "\n",
     "query = VectorQuery(\n",
@@ -226,13 +226,6 @@
     "    print(doc.text)\n",
     "    print(doc.vector_distance)"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {
diff --git a/redisvl/llmcache/semantic.py b/redisvl/llmcache/semantic.py
@@ -4,8 +4,8 @@
 
 from redisvl.index import SearchIndex
 from redisvl.llmcache.base import BaseLLMCache
-from redisvl.providers import HuggingfaceProvider
-from redisvl.providers.base import BaseProvider
+from redisvl.vectorize.text import HFTextVectorizer
+from redisvl.vectorize.base import BaseVectorizer
 from redisvl.query import VectorQuery
 from redisvl.utils.utils import array_to_buffer
 
@@ -28,7 +28,7 @@ def __init__(
         prefix: str = "llmcache",
         threshold: float = 0.9,
         ttl: Optional[int] = None,
-        provider: Optional[BaseProvider] = HuggingfaceProvider(
+        vectorizer: Optional[BaseVectorizer] = HFTextVectorizer(
             "sentence-transformers/all-mpnet-base-v2"
         ),
         redis_url: Optional[str] = "redis://localhost:6379",
@@ -41,8 +41,8 @@ def __init__(
             prefix (str, optional): The prefix for the index. Defaults to "llmcache".
             threshold (float, optional): Semantic threshold for the cache. Defaults to 0.9.
             ttl (Optional[int], optional): The TTL for the cache. Defaults to None.
-            provider (Optional[BaseProvider], optional): The provider for the cache.
-                Defaults to HuggingfaceProvider("sentence-transformers/all-mpnet-base-v2").
+            vectorizer (Optional[BaseVectorizer], optional): The vectorizer for the cache.
+                Defaults to HFTextVectorizer("sentence-transformers/all-mpnet-base-v2").
             redis_url (Optional[str], optional): The redis url. Defaults to "redis://localhost:6379".
             connection_args (Optional[dict], optional): The connection arguments for the redis client. Defaults to None.
 
@@ -51,7 +51,7 @@ def __init__(
 
         """
         self._ttl = ttl
-        self._provider = provider
+        self._vectorizer = vectorizer
         self.set_threshold(threshold)
 
         index = SearchIndex(name=index_name, prefix=prefix, fields=self._default_fields)
@@ -150,7 +150,7 @@ def check(
             raise ValueError("Either prompt or vector must be specified.")
 
         if not vector:
-            vector = self._provider.embed(prompt)  # type: ignore
+            vector = self._vectorizer.embed(prompt)  # type: ignore
 
         v = VectorQuery(
             vector=vector,
@@ -195,7 +195,7 @@ def store(
             key = self.hash_input(prompt)
 
         if not vector:
-            vector = self._provider.embed(prompt)  # type: ignore
+            vector = self._vectorizer.embed(prompt)  # type: ignore
 
         payload = {
             "id": key,
diff --git a/redisvl/providers/__init__.py b/redisvl/providers/__init__.py
diff --git a/redisvl/vectorize/__init__.py b/redisvl/vectorize/__init__.py
diff --git a/redisvl/vectorize/base.py b/redisvl/vectorize/base.py
@@ -1,7 +1,7 @@
 from typing import Callable, Dict, List, Optional
 
 
-class BaseProvider:
+class BaseVectorizer:
     def __init__(self, model: str, dims: int, api_config: Optional[Dict] = None):
         self._dims = dims
         self._model = model
diff --git a/redisvl/vectorize/text/__init__.py b/redisvl/vectorize/text/__init__.py
@@ -0,0 +1,7 @@
+from redisvl.vectorize.text.huggingface import HFTextVectorizer
+from redisvl.vectorize.text.openai import OpenAITextVectorizer
+
+__all__ = [
+    "OpenAITextVectorizer",
+    "HFTextVectorizer",
+]
diff --git a/redisvl/vectorize/text/huggingface.py b/redisvl/vectorize/text/huggingface.py
@@ -1,9 +1,9 @@
 from typing import Callable, Dict, List, Optional
 
-from redisvl.providers.base import BaseProvider
+from redisvl.vectorize.base import BaseVectorizer
 
 
-class HuggingfaceProvider(BaseProvider):
+class HFTextVectorizer(BaseVectorizer):
     def __init__(self, model: str, api_config: Optional[Dict] = None):
         # TODO set dims based on model
         dims = 768
@@ -12,7 +12,7 @@ def __init__(self, model: str, api_config: Optional[Dict] = None):
             from sentence_transformers import SentenceTransformer
         except ImportError:
             raise ImportError(
-                "Huggingface provider requires sentence-transformers library. Please install with pip install sentence-transformers"
+                "HFTextVectorizer requires sentence-transformers library. Please install with pip install sentence-transformers"
             )
 
         self._model_client = SentenceTransformer(model)
diff --git a/redisvl/vectorize/text/openai.py b/redisvl/vectorize/text/openai.py
@@ -1,9 +1,9 @@
 from typing import Callable, Dict, List, Optional
 
-from redisvl.providers.base import BaseProvider
+from redisvl.vectorize.base import BaseVectorizer
 
 
-class OpenAIProvider(BaseProvider):
+class OpenAITextVectorizer(BaseVectorizer):
     def __init__(self, model: str, api_config: Optional[Dict] = None):
         dims = 1536
         super().__init__(model, dims, api_config)
@@ -13,7 +13,7 @@ def __init__(self, model: str, api_config: Optional[Dict] = None):
             import openai
         except ImportError:
             raise ImportError(
-                "OpenAI provider requires openai library. Please install with pip install openai"
+                "OpenAI vectorizer requires openai library. Please install with pip install openai"
             )
         openai.api_key = api_config.get("api_key", None)
         self._model_client = openai.Embedding
diff --git a/tests/integration/test_llmcache.py b/tests/integration/test_llmcache.py
@@ -2,24 +2,24 @@
 
 from time import sleep
 from redisvl.llmcache.semantic import SemanticCache
-from redisvl.providers import HuggingfaceProvider
+from redisvl.vectorize.text import HFTextVectorizer
 
 
 @pytest.fixture
-def provider():
-    return HuggingfaceProvider("sentence-transformers/all-mpnet-base-v2")
+def vectorizer():
+    return HFTextVectorizer("sentence-transformers/all-mpnet-base-v2")
 
 @pytest.fixture
-def cache(provider):
-    return SemanticCache(provider=provider, threshold=0.8)
+def cache(vectorizer):
+    return SemanticCache(vectorizer=vectorizer, threshold=0.8)
 
 @pytest.fixture
-def cache_with_ttl(provider):
-    return SemanticCache(provider=provider, threshold=0.8, ttl=2)
+def cache_with_ttl(vectorizer):
+    return SemanticCache(vectorizer=vectorizer, threshold=0.8, ttl=2)
 
 @pytest.fixture
-def vector(provider):
-    return provider.embed("This is a test sentence.")
+def vector(vectorizer):
+    return vectorizer.embed("This is a test sentence.")
 
 
 def test_store_and_check_and_clear(cache, vector):
@@ -70,13 +70,13 @@ def test_set_threshold(cache):
     assert cache.threshold == 0.9
     cache._index.delete(True)
 
-def test_from_existing(cache, vector, provider):
+def test_from_existing(cache, vector, vectorizer):
     prompt = "This is another test prompt."
     response = "This is another test response."
     metadata = {"source": "test"}
     cache.store(prompt, response, vector=vector, metadata=metadata)
     # connect from existing?
-    new_cache = SemanticCache(provider=provider, threshold=0.8)
+    new_cache = SemanticCache(vectorizer=vectorizer, threshold=0.8)
     check_result = new_cache.check(vector=vector)
     assert len(check_result) >= 1
     assert response in check_result
diff --git a/tests/integration/test_providers.py b/tests/integration/test_providers.py
diff --git a/tests/integration/test_vectorizers.py b/tests/integration/test_vectorizers.py