From ed2d235bf5fb697358d5eff33ccc17ff362252e1 Mon Sep 17 00:00:00 2001 From: Anibal Date: Thu, 21 Mar 2024 11:41:48 -0600 Subject: [PATCH 1/5] Added support for AzureOpenAI --- redisvl/utils/vectorize/__init__.py | 2 + redisvl/utils/vectorize/text/azureopenai.py | 288 ++++++++++++++++++++ 2 files changed, 290 insertions(+) create mode 100644 redisvl/utils/vectorize/text/azureopenai.py diff --git a/redisvl/utils/vectorize/__init__.py b/redisvl/utils/vectorize/__init__.py index c0dc7dd8..9bd763d0 100644 --- a/redisvl/utils/vectorize/__init__.py +++ b/redisvl/utils/vectorize/__init__.py @@ -3,6 +3,7 @@ from redisvl.utils.vectorize.text.huggingface import HFTextVectorizer from redisvl.utils.vectorize.text.openai import OpenAITextVectorizer from redisvl.utils.vectorize.text.vertexai import VertexAITextVectorizer +from redisvl.utils.vectorize.text.azureopenai import AzureOpenAITextVectorizer __all__ = [ "BaseVectrorizer", @@ -10,4 +11,5 @@ "HFTextVectorizer", "OpenAITextVectorizer", "VertexAITextVectorizer", + "AzureOpenAITextVectorizer" ] diff --git a/redisvl/utils/vectorize/text/azureopenai.py b/redisvl/utils/vectorize/text/azureopenai.py new file mode 100644 index 00000000..511856b5 --- /dev/null +++ b/redisvl/utils/vectorize/text/azureopenai.py @@ -0,0 +1,288 @@ +import os +from typing import Any, Callable, Dict, List, Optional + +from tenacity import retry, stop_after_attempt, wait_random_exponential +from tenacity.retry import retry_if_not_exception_type + +from redisvl.utils.vectorize.base import BaseVectorizer + +# ignore that openai isn't imported +# mypy: disable-error-code="name-defined" + + +class AzureOpenAITextVectorizer(BaseVectorizer): + """The AzureOpenAITextVectorizer class utilizes AzureOpenAI's API to generate + embeddings for text data. + + This vectorizer is designed to interact with AzureOpenAI's embeddings API, + requiring an API key, an AzureOpenAI deployment endpoint and API version. + These values can be provided directly in the `api_config` dictionary with + the parameters 'azure_endpoint'. 'api_version' and 'api_key' or through the + environment variables 'AZURE_OPENAI_ENDPOINT', 'OPENAI_API_VERSION', and 'AZURE_OPENAI_API_KEY'. + Users must obtain these values from the 'Keys and Endpoints' section in their Azure OpenAI service. + Additionally, the `openai` python client must be installed with `pip install openai>=1.13.0`. + + The vectorizer supports both synchronous and asynchronous operations, + allowing for batch processing of texts and flexibility in handling + preprocessing tasks. + + .. code-block:: python + + # Synchronous embedding of a single text + vectorizer = AzureOpenAITextVectorizer( + model="text-embedding-ada-002", + api_config={ + "api_key": "your_api_key", # OR set OPENAI_API_KEY in your env + "api_version": "your_api_version", # OR set OPENAI_API_VERSION in your env + "azure_endpoint": "your_azure_endpoint", # OR set AZURE_OPENAI_ENDPOINT in your env + } + ) + embedding = vectorizer.embed("Hello, world!") + + # Asynchronous batch embedding of multiple texts + embeddings = await vectorizer.aembed_many( + ["Hello, world!", "How are you?"], + batch_size=2 + ) + + """ + + aclient: Any # Since the OpenAI module is loaded dynamically + + def __init__( + self, model: str = "text-embedding-ada-002", api_config: Optional[Dict] = None + ): + """Initialize the AzureOpenAI vectorizer. + + Args: + model (str): Deployment to use for embedding. Must be the + 'Deployment name' not the 'Model name'. Defaults to + 'text-embedding-ada-002'. + api_config (Optional[Dict], optional): Dictionary containing the + API key, API version and Azure endpoint. Defaults to None. + + Raises: + ImportError: If the openai library is not installed. + ValueError: If the AzureOpenAI API key, version, or endpoint are not provided. + """ + # Dynamic import of the openai module + try: + from openai import AsyncAzureOpenAI, AzureOpenAI + except ImportError: + raise ImportError( + "AzureOpenAI vectorizer requires the openai library. \ + Please install with `pip install openai`" + ) + + # Fetch the API key, version and endpoint from api_config or environment variable + azure_endpoint = ( + api_config.get("azure_endpoint") if api_config else os.getenv("AZURE_OPENAI_ENDPOINT") + ) + + if not azure_endpoint: + raise ValueError( + "AzureOpenAI API endpoint is required. " + "Provide it in api_config or set the AZURE_OPENAI_ENDPOINT\ + environment variable." + ) + + api_version = ( + api_config.get("api_version") if api_config else os.getenv("OPENAI_API_VERSION") + ) + + if not api_version: + raise ValueError( + "AzureOpenAI API version is required. " + "Provide it in api_config or set the OPENAI_API_VERSION\ + environment variable." + ) + + api_key = ( + api_config.get("api_key") if api_config else os.getenv("AZURE_OPENAI_API_KEY") + ) + + if not api_key: + raise ValueError( + "AzureOpenAI API key is required. " + "Provide it in api_config or set the AZURE_OPENAI_API_KEY\ + environment variable." + ) + + + client = AzureOpenAI(api_key=api_key, api_version=api_version, azure_endpoint=azure_endpoint) + dims = self._set_model_dims(client, model) + super().__init__(model=model, dims=dims, client=client) + self.aclient = AsyncAzureOpenAI(api_key=api_key, api_version=api_version, azure_endpoint=azure_endpoint) + + @staticmethod + def _set_model_dims(client, model) -> int: + try: + embedding = ( + client.embeddings.create(input=["dimension test"], model=model) + .data[0] + .embedding + ) + except (KeyError, IndexError) as ke: + raise ValueError(f"Unexpected response from the AzureOpenAI API: {str(ke)}") + except Exception as e: # pylint: disable=broad-except + # fall back (TODO get more specific) + raise ValueError(f"Error setting embedding model dimensions: {str(e)}") + return len(embedding) + + @retry( + wait=wait_random_exponential(min=1, max=60), + stop=stop_after_attempt(6), + retry=retry_if_not_exception_type(TypeError), + ) + def embed_many( + self, + texts: List[str], + preprocess: Optional[Callable] = None, + batch_size: int = 10, + as_buffer: bool = False, + **kwargs, + ) -> List[List[float]]: + """Embed many chunks of texts using the AzureOpenAI API. + + Args: + texts (List[str]): List of text chunks to embed. + preprocess (Optional[Callable], optional): Optional preprocessing + callable to perform before vectorization. Defaults to None. + batch_size (int, optional): Batch size of texts to use when creating + embeddings. Defaults to 10. + as_buffer (bool, optional): Whether to convert the raw embedding + to a byte string. Defaults to False. + + Returns: + List[List[float]]: List of embeddings. + + Raises: + TypeError: If the wrong input type is passed in for the test. + """ + if not isinstance(texts, list): + raise TypeError("Must pass in a list of str values to embed.") + if len(texts) > 0 and not isinstance(texts[0], str): + raise TypeError("Must pass in a list of str values to embed.") + + embeddings: List = [] + for batch in self.batchify(texts, batch_size, preprocess): + response = self.client.embeddings.create(input=batch, model=self.model) + embeddings += [ + self._process_embedding(r.embedding, as_buffer) for r in response.data + ] + return embeddings + + @retry( + wait=wait_random_exponential(min=1, max=60), + stop=stop_after_attempt(6), + retry=retry_if_not_exception_type(TypeError), + ) + def embed( + self, + text: str, + preprocess: Optional[Callable] = None, + as_buffer: bool = False, + **kwargs, + ) -> List[float]: + """Embed a chunk of text using the AzureOpenAI API. + + Args: + text (str): Chunk of text to embed. + preprocess (Optional[Callable], optional): Optional preprocessing callable to + perform before vectorization. Defaults to None. + as_buffer (bool, optional): Whether to convert the raw embedding + to a byte string. Defaults to False. + + Returns: + List[float]: Embedding. + + Raises: + TypeError: If the wrong input type is passed in for the test. + """ + if not isinstance(text, str): + raise TypeError("Must pass in a str value to embed.") + + if preprocess: + text = preprocess(text) + result = self.client.embeddings.create(input=[text], model=self.model) + return self._process_embedding(result.data[0].embedding, as_buffer) + + @retry( + wait=wait_random_exponential(min=1, max=60), + stop=stop_after_attempt(6), + retry=retry_if_not_exception_type(TypeError), + ) + async def aembed_many( + self, + texts: List[str], + preprocess: Optional[Callable] = None, + batch_size: int = 1000, + as_buffer: bool = False, + **kwargs, + ) -> List[List[float]]: + """Asynchronously embed many chunks of texts using the AzureOpenAI API. + + Args: + texts (List[str]): List of text chunks to embed. + preprocess (Optional[Callable], optional): Optional preprocessing callable to + perform before vectorization. Defaults to None. + batch_size (int, optional): Batch size of texts to use when creating + embeddings. Defaults to 10. + as_buffer (bool, optional): Whether to convert the raw embedding + to a byte string. Defaults to False. + + Returns: + List[List[float]]: List of embeddings. + + Raises: + TypeError: If the wrong input type is passed in for the test. + """ + if not isinstance(texts, list): + raise TypeError("Must pass in a list of str values to embed.") + if len(texts) > 0 and not isinstance(texts[0], str): + raise TypeError("Must pass in a list of str values to embed.") + + embeddings: List = [] + for batch in self.batchify(texts, batch_size, preprocess): + response = await self.aclient.embeddings.create( + input=batch, model=self.model + ) + embeddings += [ + self._process_embedding(r.embedding, as_buffer) for r in response.data + ] + return embeddings + + @retry( + wait=wait_random_exponential(min=1, max=60), + stop=stop_after_attempt(6), + retry=retry_if_not_exception_type(TypeError), + ) + async def aembed( + self, + text: str, + preprocess: Optional[Callable] = None, + as_buffer: bool = False, + **kwargs, + ) -> List[float]: + """Asynchronously embed a chunk of text using the OpenAI API. + + Args: + text (str): Chunk of text to embed. + preprocess (Optional[Callable], optional): Optional preprocessing callable to + perform before vectorization. Defaults to None. + as_buffer (bool, optional): Whether to convert the raw embedding + to a byte string. Defaults to False. + + Returns: + List[float]: Embedding. + + Raises: + TypeError: If the wrong input type is passed in for the test. + """ + if not isinstance(text, str): + raise TypeError("Must pass in a str value to embed.") + + if preprocess: + text = preprocess(text) + result = await self.aclient.embeddings.create(input=[text], model=self.model) + return self._process_embedding(result.data[0].embedding, as_buffer) From e1302ee8d82c7fc5474c83a8d99a6a46114cf1a6 Mon Sep 17 00:00:00 2001 From: Anibal Angulo Date: Thu, 11 Apr 2024 19:30:15 -0700 Subject: [PATCH 2/5] Added tests for AzureOpenAI vectorizer --- conftest.py | 8 ++++++++ tests/integration/test_vectorizers.py | 4 ++++ 2 files changed, 12 insertions(+) diff --git a/conftest.py b/conftest.py index 2db79ec0..0f20ab54 100644 --- a/conftest.py +++ b/conftest.py @@ -21,6 +21,14 @@ def client(redis_url): def openai_key(): return os.getenv("OPENAI_API_KEY") +@pytest.fixture +def openai_version(): + return os.getenv("OPENAI_API_VERSION") + +@pytest.fixture +def azure_endpoint(): + return os.getenv("AZURE_OPENAI_ENDPOINT") + @pytest.fixture def cohere_key(): return os.getenv("COHERE_API_KEY") diff --git a/tests/integration/test_vectorizers.py b/tests/integration/test_vectorizers.py index c8766171..bd435980 100644 --- a/tests/integration/test_vectorizers.py +++ b/tests/integration/test_vectorizers.py @@ -7,6 +7,7 @@ HFTextVectorizer, OpenAITextVectorizer, VertexAITextVectorizer, + AzureOpenAITextVectorizer ) @@ -24,6 +25,7 @@ def skip_vectorizer() -> bool: OpenAITextVectorizer, VertexAITextVectorizer, CohereTextVectorizer, + AzureOpenAITextVectorizer ] ) def vectorizer(request, skip_vectorizer): @@ -38,6 +40,8 @@ def vectorizer(request, skip_vectorizer): return request.param() elif request.param == CohereTextVectorizer: return request.param() + elif request.param == AzureOpenAITextVectorizer: + return request.param() def test_vectorizer_embed(vectorizer, skip_vectorizer): From 97da5ecb8f1792497eb68615aafe2a555c945e3a Mon Sep 17 00:00:00 2001 From: Anibal Angulo Date: Thu, 11 Apr 2024 19:30:50 -0700 Subject: [PATCH 3/5] Added AzureOpenAI vectorizer user guide --- docs/user_guide/vectorizers_04.ipynb | 113 ++++++++++++++++++++++++++- 1 file changed, 111 insertions(+), 2 deletions(-) diff --git a/docs/user_guide/vectorizers_04.ipynb b/docs/user_guide/vectorizers_04.ipynb index 6201a97f..19972c03 100644 --- a/docs/user_guide/vectorizers_04.ipynb +++ b/docs/user_guide/vectorizers_04.ipynb @@ -64,7 +64,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -176,6 +176,115 @@ "print(\"Number of Embeddings:\", len(embeddings))\n" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Azure OpenAI\n", + "\n", + "The ``AzureOpenAITextVectorizer`` is a variation of the OpenAI vectorizer that calls OpenAI models within Azure. If you've already installed ``openai``, then you're ready to use Azure OpenAI.\n", + "\n", + "The only practical difference between OpenAI and Azure OpenAI is the variables required to call the API." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# additionally to the API Key, setup the API endpoint and version\n", + "api_version = os.environ.get(\"OPENAI_API_VERSION\") or getpass.getpass(\"Enter your AzureOpenAI API version: \")\n", + "azure_endpoint = os.environ.get(\"AZURE_OPENAI_ENDPOINT\") or getpass.getpass(\"Enter your AzureOpenAI API endpoint: \")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Vector dimensions: 1536\n" + ] + }, + { + "data": { + "text/plain": [ + "[-0.0010088568087667227,\n", + " -0.003142790636047721,\n", + " 0.0024922797456383705,\n", + " -0.004522906616330147,\n", + " -0.010369433090090752,\n", + " 0.012739036232233047,\n", + " -0.005365503951907158,\n", + " -0.0029668458737432957,\n", + " -0.007141091860830784,\n", + " -0.03383301943540573]" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from redisvl.utils.vectorize import AzureOpenAITextVectorizer\n", + "\n", + "# create a vectorizer\n", + "az_oai = AzureOpenAITextVectorizer(\n", + " model=\"text-embedding-ada-002\", # Must be your custom deployment name\n", + " api_config={\n", + " \"api_key\": api_key,\n", + " \"api_version\": api_version,\n", + " \"azure_endpoint\": azure_endpoint\n", + " },\n", + ")\n", + "\n", + "test = az_oai.embed(\"This is a test sentence.\")\n", + "print(\"Vector dimensions: \", len(test))\n", + "test[:10]" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[-0.017460526898503304,\n", + " -6.895032856846228e-05,\n", + " 0.0013909287517890334,\n", + " -0.025688467547297478,\n", + " -0.019813183695077896,\n", + " 0.016087085008621216,\n", + " -0.003729278687387705,\n", + " 0.0009211922879330814,\n", + " 0.006606514099985361,\n", + " -0.025128915905952454]" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Just like OpenAI, AzureOpenAI supports batching embeddings and asynchronous requests.\n", + "sentences = [\n", + " \"That is a happy dog\",\n", + " \"That is a happy person\",\n", + " \"Today is a sunny day\"\n", + "]\n", + "\n", + "embeddings = await az_oai.aembed_many(sentences)\n", + "embeddings[0][:10]" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -547,7 +656,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.12" + "version": "3.11.5" }, "orig_nbformat": 4, "vscode": { From 7f4efa9595d9ec22b891a3729c37d67a4dea70e8 Mon Sep 17 00:00:00 2001 From: Anibal Date: Fri, 19 Apr 2024 17:19:05 -0600 Subject: [PATCH 4/5] ran make format --- redisvl/utils/vectorize/__init__.py | 4 +-- redisvl/utils/vectorize/text/azureopenai.py | 27 ++++++++++++++------- tests/integration/test_vectorizers.py | 4 +-- 3 files changed, 22 insertions(+), 13 deletions(-) diff --git a/redisvl/utils/vectorize/__init__.py b/redisvl/utils/vectorize/__init__.py index 9bd763d0..ea9d7bee 100644 --- a/redisvl/utils/vectorize/__init__.py +++ b/redisvl/utils/vectorize/__init__.py @@ -1,9 +1,9 @@ from redisvl.utils.vectorize.base import BaseVectorizer +from redisvl.utils.vectorize.text.azureopenai import AzureOpenAITextVectorizer from redisvl.utils.vectorize.text.cohere import CohereTextVectorizer from redisvl.utils.vectorize.text.huggingface import HFTextVectorizer from redisvl.utils.vectorize.text.openai import OpenAITextVectorizer from redisvl.utils.vectorize.text.vertexai import VertexAITextVectorizer -from redisvl.utils.vectorize.text.azureopenai import AzureOpenAITextVectorizer __all__ = [ "BaseVectrorizer", @@ -11,5 +11,5 @@ "HFTextVectorizer", "OpenAITextVectorizer", "VertexAITextVectorizer", - "AzureOpenAITextVectorizer" + "AzureOpenAITextVectorizer", ] diff --git a/redisvl/utils/vectorize/text/azureopenai.py b/redisvl/utils/vectorize/text/azureopenai.py index 511856b5..64349ef7 100644 --- a/redisvl/utils/vectorize/text/azureopenai.py +++ b/redisvl/utils/vectorize/text/azureopenai.py @@ -16,8 +16,8 @@ class AzureOpenAITextVectorizer(BaseVectorizer): This vectorizer is designed to interact with AzureOpenAI's embeddings API, requiring an API key, an AzureOpenAI deployment endpoint and API version. - These values can be provided directly in the `api_config` dictionary with - the parameters 'azure_endpoint'. 'api_version' and 'api_key' or through the + These values can be provided directly in the `api_config` dictionary with + the parameters 'azure_endpoint', 'api_version' and 'api_key' or through the environment variables 'AZURE_OPENAI_ENDPOINT', 'OPENAI_API_VERSION', and 'AZURE_OPENAI_API_KEY'. Users must obtain these values from the 'Keys and Endpoints' section in their Azure OpenAI service. Additionally, the `openai` python client must be installed with `pip install openai>=1.13.0`. @@ -35,7 +35,7 @@ class AzureOpenAITextVectorizer(BaseVectorizer): "api_key": "your_api_key", # OR set OPENAI_API_KEY in your env "api_version": "your_api_version", # OR set OPENAI_API_VERSION in your env "azure_endpoint": "your_azure_endpoint", # OR set AZURE_OPENAI_ENDPOINT in your env - } + } ) embedding = vectorizer.embed("Hello, world!") @@ -76,7 +76,9 @@ def __init__( # Fetch the API key, version and endpoint from api_config or environment variable azure_endpoint = ( - api_config.get("azure_endpoint") if api_config else os.getenv("AZURE_OPENAI_ENDPOINT") + api_config.get("azure_endpoint") + if api_config + else os.getenv("AZURE_OPENAI_ENDPOINT") ) if not azure_endpoint: @@ -87,7 +89,9 @@ def __init__( ) api_version = ( - api_config.get("api_version") if api_config else os.getenv("OPENAI_API_VERSION") + api_config.get("api_version") + if api_config + else os.getenv("OPENAI_API_VERSION") ) if not api_version: @@ -98,7 +102,9 @@ def __init__( ) api_key = ( - api_config.get("api_key") if api_config else os.getenv("AZURE_OPENAI_API_KEY") + api_config.get("api_key") + if api_config + else os.getenv("AZURE_OPENAI_API_KEY") ) if not api_key: @@ -108,11 +114,14 @@ def __init__( environment variable." ) - - client = AzureOpenAI(api_key=api_key, api_version=api_version, azure_endpoint=azure_endpoint) + client = AzureOpenAI( + api_key=api_key, api_version=api_version, azure_endpoint=azure_endpoint + ) dims = self._set_model_dims(client, model) super().__init__(model=model, dims=dims, client=client) - self.aclient = AsyncAzureOpenAI(api_key=api_key, api_version=api_version, azure_endpoint=azure_endpoint) + self.aclient = AsyncAzureOpenAI( + api_key=api_key, api_version=api_version, azure_endpoint=azure_endpoint + ) @staticmethod def _set_model_dims(client, model) -> int: diff --git a/tests/integration/test_vectorizers.py b/tests/integration/test_vectorizers.py index bd435980..8da824df 100644 --- a/tests/integration/test_vectorizers.py +++ b/tests/integration/test_vectorizers.py @@ -3,11 +3,11 @@ import pytest from redisvl.utils.vectorize import ( + AzureOpenAITextVectorizer, CohereTextVectorizer, HFTextVectorizer, OpenAITextVectorizer, VertexAITextVectorizer, - AzureOpenAITextVectorizer ) @@ -25,7 +25,7 @@ def skip_vectorizer() -> bool: OpenAITextVectorizer, VertexAITextVectorizer, CohereTextVectorizer, - AzureOpenAITextVectorizer + AzureOpenAITextVectorizer, ] ) def vectorizer(request, skip_vectorizer): From c1953367edd5f2a8f219752383b2ec02eae6c328 Mon Sep 17 00:00:00 2001 From: Tyler Hutcherson Date: Thu, 25 Apr 2024 16:45:42 -0400 Subject: [PATCH 5/5] Update redisvl/utils/vectorize/text/azureopenai.py --- redisvl/utils/vectorize/text/azureopenai.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/redisvl/utils/vectorize/text/azureopenai.py b/redisvl/utils/vectorize/text/azureopenai.py index 64349ef7..5ac527fa 100644 --- a/redisvl/utils/vectorize/text/azureopenai.py +++ b/redisvl/utils/vectorize/text/azureopenai.py @@ -32,7 +32,7 @@ class AzureOpenAITextVectorizer(BaseVectorizer): vectorizer = AzureOpenAITextVectorizer( model="text-embedding-ada-002", api_config={ - "api_key": "your_api_key", # OR set OPENAI_API_KEY in your env + "api_key": "your_api_key", # OR set AZURE_OPENAI_API_KEY in your env "api_version": "your_api_version", # OR set OPENAI_API_VERSION in your env "azure_endpoint": "your_azure_endpoint", # OR set AZURE_OPENAI_ENDPOINT in your env }