diff --git a/conftest.py b/conftest.py index 2db79ec0..0f20ab54 100644 --- a/conftest.py +++ b/conftest.py @@ -21,6 +21,14 @@ def client(redis_url): def openai_key(): return os.getenv("OPENAI_API_KEY") +@pytest.fixture +def openai_version(): + return os.getenv("OPENAI_API_VERSION") + +@pytest.fixture +def azure_endpoint(): + return os.getenv("AZURE_OPENAI_ENDPOINT") + @pytest.fixture def cohere_key(): return os.getenv("COHERE_API_KEY") diff --git a/docs/user_guide/vectorizers_04.ipynb b/docs/user_guide/vectorizers_04.ipynb index 6201a97f..19972c03 100644 --- a/docs/user_guide/vectorizers_04.ipynb +++ b/docs/user_guide/vectorizers_04.ipynb @@ -64,7 +64,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -176,6 +176,115 @@ "print(\"Number of Embeddings:\", len(embeddings))\n" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Azure OpenAI\n", + "\n", + "The ``AzureOpenAITextVectorizer`` is a variation of the OpenAI vectorizer that calls OpenAI models within Azure. If you've already installed ``openai``, then you're ready to use Azure OpenAI.\n", + "\n", + "The only practical difference between OpenAI and Azure OpenAI is the variables required to call the API." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# additionally to the API Key, setup the API endpoint and version\n", + "api_version = os.environ.get(\"OPENAI_API_VERSION\") or getpass.getpass(\"Enter your AzureOpenAI API version: \")\n", + "azure_endpoint = os.environ.get(\"AZURE_OPENAI_ENDPOINT\") or getpass.getpass(\"Enter your AzureOpenAI API endpoint: \")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Vector dimensions: 1536\n" + ] + }, + { + "data": { + "text/plain": [ + "[-0.0010088568087667227,\n", + " -0.003142790636047721,\n", + " 0.0024922797456383705,\n", + " -0.004522906616330147,\n", + " -0.010369433090090752,\n", + " 0.012739036232233047,\n", + " -0.005365503951907158,\n", + " -0.0029668458737432957,\n", + " -0.007141091860830784,\n", + " -0.03383301943540573]" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from redisvl.utils.vectorize import AzureOpenAITextVectorizer\n", + "\n", + "# create a vectorizer\n", + "az_oai = AzureOpenAITextVectorizer(\n", + " model=\"text-embedding-ada-002\", # Must be your custom deployment name\n", + " api_config={\n", + " \"api_key\": api_key,\n", + " \"api_version\": api_version,\n", + " \"azure_endpoint\": azure_endpoint\n", + " },\n", + ")\n", + "\n", + "test = az_oai.embed(\"This is a test sentence.\")\n", + "print(\"Vector dimensions: \", len(test))\n", + "test[:10]" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[-0.017460526898503304,\n", + " -6.895032856846228e-05,\n", + " 0.0013909287517890334,\n", + " -0.025688467547297478,\n", + " -0.019813183695077896,\n", + " 0.016087085008621216,\n", + " -0.003729278687387705,\n", + " 0.0009211922879330814,\n", + " 0.006606514099985361,\n", + " -0.025128915905952454]" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Just like OpenAI, AzureOpenAI supports batching embeddings and asynchronous requests.\n", + "sentences = [\n", + " \"That is a happy dog\",\n", + " \"That is a happy person\",\n", + " \"Today is a sunny day\"\n", + "]\n", + "\n", + "embeddings = await az_oai.aembed_many(sentences)\n", + "embeddings[0][:10]" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -547,7 +656,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.12" + "version": "3.11.5" }, "orig_nbformat": 4, "vscode": { diff --git a/redisvl/utils/vectorize/__init__.py b/redisvl/utils/vectorize/__init__.py index c0dc7dd8..ea9d7bee 100644 --- a/redisvl/utils/vectorize/__init__.py +++ b/redisvl/utils/vectorize/__init__.py @@ -1,4 +1,5 @@ from redisvl.utils.vectorize.base import BaseVectorizer +from redisvl.utils.vectorize.text.azureopenai import AzureOpenAITextVectorizer from redisvl.utils.vectorize.text.cohere import CohereTextVectorizer from redisvl.utils.vectorize.text.huggingface import HFTextVectorizer from redisvl.utils.vectorize.text.openai import OpenAITextVectorizer @@ -10,4 +11,5 @@ "HFTextVectorizer", "OpenAITextVectorizer", "VertexAITextVectorizer", + "AzureOpenAITextVectorizer", ] diff --git a/redisvl/utils/vectorize/text/azureopenai.py b/redisvl/utils/vectorize/text/azureopenai.py new file mode 100644 index 00000000..5ac527fa --- /dev/null +++ b/redisvl/utils/vectorize/text/azureopenai.py @@ -0,0 +1,297 @@ +import os +from typing import Any, Callable, Dict, List, Optional + +from tenacity import retry, stop_after_attempt, wait_random_exponential +from tenacity.retry import retry_if_not_exception_type + +from redisvl.utils.vectorize.base import BaseVectorizer + +# ignore that openai isn't imported +# mypy: disable-error-code="name-defined" + + +class AzureOpenAITextVectorizer(BaseVectorizer): + """The AzureOpenAITextVectorizer class utilizes AzureOpenAI's API to generate + embeddings for text data. + + This vectorizer is designed to interact with AzureOpenAI's embeddings API, + requiring an API key, an AzureOpenAI deployment endpoint and API version. + These values can be provided directly in the `api_config` dictionary with + the parameters 'azure_endpoint', 'api_version' and 'api_key' or through the + environment variables 'AZURE_OPENAI_ENDPOINT', 'OPENAI_API_VERSION', and 'AZURE_OPENAI_API_KEY'. + Users must obtain these values from the 'Keys and Endpoints' section in their Azure OpenAI service. + Additionally, the `openai` python client must be installed with `pip install openai>=1.13.0`. + + The vectorizer supports both synchronous and asynchronous operations, + allowing for batch processing of texts and flexibility in handling + preprocessing tasks. + + .. code-block:: python + + # Synchronous embedding of a single text + vectorizer = AzureOpenAITextVectorizer( + model="text-embedding-ada-002", + api_config={ + "api_key": "your_api_key", # OR set AZURE_OPENAI_API_KEY in your env + "api_version": "your_api_version", # OR set OPENAI_API_VERSION in your env + "azure_endpoint": "your_azure_endpoint", # OR set AZURE_OPENAI_ENDPOINT in your env + } + ) + embedding = vectorizer.embed("Hello, world!") + + # Asynchronous batch embedding of multiple texts + embeddings = await vectorizer.aembed_many( + ["Hello, world!", "How are you?"], + batch_size=2 + ) + + """ + + aclient: Any # Since the OpenAI module is loaded dynamically + + def __init__( + self, model: str = "text-embedding-ada-002", api_config: Optional[Dict] = None + ): + """Initialize the AzureOpenAI vectorizer. + + Args: + model (str): Deployment to use for embedding. Must be the + 'Deployment name' not the 'Model name'. Defaults to + 'text-embedding-ada-002'. + api_config (Optional[Dict], optional): Dictionary containing the + API key, API version and Azure endpoint. Defaults to None. + + Raises: + ImportError: If the openai library is not installed. + ValueError: If the AzureOpenAI API key, version, or endpoint are not provided. + """ + # Dynamic import of the openai module + try: + from openai import AsyncAzureOpenAI, AzureOpenAI + except ImportError: + raise ImportError( + "AzureOpenAI vectorizer requires the openai library. \ + Please install with `pip install openai`" + ) + + # Fetch the API key, version and endpoint from api_config or environment variable + azure_endpoint = ( + api_config.get("azure_endpoint") + if api_config + else os.getenv("AZURE_OPENAI_ENDPOINT") + ) + + if not azure_endpoint: + raise ValueError( + "AzureOpenAI API endpoint is required. " + "Provide it in api_config or set the AZURE_OPENAI_ENDPOINT\ + environment variable." + ) + + api_version = ( + api_config.get("api_version") + if api_config + else os.getenv("OPENAI_API_VERSION") + ) + + if not api_version: + raise ValueError( + "AzureOpenAI API version is required. " + "Provide it in api_config or set the OPENAI_API_VERSION\ + environment variable." + ) + + api_key = ( + api_config.get("api_key") + if api_config + else os.getenv("AZURE_OPENAI_API_KEY") + ) + + if not api_key: + raise ValueError( + "AzureOpenAI API key is required. " + "Provide it in api_config or set the AZURE_OPENAI_API_KEY\ + environment variable." + ) + + client = AzureOpenAI( + api_key=api_key, api_version=api_version, azure_endpoint=azure_endpoint + ) + dims = self._set_model_dims(client, model) + super().__init__(model=model, dims=dims, client=client) + self.aclient = AsyncAzureOpenAI( + api_key=api_key, api_version=api_version, azure_endpoint=azure_endpoint + ) + + @staticmethod + def _set_model_dims(client, model) -> int: + try: + embedding = ( + client.embeddings.create(input=["dimension test"], model=model) + .data[0] + .embedding + ) + except (KeyError, IndexError) as ke: + raise ValueError(f"Unexpected response from the AzureOpenAI API: {str(ke)}") + except Exception as e: # pylint: disable=broad-except + # fall back (TODO get more specific) + raise ValueError(f"Error setting embedding model dimensions: {str(e)}") + return len(embedding) + + @retry( + wait=wait_random_exponential(min=1, max=60), + stop=stop_after_attempt(6), + retry=retry_if_not_exception_type(TypeError), + ) + def embed_many( + self, + texts: List[str], + preprocess: Optional[Callable] = None, + batch_size: int = 10, + as_buffer: bool = False, + **kwargs, + ) -> List[List[float]]: + """Embed many chunks of texts using the AzureOpenAI API. + + Args: + texts (List[str]): List of text chunks to embed. + preprocess (Optional[Callable], optional): Optional preprocessing + callable to perform before vectorization. Defaults to None. + batch_size (int, optional): Batch size of texts to use when creating + embeddings. Defaults to 10. + as_buffer (bool, optional): Whether to convert the raw embedding + to a byte string. Defaults to False. + + Returns: + List[List[float]]: List of embeddings. + + Raises: + TypeError: If the wrong input type is passed in for the test. + """ + if not isinstance(texts, list): + raise TypeError("Must pass in a list of str values to embed.") + if len(texts) > 0 and not isinstance(texts[0], str): + raise TypeError("Must pass in a list of str values to embed.") + + embeddings: List = [] + for batch in self.batchify(texts, batch_size, preprocess): + response = self.client.embeddings.create(input=batch, model=self.model) + embeddings += [ + self._process_embedding(r.embedding, as_buffer) for r in response.data + ] + return embeddings + + @retry( + wait=wait_random_exponential(min=1, max=60), + stop=stop_after_attempt(6), + retry=retry_if_not_exception_type(TypeError), + ) + def embed( + self, + text: str, + preprocess: Optional[Callable] = None, + as_buffer: bool = False, + **kwargs, + ) -> List[float]: + """Embed a chunk of text using the AzureOpenAI API. + + Args: + text (str): Chunk of text to embed. + preprocess (Optional[Callable], optional): Optional preprocessing callable to + perform before vectorization. Defaults to None. + as_buffer (bool, optional): Whether to convert the raw embedding + to a byte string. Defaults to False. + + Returns: + List[float]: Embedding. + + Raises: + TypeError: If the wrong input type is passed in for the test. + """ + if not isinstance(text, str): + raise TypeError("Must pass in a str value to embed.") + + if preprocess: + text = preprocess(text) + result = self.client.embeddings.create(input=[text], model=self.model) + return self._process_embedding(result.data[0].embedding, as_buffer) + + @retry( + wait=wait_random_exponential(min=1, max=60), + stop=stop_after_attempt(6), + retry=retry_if_not_exception_type(TypeError), + ) + async def aembed_many( + self, + texts: List[str], + preprocess: Optional[Callable] = None, + batch_size: int = 1000, + as_buffer: bool = False, + **kwargs, + ) -> List[List[float]]: + """Asynchronously embed many chunks of texts using the AzureOpenAI API. + + Args: + texts (List[str]): List of text chunks to embed. + preprocess (Optional[Callable], optional): Optional preprocessing callable to + perform before vectorization. Defaults to None. + batch_size (int, optional): Batch size of texts to use when creating + embeddings. Defaults to 10. + as_buffer (bool, optional): Whether to convert the raw embedding + to a byte string. Defaults to False. + + Returns: + List[List[float]]: List of embeddings. + + Raises: + TypeError: If the wrong input type is passed in for the test. + """ + if not isinstance(texts, list): + raise TypeError("Must pass in a list of str values to embed.") + if len(texts) > 0 and not isinstance(texts[0], str): + raise TypeError("Must pass in a list of str values to embed.") + + embeddings: List = [] + for batch in self.batchify(texts, batch_size, preprocess): + response = await self.aclient.embeddings.create( + input=batch, model=self.model + ) + embeddings += [ + self._process_embedding(r.embedding, as_buffer) for r in response.data + ] + return embeddings + + @retry( + wait=wait_random_exponential(min=1, max=60), + stop=stop_after_attempt(6), + retry=retry_if_not_exception_type(TypeError), + ) + async def aembed( + self, + text: str, + preprocess: Optional[Callable] = None, + as_buffer: bool = False, + **kwargs, + ) -> List[float]: + """Asynchronously embed a chunk of text using the OpenAI API. + + Args: + text (str): Chunk of text to embed. + preprocess (Optional[Callable], optional): Optional preprocessing callable to + perform before vectorization. Defaults to None. + as_buffer (bool, optional): Whether to convert the raw embedding + to a byte string. Defaults to False. + + Returns: + List[float]: Embedding. + + Raises: + TypeError: If the wrong input type is passed in for the test. + """ + if not isinstance(text, str): + raise TypeError("Must pass in a str value to embed.") + + if preprocess: + text = preprocess(text) + result = await self.aclient.embeddings.create(input=[text], model=self.model) + return self._process_embedding(result.data[0].embedding, as_buffer) diff --git a/tests/integration/test_vectorizers.py b/tests/integration/test_vectorizers.py index c8766171..8da824df 100644 --- a/tests/integration/test_vectorizers.py +++ b/tests/integration/test_vectorizers.py @@ -3,6 +3,7 @@ import pytest from redisvl.utils.vectorize import ( + AzureOpenAITextVectorizer, CohereTextVectorizer, HFTextVectorizer, OpenAITextVectorizer, @@ -24,6 +25,7 @@ def skip_vectorizer() -> bool: OpenAITextVectorizer, VertexAITextVectorizer, CohereTextVectorizer, + AzureOpenAITextVectorizer, ] ) def vectorizer(request, skip_vectorizer): @@ -38,6 +40,8 @@ def vectorizer(request, skip_vectorizer): return request.param() elif request.param == CohereTextVectorizer: return request.param() + elif request.param == AzureOpenAITextVectorizer: + return request.param() def test_vectorizer_embed(vectorizer, skip_vectorizer):