From 53b13a77a72510f95e04bfb5bc89b47315121fb2 Mon Sep 17 00:00:00 2001 From: Tyler Hutcherson Date: Thu, 3 Aug 2023 22:06:25 -0400 Subject: [PATCH 1/6] wip --- redisvl/vectorize/text/huggingface.py | 17 ++++- redisvl/vectorize/text/openai.py | 91 +++++++++++++++++++++++---- 2 files changed, 92 insertions(+), 16 deletions(-) diff --git a/redisvl/vectorize/text/huggingface.py b/redisvl/vectorize/text/huggingface.py index b46bfe0a..171c5785 100644 --- a/redisvl/vectorize/text/huggingface.py +++ b/redisvl/vectorize/text/huggingface.py @@ -1,6 +1,7 @@ from typing import Callable, Dict, List, Optional from redisvl.vectorize.base import BaseVectorizer +from redisvl.utils.utils import array_to_buffer class HFTextVectorizer(BaseVectorizer): @@ -18,21 +19,31 @@ def __init__(self, model: str, api_config: Optional[Dict] = None): self._model_client = SentenceTransformer(model) def embed( - self, emb_input: str, preprocess: Optional[Callable] = None + self, + emb_input: str, + preprocess: Optional[Callable] = None, + as_buffer: Optional[bool] = False ) -> List[float]: if preprocess: emb_input = preprocess(emb_input) embedding = self._model_client.encode([emb_input])[0] - return embedding.tolist() + embedding = embedding.tolist() + if as_buffer: + return array_to_buffer(embedding) + return embedding def embed_many( self, inputs: List[str], preprocess: Optional[Callable] = None, chunk_size: int = 1000, + as_buffer: Optional[float] = None ) -> List[List[float]]: embeddings = [] for batch in self.batchify(inputs, chunk_size, preprocess): batch_embeddings = self._model_client.encode(batch) - embeddings.extend([embedding.tolist() for embedding in batch_embeddings]) + embeddings.extend([ + array_to_buffer(embedding.tolist()) if as_buffer else embedding.tolist() + for embedding in batch_embeddings + ]) return embeddings diff --git a/redisvl/vectorize/text/openai.py b/redisvl/vectorize/text/openai.py index ac24a05b..fb5bf139 100644 --- a/redisvl/vectorize/text/openai.py +++ b/redisvl/vectorize/text/openai.py @@ -1,9 +1,10 @@ from typing import Callable, Dict, List, Optional from redisvl.vectorize.base import BaseVectorizer - +from redisvl.utils.utils import array_to_buffer class OpenAITextVectorizer(BaseVectorizer): + # TODO - add docstring def __init__(self, model: str, api_config: Optional[Dict] = None): dims = 1536 super().__init__(model, dims, api_config) @@ -18,42 +19,106 @@ def __init__(self, model: str, api_config: Optional[Dict] = None): openai.api_key = api_config.get("api_key", None) self._model_client = openai.Embedding + def _process_embedding(self, embedding: List[float], as_buffer: bool): + if as_buffer: + return array_to_buffer(embedding) + return embedding + def embed_many( self, inputs: List[str], preprocess: Optional[Callable] = None, - chunk_size: int = 1000, + batch_size: Optional[int] = 10, + as_buffer: Optional[float] = False ) -> List[List[float]]: - results = [] - for batch in self.batchify(inputs, chunk_size, preprocess): + """Embed many chunks of texts using the OpenAI API. + + Args: + inputs (List[str]): List of text chunks to embed. + preprocess (Optional[Callable], optional): Optional preprocessing callable to + perform before vectorization. Defaults to None. + batch_size (int, optional): Batch size of texts to use when creating embeddings. Defaults to 10. + as_buffer (Optional[float], optional): Whether to convert the raw embedding to a byte string. Defaults to False. + + Returns: + List[List[float]]: _description_ + """ + embeddings: List = [] + for batch in self.batchify(inputs, batch_size, preprocess): response = self._model_client.create(input=batch, engine=self._model) - results += [r["embedding"] for r in response["data"]] - return results + embeddings += [ + self._process_embedding(r["embedding"], as_buffer) for r in response["data"] + ] + return embeddings def embed( - self, emb_input: str, preprocess: Optional[Callable] = None + self, + inputs: List[str], + preprocess: Optional[Callable] = None, + batch_size: Optional[int] = 10, + as_buffer: Optional[float] = False ) -> List[float]: + """Embed chunks of texts using the OpenAI API. + + Args: + inputs (List[str]): List of text chunks to embed. + preprocess (Optional[Callable], optional): Optional preprocessing callable to + perform before vectorization. Defaults to None. + batch_size (int, optional): Batch size of texts to use when creating embeddings. Defaults to 10. + as_buffer (Optional[float], optional): Whether to convert the raw embedding to a byte string. Defaults to False. + + Returns: + List[List[float]]: _description_ + """ if preprocess: emb_input = preprocess(emb_input) result = self._model_client.create(input=[emb_input], engine=self._model) - return result["data"][0]["embedding"] + return self._process_embedding(result["data"][0]["embedding"], as_buffer) + async def aembed_many( self, inputs: List[str], preprocess: Optional[Callable] = None, chunk_size: int = 1000, + as_buffer: Optional[bool] = False ) -> List[List[float]]: - results = [] + """_summary_ + + Args: + inputs (List[str]): _description_ + preprocess (Optional[Callable], optional): _description_. Defaults to None. + chunk_size (int, optional): _description_. Defaults to 1000. + as_buffer (Optional[bool], optional): _description_. Defaults to False. + + Returns: + List[List[float]]: _description_ + """ + embeddings: List = [] for batch in self.batchify(inputs, chunk_size, preprocess): response = await self._model_client.acreate(input=batch, engine=self._model) - results += [r["embedding"] for r in response["data"]] - return results + embeddings += [ + self._process_embedding(r["embedding"], as_buffer) for r in response["data"] + ] + return embeddings async def aembed( - self, emb_input: str, preprocess: Optional[Callable] = None + self, + emb_input: str, + preprocess: Optional[Callable] = None, + as_buffer: Optional[bool] = False ) -> List[float]: + """_summary_ + + Args: + emb_input (str): _description_ + preprocess (Optional[Callable], optional): _description_. Defaults to None. + as_buffer (Optional[bool], optional): _description_. Defaults to False. + + Returns: + List[float]: _description_ + """ if preprocess: emb_input = preprocess(emb_input) result = await self._model_client.acreate(input=[emb_input], engine=self._model) - return result["data"][0]["embedding"] + return self._process_embedding(result["data"][0]["embedding"], as_buffer) From 99c02b2fd66cfe2497b3052aee8488a31639b069 Mon Sep 17 00:00:00 2001 From: Tyler Hutcherson Date: Thu, 3 Aug 2023 22:26:47 -0400 Subject: [PATCH 2/6] clean up vectorizer interface --- redisvl/vectorize/base.py | 7 ++- redisvl/vectorize/text/huggingface.py | 53 ++++++++++++------ redisvl/vectorize/text/openai.py | 78 +++++++++++++-------------- 3 files changed, 83 insertions(+), 55 deletions(-) diff --git a/redisvl/vectorize/base.py b/redisvl/vectorize/base.py index 64f32568..973d33e8 100644 --- a/redisvl/vectorize/base.py +++ b/redisvl/vectorize/base.py @@ -1,5 +1,5 @@ from typing import Callable, Dict, List, Optional - +from redisvl.utils.utils import array_to_buffer class BaseVectorizer: def __init__(self, model: str, dims: int, api_config: Optional[Dict] = None): @@ -51,3 +51,8 @@ def batchify(self, seq: list, size: int, preprocess: Optional[Callable] = None): yield [preprocess(chunk) for chunk in seq[pos : pos + size]] else: yield seq[pos : pos + size] + + def _process_embedding(self, embedding: List[float], as_buffer: bool): + if as_buffer: + return array_to_buffer(embedding) + return embedding diff --git a/redisvl/vectorize/text/huggingface.py b/redisvl/vectorize/text/huggingface.py index 171c5785..f0fc47ec 100644 --- a/redisvl/vectorize/text/huggingface.py +++ b/redisvl/vectorize/text/huggingface.py @@ -1,10 +1,10 @@ from typing import Callable, Dict, List, Optional from redisvl.vectorize.base import BaseVectorizer -from redisvl.utils.utils import array_to_buffer class HFTextVectorizer(BaseVectorizer): + # TODO - add docstring def __init__(self, model: str, api_config: Optional[Dict] = None): # TODO set dims based on model dims = 768 @@ -20,30 +20,53 @@ def __init__(self, model: str, api_config: Optional[Dict] = None): def embed( self, - emb_input: str, + text: str, preprocess: Optional[Callable] = None, - as_buffer: Optional[bool] = False + as_buffer: Optional[float] = False ) -> List[float]: + """Embed a chunk of text using the Hugging Face sentence transformer. + + Args: + text (str): Chunk of text to embed. + preprocess (Optional[Callable], optional): Optional preprocessing callable to + perform before vectorization. Defaults to None. + as_buffer (Optional[float], optional): Whether to convert the raw embedding + to a byte string. Defaults to False. + + Returns: + List[float]: Embedding. + """ if preprocess: - emb_input = preprocess(emb_input) - embedding = self._model_client.encode([emb_input])[0] - embedding = embedding.tolist() - if as_buffer: - return array_to_buffer(embedding) - return embedding + text = preprocess(text) + embedding = self._model_client.encode([text])[0] + return self._process_embedding(embedding.tolist(), as_buffer) def embed_many( self, - inputs: List[str], + texts: List[str], preprocess: Optional[Callable] = None, - chunk_size: int = 1000, + batch_size: int = 1000, as_buffer: Optional[float] = None ) -> List[List[float]]: - embeddings = [] - for batch in self.batchify(inputs, chunk_size, preprocess): + """Asynchronously embed many chunks of texts using the Hugging Face sentence + transformer. + + Args: + texts (List[str]): List of text chunks to embed. + preprocess (Optional[Callable], optional): Optional preprocessing callable to + perform before vectorization. Defaults to None. + batch_size (int, optional): Batch size of texts to use when creating + embeddings. Defaults to 10. + as_buffer (Optional[float], optional): Whether to convert the raw embedding + to a byte string. Defaults to False. + + Returns: + List[List[float]]: List of embeddings. + """ + embeddings: List = [] + for batch in self.batchify(texts, batch_size, preprocess): batch_embeddings = self._model_client.encode(batch) embeddings.extend([ - array_to_buffer(embedding.tolist()) if as_buffer else embedding.tolist() - for embedding in batch_embeddings + self._process_embedding(embedding.tolist(), as_buffer) for embedding in batch_embeddings ]) return embeddings diff --git a/redisvl/vectorize/text/openai.py b/redisvl/vectorize/text/openai.py index fb5bf139..99e8ffc2 100644 --- a/redisvl/vectorize/text/openai.py +++ b/redisvl/vectorize/text/openai.py @@ -1,7 +1,7 @@ from typing import Callable, Dict, List, Optional from redisvl.vectorize.base import BaseVectorizer -from redisvl.utils.utils import array_to_buffer + class OpenAITextVectorizer(BaseVectorizer): # TODO - add docstring @@ -19,14 +19,9 @@ def __init__(self, model: str, api_config: Optional[Dict] = None): openai.api_key = api_config.get("api_key", None) self._model_client = openai.Embedding - def _process_embedding(self, embedding: List[float], as_buffer: bool): - if as_buffer: - return array_to_buffer(embedding) - return embedding - def embed_many( self, - inputs: List[str], + texts: List[str], preprocess: Optional[Callable] = None, batch_size: Optional[int] = 10, as_buffer: Optional[float] = False @@ -34,17 +29,19 @@ def embed_many( """Embed many chunks of texts using the OpenAI API. Args: - inputs (List[str]): List of text chunks to embed. + texts (List[str]): List of text chunks to embed. preprocess (Optional[Callable], optional): Optional preprocessing callable to perform before vectorization. Defaults to None. - batch_size (int, optional): Batch size of texts to use when creating embeddings. Defaults to 10. - as_buffer (Optional[float], optional): Whether to convert the raw embedding to a byte string. Defaults to False. + batch_size (int, optional): Batch size of texts to use when creating + embeddings. Defaults to 10. + as_buffer (Optional[float], optional): Whether to convert the raw embedding + to a byte string. Defaults to False. Returns: - List[List[float]]: _description_ + List[List[float]]: List of embeddings. """ embeddings: List = [] - for batch in self.batchify(inputs, batch_size, preprocess): + for batch in self.batchify(texts, batch_size, preprocess): response = self._model_client.create(input=batch, engine=self._model) embeddings += [ self._process_embedding(r["embedding"], as_buffer) for r in response["data"] @@ -53,49 +50,50 @@ def embed_many( def embed( self, - inputs: List[str], + text: str, preprocess: Optional[Callable] = None, - batch_size: Optional[int] = 10, as_buffer: Optional[float] = False ) -> List[float]: - """Embed chunks of texts using the OpenAI API. + """Embed a chunk of text using the OpenAI API. Args: - inputs (List[str]): List of text chunks to embed. + text (str): Chunk of text to embed. preprocess (Optional[Callable], optional): Optional preprocessing callable to perform before vectorization. Defaults to None. - batch_size (int, optional): Batch size of texts to use when creating embeddings. Defaults to 10. - as_buffer (Optional[float], optional): Whether to convert the raw embedding to a byte string. Defaults to False. + as_buffer (Optional[float], optional): Whether to convert the raw embedding + to a byte string. Defaults to False. Returns: - List[List[float]]: _description_ + List[float]: Embedding. """ if preprocess: - emb_input = preprocess(emb_input) - result = self._model_client.create(input=[emb_input], engine=self._model) + text = preprocess(text) + result = self._model_client.create(input=[text], engine=self._model) return self._process_embedding(result["data"][0]["embedding"], as_buffer) - async def aembed_many( self, - inputs: List[str], + texts: List[str], preprocess: Optional[Callable] = None, - chunk_size: int = 1000, + batch_size: int = 1000, as_buffer: Optional[bool] = False ) -> List[List[float]]: - """_summary_ + """Asynchronously embed many chunks of texts using the OpenAI API. Args: - inputs (List[str]): _description_ - preprocess (Optional[Callable], optional): _description_. Defaults to None. - chunk_size (int, optional): _description_. Defaults to 1000. - as_buffer (Optional[bool], optional): _description_. Defaults to False. + texts (List[str]): List of text chunks to embed. + preprocess (Optional[Callable], optional): Optional preprocessing callable to + perform before vectorization. Defaults to None. + batch_size (int, optional): Batch size of texts to use when creating + embeddings. Defaults to 10. + as_buffer (Optional[float], optional): Whether to convert the raw embedding + to a byte string. Defaults to False. Returns: - List[List[float]]: _description_ + List[List[float]]: List of embeddings. """ embeddings: List = [] - for batch in self.batchify(inputs, chunk_size, preprocess): + for batch in self.batchify(texts, batch_size, preprocess): response = await self._model_client.acreate(input=batch, engine=self._model) embeddings += [ self._process_embedding(r["embedding"], as_buffer) for r in response["data"] @@ -104,21 +102,23 @@ async def aembed_many( async def aembed( self, - emb_input: str, + text: str, preprocess: Optional[Callable] = None, as_buffer: Optional[bool] = False ) -> List[float]: - """_summary_ + """Asynchronously embed a chunk of text using the OpenAI API. Args: - emb_input (str): _description_ - preprocess (Optional[Callable], optional): _description_. Defaults to None. - as_buffer (Optional[bool], optional): _description_. Defaults to False. + text (str): Chunk of text to embed. + preprocess (Optional[Callable], optional): Optional preprocessing callable to + perform before vectorization. Defaults to None. + as_buffer (Optional[float], optional): Whether to convert the raw embedding + to a byte string. Defaults to False. Returns: - List[float]: _description_ + List[float]: Embedding. """ if preprocess: - emb_input = preprocess(emb_input) - result = await self._model_client.acreate(input=[emb_input], engine=self._model) + text = preprocess(text) + result = await self._model_client.acreate(input=[text], engine=self._model) return self._process_embedding(result["data"][0]["embedding"], as_buffer) From eb8258d8bfde2930b1b8d63e0e504477e75d5b35 Mon Sep 17 00:00:00 2001 From: Tyler Hutcherson Date: Thu, 3 Aug 2023 22:35:01 -0400 Subject: [PATCH 3/6] Add tenacity exponential backoff --- redisvl/vectorize/text/openai.py | 9 +++++++++ requirements.txt | 3 ++- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/redisvl/vectorize/text/openai.py b/redisvl/vectorize/text/openai.py index 99e8ffc2..8c1ac6d9 100644 --- a/redisvl/vectorize/text/openai.py +++ b/redisvl/vectorize/text/openai.py @@ -1,4 +1,9 @@ from typing import Callable, Dict, List, Optional +from tenacity import ( + retry, + stop_after_attempt, + wait_random_exponential, +) # for exponential backoff from redisvl.vectorize.base import BaseVectorizer @@ -19,6 +24,7 @@ def __init__(self, model: str, api_config: Optional[Dict] = None): openai.api_key = api_config.get("api_key", None) self._model_client = openai.Embedding + @retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6)) def embed_many( self, texts: List[str], @@ -48,6 +54,7 @@ def embed_many( ] return embeddings + @retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6)) def embed( self, text: str, @@ -71,6 +78,7 @@ def embed( result = self._model_client.create(input=[text], engine=self._model) return self._process_embedding(result["data"][0]["embedding"], as_buffer) + @retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6)) async def aembed_many( self, texts: List[str], @@ -100,6 +108,7 @@ async def aembed_many( ] return embeddings + @retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6)) async def aembed( self, text: str, diff --git a/requirements.txt b/requirements.txt index c74fdbd7..722f3596 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,4 +2,5 @@ numpy redis>=4.3.4 pyyaml coloredlogs -pydantic>=2.0.0 \ No newline at end of file +pydantic>=2.0.0 +tenacity==8.2.2 \ No newline at end of file From f1bb1b8122144608a8556e248022869e6de92d74 Mon Sep 17 00:00:00 2001 From: Tyler Hutcherson Date: Thu, 3 Aug 2023 22:38:01 -0400 Subject: [PATCH 4/6] update base vectorizer class --- redisvl/vectorize/base.py | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/redisvl/vectorize/base.py b/redisvl/vectorize/base.py index 973d33e8..bee93411 100644 --- a/redisvl/vectorize/base.py +++ b/redisvl/vectorize/base.py @@ -21,27 +21,35 @@ def set_model(self, model: str, dims: Optional[int] = None) -> None: def embed_many( self, - inputs: List[str], + texts: List[str], preprocess: Optional[Callable] = None, - chunk_size: int = 1000, + batch_size: Optional[int] = 1000, + as_buffer: Optional[bool] = False ) -> List[List[float]]: raise NotImplementedError def embed( - self, emb_input: str, preprocess: Optional[Callable] = None + self, + text: str, + preprocess: Optional[Callable] = None, + as_buffer: Optional[bool] = False ) -> List[float]: raise NotImplementedError async def aembed_many( self, - inputs: List[str], + texts: List[str], preprocess: Optional[Callable] = None, - chunk_size: int = 1000, + batch_size: Optional[int] = 1000, + as_buffer: Optional[bool] = False ) -> List[List[float]]: raise NotImplementedError async def aembed( - self, emb_input: str, preprocess: Optional[Callable] = None + self, + text: str, + preprocess: Optional[Callable] = None, + as_buffer: Optional[bool] = False ) -> List[float]: raise NotImplementedError From 71081e71f4f72eece6c5be9ec70f906845d15a5d Mon Sep 17 00:00:00 2001 From: Tyler Hutcherson Date: Thu, 3 Aug 2023 22:42:35 -0400 Subject: [PATCH 5/6] update examples --- docs/examples/openai_qna.ipynb | 4 +--- docs/user_guide/vectorizers_03.ipynb | 4 ++-- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/docs/examples/openai_qna.ipynb b/docs/examples/openai_qna.ipynb index d0bafd7b..d3c1bf91 100644 --- a/docs/examples/openai_qna.ipynb +++ b/docs/examples/openai_qna.ipynb @@ -710,13 +710,11 @@ "source": [ "import os\n", "from redisvl.vectorize.text import OpenAITextVectorizer\n", - "from redisvl.utils.utils import array_to_buffer\n", "\n", "api_key = os.environ.get(\"OPENAI_API_KEY\", \"\")\n", "oaip = OpenAITextVectorizer(EMBEDDINGS_MODEL, api_config={\"api_key\": api_key})\n", "\n", - "chunked_data[\"embedding\"] = oaip.embed_many(chunked_data[\"content\"].tolist())\n", - "chunked_data[\"embedding\"] = chunked_data[\"embedding\"].apply(lambda x: array_to_buffer(x))\n", + "chunked_data[\"embedding\"] = oaip.embed_many(chunked_data[\"content\"].tolist(), as_buffer=True)\n", "chunked_data" ] }, diff --git a/docs/user_guide/vectorizers_03.ipynb b/docs/user_guide/vectorizers_03.ipynb index bbf61886..d25cf8bc 100644 --- a/docs/user_guide/vectorizers_03.ipynb +++ b/docs/user_guide/vectorizers_03.ipynb @@ -105,7 +105,7 @@ " \"Today is a sunny day\"\n", "]\n", "\n", - "embeddings = hf.embed_many(sentences)\n" + "embeddings = hf.embed_many(sentences, as_buffer=True)\n" ] }, { @@ -183,7 +183,7 @@ "# the vector is stored as a bytes buffer\n", "\n", "data = [{\"text\": t,\n", - " \"embedding\": array_to_buffer(v)}\n", + " \"embedding\": v}\n", " for t, v in zip(sentences, embeddings)]\n", "\n", "index.load(data)" From fcd9c959e297cfbe58b45e43824a91b133d4bab0 Mon Sep 17 00:00:00 2001 From: Tyler Hutcherson Date: Fri, 4 Aug 2023 13:29:46 -0400 Subject: [PATCH 6/6] styling and linting --- redisvl/vectorize/base.py | 10 ++++++---- redisvl/vectorize/text/huggingface.py | 13 ++++++++----- redisvl/vectorize/text/openai.py | 19 +++++++++++-------- 3 files changed, 25 insertions(+), 17 deletions(-) diff --git a/redisvl/vectorize/base.py b/redisvl/vectorize/base.py index bee93411..592f7938 100644 --- a/redisvl/vectorize/base.py +++ b/redisvl/vectorize/base.py @@ -1,6 +1,8 @@ from typing import Callable, Dict, List, Optional + from redisvl.utils.utils import array_to_buffer + class BaseVectorizer: def __init__(self, model: str, dims: int, api_config: Optional[Dict] = None): self._dims = dims @@ -24,7 +26,7 @@ def embed_many( texts: List[str], preprocess: Optional[Callable] = None, batch_size: Optional[int] = 1000, - as_buffer: Optional[bool] = False + as_buffer: Optional[bool] = False, ) -> List[List[float]]: raise NotImplementedError @@ -32,7 +34,7 @@ def embed( self, text: str, preprocess: Optional[Callable] = None, - as_buffer: Optional[bool] = False + as_buffer: Optional[bool] = False, ) -> List[float]: raise NotImplementedError @@ -41,7 +43,7 @@ async def aembed_many( texts: List[str], preprocess: Optional[Callable] = None, batch_size: Optional[int] = 1000, - as_buffer: Optional[bool] = False + as_buffer: Optional[bool] = False, ) -> List[List[float]]: raise NotImplementedError @@ -49,7 +51,7 @@ async def aembed( self, text: str, preprocess: Optional[Callable] = None, - as_buffer: Optional[bool] = False + as_buffer: Optional[bool] = False, ) -> List[float]: raise NotImplementedError diff --git a/redisvl/vectorize/text/huggingface.py b/redisvl/vectorize/text/huggingface.py index f0fc47ec..b45349fa 100644 --- a/redisvl/vectorize/text/huggingface.py +++ b/redisvl/vectorize/text/huggingface.py @@ -22,7 +22,7 @@ def embed( self, text: str, preprocess: Optional[Callable] = None, - as_buffer: Optional[float] = False + as_buffer: Optional[float] = False, ) -> List[float]: """Embed a chunk of text using the Hugging Face sentence transformer. @@ -46,7 +46,7 @@ def embed_many( texts: List[str], preprocess: Optional[Callable] = None, batch_size: int = 1000, - as_buffer: Optional[float] = None + as_buffer: Optional[float] = None, ) -> List[List[float]]: """Asynchronously embed many chunks of texts using the Hugging Face sentence transformer. @@ -66,7 +66,10 @@ def embed_many( embeddings: List = [] for batch in self.batchify(texts, batch_size, preprocess): batch_embeddings = self._model_client.encode(batch) - embeddings.extend([ - self._process_embedding(embedding.tolist(), as_buffer) for embedding in batch_embeddings - ]) + embeddings.extend( + [ + self._process_embedding(embedding.tolist(), as_buffer) + for embedding in batch_embeddings + ] + ) return embeddings diff --git a/redisvl/vectorize/text/openai.py b/redisvl/vectorize/text/openai.py index 8c1ac6d9..f8a63b58 100644 --- a/redisvl/vectorize/text/openai.py +++ b/redisvl/vectorize/text/openai.py @@ -1,9 +1,10 @@ from typing import Callable, Dict, List, Optional -from tenacity import ( + +from tenacity import ( # for exponential backoff retry, stop_after_attempt, wait_random_exponential, -) # for exponential backoff +) from redisvl.vectorize.base import BaseVectorizer @@ -30,7 +31,7 @@ def embed_many( texts: List[str], preprocess: Optional[Callable] = None, batch_size: Optional[int] = 10, - as_buffer: Optional[float] = False + as_buffer: Optional[float] = False, ) -> List[List[float]]: """Embed many chunks of texts using the OpenAI API. @@ -50,7 +51,8 @@ def embed_many( for batch in self.batchify(texts, batch_size, preprocess): response = self._model_client.create(input=batch, engine=self._model) embeddings += [ - self._process_embedding(r["embedding"], as_buffer) for r in response["data"] + self._process_embedding(r["embedding"], as_buffer) + for r in response["data"] ] return embeddings @@ -59,7 +61,7 @@ def embed( self, text: str, preprocess: Optional[Callable] = None, - as_buffer: Optional[float] = False + as_buffer: Optional[float] = False, ) -> List[float]: """Embed a chunk of text using the OpenAI API. @@ -84,7 +86,7 @@ async def aembed_many( texts: List[str], preprocess: Optional[Callable] = None, batch_size: int = 1000, - as_buffer: Optional[bool] = False + as_buffer: Optional[bool] = False, ) -> List[List[float]]: """Asynchronously embed many chunks of texts using the OpenAI API. @@ -104,7 +106,8 @@ async def aembed_many( for batch in self.batchify(texts, batch_size, preprocess): response = await self._model_client.acreate(input=batch, engine=self._model) embeddings += [ - self._process_embedding(r["embedding"], as_buffer) for r in response["data"] + self._process_embedding(r["embedding"], as_buffer) + for r in response["data"] ] return embeddings @@ -113,7 +116,7 @@ async def aembed( self, text: str, preprocess: Optional[Callable] = None, - as_buffer: Optional[bool] = False + as_buffer: Optional[bool] = False, ) -> List[float]: """Asynchronously embed a chunk of text using the OpenAI API.