From c4752a3ec955e12330ab6ca021f6e4c85167a195 Mon Sep 17 00:00:00 2001
From: xiayouran <youran.xia@foxmail.com>
Date: Fri, 27 Sep 2024 18:08:47 +0800
Subject: [PATCH 1/3] support TEI

---
 dsp/modules/sentence_vectorizer.py | 53 ++++++++++++++++++++++++++++++
 1 file changed, 53 insertions(+)

diff --git a/dsp/modules/sentence_vectorizer.py b/dsp/modules/sentence_vectorizer.py
index 59c25982d6..84be684204 100644
--- a/dsp/modules/sentence_vectorizer.py
+++ b/dsp/modules/sentence_vectorizer.py
@@ -3,6 +3,8 @@
 
 import numpy as np
 import openai
+import math
+import requests
 
 
 class BaseSentenceVectorizer(abc.ABC):
@@ -306,3 +308,54 @@ def __call__(self, inp_examples: List["Example"]) -> np.ndarray:
 
         embeddings = np.array(embedding_list, dtype=np.float32)
         return embeddings
+
+
+class TEIVectorizer(BaseSentenceVectorizer):
+    """The TEIVectorizer class utilizes the TEI(Text Embeddings Inference) Embeddings API to
+    convert text into embeddings.
+
+    For detailed information on the supported models, visit: https://github.com/huggingface/text-embeddings-inference.
+
+    `model` is embedding model name.
+    `embed_batch_size` is the maximum batch size for a single request.
+    `api_key` request authorization.
+    `api_url` custom inference endpoint url.
+
+    To learn more about getting started with TEI, visit: https://github.com/huggingface/text-embeddings-inference.
+    """
+
+    def __init__(
+        self,
+        model: Optional[str] = "bge-base-en-v1.5",
+        embed_batch_size: int = 256,
+        api_key: Optional[str] = None,
+        api_url: str = "",
+    ):
+        self.model = model
+        self.embed_batch_size = embed_batch_size
+        self.api_key = api_key
+        self.api_url = api_url
+
+    @property
+    def _headers(self) -> dict:
+        return {"Authorization": f"Bearer {self.api_key}"}
+
+    def __call__(self, inp_examples: List["Example"]) -> np.ndarray:
+        text_to_vectorize = self._extract_text_from_examples(inp_examples)
+        embeddings_list = []
+
+        n = math.ceil(len(text_to_vectorize) / self.embed_batch_size)
+        for i in range(n):
+            response = requests.post(
+                self.api_url,
+                headers=self._headers,
+                json={
+                    "inputs": text_to_vectorize[i * self.embed_batch_size:(i + 1) * self.embed_batch_size],
+                    "normalize": True,
+                    "truncate": True
+                },
+            )
+            embeddings_list.extend(response.json())
+
+        embeddings = np.array(embeddings_list, dtype=np.float32)
+        return embeddings

From f1583e3f337b2379b702fcc9ef01ba242480ce2b Mon Sep 17 00:00:00 2001
From: xiayouran <youran.xia@foxmail.com>
Date: Mon, 14 Oct 2024 10:03:49 +0800
Subject: [PATCH 2/3] Fix: 'LM' object has no attribute 'copy'

---
 dspy/clients/lm.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/dspy/clients/lm.py b/dspy/clients/lm.py
index 994c8bc418..6ef50cfdac 100644
--- a/dspy/clients/lm.py
+++ b/dspy/clients/lm.py
@@ -63,6 +63,11 @@ def __call__(self, prompt=None, messages=None, **kwargs):
     def inspect_history(self, n: int = 1):
         _inspect_history(self, n)
 
+    def copy(self, **kwargs):
+        """Returns a copy of the language model with the same parameters."""
+        kwargs = {**self.kwargs, **kwargs}
+        return self.__class__(model=self.model, **kwargs)
+
 
 @functools.lru_cache(maxsize=None)
 def cached_litellm_completion(request):

From 0840f8a51ebd9c27aca2a9a4ae187a683576399e Mon Sep 17 00:00:00 2001
From: xiayouran <youran.xia@foxmail.com>
Date: Tue, 15 Oct 2024 21:29:13 +0800
Subject: [PATCH 3/3] Fix: 'LM' object has no attribute 'copy'

---
 dspy/clients/lm.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dspy/clients/lm.py b/dspy/clients/lm.py
index 6ef50cfdac..43a8311ce8 100644
--- a/dspy/clients/lm.py
+++ b/dspy/clients/lm.py
@@ -65,8 +65,8 @@ def inspect_history(self, n: int = 1):
 
     def copy(self, **kwargs):
         """Returns a copy of the language model with the same parameters."""
-        kwargs = {**self.kwargs, **kwargs}
-        return self.__class__(model=self.model, **kwargs)
+        kwargs = {**self.__dict__, **kwargs}
+        return self.__class__(**kwargs)
 
 
 @functools.lru_cache(maxsize=None)