From 7510b12b1596d94b85730bd979adb568d7dc36c6 Mon Sep 17 00:00:00 2001 From: usamajamil43 Date: Tue, 9 Apr 2024 06:21:55 +0500 Subject: [PATCH 01/14] feature(dspy): Add MyScale in Retrieve --- README.md | 5 +- dspy/retrieve/MyScaleRM.py | 222 +++++++++++++++++++++++++++++++++++++ setup.py | 4 +- 3 files changed, 226 insertions(+), 5 deletions(-) create mode 100644 dspy/retrieve/MyScaleRM.py diff --git a/README.md b/README.md index f870312e8c..a44d75dc63 100644 --- a/README.md +++ b/README.md @@ -66,10 +66,10 @@ Or open our intro notebook in Google Colab: [= 1 and minor >= 16 +except Exception: + OPENAI_VERSION_COMPATIBLE = False + +if not OPENAI_VERSION_COMPATIBLE: + raise ImportError( + "An incompatible OpenAI library version is installed. Ensure you have version 1.16.1 or later.", + ) + +# Attempt to handle specific OpenAI errors; fallback to general ones if necessary. +try: + import openai.error + ERRORS = (openai.error.RateLimitError, openai.error.ServiceUnavailableError, openai.error.APIError) +except Exception: + ERRORS = (openai.RateLimitError, openai.APIError) + + +class MyScaleRM(dspy.Retrieve): + """ + A retrieval module that uses MyScaleDB to return the top passages for a given query. + + MyScaleDB is a fork of ClickHouse that focuses on vector similarity search and full + text search. MyScaleRM is designed to facilitate easy retrieval of information from + MyScaleDB using embeddings. It supports embedding generation through either a local + model or the OpenAI API. This class abstracts away the complexities of connecting to + MyScaleDB, managing API keys, and processing queries to return semantically + relevant results. + + Assumes that a table named `database.table` exists in MyScaleDB, and that the + table has column named `vector_column` that stores vector data and a vector index has + been created on this column. Other metadata are stored in `metadata_columns`. + + Args: + client (clickhouse_connect.driver.client.Client): A client connection to the MyScaleDB. + table (str): Name of the table within the database to perform queries against. + database (str, optional): Name of the database to query within MyScaleDB. + metadata_columns(List[str], optional): A list of columns to include in the results. + vector_column (str, optional): The name of the column in the table that stores vector data. + k (int, optional): The number of closest matches to retrieve for a given query. + openai_api_key (str, optional): The API key for accessing OpenAI's services. + model (str, optional): Specifies the particular OpenAI model to use for embedding generation. + use_local_model (bool): Flag indicating whether a local model is used for embeddings. + + Examples: + Below is a code snippet that shows how to use MyScaleDB as the default retriever: + ```python + TODO + ``` + + Below is a code snippet that shows how to use MyScaleDB in the forward() function of a module + ```python + TODO + ``` + """ + + def __init__(self, + client: clickhouse_connect.driver.client.Client, + table: str, + database: str = "default", + metadata_columns: List[str] = ["text"], + vector_column: str = "vector", + k: int = 3, + openai_api_key: Optional[str] = None, + openai_model: Optional[str] = None, + local_embed_model: Optional[str] = None): + self.client = client + self.database = database + self.table = table + if not metadata_columns: + raise ValueError("metadata_columns is required") + self.metadata_columns = metadata_columns + self.vector_column = vector_column + self.k = k + self.openai_api_key = openai_api_key + self.model = openai_model + self.use_local_model = False + + if local_embed_model: + self.setup_local_model(local_embed_model) + elif openai_api_key: + os.environ['OPENAI_API_KEY'] = self.openai_api_key + + def setup_local_model(self, model_name: str): + """ + Configures a local model for embedding generation, including model and tokenizer loading. + + Args: + model_name: The name or path to the pre-trained model to load. + + Raises: + ModuleNotFoundError: If necessary libraries (torch or transformers) are not installed. + """ + try: + import torch + from transformers import AutoModel, AutoTokenizer + except ImportError as exc: + raise ModuleNotFoundError( + "You need to install PyTorch and Hugging Face's transformers library to use a local embedding model.", + ) from exc + + self._local_embed_model = AutoModel.from_pretrained(model_name) + self._local_tokenizer = AutoTokenizer.from_pretrained(model_name) + self.use_local_model = True + + if torch.cuda.is_available(): + self.device = torch.device('cuda:0') + elif torch.backends.mps.is_available(): + self.device = torch.device('mps') + else: + self.device = torch.device('cpu') + + self._local_embed_model.to(self.device) + + def get_embeddings(self, queries: List[str]) -> List[List[float]]: + """ + Determines the appropriate source (OpenAI or local model) for embedding generation based on class configuration, + and retrieves embeddings for the provided queries. + + Args: + queries: A list of text queries to generate embeddings for. + + Returns: + A list of embeddings, each corresponding to a query in the input list. + + Raises: + ValueError: If neither an OpenAI API key nor a local model has been configured. + """ + if self.openai_api_key and self.model: + return self._get_embeddings_from_openai(queries) + elif self.use_local_model: + return self._get_embedding_from_local_model(queries) + else: + raise ValueError("No valid method for obtaining embeddings is configured.") + + def _get_embeddings_from_openai(self, queries: List[str]) -> List[List[float]]: + """ + Uses the OpenAI API to generate embeddings for a list of queries. + + Args: + queries: A list of strings for which to generate embeddings. + + Returns: + A list of lists, where each inner list contains the embedding of a query. + """ + response = openai.embeddings.create( + model=self.model, + input=queries) + return response.data[0].embedding + + def _get_embedding_from_local_model(self, query: str) -> List[float]: + """ + Generates embeddings for a single query using the configured local model. + + Args: + query: The text query to generate an embedding for. + + Returns: + A list of floats representing the query's embedding. + """ + import torch + self._local_embed_model.eval() # Ensure the model is in evaluation mode + + inputs = self._local_tokenizer(query, return_tensors="pt", padding=True, truncation=True).to(self.device) + with torch.no_grad(): + output = self._local_embed_model(**inputs) + embedding = output.last_hidden_state.mean(dim=1).cpu().numpy().tolist()[0] + + return embedding + + def forward(self, user_query: str, k: Optional[int] = None) -> dspy.Prediction: + """ + Executes a retrieval operation based on a user's query and returns the top k relevant results. + + Args: + user_query: The query text to search for. + k: Optional; The number of top matches to return. Defaults to the class's configured k value. + + Returns: + A dspy.Prediction object containing the formatted retrieval results. + + Raises: + ValueError: If the user_query is None. + """ + if user_query is None: + raise ValueError("Query is required") + k = k if k is not None else self.k + embeddings = self.get_embeddings([user_query]) + columns_string = ', '.join(self.metadata_columns) + result = self.client.query(f""" + SELECT {columns_string}, + distance({self.vector_column}, {embeddings}) as dist FROM {self.database}.{self.table} ORDER BY dist LIMIT {k} + """) + + # We convert the metadata into strings to pass to dspy.Prediction + results = [] + for row in result.named_results(): + if len(self.metadata_columns) == 1: + results.append(row[self.metadata_columns[0]]) + else: + row_strings = [f"{column}: {row[column]}" for column in self.metadata_columns] # Format row data + row_string = "\n".join(row_strings) # Combine formatted data + results.append(row_string) # Append to results + + return dspy.Prediction(passages=[dotdict({"long_text": passage}) for passage in results]) # Return results as Prediction diff --git a/setup.py b/setup.py index 49c364f5ac..ff311d670e 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ setup( name="dspy-ai", - version="2.4.3", + version="2.4.1", description="DSPy", long_description=long_description, long_description_content_type='text/markdown', @@ -29,7 +29,7 @@ "pinecone": ["pinecone-client~=2.2.4"], "weaviate": ["weaviate-client~=3.26.1"], "faiss-cpu": ["sentence_transformers", "faiss-cpu"], - "google-vertex-ai": ["google-cloud-aiplatform==1.43.0"], + "myscale":["clickhouse-connect"] }, classifiers=[ "Development Status :: 3 - Alpha", From e150392918bf32d4bc66ea6987502cf7c2f14fd8 Mon Sep 17 00:00:00 2001 From: usamajamil43 Date: Tue, 9 Apr 2024 06:38:02 +0500 Subject: [PATCH 02/14] feature(dspy): Add MyScale in Retrieve --- setup.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index ff311d670e..923508b3dd 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ setup( name="dspy-ai", - version="2.4.1", + version="2.4.3", description="DSPy", long_description=long_description, long_description_content_type='text/markdown', @@ -29,7 +29,8 @@ "pinecone": ["pinecone-client~=2.2.4"], "weaviate": ["weaviate-client~=3.26.1"], "faiss-cpu": ["sentence_transformers", "faiss-cpu"], - "myscale":["clickhouse-connect"] + "google-vertex-ai": ["google-cloud-aiplatform==1.43.0"], + "myscale":["clickhouse-connect"], }, classifiers=[ "Development Status :: 3 - Alpha", From 47b45d267f69ad1344a83ffdae48d1a6f2db3f98 Mon Sep 17 00:00:00 2001 From: usamajamil43 Date: Wed, 10 Apr 2024 04:45:34 +0500 Subject: [PATCH 03/14] feature(dspy): Add MyScale in Retrieve --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index a44d75dc63..e9e258756b 100644 --- a/README.md +++ b/README.md @@ -97,6 +97,7 @@ The DSPy documentation is divided into **tutorials** (step-by-step illustration - Hands-on Overviews of DSPy by the community: [DSPy Explained! by Connor Shorten](https://www.youtube.com/watch?v=41EfOY0Ldkc), [DSPy explained by code_your_own_ai](https://www.youtube.com/watch?v=ycfnKPxBMck) - Interviews: [Weaviate Podcast in-person](https://www.youtube.com/watch?v=CDung1LnLbY), and you can find 6-7 other remote podcasts on YouTube from a few different perspectives/audiences. - **Tracing in DSPy** with Arize Phoenix: [Tutorial for tracing your prompts and the steps of your DSPy programs](https://colab.research.google.com/github/Arize-ai/phoenix/blob/main/tutorials/tracing/dspy_tracing_tutorial.ipynb) +- [DSPy: Not Your Average Prompt Engineering](https://jina.ai/news/dspy-not-your-average-prompt-engineering), why it's crucial for future prompt engineering, and yet why it is challenging for prompt engineers to learn. ### B) Guides From b466b3c218fb32c049368294ff41897e5b60ff26 Mon Sep 17 00:00:00 2001 From: arnavsinghvi11 <54859892+arnavsinghvi11@users.noreply.github.com> Date: Fri, 12 Apr 2024 18:13:57 -0700 Subject: [PATCH 04/14] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index cdd16e9ad3..6bc5131248 100644 --- a/README.md +++ b/README.md @@ -71,7 +71,7 @@ For the optional (alphabetically sorted) [Chromadb](https://github.com/chroma-co ``` pip install dspy-ai[chromadb] # or [marqo] or [milvus] or [mongodb] or [myscale] or [pinecone] or [qdrant] or [weaviate] - +``` ## 2) Documentation From 1e09778ac9e1abd243f1755bd195f3a30f0fb302 Mon Sep 17 00:00:00 2001 From: usamajamil43 Date: Sat, 20 Apr 2024 12:57:00 +0500 Subject: [PATCH 05/14] Add documentation and cache. --- docs/api/MyScaleRM.md | 79 ++++++++++++++++++++++++++++++++++++++ dspy/retrieve/MyScaleRM.py | 37 ++++++++++-------- 2 files changed, 99 insertions(+), 17 deletions(-) create mode 100644 docs/api/MyScaleRM.md diff --git a/docs/api/MyScaleRM.md b/docs/api/MyScaleRM.md new file mode 100644 index 0000000000..717e991a66 --- /dev/null +++ b/docs/api/MyScaleRM.md @@ -0,0 +1,79 @@ +--- +sidebar_position: 8 +--- + +# retrieve.MyScaleRM +## Constructor + +Initializes an instance of the `MyScaleRM` class, which is designed to use MyScaleDB (a ClickHouse fork optimized for vector similarity and full-text search) to retrieve documents based on query embeddings. This class supports embedding generation using either local models or OpenAI's API and manages database interactions efficiently. + +### Syntax +```python +MyScaleRM( + client: clickhouse_connect.driver.client.Client, + table: str, + database: str = 'default', + metadata_columns: List[str] = ['text'], + vector_column: str = 'vector', + k: int = 3, + openai_api_key: Optional[str] = None, + openai_model: Optional[str] = None, + local_embed_model: Optional[str] = None +) +``` +## Parameters for `MyScaleRM` Constructor +- `client` (_clickhouse_connect.driver.client.Client_): A client connection to the MyScaleDB database, used to execute queries and manage interactions with the database. +- `table` (_str_): Specifies the table within MyScaleDB from which data will be retrieved. This table should be equipped with a vector column for conducting similarity searches. +- `database` (_str_, optional): The name of the database where the table is located, defaulting to `"default"`. +- `metadata_columns` (_List[str], optional_): Columns to include as metadata in the output, defaulting to `["text"]`. +- `vector_column` (_str, optional_): The column that contains vector data, used for similarity searches, defaulting to `"vector"`. +- `k` (_int, optional_): The number of closest matches to return for each query, defaulting to 3. +- `openai_api_key` (_str, optional_): API key for accessing OpenAI services, necessary if using OpenAI for embedding generation. +- `openai_model` (_str, optional_): The specific OpenAI model to use for embeddings, required if an OpenAI API key is provided. +- `local_embed_model` (_str, optional_): Specifies a local model for embedding generation, chosen if local computation is preferred. + +## Methods +### `forward` +Executes a retrieval operation based on a user's query and returns the top `k` relevant results using the embeddings generated by the specified method. + +### Syntax +```python +def forward(self, user_query: str, k: Optional[int] = None) -> dspy.Prediction +``` + +## Parameters +- `user_query` (_str_): The query or list of queries for which to retrieve matching passages. +- `k` (_Optional[int], optional_): The number of top matches to retrieve. If not provided, it defaults to the `k` value set during class initialization. + +## Returns +- `dspy.Prediction`: Contains the retrieved passages, formatted as a list of `dotdict` objects. Each entry includes: + - **long_text (str)**: The text content of the retrieved passage. + +## Description + +The `forward` method leverages the MyScaleDB's vector search capabilities to find the top `k` passages that best match the provided query. This method is integral for utilizing the MyScaleRM class to access and retrieve data efficiently based on semantic similarity, facilitated by the chosen embedding generation technique (either via a local model or the OpenAI API). + +## Quickstart + +This section provides practical examples of how to instantiate and use the `MyScaleRM` class to retrieve data from MyScaleDB efficiently using text embeddings. + +```python +from dspy.retrieve.myscaledb_rm import MyScaleRM + +MyScale_model = MyScaleRM(client=client, + table="table_name", + openai_api_key="sk-***", + openai_model="embeddings_model", + vector_column="vector_column_name", + metadata_columns=["add_your_columns_here"], + k=6) + +MyScale_model("Please suggest me some funny movies") + +passages = results.passages + +# Loop through each passage and print the 'long_text' +for passage in passages: + print(passage['long_text'], "\n") + +``` \ No newline at end of file diff --git a/dspy/retrieve/MyScaleRM.py b/dspy/retrieve/MyScaleRM.py index 356e827dbe..d22281b976 100644 --- a/dspy/retrieve/MyScaleRM.py +++ b/dspy/retrieve/MyScaleRM.py @@ -1,9 +1,11 @@ -from typing import List, Optional import os +from typing import List, Optional + +import openai import dspy +from dsp.modules.cache_utils import CacheMemory, cache_turn_on from dsp.utils import dotdict -import openai # Check for necessary libraries and suggest installation if not found. try: @@ -59,16 +61,6 @@ class MyScaleRM(dspy.Retrieve): model (str, optional): Specifies the particular OpenAI model to use for embedding generation. use_local_model (bool): Flag indicating whether a local model is used for embeddings. - Examples: - Below is a code snippet that shows how to use MyScaleDB as the default retriever: - ```python - TODO - ``` - - Below is a code snippet that shows how to use MyScaleDB in the forward() function of a module - ```python - TODO - ``` """ def __init__(self, @@ -113,12 +105,16 @@ def setup_local_model(self, model_name: str): from transformers import AutoModel, AutoTokenizer except ImportError as exc: raise ModuleNotFoundError( - "You need to install PyTorch and Hugging Face's transformers library to use a local embedding model.", + """You need to install PyTorch and Hugging Face's transformers library to use a local embedding model. + Install the pytorch using `pip install torch` and transformers using `pip install transformers` """, ) from exc - self._local_embed_model = AutoModel.from_pretrained(model_name) - self._local_tokenizer = AutoTokenizer.from_pretrained(model_name) - self.use_local_model = True + try: + self._local_embed_model = AutoModel.from_pretrained(model_name) + self._local_tokenizer = AutoTokenizer.from_pretrained(model_name) + self.use_local_model = True + except Exception as e: + raise ValueError(f"Failed to load model or tokenizer. Error: {str(e)}") if torch.cuda.is_available(): self.device = torch.device('cuda:0') @@ -128,8 +124,14 @@ def setup_local_model(self, model_name: str): self.device = torch.device('cpu') self._local_embed_model.to(self.device) - + def get_embeddings(self, queries: List[str]) -> List[List[float]]: + if cache_turn_on: + return CacheMemory.cache(self._get_embeddings)(queries) + else: + return self._get_embeddings(queries) + + def _get_embeddings(self, queries: List[str]) -> List[List[float]]: """ Determines the appropriate source (OpenAI or local model) for embedding generation based on class configuration, and retrieves embeddings for the provided queries. @@ -160,6 +162,7 @@ def _get_embeddings_from_openai(self, queries: List[str]) -> List[List[float]]: Returns: A list of lists, where each inner list contains the embedding of a query. """ + response = openai.embeddings.create( model=self.model, input=queries) From c9765528b14d4ac27c3aa496d10ea8c7090521da Mon Sep 17 00:00:00 2001 From: usamajamil43 Date: Tue, 14 May 2024 08:28:32 +0500 Subject: [PATCH 06/14] Added Cache to the embedding methods --- dspy/retrieve/MyScaleRM.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/dspy/retrieve/MyScaleRM.py b/dspy/retrieve/MyScaleRM.py index d22281b976..79e71bd146 100644 --- a/dspy/retrieve/MyScaleRM.py +++ b/dspy/retrieve/MyScaleRM.py @@ -4,7 +4,8 @@ import openai import dspy -from dsp.modules.cache_utils import CacheMemory, cache_turn_on +import functools +from dsp.modules.cache_utils import CacheMemory, NotebookCacheMemory, cache_turn_on from dsp.utils import dotdict # Check for necessary libraries and suggest installation if not found. @@ -124,14 +125,10 @@ def setup_local_model(self, model_name: str): self.device = torch.device('cpu') self._local_embed_model.to(self.device) - - def get_embeddings(self, queries: List[str]) -> List[List[float]]: - if cache_turn_on: - return CacheMemory.cache(self._get_embeddings)(queries) - else: - return self._get_embeddings(queries) - def _get_embeddings(self, queries: List[str]) -> List[List[float]]: + @functools.lru_cache(maxsize=None if cache_turn_on else 0) + @NotebookCacheMemory.cache + def get_embeddings(self, queries: List[str]) -> List[List[float]]: """ Determines the appropriate source (OpenAI or local model) for embedding generation based on class configuration, and retrieves embeddings for the provided queries. @@ -151,7 +148,9 @@ def _get_embeddings(self, queries: List[str]) -> List[List[float]]: return self._get_embedding_from_local_model(queries) else: raise ValueError("No valid method for obtaining embeddings is configured.") - + + #TO DO Add this method as Util method outside MyScaleRM + @CacheMemory.cache def _get_embeddings_from_openai(self, queries: List[str]) -> List[List[float]]: """ Uses the OpenAI API to generate embeddings for a list of queries. @@ -167,7 +166,9 @@ def _get_embeddings_from_openai(self, queries: List[str]) -> List[List[float]]: model=self.model, input=queries) return response.data[0].embedding - + + #TO DO Add this method as Util method outside MyScaleRM + @CacheMemory.cache def _get_embedding_from_local_model(self, query: str) -> List[float]: """ Generates embeddings for a single query using the configured local model. From 18b9e6b968bb814f124e0f9d5d2fe23a249879df Mon Sep 17 00:00:00 2001 From: usamajamil43 Date: Wed, 22 May 2024 18:24:23 +0500 Subject: [PATCH 07/14] The last commit --- dspy/retrieve/MyScaleRM.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dspy/retrieve/MyScaleRM.py b/dspy/retrieve/MyScaleRM.py index 79e71bd146..cca8b087e1 100644 --- a/dspy/retrieve/MyScaleRM.py +++ b/dspy/retrieve/MyScaleRM.py @@ -1,10 +1,10 @@ +import functools import os from typing import List, Optional import openai import dspy -import functools from dsp.modules.cache_utils import CacheMemory, NotebookCacheMemory, cache_turn_on from dsp.utils import dotdict From bbadff7b3c2c023b70c3eec33a7b964b7dc30d24 Mon Sep 17 00:00:00 2001 From: usamajamil43 Date: Tue, 9 Apr 2024 06:21:55 +0500 Subject: [PATCH 08/14] feature(dspy): Add MyScale in Retrieve --- README.md | 4 +- dspy/retrieve/MyScaleRM.py | 222 +++++++++++++++++++++++++++++++++++++ setup.py | 1 + 3 files changed, 225 insertions(+), 2 deletions(-) create mode 100644 dspy/retrieve/MyScaleRM.py diff --git a/README.md b/README.md index f28ad83b7a..067e533288 100644 --- a/README.md +++ b/README.md @@ -72,11 +72,11 @@ Or open our intro notebook in Google Colab: [= 1 and minor >= 16 +except Exception: + OPENAI_VERSION_COMPATIBLE = False + +if not OPENAI_VERSION_COMPATIBLE: + raise ImportError( + "An incompatible OpenAI library version is installed. Ensure you have version 1.16.1 or later.", + ) + +# Attempt to handle specific OpenAI errors; fallback to general ones if necessary. +try: + import openai.error + ERRORS = (openai.error.RateLimitError, openai.error.ServiceUnavailableError, openai.error.APIError) +except Exception: + ERRORS = (openai.RateLimitError, openai.APIError) + + +class MyScaleRM(dspy.Retrieve): + """ + A retrieval module that uses MyScaleDB to return the top passages for a given query. + + MyScaleDB is a fork of ClickHouse that focuses on vector similarity search and full + text search. MyScaleRM is designed to facilitate easy retrieval of information from + MyScaleDB using embeddings. It supports embedding generation through either a local + model or the OpenAI API. This class abstracts away the complexities of connecting to + MyScaleDB, managing API keys, and processing queries to return semantically + relevant results. + + Assumes that a table named `database.table` exists in MyScaleDB, and that the + table has column named `vector_column` that stores vector data and a vector index has + been created on this column. Other metadata are stored in `metadata_columns`. + + Args: + client (clickhouse_connect.driver.client.Client): A client connection to the MyScaleDB. + table (str): Name of the table within the database to perform queries against. + database (str, optional): Name of the database to query within MyScaleDB. + metadata_columns(List[str], optional): A list of columns to include in the results. + vector_column (str, optional): The name of the column in the table that stores vector data. + k (int, optional): The number of closest matches to retrieve for a given query. + openai_api_key (str, optional): The API key for accessing OpenAI's services. + model (str, optional): Specifies the particular OpenAI model to use for embedding generation. + use_local_model (bool): Flag indicating whether a local model is used for embeddings. + + Examples: + Below is a code snippet that shows how to use MyScaleDB as the default retriever: + ```python + TODO + ``` + + Below is a code snippet that shows how to use MyScaleDB in the forward() function of a module + ```python + TODO + ``` + """ + + def __init__(self, + client: clickhouse_connect.driver.client.Client, + table: str, + database: str = "default", + metadata_columns: List[str] = ["text"], + vector_column: str = "vector", + k: int = 3, + openai_api_key: Optional[str] = None, + openai_model: Optional[str] = None, + local_embed_model: Optional[str] = None): + self.client = client + self.database = database + self.table = table + if not metadata_columns: + raise ValueError("metadata_columns is required") + self.metadata_columns = metadata_columns + self.vector_column = vector_column + self.k = k + self.openai_api_key = openai_api_key + self.model = openai_model + self.use_local_model = False + + if local_embed_model: + self.setup_local_model(local_embed_model) + elif openai_api_key: + os.environ['OPENAI_API_KEY'] = self.openai_api_key + + def setup_local_model(self, model_name: str): + """ + Configures a local model for embedding generation, including model and tokenizer loading. + + Args: + model_name: The name or path to the pre-trained model to load. + + Raises: + ModuleNotFoundError: If necessary libraries (torch or transformers) are not installed. + """ + try: + import torch + from transformers import AutoModel, AutoTokenizer + except ImportError as exc: + raise ModuleNotFoundError( + "You need to install PyTorch and Hugging Face's transformers library to use a local embedding model.", + ) from exc + + self._local_embed_model = AutoModel.from_pretrained(model_name) + self._local_tokenizer = AutoTokenizer.from_pretrained(model_name) + self.use_local_model = True + + if torch.cuda.is_available(): + self.device = torch.device('cuda:0') + elif torch.backends.mps.is_available(): + self.device = torch.device('mps') + else: + self.device = torch.device('cpu') + + self._local_embed_model.to(self.device) + + def get_embeddings(self, queries: List[str]) -> List[List[float]]: + """ + Determines the appropriate source (OpenAI or local model) for embedding generation based on class configuration, + and retrieves embeddings for the provided queries. + + Args: + queries: A list of text queries to generate embeddings for. + + Returns: + A list of embeddings, each corresponding to a query in the input list. + + Raises: + ValueError: If neither an OpenAI API key nor a local model has been configured. + """ + if self.openai_api_key and self.model: + return self._get_embeddings_from_openai(queries) + elif self.use_local_model: + return self._get_embedding_from_local_model(queries) + else: + raise ValueError("No valid method for obtaining embeddings is configured.") + + def _get_embeddings_from_openai(self, queries: List[str]) -> List[List[float]]: + """ + Uses the OpenAI API to generate embeddings for a list of queries. + + Args: + queries: A list of strings for which to generate embeddings. + + Returns: + A list of lists, where each inner list contains the embedding of a query. + """ + response = openai.embeddings.create( + model=self.model, + input=queries) + return response.data[0].embedding + + def _get_embedding_from_local_model(self, query: str) -> List[float]: + """ + Generates embeddings for a single query using the configured local model. + + Args: + query: The text query to generate an embedding for. + + Returns: + A list of floats representing the query's embedding. + """ + import torch + self._local_embed_model.eval() # Ensure the model is in evaluation mode + + inputs = self._local_tokenizer(query, return_tensors="pt", padding=True, truncation=True).to(self.device) + with torch.no_grad(): + output = self._local_embed_model(**inputs) + embedding = output.last_hidden_state.mean(dim=1).cpu().numpy().tolist()[0] + + return embedding + + def forward(self, user_query: str, k: Optional[int] = None) -> dspy.Prediction: + """ + Executes a retrieval operation based on a user's query and returns the top k relevant results. + + Args: + user_query: The query text to search for. + k: Optional; The number of top matches to return. Defaults to the class's configured k value. + + Returns: + A dspy.Prediction object containing the formatted retrieval results. + + Raises: + ValueError: If the user_query is None. + """ + if user_query is None: + raise ValueError("Query is required") + k = k if k is not None else self.k + embeddings = self.get_embeddings([user_query]) + columns_string = ', '.join(self.metadata_columns) + result = self.client.query(f""" + SELECT {columns_string}, + distance({self.vector_column}, {embeddings}) as dist FROM {self.database}.{self.table} ORDER BY dist LIMIT {k} + """) + + # We convert the metadata into strings to pass to dspy.Prediction + results = [] + for row in result.named_results(): + if len(self.metadata_columns) == 1: + results.append(row[self.metadata_columns[0]]) + else: + row_strings = [f"{column}: {row[column]}" for column in self.metadata_columns] # Format row data + row_string = "\n".join(row_strings) # Combine formatted data + results.append(row_string) # Append to results + + return dspy.Prediction(passages=[dotdict({"long_text": passage}) for passage in results]) # Return results as Prediction diff --git a/setup.py b/setup.py index 7d1e973653..de166f6a0f 100644 --- a/setup.py +++ b/setup.py @@ -34,6 +34,7 @@ "google-vertex-ai": ["google-cloud-aiplatform==1.43.0"], "snowflake": ["snowflake-snowpark-python"], "fastembed": ["fastembed"], + "myscale":["clickhouse-connect"] }, classifiers=[ "Development Status :: 3 - Alpha", From b1d53e3f3d085a4ab1481c24e0c4908937b0fbd5 Mon Sep 17 00:00:00 2001 From: usamajamil43 Date: Tue, 9 Apr 2024 06:38:02 +0500 Subject: [PATCH 09/14] feature(dspy): Add MyScale in Retrieve --- setup.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index de166f6a0f..0e3ab985b6 100644 --- a/setup.py +++ b/setup.py @@ -34,7 +34,8 @@ "google-vertex-ai": ["google-cloud-aiplatform==1.43.0"], "snowflake": ["snowflake-snowpark-python"], "fastembed": ["fastembed"], - "myscale":["clickhouse-connect"] + "google-vertex-ai": ["google-cloud-aiplatform==1.43.0"], + "myscale":["clickhouse-connect"], }, classifiers=[ "Development Status :: 3 - Alpha", From 53467661b0b27b88dcb208c7327d083ce1e5bc2a Mon Sep 17 00:00:00 2001 From: usamajamil43 Date: Wed, 10 Apr 2024 04:45:34 +0500 Subject: [PATCH 10/14] feature(dspy): Add MyScale in Retrieve --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 067e533288..710d9c3ad9 100644 --- a/README.md +++ b/README.md @@ -106,6 +106,7 @@ The DSPy documentation is divided into **tutorials** (step-by-step illustration - **Tracing in DSPy** with Arize Phoenix: [Tutorial for tracing your prompts and the steps of your DSPy programs](https://colab.research.google.com/github/Arize-ai/phoenix/blob/main/tutorials/tracing/dspy_tracing_tutorial.ipynb) - [DSPy: Not Your Average Prompt Engineering](https://jina.ai/news/dspy-not-your-average-prompt-engineering), why it's crucial for future prompt engineering, and yet why it is challenging for prompt engineers to learn. - **Tracing & Optimization Tracking in DSPy** with Parea AI: [Tutorial on tracing & evaluating a DSPy RAG program](https://docs.parea.ai/tutorials/dspy-rag-trace-evaluate/tutorial) +- [DSPy: Not Your Average Prompt Engineering](https://jina.ai/news/dspy-not-your-average-prompt-engineering), why it's crucial for future prompt engineering, and yet why it is challenging for prompt engineers to learn. ### B) Guides From e2f3db0e91a556729b5fa3cfac11dcecab681002 Mon Sep 17 00:00:00 2001 From: arnavsinghvi11 <54859892+arnavsinghvi11@users.noreply.github.com> Date: Fri, 12 Apr 2024 18:13:57 -0700 Subject: [PATCH 11/14] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 710d9c3ad9..39a33d0057 100644 --- a/README.md +++ b/README.md @@ -77,7 +77,7 @@ or [Milvus](https://github.com/milvus-io/milvus) retrieval integration(s), inclu ``` pip install dspy-ai[chromadb] # or [qdrant] or [marqo] or [myscale] or [mongodb] or [pinecone] or [snowflake] or [weaviate] or [milvus] -``` +`````` ## 2) Documentation From 13a61a32e5bc136f07124476205bb6d7350c8299 Mon Sep 17 00:00:00 2001 From: usamajamil43 Date: Sat, 20 Apr 2024 12:57:00 +0500 Subject: [PATCH 12/14] Add documentation and cache. --- docs/api/MyScaleRM.md | 79 ++++++++++++++++++++++++++++++++++++++ dspy/retrieve/MyScaleRM.py | 37 ++++++++++-------- 2 files changed, 99 insertions(+), 17 deletions(-) create mode 100644 docs/api/MyScaleRM.md diff --git a/docs/api/MyScaleRM.md b/docs/api/MyScaleRM.md new file mode 100644 index 0000000000..717e991a66 --- /dev/null +++ b/docs/api/MyScaleRM.md @@ -0,0 +1,79 @@ +--- +sidebar_position: 8 +--- + +# retrieve.MyScaleRM +## Constructor + +Initializes an instance of the `MyScaleRM` class, which is designed to use MyScaleDB (a ClickHouse fork optimized for vector similarity and full-text search) to retrieve documents based on query embeddings. This class supports embedding generation using either local models or OpenAI's API and manages database interactions efficiently. + +### Syntax +```python +MyScaleRM( + client: clickhouse_connect.driver.client.Client, + table: str, + database: str = 'default', + metadata_columns: List[str] = ['text'], + vector_column: str = 'vector', + k: int = 3, + openai_api_key: Optional[str] = None, + openai_model: Optional[str] = None, + local_embed_model: Optional[str] = None +) +``` +## Parameters for `MyScaleRM` Constructor +- `client` (_clickhouse_connect.driver.client.Client_): A client connection to the MyScaleDB database, used to execute queries and manage interactions with the database. +- `table` (_str_): Specifies the table within MyScaleDB from which data will be retrieved. This table should be equipped with a vector column for conducting similarity searches. +- `database` (_str_, optional): The name of the database where the table is located, defaulting to `"default"`. +- `metadata_columns` (_List[str], optional_): Columns to include as metadata in the output, defaulting to `["text"]`. +- `vector_column` (_str, optional_): The column that contains vector data, used for similarity searches, defaulting to `"vector"`. +- `k` (_int, optional_): The number of closest matches to return for each query, defaulting to 3. +- `openai_api_key` (_str, optional_): API key for accessing OpenAI services, necessary if using OpenAI for embedding generation. +- `openai_model` (_str, optional_): The specific OpenAI model to use for embeddings, required if an OpenAI API key is provided. +- `local_embed_model` (_str, optional_): Specifies a local model for embedding generation, chosen if local computation is preferred. + +## Methods +### `forward` +Executes a retrieval operation based on a user's query and returns the top `k` relevant results using the embeddings generated by the specified method. + +### Syntax +```python +def forward(self, user_query: str, k: Optional[int] = None) -> dspy.Prediction +``` + +## Parameters +- `user_query` (_str_): The query or list of queries for which to retrieve matching passages. +- `k` (_Optional[int], optional_): The number of top matches to retrieve. If not provided, it defaults to the `k` value set during class initialization. + +## Returns +- `dspy.Prediction`: Contains the retrieved passages, formatted as a list of `dotdict` objects. Each entry includes: + - **long_text (str)**: The text content of the retrieved passage. + +## Description + +The `forward` method leverages the MyScaleDB's vector search capabilities to find the top `k` passages that best match the provided query. This method is integral for utilizing the MyScaleRM class to access and retrieve data efficiently based on semantic similarity, facilitated by the chosen embedding generation technique (either via a local model or the OpenAI API). + +## Quickstart + +This section provides practical examples of how to instantiate and use the `MyScaleRM` class to retrieve data from MyScaleDB efficiently using text embeddings. + +```python +from dspy.retrieve.myscaledb_rm import MyScaleRM + +MyScale_model = MyScaleRM(client=client, + table="table_name", + openai_api_key="sk-***", + openai_model="embeddings_model", + vector_column="vector_column_name", + metadata_columns=["add_your_columns_here"], + k=6) + +MyScale_model("Please suggest me some funny movies") + +passages = results.passages + +# Loop through each passage and print the 'long_text' +for passage in passages: + print(passage['long_text'], "\n") + +``` \ No newline at end of file diff --git a/dspy/retrieve/MyScaleRM.py b/dspy/retrieve/MyScaleRM.py index 356e827dbe..d22281b976 100644 --- a/dspy/retrieve/MyScaleRM.py +++ b/dspy/retrieve/MyScaleRM.py @@ -1,9 +1,11 @@ -from typing import List, Optional import os +from typing import List, Optional + +import openai import dspy +from dsp.modules.cache_utils import CacheMemory, cache_turn_on from dsp.utils import dotdict -import openai # Check for necessary libraries and suggest installation if not found. try: @@ -59,16 +61,6 @@ class MyScaleRM(dspy.Retrieve): model (str, optional): Specifies the particular OpenAI model to use for embedding generation. use_local_model (bool): Flag indicating whether a local model is used for embeddings. - Examples: - Below is a code snippet that shows how to use MyScaleDB as the default retriever: - ```python - TODO - ``` - - Below is a code snippet that shows how to use MyScaleDB in the forward() function of a module - ```python - TODO - ``` """ def __init__(self, @@ -113,12 +105,16 @@ def setup_local_model(self, model_name: str): from transformers import AutoModel, AutoTokenizer except ImportError as exc: raise ModuleNotFoundError( - "You need to install PyTorch and Hugging Face's transformers library to use a local embedding model.", + """You need to install PyTorch and Hugging Face's transformers library to use a local embedding model. + Install the pytorch using `pip install torch` and transformers using `pip install transformers` """, ) from exc - self._local_embed_model = AutoModel.from_pretrained(model_name) - self._local_tokenizer = AutoTokenizer.from_pretrained(model_name) - self.use_local_model = True + try: + self._local_embed_model = AutoModel.from_pretrained(model_name) + self._local_tokenizer = AutoTokenizer.from_pretrained(model_name) + self.use_local_model = True + except Exception as e: + raise ValueError(f"Failed to load model or tokenizer. Error: {str(e)}") if torch.cuda.is_available(): self.device = torch.device('cuda:0') @@ -128,8 +124,14 @@ def setup_local_model(self, model_name: str): self.device = torch.device('cpu') self._local_embed_model.to(self.device) - + def get_embeddings(self, queries: List[str]) -> List[List[float]]: + if cache_turn_on: + return CacheMemory.cache(self._get_embeddings)(queries) + else: + return self._get_embeddings(queries) + + def _get_embeddings(self, queries: List[str]) -> List[List[float]]: """ Determines the appropriate source (OpenAI or local model) for embedding generation based on class configuration, and retrieves embeddings for the provided queries. @@ -160,6 +162,7 @@ def _get_embeddings_from_openai(self, queries: List[str]) -> List[List[float]]: Returns: A list of lists, where each inner list contains the embedding of a query. """ + response = openai.embeddings.create( model=self.model, input=queries) From 7e2c1b9e056b30ae602c02be89d16e35b7a42153 Mon Sep 17 00:00:00 2001 From: usamajamil43 Date: Tue, 14 May 2024 08:28:32 +0500 Subject: [PATCH 13/14] Added Cache to the embedding methods --- dspy/retrieve/MyScaleRM.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/dspy/retrieve/MyScaleRM.py b/dspy/retrieve/MyScaleRM.py index d22281b976..79e71bd146 100644 --- a/dspy/retrieve/MyScaleRM.py +++ b/dspy/retrieve/MyScaleRM.py @@ -4,7 +4,8 @@ import openai import dspy -from dsp.modules.cache_utils import CacheMemory, cache_turn_on +import functools +from dsp.modules.cache_utils import CacheMemory, NotebookCacheMemory, cache_turn_on from dsp.utils import dotdict # Check for necessary libraries and suggest installation if not found. @@ -124,14 +125,10 @@ def setup_local_model(self, model_name: str): self.device = torch.device('cpu') self._local_embed_model.to(self.device) - - def get_embeddings(self, queries: List[str]) -> List[List[float]]: - if cache_turn_on: - return CacheMemory.cache(self._get_embeddings)(queries) - else: - return self._get_embeddings(queries) - def _get_embeddings(self, queries: List[str]) -> List[List[float]]: + @functools.lru_cache(maxsize=None if cache_turn_on else 0) + @NotebookCacheMemory.cache + def get_embeddings(self, queries: List[str]) -> List[List[float]]: """ Determines the appropriate source (OpenAI or local model) for embedding generation based on class configuration, and retrieves embeddings for the provided queries. @@ -151,7 +148,9 @@ def _get_embeddings(self, queries: List[str]) -> List[List[float]]: return self._get_embedding_from_local_model(queries) else: raise ValueError("No valid method for obtaining embeddings is configured.") - + + #TO DO Add this method as Util method outside MyScaleRM + @CacheMemory.cache def _get_embeddings_from_openai(self, queries: List[str]) -> List[List[float]]: """ Uses the OpenAI API to generate embeddings for a list of queries. @@ -167,7 +166,9 @@ def _get_embeddings_from_openai(self, queries: List[str]) -> List[List[float]]: model=self.model, input=queries) return response.data[0].embedding - + + #TO DO Add this method as Util method outside MyScaleRM + @CacheMemory.cache def _get_embedding_from_local_model(self, query: str) -> List[float]: """ Generates embeddings for a single query using the configured local model. From c60148ebeae4bf387250c767eaf18e53f48f20d3 Mon Sep 17 00:00:00 2001 From: Ahsan Saeed Date: Sun, 2 Jun 2024 21:18:14 +0500 Subject: [PATCH 14/14] run ruff --- dspy/retrieve/MyScaleRM.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dspy/retrieve/MyScaleRM.py b/dspy/retrieve/MyScaleRM.py index 79e71bd146..cca8b087e1 100644 --- a/dspy/retrieve/MyScaleRM.py +++ b/dspy/retrieve/MyScaleRM.py @@ -1,10 +1,10 @@ +import functools import os from typing import List, Optional import openai import dspy -import functools from dsp.modules.cache_utils import CacheMemory, NotebookCacheMemory, cache_turn_on from dsp.utils import dotdict