From 71e90dc85b368f2623d91c7eddb023211dabda83 Mon Sep 17 00:00:00 2001 From: Ofer Mendelevitch Date: Tue, 14 May 2024 11:04:54 -0700 Subject: [PATCH 1/2] support multiple corpora in vectara retriever --- dspy/retrieve/vectara_rm.py | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/dspy/retrieve/vectara_rm.py b/dspy/retrieve/vectara_rm.py index 047c70d6c9..d0efa0ba16 100644 --- a/dspy/retrieve/vectara_rm.py +++ b/dspy/retrieve/vectara_rm.py @@ -18,7 +18,7 @@ class VectaraRM(dspy.Retrieve): """ A retrieval module that uses Vectara to return the top passages for a given query. - Assumes that a Vectara corpus has been created and populated with the following payload: + Assumes that a Vectara corpora have been created and populated with the following payload: - document: The text of the passage Args: @@ -67,17 +67,27 @@ def __init__( def _vectara_query( self, query: str, - limit: int = 3, + limit: int = 5, ) -> List[str]: """Query Vectara index to get for top k matching passages. Args: query: query string """ - corpus_key = { - "customerId": self._vectara_customer_id, - "corpusId": self._vectara_corpus_id, - "lexicalInterpolationConfig": {"lambda": 0.025 }, - } + # If multiple corpus ids are provided (comma-separated), create a list of corpus keys + if ',' in self._vectara_corpus_id: + corpus_key = [ + { + "customerId": self._vectara_customer_id, + "corpusId": corpus_id, + "lexicalInterpolationConfig": {"lambda": 0.025 }, + } for corpus_id in self._vectara_corpus_id.split(',') + ] + else: + corpus_key = [{ + "customerId": self._vectara_customer_id, + "corpusId": self._vectara_corpus_id, + "lexicalInterpolationConfig": {"lambda": 0.025 }, + }] data = { "query": [ @@ -91,7 +101,7 @@ def _vectara_query( "startTag": START_SNIPPET, "endTag": END_SNIPPET, }, - "corpusKey": [corpus_key], + "corpusKey": corpus_key, }, ], } From c74b45e54936ae8804ab13a28a60c3b05c003438 Mon Sep 17 00:00:00 2001 From: Ofer Mendelevitch Date: Tue, 14 May 2024 14:25:51 -0700 Subject: [PATCH 2/2] simplification --- dspy/retrieve/vectara_rm.py | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/dspy/retrieve/vectara_rm.py b/dspy/retrieve/vectara_rm.py index d0efa0ba16..d1f642b375 100644 --- a/dspy/retrieve/vectara_rm.py +++ b/dspy/retrieve/vectara_rm.py @@ -74,20 +74,14 @@ def _vectara_query( query: query string """ # If multiple corpus ids are provided (comma-separated), create a list of corpus keys - if ',' in self._vectara_corpus_id: - corpus_key = [ - { - "customerId": self._vectara_customer_id, - "corpusId": corpus_id, - "lexicalInterpolationConfig": {"lambda": 0.025 }, - } for corpus_id in self._vectara_corpus_id.split(',') - ] - else: - corpus_key = [{ + # otherwise by default, the `split(',')` is a no-op so retains the single corpus id + corpus_key = [ + { "customerId": self._vectara_customer_id, - "corpusId": self._vectara_corpus_id, + "corpusId": corpus_id, "lexicalInterpolationConfig": {"lambda": 0.025 }, - }] + } for corpus_id in self._vectara_corpus_id.split(',') + ] data = { "query": [