In [1]:
from llama_index.core import SummaryIndex
from llama_index.readers.web import SimpleWebPageReader
documents = SimpleWebPageReader(html_to_text=True).load_data(
    ["https://www.thoughtworks.com/en-in/insights/blog/data-strategy/building-an-amazon-com-for-your-data-products"]
)

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
doc_contents = [doc.text for doc in documents]
doc_ids = list(range(1, len(doc_contents) + 1))

In [8]:
from qdrant_client import QdrantClient
client = QdrantClient(":memory:")

client.add(
    collection_name="DSpy_Qdrant",
    documents=doc_contents,
    ids=doc_ids,
)

100%|██████████| 77.7M/77.7M [00:06<00:00, 11.1MiB/s]


[1]

In [10]:
from dspy.retrieve.qdrant_rm import QdrantRM
import dspy


qdrant_retriever_model = QdrantRM("DSpy_Qdrant", client, k=10)


ollama_model = dspy.OllamaLocal(model="llama2",model_type='text',
                                max_tokens=350,
                                temperature=0.1,
                                top_p=0.8, frequency_penalty=1.17, top_k=40)


dspy.settings.configure(lm= ollama_model, rm=qdrant_retriever_model)

In [11]:
class GenerateAnswer(dspy.Signature):
    """Answer questions with short factoid answers."""


    context = dspy.InputField(desc="may contain relevant facts")
    question = dspy.InputField()
    answer = dspy.OutputField()

In [12]:
class GenerateSearchQuery(dspy.Signature):
    """Write a simple search query that will help answer a complex question."""


    context = dspy.InputField(desc="may contain relevant facts")
    question = dspy.InputField()
    query = dspy.OutputField()

In [13]:
from dsp.utils import deduplicate


class SimplifiedBaleen(dspy.Module):
    def __init__(self, passages_per_hop=3, max_hops=2):
        super().__init__()


        self.generate_query = [dspy.ChainOfThought(GenerateSearchQuery) for _ in range(max_hops)]
        self.retrieve = dspy.Retrieve(k=passages_per_hop)
        self.generate_answer = dspy.ChainOfThought(GenerateAnswer)
        self.max_hops = max_hops


    def forward(self, question):
        context = []


        for hop in range(self.max_hops):
            query = self.generate_query[hop](context=context, question=question).query
            passages = self.retrieve(query).passages
            context = deduplicate(context + passages)


        pred = self.generate_answer(context=context, question=question)
        return dspy.Prediction(context=context, answer=pred.answer)

In [14]:
my_question = "What is Data Mesh?"


# Get the prediction. This contains `pred.context` and `pred.answer`.
uncompiled_baleen = SimplifiedBaleen()  # uncompiled (i.e., zero-shot) program
pred = uncompiled_baleen(my_question)


# Print the contexts and the answer.
print(f"Question: {my_question}")
print(f"Predicted Answer: {pred.answer}")
print(f"Retrieved Contexts (truncated): {[c[:200] + '...' for c in pred.context]}")

Question: What is Data Mesh?
Predicted Answer: Data Mesh is a term used in the context of software development and architecture. It refers to a design pattern or approach where data is distributed across multiple microservices, each responsible for its own subset of data, rather than being centralized in a single monolithic database. This allows for greater scalability, fault tolerance, and flexibility in terms of how data is stored and accessed.

In this context, "Data Mesh" can be thought of as a network or mesh of interconnected microservices that work together to provide a unified view of the data. Each microservice acts as a node in the mesh, communicating with other nodes to exchange data and maintain consistency across the system. This allows for more efficient use of resources, as well as greater resilience and adaptability in response to changing requirements or conditions.

The term "Data Mesh" is often used in contrast to traditional monolithic architecture, where all the da

In [None]:
ollama_model.inspect_history(n=3)

# Signatures

In [19]:
class Event(dspy.Signature):
    description = dspy.InputField(
        desc="Question regarding the content",
    )
    summary = dspy.OutputField(desc="Summary of the content")

In [20]:
class EventExtractor(dspy.Module):

    def __init__(self):
        super().__init__()
        # Retrieve module to get relevant documents
        self.retriever = dspy.Retrieve(k=3)
        # Predict module for the created signature
        self.predict = dspy.Predict(Event)

    def forward(self, query: str):
        # Retrieve the most relevant documents
        results = self.retriever.forward(query)

        # Try to extract events from the retrieved documents
        events = []
        for document in results.passages:
            event = self.predict(description=document)
            events.append(event)

        return events

In [21]:
extractor = EventExtractor()
extractor.forward("What is Data Mesh ?")

[Prediction(
     summary="ThoughtWorks is a global software consulting company that helps clients achieve faster growth through technology and innovation. They offer a range of services including data strategy, data engineering, cloud computing, and more. In this summary, we will explore how ThoughtWorks can help organizations achieve faster growth through their expertise in these areas.\n\nData Strategy: ThoughtWorks helps clients develop a comprehensive data strategy that aligns with their business goals. They provide services such as data governance, data quality, and data integration to ensure that clients have a solid foundation for their data-driven initiatives. By leveraging the power of data, organizations can make more informed decisions, improve customer experiences, and gain a competitive advantage.\n\nData Engineering: ThoughtWorks' expertise in data engineering enables them to design and implement scalable and secure data pipelines that meet clients' needs. They help clie

# RAG pipeline

In [25]:
class RAG(dspy.Module):
    def __init__(self, num_passages=3):
        super().__init__()

        self.retrieve = dspy.Retrieve(k=num_passages)
        self.generate_answer = dspy.ChainOfThought(GenerateAnswer)
    
    def forward(self, question):
        context = self.retrieve(question).passages
        prediction = self.generate_answer(context=context, question=question)
        return dspy.Prediction(context=context, answer=prediction.answer)

In [26]:
rag = RAG()
rag.forward("What is Data Mesh ?")

Prediction(
    context=['[ ![Thoughtworks](/etc.clientlibs/thoughtworks/clientlibs/clientlib-\nsite/resources/images/thoughtworks-logo.svg) ](/en-in "Thoughtworks")\n\nMenu\n\nClose\n\n  * [What we do  ](/en-in/what-we-do "What we do")\n\n    * [ Go to overview ](/en-in/what-we-do)\n      * ### Services\n\n        * [ Artificial Intelligence  ](/en-in/what-we-do/ai)\n        * [ Cloud  ](/en-in/what-we-do/cloud)\n        * [ Customer Experience and Products  ](/en-in/what-we-do/customer-experience-product-design)\n        * [ Data and Analytics  ](/en-in/what-we-do/data)\n        * [ Managed Services  ](/en-in/what-we-do/digital-application-management-and-operations)\n        * [ Modernization  ](/en-in/what-we-do/modernization)\n        * [ Platforms  ](/en-in/what-we-do/platforms)\n\n  * [Who we work with  ](/en-in/clients "Who we work with")\n\n    * [ Go to overview ](/en-in/clients)\n    * [Automotive  ](/en-in/clients/automotive "Automotive")\n    * [Healthcare and Life Sciences

# RAG Optimization

In [28]:
from dspy.teleprompt import BootstrapFewShot
def validate_context_and_answer(example, pred, trace=None):
    answer_EM = dspy.evaluate.answer_exact_match(example, pred)
    answer_PM = dspy.evaluate.answer_passage_match(example, pred)
    return answer_EM and answer_PM

# Set up a basic teleprompter, which will compile our RAG program.
teleprompter = BootstrapFewShot(metric=validate_context_and_answer)

In [30]:
from dspy.datasets import HotPotQA
dataset = HotPotQA(train_seed=1, train_size=20, eval_seed=2023, dev_size=50, test_size=0)

# Tell DSPy that the 'question' field is the input. Any other fields are labels and/or metadata.
trainset = [x.with_inputs('question') for x in dataset.train]
devset = [x.with_inputs('question') for x in dataset.dev]

len(trainset), len(devset)

Downloading builder script: 100%|██████████| 6.42k/6.42k [00:00<00:00, 10.3MB/s]
Downloading readme: 100%|██████████| 9.19k/9.19k [00:00<00:00, 6.95MB/s]
Downloading data: 100%|██████████| 566M/566M [02:05<00:00, 4.51MB/s]
Downloading data: 100%|██████████| 47.5M/47.5M [00:18<00:00, 2.60MB/s]
Downloading data: 100%|██████████| 46.2M/46.2M [00:10<00:00, 4.27MB/s]
Downloading data files: 100%|██████████| 3/3 [03:14<00:00, 64.92s/it]
Generating train split: 100%|██████████| 90447/90447 [00:15<00:00, 5835.65 examples/s]
Generating validation split: 100%|██████████| 7405/7405 [00:01<00:00, 6005.61 examples/s]
Generating test split: 100%|██████████| 7405/7405 [00:01<00:00, 6410.27 examples/s]
  table = cls._concat_blocks(blocks, axis=0)


(20, 50)

# RAG Compiling

In [42]:
class Example:
    def __init__(self, data):
        self.data = data

    def __repr__(self):
        return f"Example({self.data})"

# Define the training set using the Example class
answer_golden_set = [
    Example({'question': 'What are the considerations to design the right data product?', 'answer': 'Key considerations in designing the right data products are its fulfillment to the use case for a given domain, along with compliance to slo and slis, support for output ports based on persona, metadata for discoverability and access and quality aspects to deliver trust.'}),
    Example({'question': 'What are tools used in?', 'answer': 'Snowflake, Talend, DBT, Collibra, Monte Carlo, Dataops.live, SOLE, OAM Client libraries'}),
    Example({'question': "According to Zhamak Dehghani's principles, effective data products in a Data Mesh architecture should possess several key qualities. Which of the following options correctly lists these qualities?", 'answer': 'Discoverable, Addressable, Trustworthy, Self-Describing, Interoperable, and Secure'})
]

In [44]:
compiled_rag = teleprompter.compile(RAG(), trainset=answer_golden_set)

  0%|          | 0/3 [00:00<?, ?it/s]


AttributeError: 'Example' object has no attribute 'inputs'