In [2]:
from llama_index.core import SummaryIndex
from llama_index.readers.web import SimpleWebPageReader
documents = SimpleWebPageReader(html_to_text=True).load_data(
    ["https://www.thoughtworks.com/en-in/insights/blog/data-strategy/building-an-amazon-com-for-your-data-products"]
)

In [3]:
doc_contents = [doc.text for doc in documents]
doc_ids = list(range(1, len(doc_contents) + 1))

In [4]:
from qdrant_client import QdrantClient
client = QdrantClient(":memory:")

client.add(
    collection_name="DSpy_Qdrant",
    documents=doc_contents,
    ids=doc_ids,
)

100%|██████████| 77.7M/77.7M [00:08<00:00, 9.43MiB/s]


[1]

In [5]:
from dspy.retrieve.qdrant_rm import QdrantRM
import dspy


qdrant_retriever_model = QdrantRM("DSpy_Qdrant", client, k=10)


ollama_model = dspy.OllamaLocal(model="mistral",model_type='text',
                                max_tokens=350,
                                temperature=0.1,
                                top_p=0.8, frequency_penalty=1.17, top_k=40)


dspy.settings.configure(lm= ollama_model, rm=qdrant_retriever_model)

# Dataset 

In [6]:
import pandas as pd
import dspy

df = pd.read_csv("sample_example.csv")

full_dataset = []
for context, question, answer in df.values:
    example = dspy.Example(question=question, answer=answer).with_inputs("question")
    full_dataset.append(example)

split_index = int(len(full_dataset) * 0.7)


trainset = full_dataset[:split_index]
devset = full_dataset[split_index:]


print("First 3 training examples:", trainset[:3])
print("First 3 development examples:", devset[:3])

First 3 training examples: [Example({'question': 'What characteristics should data products have according to Zhamak Dehgahi?', 'answer': 'Data products should be discoverable, addressable, trustworthy, self-describing, interoperable, and secure.'}) (input_keys={'question'}), Example({'question': 'Why is it considered a lost opportunity that most data products only support one or two use cases?', 'answer': "It's a lost opportunity because it reflects underutilization and limits the potential benefits across various teams, especially in organizations with decentralized structures or those implementing data mesh."}) (input_keys={'question'}), Example({'question': 'What is the benefit of creating a central marketplace or catalog for internal data products?', 'answer': 'Creating a central marketplace or catalog helps raise awareness and can convince skeptical data consumers to start using the data products.'}) (input_keys={'question'})]
First 3 development examples: [Example({'question': '

# Simple RAG pipeline

### Question Answering Concept

In [9]:
class GenerateAnswer(dspy.Signature):
    """Answer questions with short factoid answers."""

    context = dspy.InputField(desc="may contain relevant facts")
    question = dspy.InputField()
    answer = dspy.OutputField()

In [10]:
GenerateAnswer.signature

'context, question -> answer'

In [11]:
GenerateAnswer.instructions

'Answer questions with short factoid answers.'

In [12]:
from dspy.signatures import signature_to_template
template = signature_to_template(GenerateAnswer)

print(str(template))

Template(Answer questions with short factoid answers., ['Context:', 'Question:', 'Answer:'])


# Zero Shot Prompting

In [13]:
class RAG(dspy.Module):
    def __init__(self, num_passages=3):
        super().__init__()

        self.retrieve = dspy.Retrieve(k=num_passages)
        self.generate_answer = dspy.ChainOfThought(GenerateAnswer)
    
    def forward(self, question):
        context = self.retrieve(question).passages
        prediction = self.generate_answer(context=context, question=question)
        return dspy.Prediction(context=context, answer=prediction.answer)

In [11]:

qa = RAG()

pred = qa("How does Thoughtworks identify potential data products?")

print(f"Predicted Answer: {pred.answer}")

Predicted Answer: Based on the information provided in the given website of Thoughtworks, there is no explicit mention of a specific process or methodology used by the company to identify potential data products. However, we can infer some possible ways that companies like Thoughtworks might approach identifying potential data products based on their focus areas and initiatives:

1. Data Discovery and Analysis: Companies may use various tools and techniques for discovering hidden patterns or trends in large datasets to identify potential areas for creating data products. They may also analyze customer feedback, market demands, and industry trends to determine which data could be valuable as a product.
2. Innovation Labs: Through their "Innovation Lab" initiatives, companies like Thoughtworks might experiment with emerging technologies, conduct proof-of-concepts (PoC), or build prototypes to validate the feasibility of potential data products.
3. Data Monetization: Companies may conside

# Compiling the RAG program

In [14]:
from dspy.teleprompt import BootstrapFewShot

# Validation logic: check that the predicted answer is correct.
# Also check that the retrieved context does actually contain that answer.
def validate_context_and_answer(example, pred, trace=None):
    answer_EM = dspy.evaluate.answer_exact_match(example, pred)
    answer_PM = dspy.evaluate.answer_passage_match(example, pred)
    return answer_EM and answer_PM

# Set up a basic teleprompter, which will compile our RAG program.

In [36]:
print(inspect.getsource(gsm8k_metric))

In [37]:
print(inspect.getsource(parse_integer_answer))

In [25]:
from dspy import (
    OpenAI,
    settings,
    context,
    Module,
    ChainOfThought,   
)
from dspy.evaluate import Evaluate
from dspy.datasets import HotPotQA
from dspy.datasets.gsm8k import gsm8k_metric, GSM8K
from dspy.datasets.gsm8k import parse_integer_answer
from dspy.teleprompt import (
    BootstrapFewShot,
    BootstrapFewShotWithRandomSearch,
    BootstrapFinetune,
    LabeledFewShot,
)
from dspy.teleprompt.ensemble import Ensemble
from rich import print
import inspect

In [32]:
from dspy.datasets.gsm8k import GSM8K, gsm8k_metric

In [38]:
optim = BootstrapFewShot(
    metric=validate_context_and_answer,
    max_bootstrapped_demos=8,
    max_labeled_demos=8
)

In [39]:
compile_cot = optim.compile(RAG(),
                            trainset=trainset[:2],
                            valset=devset[:1])

100%|██████████| 2/2 [00:50<00:00, 25.22s/it]


In [19]:
trainset[:2]

[Example({'question': 'What characteristics should data products have according to Zhamak Dehgahi?', 'answer': 'Data products should be discoverable, addressable, trustworthy, self-describing, interoperable, and secure.'}) (input_keys={'question'}),
 Example({'question': 'Why is it considered a lost opportunity that most data products only support one or two use cases?', 'answer': "It's a lost opportunity because it reflects underutilization and limits the potential benefits across various teams, especially in organizations with decentralized structures or those implementing data mesh."}) (input_keys={'question'})]

In [20]:
devset

[Example({'question': 'what are tools used in?', 'answer': 'Snowflake, Talend, DBT, Collibra, Monte Carlo, Dataops.live, SOLE, OAM Client libraries'}) (input_keys={'question'}),
 Example({'question': "According to Zhamak Dehghani's principles, effective data products in a Data Mesh architecture should possess several key qualities. Which of the following options correctly lists these qualities?", 'answer': 'Discoverable, Addressable, Trustworthy, Self-Describing, Interoperable, and Secure'}) (input_keys={'question'}),
 Example({'question': 'The three pillars of Data Mesh success', 'answer': 'Organizational change, product thinking, and technology'}) (input_keys={'question'}),
 Example({'question': 'What is Data Mesh?', 'answer': 'A decentralized approach to data architecture and organizational design'}) (input_keys={'question'})]

In [None]:
teleprompter = BootstrapFewShot(metric=validate_context_and_answer ,  max_bootstrapped_demos=15, 
    max_labeled_demos=15)

# Compile!
compiled_rag = teleprompter.compile(RAG(), trainset=trainset)

In [40]:
compile_cot.save(path="/Users/samvardhan/Desktop/DataEngineer/DSPy_Experiment/model_compile_2.json")

# Executing the Pipeline

In [17]:
my_question = "What is Data Mesh"

pred = compiled_rag(my_question)

print(f"Question: {my_question}")
print(f"Predicted Answer: {pred.answer}")
print(f"Retrieved Contexts (truncated): {[c[:200] + '...' for c in pred.context]}")

Question: What is Data Mesh
Predicted Answer: Data Mesh is a data architecture style that aims to make data more accessible, discoverable, and interoperable within an organization. It achieves this by decentralizing the ownership and management of data to different domain teams using a domain-driven design approach. Each dataset is treated as a product with its own lifecycle and versioning, ensuring semantic meaning within each domain while promoting interoperability between datasets through common standards and protocols. Data processing occurs in real-time using event-driven architecture, and metadata about each dataset is managed at its source for easy discovery. Security measures such as access controls, encryption, and data masking are also incorporated to ensure robust protection of the data.
Retrieved Contexts (truncated): ['[ ![Thoughtworks](/etc.clientlibs/thoughtworks/clientlibs/clientlib-\nsite/resources/images/thoughtworks-logo.svg) ](/en-in "Thoughtworks")\n\nMenu\n\nClose

In [18]:
ollama_model.inspect_history(n=1)




Answer questions with short factoid answers.

---

Question: What are the considerations to design the right data product?
Answer: Key considerations in designing the right data products are its fulfillment to the use case for a given domain, along with compliance to slo and slis, support for output ports based on persona, metadata for discoverability and access and quality aspects to deliver trust.

Question: What characteristics should data products have according to Zhamak Dehgahi?
Answer: Data products should be discoverable, addressable, trustworthy, self-describing, interoperable, and secure.

Question: How can trustworthiness in data products be achieved?
Answer: Trustworthiness in data products can be achieved by being transparent about information quality metrics and performance promises.

Question: How does Thoughtworks identify potential data products?
Answer: Thoughtworks identifies potential data products by working backwards from the use case using the Jobs to be Done 

'\n\n\nAnswer questions with short factoid answers.\n\n---\n\nQuestion: What are the considerations to design the right data product?\nAnswer: Key considerations in designing the right data products are its fulfillment to the use case for a given domain, along with compliance to slo and slis, support for output ports based on persona, metadata for discoverability and access and quality aspects to deliver trust.\n\nQuestion: What characteristics should data products have according to Zhamak Dehgahi?\nAnswer: Data products should be discoverable, addressable, trustworthy, self-describing, interoperable, and secure.\n\nQuestion: How can trustworthiness in data products be achieved?\nAnswer: Trustworthiness in data products can be achieved by being transparent about information quality metrics and performance promises.\n\nQuestion: How does Thoughtworks identify potential data products?\nAnswer: Thoughtworks identifies potential data products by working backwards from the use case using th

In [104]:
for name, parameter in compiled_rag.named_predictors():
    print(name)
    print(parameter.demos[0])
    print()

generate_answer
Example({'question': 'The three pillars of Data Mesh success', 'answer': 'Organizational change,product thinking, and technology'}) (input_keys={'question'})



# Measuring and Optimizing Performance

#  Metric 1: Readability

In [254]:
# https://apps.dtic.mil/sti/tr/pdf/AD0667273.pdf
def automated_readability_index(text):
    import re
    characters = len(re.sub(r'\s+', '', text)) # Count characters (ignoring whitespace)
    words = len(text.split()) # Count words by splitting the text
    # Count sentences by finding period, exclamation, or question mark
    sentences = len(re.findall(r'[.!?\n]', text))
    # small change is to add a new line character as grug doesn't seem to use punctuation.
    if words == 0 or sentences == 0:  # Prevent division by zero
        return 0
    # Calculate the Automated Readability Index (ARI)
    ari = (4.71 * (characters / words)) + (0.5 * (words / sentences)) - 21.43
    
    return round(ari, 2)

In [255]:
for ex in full_dataset:
    source_ari = automated_readability_index(ex.question)
    grug_ari = automated_readability_index(ex.answer)
    print(f"ARI {source_ari} => {grug_ari}")

ARI 14.66 => 25.6
ARI 8.77 => 23.64
ARI 10.56 => 15.76
ARI 9.65 => 18.38
ARI 15.04 => 14.71
ARI 6.62 => 9.64
ARI 8.06 => 24.98
ARI -1.03 => 17.04
ARI 14.98 => 0
ARI 0 => 0
ARI -1.77 => 0


In [256]:
# https://dspy-docs.vercel.app/docs/building-blocks/metrics#intermediate-using-ai-feedback-for-your-metric
class AssessBasedOnQuestion(dspy.Signature):
    """Given the assessed text provide a yes or no to the assessment question."""
    assessed_text = dspy.InputField(format=str)
    assessment_question = dspy.InputField(format=str)
    assessment_answer = dspy.OutputField(desc="Yes or No")

In [257]:
example_question_assessment = dspy.Example(assessed_text="This is a test.", assessment_question="Is this a test?", assessment_answer="Yes").with_inputs("assessed_text", "assessment_question")
print(signature_to_template(AssessBasedOnQuestion).query(example_question_assessment))

Assessed Text: This is a test.
Assessment Question: Is this a test?
Assessment Answer: Yes


In [258]:
def similarity_metric(truth, pred, trace=None):
    question = truth.question
    truth_answer = truth.answer
    proposed_answer = pred.answer
    similarity_question = f"""Does the assessed text have the same meaning as the gold_standard text provided?
Gold Standard: "{truth_answer}"
Assessed Answer: "{proposed_answer}"
Provide only a yes or no answer."""
    
    with dspy.context(lm=ollama_model):
        assessor = dspy.Predict(AssessBasedOnQuestion)
        raw_similarity_result = assessor(assessed_text=proposed_answer, assessment_question=similarity_question)
    
    print(f"Question: {question}")
    print(f"Truth Answer: {truth_answer}")
    print(f"Proposed Answer: {proposed_answer}")
    print(f"Assessment Answer: {raw_similarity_result.assessment_answer}")

    raw_similarity = raw_similarity_result.assessment_answer.lower().strip()
    same_meaning = raw_similarity == 'yes'
    return same_meaning

In [259]:
def ari_metric(truth, pred, trace=None):
    truth_answer =  truth.answer
    proposed_answer = pred.answer
    
    gold_ari = automated_readability_index(truth_answer)
    pred_ari = automated_readability_index(proposed_answer)
    print(f"ARI {gold_ari} => {pred_ari}")
    ari_result = pred_ari <= 7.01
    return ari_result

In [260]:
def overall_metric(provided_example, predicted, trace=None):
    similarity = similarity_metric(provided_example, predicted, trace)
    ari = ari_metric(provided_example, predicted, trace)
    if similarity and ari:
        return True
    return False

In [261]:
from dspy.teleprompt import BootstrapFewShot
config = dict(max_bootstrapped_demos=4, max_labeled_demos=4)
optimizer = BootstrapFewShot(metric=overall_metric, **config)
optimizer.max_errors = 1 # helpful to debug errors faster
optimized_cot = optimizer.compile(RAG(), trainset=trainset, valset=devset)



Question: What characteristics should data products have according to Zhamak Dehgahi?
Truth Answer: Data products should be discoverable, addressable, trustworthy, self-describing, interoperable, and secure.
Proposed Answer: According to Zhamak Dehgahi, a thought leader in the field of Data Mesh architecture, effective data products should possess the following characteristics: 1) consumer-driven, 2) domain-specific, 3) composable, 4) owned by a team or individual, 5) versioned, and 6) secure. Additionally, they should be scalable to handle increasing amounts of data and growing numbers of consumers.
Assessment Answer: No. The assessed text mentions different characteristics for effective data products than the gold standard text provides.
ARI 25.6 => 20.07




Question: Why is it considered a lost opportunity that most data products only support one or two use cases?
Truth Answer: It's a lost opportunity because it reflects underutilization and limits the potential benefits across various teams, especially in organizations with decentralized structures or those implementing data mesh.
Proposed Answer: Supporting multiple use cases for data products is beneficial because it increases value, efficiency and consistency, flexibility, improves collaboration, saves costs, and enables better decision-making. Limiting these products to just one or two use cases can be considered a lost opportunity as they may not fully leverage the potential of the data being collected and processed within an organization.
Assessment Answer: Yes. Both texts convey that supporting multiple use cases for data products is beneficial, and limiting them to just one or two can result in missed opportunities for maximizing their value within an organization.
ARI 23.64 => 2



Question: What is the benefit of creating a central marketplace or catalog for internal data products?
Truth Answer: Creating a central marketplace or catalog helps raise awareness and can convince skeptical data consumers to start using the data products.
Proposed Answer: Creating a central marketplace or catalog for internal data products offers several benefits that can lead to faster growth and efficiency within an organization. These benefits include improved discoverability of available data products, faster data access with consistent security policies and compliance, increased collaboration between teams, and standardization across the organization's handling of internal data.
Assessment Answer: Yes. While there are some differences in emphasis and language used, both texts convey the idea that creating a central marketplace or catalog for internal data products can bring about benefits such as improved discoverability, faster access with consistent security policies, increased



Question: How can trustworthiness in data products be achieved?
Truth Answer: Trustworthiness in data products can be achieved by being transparent about information quality metrics and performance promises.
Proposed Answer: To achieve trustworthiness in data products, follow these steps:
1. Collect and process high-quality data ethically and representatively.
2. Ensure security throughout all stages of processing by securing access to raw data and implementing encryption and other protective measures for sensitive information.
3. Provide transparency into how the data was collected, processed, and analyzed through documentation or tools that allow users to explore underlying data themselves.
4. Implement robust validation checks throughout all stages of processing to ensure accuracy and consistency in your data product.
5. Continuously monitor performance and quality of your data products for early identification of issues and prevention of potential misuse or misunderstanding by user



Question: How does Thoughtworks identify potential data products?
Truth Answer: Thoughtworks identifies potential data products by working backwards from the use case using the Jobs to be Done (JTBD) framework.
Proposed Answer: Thoughtworks identifies potential data products through a methodology called Data Mesh, which involves several steps including identifying business capabilities, discovering relevant data sources, validating the data for accuracy and completeness, building pipelines to move the data, and developing data products based on the requirements of the use case.
Assessment Answer: Yes. Both texts describe methods used by Thoughtworks to identify potential data products. The first text mentions Data Mesh as the methodology, while the gold standard text refers to working backwards from the use case using Jobs to be Done (JTBD). These approaches are related; JTBD is one way of identifying business capabilities and use cases that can guide data product development within a 



Question: How can the deployment of data monitors be automated?
Truth Answer: The deployment of data monitors can be automated using the Monte Carlo CLI as part of the CI/CD pipeline.
Proposed Answer: To automate the deployment of data monitors, you can follow these steps:

1. Create a template or configuration file for setting up new data monitor instances using Infrastructure as Code (IaC) tools like Terraform, CloudFormation, or Ansible.
2. Write scripts to install the required monitoring software on target machines automatically using package managers like apt-get or yum for Linux systems and PowerShell for Windows systems.
3. Set up a Continuous Integration (CI) pipeline that can build and deploy these configurations as part of an application release process, using tools such as Jenkins, GitLab CI/CD, or Travis CI.
4. Implement monitoring for the monitoring infrastructure itself to detect any issues with the deployment and configuration of data monitors using tools like Nagios, Za

100%|██████████| 7/7 [03:09<00:00, 27.04s/it]

Question: What are the considerations to design the right data product?
Truth Answer: Key considerations in designing the right data products are its fulfillment to the use case for a given domain, along with compliance to slo and slis, support for output ports based on persona, metadata for discoverability and access and quality aspects to deliver trust.
Proposed Answer: To design the right data product, consider the following steps and key considerations at each stage:

1. Understanding your audience and their needs: Identify who will be using your data product, their goals, pain points, motivations, behaviors, and desired outcomes. This includes understanding their use cases and how they plan to utilize the data product.
2. Defining the problem statement: Clearly define the specific business or organizational problem that your data product aims to solve by being as specific as possible about the desired outcomes and benefits of solving this issue with data.
3. Collecting relevant da




In [262]:
from dspy.evaluate import Evaluate
individual_metrics = [similarity_metric, ari_metric]
for metric in individual_metrics:
    evaluate = Evaluate(metric=metric, devset=trainset, num_threads=1, display_progress=True, display_table=5)
    evaluate(optimized_cot)

  0%|          | 0/7 [00:00<?, ?it/s]Question: What characteristics should data products have according to Zhamak Dehgahi?
Truth Answer: Data products should be discoverable, addressable, trustworthy, self-describing, interoperable, and secure.
Proposed Answer: According to Zhamak Dehgahi, a thought leader in Data Mesh architecture, data products should have the following characteristics:

1. Domain-driven: A data product is designed and developed with the specific needs of a particular business domain in mind. It provides value to that domain and aligns with its goals and objectives.
2. Self-contained: Each data product should be self-contained, providing all necessary functionality for consuming its data without relying on other products or external systems.
3. Consumable: Data products should be easy to consume by their intended audience through APIs, user interfaces, or other means of access. They should also provide clear documentation and support for usage.
4. Versioned: As data 

Unnamed: 0,question,example_answer,context,pred_answer,similarity_metric
0,What characteristics should data products have according to Zhamak Dehgahi?,"Data products should be discoverable, addressable, trustworthy, self-describing, interoperable, and secure.","['[ ![Thoughtworks](/etc.clientlibs/thoughtworks/clientlibs/clientlib-\nsite/resources/images/thoughtworks-logo.svg) ](/en-in ""Thoughtworks"")\n\nMenu\n\nClose\n\n * [What we do ](/en-in/what-we-do ""What we do"")\n\n * [ Go to overview ](/en-in/what-we-do)\n * ### Services\n\n * [ Artificial Intelligence...","According to Zhamak Dehgahi, a thought leader in Data Mesh architecture, data products should have the following characteristics: 1. Domain-driven: A data product is designed...",False
1,Why is it considered a lost opportunity that most data products only support one or two use cases?,"It's a lost opportunity because it reflects underutilization and limits the potential benefits across various teams, especially in organizations with decentralized structures or those implementing...","['[ ![Thoughtworks](/etc.clientlibs/thoughtworks/clientlibs/clientlib-\nsite/resources/images/thoughtworks-logo.svg) ](/en-in ""Thoughtworks"")\n\nMenu\n\nClose\n\n * [What we do ](/en-in/what-we-do ""What we do"")\n\n * [ Go to overview ](/en-in/what-we-do)\n * ### Services\n\n * [ Artificial Intelligence...","The statement ""most data products only support one or two use cases"" can be considered a lost opportunity because it may limit the potential insights...",False
2,What is the benefit of creating a central marketplace or catalog for internal data products?,Creating a central marketplace or catalog helps raise awareness and can convince skeptical data consumers to start using the data products.,"['[ ![Thoughtworks](/etc.clientlibs/thoughtworks/clientlibs/clientlib-\nsite/resources/images/thoughtworks-logo.svg) ](/en-in ""Thoughtworks"")\n\nMenu\n\nClose\n\n * [What we do ](/en-in/what-we-do ""What we do"")\n\n * [ Go to overview ](/en-in/what-we-do)\n * ### Services\n\n * [ Artificial Intelligence...","Creating a central marketplace or catalog for internal data products offers several benefits for an organization. Firstly, it improves discoverability by providing a single platform...",False
3,How can trustworthiness in data products be achieved?,Trustworthiness in data products can be achieved by being transparent about information quality metrics and performance promises.,"['[ ![Thoughtworks](/etc.clientlibs/thoughtworks/clientlibs/clientlib-\nsite/resources/images/thoughtworks-logo.svg) ](/en-in ""Thoughtworks"")\n\nMenu\n\nClose\n\n * [What we do ](/en-in/what-we-do ""What we do"")\n\n * [ Go to overview ](/en-in/what-we-do)\n * ### Services\n\n * [ Artificial Intelligence...","To achieve trustworthiness in data products, there are several steps that can be taken: 1. Ethical and Transparent Data Collection: Collect data ethically with informed...",False
4,How does Thoughtworks identify potential data products?,Thoughtworks identifies potential data products by working backwards from the use case using the Jobs to be Done (JTBD) framework.,"['[ ![Thoughtworks](/etc.clientlibs/thoughtworks/clientlibs/clientlib-\nsite/resources/images/thoughtworks-logo.svg) ](/en-in ""Thoughtworks"")\n\nMenu\n\nClose\n\n * [What we do ](/en-in/what-we-do ""What we do"")\n\n * [ Go to overview ](/en-in/what-we-do)\n * ### Services\n\n * [ Artificial Intelligence...","Thoughtworks identifies potential data products through various means, including partnerships and collaborations with organizations, working closely with clients in different industries, engaging in research and...",False


  0%|          | 0/7 [00:00<?, ?it/s]ARI 25.6 => 17.11
Average Metric: 0 / 1  (0.0):  14%|█▍        | 1/7 [00:22<02:15, 22.63s/it]ARI 23.64 => 11.66
Average Metric: 0 / 2  (0.0):  29%|██▊       | 2/7 [00:51<02:10, 26.16s/it]ARI 15.76 => 12.96
Average Metric: 0 / 3  (0.0):  43%|████▎     | 3/7 [01:16<01:42, 25.66s/it]ARI 18.38 => 30.49
Average Metric: 0 / 4  (0.0):  57%|█████▋    | 4/7 [01:37<01:11, 23.94s/it]ARI 14.71 => 20.04
Average Metric: 0 / 5  (0.0):  71%|███████▏  | 5/7 [01:54<00:42, 21.46s/it]ARI 9.64 => 9.79
Average Metric: 0 / 6  (0.0):  86%|████████▌ | 6/7 [02:19<00:22, 22.65s/it]ARI 24.98 => 12.23
Average Metric: 0 / 7  (0.0): 100%|██████████| 7/7 [02:48<00:00, 24.03s/it]


Unnamed: 0,question,example_answer,context,pred_answer,ari_metric
0,What characteristics should data products have according to Zhamak Dehgahi?,"Data products should be discoverable, addressable, trustworthy, self-describing, interoperable, and secure.","['[ ![Thoughtworks](/etc.clientlibs/thoughtworks/clientlibs/clientlib-\nsite/resources/images/thoughtworks-logo.svg) ](/en-in ""Thoughtworks"")\n\nMenu\n\nClose\n\n * [What we do ](/en-in/what-we-do ""What we do"")\n\n * [ Go to overview ](/en-in/what-we-do)\n * ### Services\n\n * [ Artificial Intelligence...","According to Zhamak Dehgahi's perspective on data products in the context of Data Mesh architecture, effective data products should possess the following characteristics: 1) Consumer-driven:...",False
1,Why is it considered a lost opportunity that most data products only support one or two use cases?,"It's a lost opportunity because it reflects underutilization and limits the potential benefits across various teams, especially in organizations with decentralized structures or those implementing...","['[ ![Thoughtworks](/etc.clientlibs/thoughtworks/clientlibs/clientlib-\nsite/resources/images/thoughtworks-logo.svg) ](/en-in ""Thoughtworks"")\n\nMenu\n\nClose\n\n * [What we do ](/en-in/what-we-do ""What we do"")\n\n * [ Go to overview ](/en-in/what-we-do)\n * ### Services\n\n * [ Artificial Intelligence...","The statement ""most data products only support one or two use cases"" implies that there are potential uses for the data beyond what is currently...",False
2,What is the benefit of creating a central marketplace or catalog for internal data products?,Creating a central marketplace or catalog helps raise awareness and can convince skeptical data consumers to start using the data products.,"['[ ![Thoughtworks](/etc.clientlibs/thoughtworks/clientlibs/clientlib-\nsite/resources/images/thoughtworks-logo.svg) ](/en-in ""Thoughtworks"")\n\nMenu\n\nClose\n\n * [What we do ](/en-in/what-we-do ""What we do"")\n\n * [ Go to overview ](/en-in/what-we-do)\n * ### Services\n\n * [ Artificial Intelligence...",Creating a central marketplace or catalog for internal data products can bring several advantages to an organization. Some of these benefits include: 1. Discoverability: A...,False
3,How can trustworthiness in data products be achieved?,Trustworthiness in data products can be achieved by being transparent about information quality metrics and performance promises.,"['[ ![Thoughtworks](/etc.clientlibs/thoughtworks/clientlibs/clientlib-\nsite/resources/images/thoughtworks-logo.svg) ](/en-in ""Thoughtworks"")\n\nMenu\n\nClose\n\n * [What we do ](/en-in/what-we-do ""What we do"")\n\n * [ Go to overview ](/en-in/what-we-do)\n * ### Services\n\n * [ Artificial Intelligence...","To achieve trustworthiness in data products, follow these steps: 1) Ensure ethical and transparent data collection with informed consent if necessary; 2) Implement robust security...",False
4,How does Thoughtworks identify potential data products?,Thoughtworks identifies potential data products by working backwards from the use case using the Jobs to be Done (JTBD) framework.,"['[ ![Thoughtworks](/etc.clientlibs/thoughtworks/clientlibs/clientlib-\nsite/resources/images/thoughtworks-logo.svg) ](/en-in ""Thoughtworks"")\n\nMenu\n\nClose\n\n * [What we do ](/en-in/what-we-do ""What we do"")\n\n * [ Go to overview ](/en-in/what-we-do)\n * ### Services\n\n * [ Artificial Intelligence...","Based on the information provided in the text, Thoughtworks identifies potential data products through client needs assessment, market trends analysis, or by leveraging their expertise...",False


In [263]:
result = optimized_cot.forward("What characteristics should data products have according to Zhamak Dehgahi?")

In [264]:
result.answer

'According to Zhamak Dehgahi, a good data product should have the following characteristics: 1) Event-driven, 2) Streaming, 3) Decentralized, 4) Immutable, and 5) Real-time analytics. These characteristics help make data products more responsive, flexible to changing business requirements, scalable and resilient to failures, ensure data integrity and consistency across the system, as well as enable organizations to make informed decisions quickly based on new information.'

In [181]:
optimized_cot.save(path="/Users/samvardhan/Desktop/DataEngineer/DSPy_Experiment/model.json")

# Multi Hop

In [182]:
class GenerateAnswer(dspy.Signature):
    """Answer questions with short factoid answers."""


    context = dspy.InputField(desc="may contain relevant facts")
    question = dspy.InputField()
    answer = dspy.OutputField()

In [183]:
class GenerateSearchQuery(dspy.Signature):
    """Write a simple search query that will help answer a complex question."""


    context = dspy.InputField(desc="may contain relevant facts")
    question = dspy.InputField()
    query = dspy.OutputField()

In [184]:
from dsp.utils import deduplicate

class SimplifiedBaleen(dspy.Module):
    def __init__(self, passages_per_hop=3, max_hops=2):
        super().__init__()

        self.generate_query = [dspy.ChainOfThought(GenerateSearchQuery) for _ in range(max_hops)]
        self.retrieve = dspy.Retrieve(k=passages_per_hop)
        self.generate_answer = dspy.ChainOfThought(GenerateAnswer)
        self.max_hops = max_hops
    
    def forward(self, question):
        context = []
        
        for hop in range(self.max_hops):
            query = self.generate_query[hop](context=context, question=question).query
            passages = self.retrieve(query).passages
            context = deduplicate(context + passages)

        pred = self.generate_answer(context=context, question=question)
        return dspy.Prediction(context=context, answer=pred.answer)

In [185]:
my_question = "What is Data Mesh?"


# Get the prediction. This contains `pred.context` and `pred.answer`.
uncompiled_baleen = SimplifiedBaleen()  # uncompiled (i.e., zero-shot) program
pred = uncompiled_baleen(my_question)


# Print the contexts and the answer.
print(f"Question: {my_question}")
print(f"Predicted Answer: {pred.answer}")
print(f"Retrieved Contexts (truncated): {[c[:200] + '...' for c in pred.context]}")

Question: What is Data Mesh?
Predicted Answer: Data Mesh is a term used in the context of data management and processing. It refers to a decentralized architecture where data is stored and processed across multiple nodes or devices, rather than being concentrated on a single centralized location. This approach allows for more efficient use of resources, improved scalability, and enhanced security by distributing the load across multiple nodes.

In this context, "Data Mesh" can be thought of as a network of interconnected data sources, where each node or device contributes to the overall dataset. The mesh structure enables flexible and efficient communication between these nodes, allowing for real-time updates and processing of data without relying on a single centralized point of control.

The term "Data Mesh" is often used in contrast to traditional centralized data architectures, where all data is stored and processed on a single server or cluster of servers. In this model, the entir