In [1]:
import dspy
import yaml
import os
import json

In [2]:
with open(os.path.join("..", "keys.yaml"), "r") as file:
    config = yaml.safe_load(file)
    openai_api_key = config["openai_api_key"]

llm = dspy.OpenAI(model='gpt-4', api_key=openai_api_key)
dspy.settings.configure(lm=llm, experimental=True)

In [3]:
class Explingo(dspy.Signature):
    """ You are helping users understand an ML model's prediction. Given an explanation and information about the model,
    convert the explanation into a human-readable narrative."""

    context = dspy.InputField(desc="what the ML model predicts")
    explanation = dspy.InputField(desc="explanation of an ML model's prediction")
    explanation_format = dspy.InputField(desc="format the explanation is given in")

    narrative = dspy.OutputField(
        desc="human-readable narrative version of the explanation"
    )
    rationalization = dspy.OutputField(
       desc="explains why given features may be relevant"
    )

In [4]:
narrify = dspy.Predict(Explingo)
result = narrify(context="The ML model predicts house prices", 
                 explanation="(total size in square feet, 300, -12000), (number of bedrooms, 2, -8000)",
                 explanation_format="(feature name, feature value, SHAP contribution in $)")

In [5]:
def create_example(convo):
      example = dspy.Example(explanation=convo["explanation"],
                             context=convo["context"],
                             explanation_format=convo["explanation_format"])
      if "description" in convo:
        example.narrative = convo["description"]
      if "bad_description" in convo:
        example.bad_narrative = convo["bad_description"]
      return example.with_inputs("explanation", "context", "explanation_format")

training_data = json.load(open("examples.json", "r"))
examples = []
for convo in training_data:
  examples.append(create_example(convo))

In [6]:
grader = dspy.OpenAI(model="gpt-4-1106-preview", max_tokens=1000, model_type="chat")

class RubricAssess(dspy.Signature):
    """Assess a narrative based on a rubric."""

    narrative = dspy.InputField()
    question = dspy.InputField()
    rubric = dspy.InputField()

    assessment = dspy.OutputField(
        desc="0, 1, or 2, based on the rubric. Include only the number."
    )


class BooleanAssess(dspy.Signature):
    """Assess a narrative with a yes/no question."""

    narrative = dspy.InputField()
    question = dspy.InputField()

    assessment = dspy.OutputField(desc="yes or no")


def compute_score_from_boolean(metric, question, narrative, iters=10):
    total_score = 0

    with dspy.context(lm=grader):
        for i in range(iters):
            score = dspy.Predict(BooleanAssess)(
                question=question, narrative=narrative
            ).assessment
            if score == "yes":
                total_score += 1
    score = total_score / iters

    if 0.3 < score < 0.7:
        print("Inconsistent score for metric %s: %s" % (metric, score))

    return score * 2


def compute_score_from_rubric(metric, question, rubric, narrative, iters=5):
    scores = []

    with dspy.context(lm=grader):
        for i in range(iters):
            score = dspy.Predict(RubricAssess)(
                question=question,
                rubric=rubric,
                narrative=narrative,
            ).assessment
            scores.append(int(score))

    if 0 in scores and 2 in scores:
        print("Inconsistent score for metric %s: %s" % (metric, scores))

    return sum(scores) / iters


def accuracy(gold, pred, trace=None):
    question = f"How accurately does the narrative describe this explanation: {gold.explanation}?. The explanation is formatted at: {gold.explanation_format}"
    rubric = f"0: Contain an error. 1: Accurate, but misleading. 2: Accurate and clear."
    return compute_score_from_rubric("accuracy", question, rubric, pred.narrative)


def fluency(gold, pred, trace=None):
    question = f"How natural and human does the narrative sound?"
    rubric = f"0: Not at all natural. 1: Somewhat natural. 2: Natural."
    return compute_score_from_rubric("fluency", question, rubric, pred.narrative)


def completeness(gold, pred, trace=None):
    question = f"Does the narrative contain all the feature values from this explanation? {gold.explanation}? The explanation is formatted at: {gold.explanation_format}"
    return compute_score_from_boolean("completeness", question, pred.narrative)


def conciseness(gold, pred, trace=None):
    length = len(pred.narrative.split())
    # scale length between 0 and 2, such that longer lengths score lower
    return 2 - min(length / 50, 2)


def context_awareness(gold, pred, trace=None):
    question = (
        f"How well does the narrative rationalization help explain the model's logic?"
    )
    rubric = f"0: Not at all. 1: Somewhat. 2: Very well."
    return compute_score_from_rubric(
        "context_awareness", question, rubric, pred.rationalization
    )

def all_metrics(gold, pred, trace=None, verbose=False):
    metrics = {
        "accuracy": accuracy(gold, pred, trace),
        "fluency": fluency(gold, pred, trace),
        "completeness": completeness(gold, pred, trace),
        "conciseness": conciseness(gold, pred, trace),
        "context_awareness": context_awareness(gold, pred, trace),
    }
    
    total_score = sum(metrics.values())
    
    if verbose:
        print("Explanation:", gold.explanation)
        print("Narrative:", pred.narrative)
        print("Rationalization:", pred.rationalization)
        print("Total Score:", total_score)
        print("".join(
            f"{metric}: {score}, " for metric, score in metrics.items()))
        print("--")

    if trace is None:
        return total_score
    else:
        # For bootstrapping, only consider this narrative acceptable if it is completely accurate and score 8 or higher
        return (metrics["accuracy"] == 2) and (total_score >= 8)

In [7]:
for i in range(1):
    example = examples[i]
    pred = narrify(**example.inputs())
    all_metrics(example, pred, verbose=True)

Explanation: (Above ground living area square feet, 1256.00, -12527.46), (Rates the overall material and finish of the house, 5.00, -10743.76), (Second floor square feet, 0.00, -10142.29), (Physical locations within Ames city limits, Edwards, -9913.81), (Wood deck area in square feet, 736.00, 9846.38)
Narrative: The machine learning model predicts house prices based on several factors. The most significant factors include the above ground living area, the overall material and finish of the house, the size of the second floor, the location within Ames city limits, and the size of the wood deck area. For instance, a house with an above ground living area of 1256 square feet, an overall material and finish rating of 5, no second floor, located in Edwards, and with a wood deck area of 736 square feet would have a certain predicted price.
Rationalization: The above ground living area is a significant factor because larger houses tend to be more expensive. The overall material and finish of 

In [8]:
from dspy.teleprompt import BootstrapFewShot

bootstrap = BootstrapFewShot(metric=all_metrics)
narrify_optimized = bootstrap.compile(narrify, trainset=examples)

100%|██████████| 25/25 [05:41<00:00, 13.68s/it]


Bootstrapped 0 full traces after 25 examples in round 0.
