From 3a37195ab8007342cca1f02b26ac14ed0c5d70bc Mon Sep 17 00:00:00 2001
From: Omar Khattab <okhat@users.noreply.github.com>
Date: Thu, 28 Nov 2024 19:07:08 -0800
Subject: [PATCH] Add MATH reasoning tutorial

---
 docs/docs/index.md                   |  14 +-
 docs/docs/tutorials/math/index.ipynb | 777 +++++++++++++++++++++++++++
 docs/mkdocs.yml                      |   1 +
 dspy/datasets/__init__.py            |   1 +
 dspy/datasets/math.py                |  63 +++
 5 files changed, 849 insertions(+), 7 deletions(-)
 create mode 100644 docs/docs/tutorials/math/index.ipynb
 create mode 100644 dspy/datasets/math.py

diff --git a/docs/docs/index.md b/docs/docs/index.md
index d9b1237b19..1a551c1c6f 100644
--- a/docs/docs/index.md
+++ b/docs/docs/index.md
@@ -11,9 +11,9 @@ hide:
 # _Programming_—not prompting—_LMs_
 
 
-DSPy is the framework for _programming—rather than prompting—language models_. It allows you to iterate fast on **building modular AI systems** and provides algorithms for **optimizing their prompts and weights**, whether you're building simple classifiers, sophisticated RAG pipelines, or Agent loops.
+DSPy is the framework for _programming—rather than prompting—language models_. It allows you to iterate fast on **building modular AI systems** and offers algorithms for **optimizing their prompts and weights**, whether you're building simple classifiers, sophisticated RAG pipelines, or Agent loops.
 
-DSPy stands for Declarative Self-improving Python. Instead of brittle prompts, you write compositional _Python code_ and use DSPy's tools to **teach your LM to deliver high-quality outputs**. This [lecture](https://www.youtube.com/watch?v=JEMYuzrKLUw) is a good conceptual introduction. Meet the community, seek help, or start contributing via our [GitHub repo](https://github.com/stanfordnlp/dspy) and [Discord server](https://discord.gg/XCGy2WDCQB).
+DSPy stands for Declarative Self-improving Python. Instead of brittle prompts, you write compositional _Python code_ and use DSPy to **teach your LM to deliver high-quality outputs**. This [lecture](https://www.youtube.com/watch?v=JEMYuzrKLUw) is a good conceptual introduction. Meet the community, seek help, or start contributing via our [GitHub repo](https://github.com/stanfordnlp/dspy) and [Discord server](https://discord.gg/XCGy2WDCQB).
 
 
 !!! info "Getting Started I: Install DSPy and set up your LM"
@@ -117,7 +117,7 @@ DSPy stands for Declarative Self-improving Python. Instead of brittle prompts, y
 
 ## 1) **Modules** help you describe AI behavior as _code_, not strings.
 
-To build reliable AI systems, you must iterate fast. But maintaining prompts makes that hard: it forces you to tinker with strings or data _every time you change your LM, metrics, or pipeline_. Having built over a dozen best-in-class compound LM systems since 2020, we learned this the hard way—and built DSPy to decouple the core definition of an LM system from messy incidental choices about specific LMs or prompting strategies.
+To build reliable AI systems, you must iterate fast. But maintaining prompts makes that hard: it forces you to tinker with strings or data _every time you change your LM, metrics, or pipeline_. Having built over a dozen best-in-class compound LM systems since 2020, we learned this the hard way—and so built DSPy to decouple defining LM systems from messy incidental choices about specific LMs or prompting strategies.
 
 DSPy shifts your focus from tinkering with prompt strings to **programming with structured and declarative natural-language modules**. For every AI component in your system, you specify input/output behavior as a _signature_ and select a _module_ to assign a strategy for invoking your LM. DSPy expands your signatures into prompts and parses your typed outputs, so you can write ergonomic, portable, and optimizable AI systems.
 
@@ -218,10 +218,10 @@ DSPy shifts your focus from tinkering with prompt strings to **programming with
     === "Agents"
 
         ```python linenums="1"       
-        def evaluate_math(expression: str) -> float:
+        def evaluate_math(expression: str):
             return dspy.PythonInterpreter({}).execute(expression)
 
-        def search_wikipedia(query: str) -> str:
+        def search_wikipedia(query: str):
             results = dspy.ColBERTv2(url='http://20.102.90.50:2017/wiki17_abstracts')(query, k=3)
             return [x['text'] for x in results]
 
@@ -239,9 +239,9 @@ DSPy shifts your focus from tinkering with prompt strings to **programming with
 
 ??? "Using DSPy in practice: from quick scripting to building sophisticated systems."
 
-    Standard prompts conflate interface (“what should the LM do?”) with implementation (“how do we tell it to do that?”). DSPy isolates the former as _signatures_ so we can infer latter or learn it from data — in the context of a bigger program.
+    Standard prompts conflate interface (“what should the LM do?”) with implementation (“how do we tell it to do that?”). DSPy isolates the former as _signatures_ so we can infer the latter or learn it from data — in the context of a bigger program.
     
-    Even before you start using optimizers, DSPy's modules allow you to script effective LM systems in ergonomic, portable ways. Across many tasks and LMs, we maintain _signature test suites_ that assess the reliability of the built-in DSPy adapters. Adapters are the components that map signatures to prompts prior to optimization. If you find a task where a simple prompt consistently outperforms idiomatic DSPy for your LM, consider that a bug and [file an issue](https://github.com/stanfordnlp/dspy/issues). We'll use this to improve the built-in adapters.
+    Even before you start using optimizers, DSPy's modules allow you to script effective LM systems as ergonomic, portable _code_. Across many tasks and LMs, we maintain _signature test suites_ that assess the reliability of the built-in DSPy adapters. Adapters are the components that map signatures to prompts prior to optimization. If you find a task where a simple prompt consistently outperforms idiomatic DSPy for your LM, consider that a bug and [file an issue](https://github.com/stanfordnlp/dspy/issues). We'll use this to improve the built-in adapters.
 
 
 ## 2) **Optimizers** tune the prompts and weights of your AI modules.
diff --git a/docs/docs/tutorials/math/index.ipynb b/docs/docs/tutorials/math/index.ipynb
new file mode 100644
index 0000000000..0b8715bcce
--- /dev/null
+++ b/docs/docs/tutorials/math/index.ipynb
@@ -0,0 +1,777 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Tutorial: Math Reasoning\n",
+    "\n",
+    "Let's walk through a quick example of setting up a `dspy.ChainOfThought` module and optimizing it for answering algebra questions.\n",
+    "\n",
+    "Install the latest DSPy via `pip install -U dspy` and follow along."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's tell DSPy that we will use OpenAI's `gpt-4o-mini` in our modules. To authenticate, DSPy will look into your `OPENAI_API_KEY`. You can easily swap this out for [other providers or local models](https://github.com/stanfordnlp/dspy/blob/main/examples/migration.ipynb)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import dspy\n",
+    "\n",
+    "gpt4o_mini = dspy.LM('openai/gpt-4o-mini', max_tokens=2000)\n",
+    "gpt4o = dspy.LM('openai/gpt-4o', max_tokens=2000)\n",
+    "dspy.configure(lm=gpt4o_mini)  # we'll use gpt-4o-mini as the default LM, unless otherwise specified"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Next, let's load some data examples from the [MATH](https://arxiv.org/abs/2103.03874) benchmark. We'll use a training split for optimization and evaluate it on a held-out dev set.\n",
+    "\n",
+    "Please note that the following step will require:\n",
+    "```bash\n",
+    "%pip install git+https://github.com/hendrycks/math.git\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "350 350\n"
+     ]
+    }
+   ],
+   "source": [
+    "from dspy.datasets import MATH\n",
+    "\n",
+    "dataset = MATH(subset='algebra')\n",
+    "print(len(dataset.train), len(dataset.dev))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's inspect one example from the training set."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Question: The doctor has told Cal O'Ree that during his ten weeks of working out at the gym, he can expect each week's weight loss to be $1\\%$ of his weight at the end of the previous week. His weight at the beginning of the workouts is $244$ pounds. How many pounds does he expect to weigh at the end of the ten weeks? Express your answer to the nearest whole number.\n",
+      "Answer: 221\n"
+     ]
+    }
+   ],
+   "source": [
+    "example = dataset.train[0]\n",
+    "print(\"Question:\", example.question)\n",
+    "print(\"Answer:\", example.answer)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now let's define our module. It's extremely simple: just a chain-of-thought step that takes a `question` and produces an `answer`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Prediction(\n",
+       "    reasoning=\"Cal O'Ree's weight loss each week is $1\\\\%$ of his weight at the end of the previous week. This means that at the end of each week, he retains $99\\\\%$ of his weight from the previous week. \\n\\nIf we denote his weight at the beginning as \\\\( W_0 = 244 \\\\) pounds, then his weight at the end of week \\\\( n \\\\) can be expressed as:\\n\\\\[\\nW_n = W_{n-1} \\\\times 0.99\\n\\\\]\\nThis can be simplified to:\\n\\\\[\\nW_n = W_0 \\\\times (0.99)^n\\n\\\\]\\nAfter 10 weeks, his weight will be:\\n\\\\[\\nW_{10} = 244 \\\\times (0.99)^{10}\\n\\\\]\\n\\nNow, we calculate \\\\( (0.99)^{10} \\\\):\\n\\\\[\\n(0.99)^{10} \\\\approx 0.904382\\n\\\\]\\n\\nNow, we can calculate his expected weight after 10 weeks:\\n\\\\[\\nW_{10} \\\\approx 244 \\\\times 0.904382 \\\\approx 220.5\\n\\\\]\\n\\nRounding to the nearest whole number, Cal O'Ree can expect to weigh approximately \\\\( 221 \\\\) pounds at the end of the ten weeks.\",\n",
+       "    answer='221'\n",
+       ")"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "module = dspy.ChainOfThought(\"question -> answer\")\n",
+    "module(question=example.question)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Next, let's set up an evaluator for the zero-shot module above, before prompt optimization."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Average Metric: 259.00 / 350 (74.0%): 100%|██████████| 350/350 [01:30<00:00,  3.85it/s]"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2024/11/28 18:41:55 INFO dspy.evaluate.evaluate: Average Metric: 259 / 350 (74.0%)\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>question</th>\n",
+       "      <th>example_reasoning</th>\n",
+       "      <th>example_answer</th>\n",
+       "      <th>pred_reasoning</th>\n",
+       "      <th>pred_answer</th>\n",
+       "      <th>method</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>What is the smallest integer value of $c$ such that the function $...</td>\n",
+       "      <td>The given function has a domain of all real numbers if and only if...</td>\n",
+       "      <td>1</td>\n",
+       "      <td>To determine the smallest integer value of \\( c \\) such that the f...</td>\n",
+       "      <td>1</td>\n",
+       "      <td>✔️ [True]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>What is the least value of $x$ that is a solution of $|{-x+3}|=7$?</td>\n",
+       "      <td>In order to have $|{-x+3}| = 7$, we must have $-x + 3 = 7$ or $-x ...</td>\n",
+       "      <td>-4</td>\n",
+       "      <td>To solve the equation \\( |{-x+3}|=7 \\), we need to consider the de...</td>\n",
+       "      <td>-4</td>\n",
+       "      <td>✔️ [True]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>Evaluate $\\left\\lceil -\\frac{7}{4}\\right\\rceil$.</td>\n",
+       "      <td>$-\\frac{7}{4}$ is between $-1$ and $-2$, so $\\left\\lceil -\\frac{7}...</td>\n",
+       "      <td>-1</td>\n",
+       "      <td>To evaluate \\(\\left\\lceil -\\frac{7}{4}\\right\\rceil\\), we first nee...</td>\n",
+       "      <td>-1</td>\n",
+       "      <td>✔️ [True]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>A triangle has vertices at coordinates $(11,1)$, $(2,3)$ and $(3,7...</td>\n",
+       "      <td>We must find the distance between each pair of points by using the...</td>\n",
+       "      <td>10</td>\n",
+       "      <td>To find the length of the longest side of the triangle with vertic...</td>\n",
+       "      <td>10</td>\n",
+       "      <td>✔️ [True]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>Let $f(x) = x + 2$ and $g(x) = 1/f(x)$. What is $g(f(-3))$?</td>\n",
+       "      <td>First, we find that $f(-3) = (-3) + 2 = -1$. Then, $$g(f(-3)) = g(...</td>\n",
+       "      <td>1</td>\n",
+       "      <td>To find \\( g(f(-3)) \\), we first need to evaluate \\( f(-3) \\). The...</td>\n",
+       "      <td>1</td>\n",
+       "      <td>✔️ [True]</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                                                question  \\\n",
+       "0  What is the smallest integer value of $c$ such that the function $...   \n",
+       "1     What is the least value of $x$ that is a solution of $|{-x+3}|=7$?   \n",
+       "2                       Evaluate $\\left\\lceil -\\frac{7}{4}\\right\\rceil$.   \n",
+       "3  A triangle has vertices at coordinates $(11,1)$, $(2,3)$ and $(3,7...   \n",
+       "4            Let $f(x) = x + 2$ and $g(x) = 1/f(x)$. What is $g(f(-3))$?   \n",
+       "\n",
+       "                                                       example_reasoning  \\\n",
+       "0  The given function has a domain of all real numbers if and only if...   \n",
+       "1  In order to have $|{-x+3}| = 7$, we must have $-x + 3 = 7$ or $-x ...   \n",
+       "2  $-\\frac{7}{4}$ is between $-1$ and $-2$, so $\\left\\lceil -\\frac{7}...   \n",
+       "3  We must find the distance between each pair of points by using the...   \n",
+       "4  First, we find that $f(-3) = (-3) + 2 = -1$. Then, $$g(f(-3)) = g(...   \n",
+       "\n",
+       "  example_answer  \\\n",
+       "0              1   \n",
+       "1             -4   \n",
+       "2             -1   \n",
+       "3             10   \n",
+       "4              1   \n",
+       "\n",
+       "                                                          pred_reasoning  \\\n",
+       "0  To determine the smallest integer value of \\( c \\) such that the f...   \n",
+       "1  To solve the equation \\( |{-x+3}|=7 \\), we need to consider the de...   \n",
+       "2  To evaluate \\(\\left\\lceil -\\frac{7}{4}\\right\\rceil\\), we first nee...   \n",
+       "3  To find the length of the longest side of the triangle with vertic...   \n",
+       "4  To find \\( g(f(-3)) \\), we first need to evaluate \\( f(-3) \\). The...   \n",
+       "\n",
+       "  pred_answer     method  \n",
+       "0           1  ✔️ [True]  \n",
+       "1          -4  ✔️ [True]  \n",
+       "2          -1  ✔️ [True]  \n",
+       "3          10  ✔️ [True]  \n",
+       "4           1  ✔️ [True]  "
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "\n",
+       "                <div style='\n",
+       "                    text-align: center;\n",
+       "                    font-size: 16px;\n",
+       "                    font-weight: bold;\n",
+       "                    color: #555;\n",
+       "                    margin: 10px 0;'>\n",
+       "                    ... 345 more rows not displayed ...\n",
+       "                </div>\n",
+       "                "
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "74.0"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "THREADS = 24\n",
+    "kwargs = dict(num_threads=THREADS, display_progress=True, display_table=5)\n",
+    "evaluate = dspy.Evaluate(devset=dataset.dev, metric=dataset.metric, **kwargs)\n",
+    "\n",
+    "evaluate(module)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "And lastly let's optimize our module. Since we want strong reasoning, we'll use the large GPT-4o as the teacher model (used to bootstrap reasoning for the small LM at optimization time) and as the proposer model (used to craft instructions).\n",
+    "\n",
+    "GPT-4o will be invoked only a small number of times. The model involved directly in optimization and in the resulting (optimized) program will be GPT-4o-mini.\n",
+    "\n",
+    "We will also specify `max_bootstrapped_demos=4` which means we want at most four bootstrapped examples in the prompt and `max_labeled_demos=4` which means that, in total between bootstrapped and pre-labeled examples, we want at most four."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "kwargs = dict(num_threads=THREADS, teacher_settings=dict(lm=gpt4o), prompt_model=gpt4o_mini)\n",
+    "optimizer = dspy.MIPROv2(metric=dataset.metric, auto=\"medium\", **kwargs)\n",
+    "\n",
+    "kwargs = dict(requires_permission_to_run=False, max_bootstrapped_demos=4, max_labeled_demos=4)\n",
+    "optimized_react = optimizer.compile(module, trainset=dataset.train, **kwargs)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Average Metric: 310.00 / 350 (88.6%): 100%|██████████| 350/350 [01:31<00:00,  3.84it/s]"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2024/11/28 18:59:19 INFO dspy.evaluate.evaluate: Average Metric: 310 / 350 (88.6%)\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>question</th>\n",
+       "      <th>example_reasoning</th>\n",
+       "      <th>example_answer</th>\n",
+       "      <th>pred_reasoning</th>\n",
+       "      <th>pred_answer</th>\n",
+       "      <th>method</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>What is the smallest integer value of $c$ such that the function $...</td>\n",
+       "      <td>The given function has a domain of all real numbers if and only if...</td>\n",
+       "      <td>1</td>\n",
+       "      <td>The function \\( f(x) = \\frac{x^2 + 1}{x^2 - x + c} \\) will have a ...</td>\n",
+       "      <td>1</td>\n",
+       "      <td>✔️ [True]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>What is the least value of $x$ that is a solution of $|{-x+3}|=7$?</td>\n",
+       "      <td>In order to have $|{-x+3}| = 7$, we must have $-x + 3 = 7$ or $-x ...</td>\n",
+       "      <td>-4</td>\n",
+       "      <td>The equation \\( |{-x+3}|=7 \\) implies two possible cases: 1. \\(-x ...</td>\n",
+       "      <td>-4</td>\n",
+       "      <td>✔️ [True]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>Evaluate $\\left\\lceil -\\frac{7}{4}\\right\\rceil$.</td>\n",
+       "      <td>$-\\frac{7}{4}$ is between $-1$ and $-2$, so $\\left\\lceil -\\frac{7}...</td>\n",
+       "      <td>-1</td>\n",
+       "      <td>To evaluate \\(\\left\\lceil -\\frac{7}{4}\\right\\rceil\\), we first nee...</td>\n",
+       "      <td>-1</td>\n",
+       "      <td>✔️ [True]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>A triangle has vertices at coordinates $(11,1)$, $(2,3)$ and $(3,7...</td>\n",
+       "      <td>We must find the distance between each pair of points by using the...</td>\n",
+       "      <td>10</td>\n",
+       "      <td>To find the length of the sides of the triangle formed by the vert...</td>\n",
+       "      <td>10</td>\n",
+       "      <td>✔️ [True]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>Let $f(x) = x + 2$ and $g(x) = 1/f(x)$. What is $g(f(-3))$?</td>\n",
+       "      <td>First, we find that $f(-3) = (-3) + 2 = -1$. Then, $$g(f(-3)) = g(...</td>\n",
+       "      <td>1</td>\n",
+       "      <td>To find \\( g(f(-3)) \\), we first need to evaluate \\( f(-3) \\). Usi...</td>\n",
+       "      <td>1</td>\n",
+       "      <td>✔️ [True]</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                                                question  \\\n",
+       "0  What is the smallest integer value of $c$ such that the function $...   \n",
+       "1     What is the least value of $x$ that is a solution of $|{-x+3}|=7$?   \n",
+       "2                       Evaluate $\\left\\lceil -\\frac{7}{4}\\right\\rceil$.   \n",
+       "3  A triangle has vertices at coordinates $(11,1)$, $(2,3)$ and $(3,7...   \n",
+       "4            Let $f(x) = x + 2$ and $g(x) = 1/f(x)$. What is $g(f(-3))$?   \n",
+       "\n",
+       "                                                       example_reasoning  \\\n",
+       "0  The given function has a domain of all real numbers if and only if...   \n",
+       "1  In order to have $|{-x+3}| = 7$, we must have $-x + 3 = 7$ or $-x ...   \n",
+       "2  $-\\frac{7}{4}$ is between $-1$ and $-2$, so $\\left\\lceil -\\frac{7}...   \n",
+       "3  We must find the distance between each pair of points by using the...   \n",
+       "4  First, we find that $f(-3) = (-3) + 2 = -1$. Then, $$g(f(-3)) = g(...   \n",
+       "\n",
+       "  example_answer  \\\n",
+       "0              1   \n",
+       "1             -4   \n",
+       "2             -1   \n",
+       "3             10   \n",
+       "4              1   \n",
+       "\n",
+       "                                                          pred_reasoning  \\\n",
+       "0  The function \\( f(x) = \\frac{x^2 + 1}{x^2 - x + c} \\) will have a ...   \n",
+       "1  The equation \\( |{-x+3}|=7 \\) implies two possible cases: 1. \\(-x ...   \n",
+       "2  To evaluate \\(\\left\\lceil -\\frac{7}{4}\\right\\rceil\\), we first nee...   \n",
+       "3  To find the length of the sides of the triangle formed by the vert...   \n",
+       "4  To find \\( g(f(-3)) \\), we first need to evaluate \\( f(-3) \\). Usi...   \n",
+       "\n",
+       "  pred_answer     method  \n",
+       "0           1  ✔️ [True]  \n",
+       "1          -4  ✔️ [True]  \n",
+       "2          -1  ✔️ [True]  \n",
+       "3          10  ✔️ [True]  \n",
+       "4           1  ✔️ [True]  "
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "\n",
+       "                <div style='\n",
+       "                    text-align: center;\n",
+       "                    font-size: 16px;\n",
+       "                    font-weight: bold;\n",
+       "                    color: #555;\n",
+       "                    margin: 10px 0;'>\n",
+       "                    ... 345 more rows not displayed ...\n",
+       "                </div>\n",
+       "                "
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "88.57"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "evaluate(optimized_react)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Neat. It was pretty straightforward to improve quality from 74% to over 88% on a held-out set here.\n",
+    "\n",
+    "That said, for reasoning tasks like this, you will often want to consider more advanced strategies, like:\n",
+    "\n",
+    "- A `dspy.ReAct` module with access to a calculator function or `dspy.PythonInterpreter`\n",
+    "- Ensembling multiple optimized prompts with a majority vote (or an Aggregator module) on top"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "\n",
+    "Just to understand what changed, let's view the prompt after optimization."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "\n",
+      "\n",
+      "\n",
+      "\u001b[34m[2024-11-28T18:59:19.176586]\u001b[0m\n",
+      "\n",
+      "\u001b[31mSystem message:\u001b[0m\n",
+      "\n",
+      "Your input fields are:\n",
+      "1. `question` (str)\n",
+      "\n",
+      "Your output fields are:\n",
+      "1. `reasoning` (str)\n",
+      "2. `answer` (str)\n",
+      "\n",
+      "All interactions will be structured in the following way, with the appropriate values filled in.\n",
+      "\n",
+      "[[ ## question ## ]]\n",
+      "{question}\n",
+      "\n",
+      "[[ ## reasoning ## ]]\n",
+      "{reasoning}\n",
+      "\n",
+      "[[ ## answer ## ]]\n",
+      "{answer}\n",
+      "\n",
+      "[[ ## completed ## ]]\n",
+      "\n",
+      "In adhering to this structure, your objective is: \n",
+      "        Analyze the `question` provided, and systematically apply mathematical reasoning to derive the `answer`. Ensure to articulate each step of your thought process in the `reasoning` field, detailing how you identify relationships and formulate equations to arrive at the solution.\n",
+      "\n",
+      "\n",
+      "\u001b[31mUser message:\u001b[0m\n",
+      "\n",
+      "[[ ## question ## ]]\n",
+      "If $|4x+2|=10$ and $x<0$, what is the value of $x$?\n",
+      "\n",
+      "Respond with the corresponding output fields, starting with the field `[[ ## reasoning ## ]]`, then `[[ ## answer ## ]]`, and then ending with the marker for `[[ ## completed ## ]]`.\n",
+      "\n",
+      "\n",
+      "\u001b[31mAssistant message:\u001b[0m\n",
+      "\n",
+      "[[ ## reasoning ## ]]\n",
+      "The equation $|4x+2|=10$ implies two possible equations: $4x+2=10$ or $4x+2=-10$. We solve each equation separately.\n",
+      "\n",
+      "1. For $4x+2=10$:\n",
+      "   \\[\n",
+      "   4x + 2 = 10 \\\\\n",
+      "   4x = 8 \\\\\n",
+      "   x = 2\n",
+      "   \\]\n",
+      "   However, $x=2$ does not satisfy the condition $x<0$.\n",
+      "\n",
+      "2. For $4x+2=-10$:\n",
+      "   \\[\n",
+      "   4x + 2 = -10 \\\\\n",
+      "   4x = -12 \\\\\n",
+      "   x = -3\n",
+      "   \\]\n",
+      "   The solution $x = -3$ satisfies the condition $x<0$.\n",
+      "\n",
+      "Therefore, the value of $x$ is $\\boxed{-3}$.\n",
+      "\n",
+      "[[ ## answer ## ]]\n",
+      "-3\n",
+      "\n",
+      "[[ ## completed ## ]]\n",
+      "\n",
+      "\n",
+      "\u001b[31mUser message:\u001b[0m\n",
+      "\n",
+      "[[ ## question ## ]]\n",
+      "Given that $-4$ is a solution to $x^2 + bx -36 = 0$, what is the value of $b$?\n",
+      "\n",
+      "Respond with the corresponding output fields, starting with the field `[[ ## reasoning ## ]]`, then `[[ ## answer ## ]]`, and then ending with the marker for `[[ ## completed ## ]]`.\n",
+      "\n",
+      "\n",
+      "\u001b[31mAssistant message:\u001b[0m\n",
+      "\n",
+      "[[ ## reasoning ## ]]\n",
+      "The product of the roots of this quadratic is $-36/1=-36$, so the other solution must be $-36/-4=9$. That means that the sum of the solutions is $-4+9=5$. The sum of the solutions is also $-b/1=-b$. Thus, $-b=5$ and $b=\\boxed{-5}$.\n",
+      "\n",
+      "[[ ## answer ## ]]\n",
+      "-5\n",
+      "\n",
+      "[[ ## completed ## ]]\n",
+      "\n",
+      "\n",
+      "\u001b[31mUser message:\u001b[0m\n",
+      "\n",
+      "[[ ## question ## ]]\n",
+      "Richard is building a rectangular playground from 200 feet of fencing. The fencing must entirely enclose the playground. What is the maximum area of this playground?\n",
+      "\n",
+      "Respond with the corresponding output fields, starting with the field `[[ ## reasoning ## ]]`, then `[[ ## answer ## ]]`, and then ending with the marker for `[[ ## completed ## ]]`.\n",
+      "\n",
+      "\n",
+      "\u001b[31mAssistant message:\u001b[0m\n",
+      "\n",
+      "[[ ## reasoning ## ]]\n",
+      "Let the length of the playground be $l$ and the width be $w$. We have the equation $2l+2w=200 \\Rightarrow l + w = 100$. We want to maximize the area of this rectangular playground, which is given by $lw$. From our equation, we know that $l=100-w$. Substituting this in to our expression for area, we have \\[(100-w)(w)=100w-w^2\\]We will now complete the square to find the maximum value of this expression. Factoring a $-1$ out, we have \\[-(w^2-100w)\\]In order for the expression inside the parenthesis to be a perfect square, we need to add and subtract $(100/2)^2=2500$ inside the parenthesis. Doing this, we get \\[-(w^2-100w+2500-2500) \\Rightarrow -(w-50)^2+2500\\]Since the maximum value of $-(w-50)^2$ is 0 (perfect squares are always nonnegative), the maximum value of the entire expression is 2500, which is achieved when $w=50$ and $l=100-w=50$ (the playground is a square). Thus, the maximum area of the playground is $\\boxed{2500}$ square feet.\n",
+      "\n",
+      "[[ ## answer ## ]]\n",
+      "2500\n",
+      "\n",
+      "[[ ## completed ## ]]\n",
+      "\n",
+      "\n",
+      "\u001b[31mUser message:\u001b[0m\n",
+      "\n",
+      "[[ ## question ## ]]\n",
+      "Alice and Bob are playing a game. Alice starts first. On Alice's turn, she flips a coin. If she gets a heads, she wins. If not, it becomes Bob's turn. On Bob's turn, he flips a coin. If he gets a tails, he wins. If not, it becomes Alice's turn. What is the probability that Alice wins the game?\n",
+      "\n",
+      "Respond with the corresponding output fields, starting with the field `[[ ## reasoning ## ]]`, then `[[ ## answer ## ]]`, and then ending with the marker for `[[ ## completed ## ]]`.\n",
+      "\n",
+      "\n",
+      "\u001b[31mAssistant message:\u001b[0m\n",
+      "\n",
+      "[[ ## reasoning ## ]]\n",
+      "Alice has a $1/2$ chance of winning the game on her first turn. If she doesn't, then the probability that she wins the game on her second turn is $1/8,$ since she must not win on her first flip ($1/2$ chance), Bob must not win on his first flip ($1/2$ chance), and then Alice must win on her second flip ($1/2$ chance). The probability that she wins the game on her third turn is $1/32,$ and in general, the probability that she wins the game on her $k^\\text{th}$ turn is $(1/2)^{2k-1}.$ Thus, the probability that Alice wins is an infinite geometric series with first term $1/2$ and common ratio $1/4.$ So, the probability that Alice wins the game is $$\\frac{\\frac12}{1-\\frac14} = \\boxed{\\frac{2}{3}}.$$OR\n",
+      "\n",
+      "Note that the only difference between the odds of Alice or Bob winning is who goes first. Because Bob goes second, the odds of him winning on his $k^\\text{th}$ flip is half of the odds that Alice wins on her $k^\\text{th}$ flip, since Alice must first get a tails before Bob gets a chance to win. Thus, if $a$ is Alice's chance of winning, and $b$ is Bob's chance of winning, then $a = 2b.$ Also, since someone must win, $a + b = 1.$ It follows that $a = 2/3$ and $b = 1/3,$ so Alice has a $\\boxed{\\frac{2}{3}}$ chance of winning the game.\n",
+      "\n",
+      "[[ ## answer ## ]]\n",
+      "\\frac{2}{3}\n",
+      "\n",
+      "[[ ## completed ## ]]\n",
+      "\n",
+      "\n",
+      "\u001b[31mUser message:\u001b[0m\n",
+      "\n",
+      "[[ ## question ## ]]\n",
+      "Find the smallest possible value of $$\\frac{(y-x)^2}{(y-z)(z-x)} + \\frac{(z-y)^2}{(z-x)(x-y)} + \\frac{(x-z)^2}{(x-y)(y-z)},$$ where $x,y,$ and $z$ are distinct real numbers.\n",
+      "\n",
+      "Respond with the corresponding output fields, starting with the field `[[ ## reasoning ## ]]`, then `[[ ## answer ## ]]`, and then ending with the marker for `[[ ## completed ## ]]`.\n",
+      "\n",
+      "\n",
+      "\u001b[31mResponse:\u001b[0m\n",
+      "\n",
+      "\u001b[32m[[ ## reasoning ## ]]\n",
+      "Let \\( S = \\frac{(y-x)^2}{(y-z)(z-x)} + \\frac{(z-y)^2}{(z-x)(x-y)} + \\frac{(x-z)^2}{(x-y)(y-z)} \\).\n",
+      "\n",
+      "To find the minimum value of \\( S \\), we can use the method of symmetry and the Cauchy-Schwarz inequality. \n",
+      "\n",
+      "By applying the Cauchy-Schwarz inequality, we have:\n",
+      "\\[\n",
+      "\\left( \\frac{(y-x)^2}{(y-z)(z-x)} + \\frac{(z-y)^2}{(z-x)(x-y)} + \\frac{(x-z)^2}{(x-y)(y-z)} \\right) \\left( (y-z)(z-x) + (z-x)(x-y) + (x-y)(y-z) \\right) \\geq \\left( (y-x) + (z-y) + (x-z) \\right)^2.\n",
+      "\\]\n",
+      "The right-hand side simplifies to:\n",
+      "\\[\n",
+      "(y-x) + (z-y) + (x-z) = 0,\n",
+      "\\]\n",
+      "which means the right-hand side is \\( 0 \\). Therefore, we have:\n",
+      "\\[\n",
+      "S \\cdot \\left( (y-z)(z-x) + (z-x)(x-y) + (x-y)(y-z) \\right) \\geq 0.\n",
+      "\\]\n",
+      "This implies that \\( S \\) is non-negative.\n",
+      "\n",
+      "Next, we can test specific values for \\( x, y, z \\) to find the minimum. Let's set \\( x = 0, y = 1, z = 2 \\):\n",
+      "\\[\n",
+      "S = \\frac{(1-0)^2}{(1-2)(2-0)} + \\frac{(2-1)^2}{(2-0)(0-1)} + \\frac{(0-2)^2}{(0-1)(1-2)}.\n",
+      "\\]\n",
+      "Calculating each term:\n",
+      "1. The first term:\n",
+      "   \\[\n",
+      "   \\frac{1^2}{(-1)(2)} = \\frac{1}{-2} = -\\frac{1}{2}.\n",
+      "   \\]\n",
+      "2. The second term:\n",
+      "   \\[\n",
+      "   \\frac{1^2}{(2)(-1)} = \\frac{1}{-2} = -\\frac{1}{2}.\n",
+      "   \\]\n",
+      "3. The third term:\n",
+      "   \\[\n",
+      "   \\frac{(-2)^2}{(-1)(-1)} = \\frac{4}{1} = 4.\n",
+      "   \\]\n",
+      "\n",
+      "Thus,\n",
+      "\\[\n",
+      "S = -\\frac{1}{2} - \\frac{1}{2} + 4 = 3.\n",
+      "\\]\n",
+      "\n",
+      "To check if this is indeed the minimum, we can also consider the case when \\( x, y, z \\) are the roots of a quadratic polynomial. By symmetry and the nature of the expression, we can conclude that the minimum value of \\( S \\) is indeed \\( 3 \\).\n",
+      "\n",
+      "Therefore, the smallest possible value of \\( S \\) is \\( \\boxed{3} \\).\n",
+      "\n",
+      "[[ ## answer ## ]]\n",
+      "3\n",
+      "\n",
+      "[[ ## completed ## ]]\u001b[0m\n",
+      "\n",
+      "\n",
+      "\n",
+      "\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "dspy.inspect_history()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "py310_sept24_user",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.14"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml
index 3f842630b5..d69c4f342e 100644
--- a/docs/mkdocs.yml
+++ b/docs/mkdocs.yml
@@ -45,6 +45,7 @@ nav:
   - Tutorials:
       - Tutorials Overview: tutorials/index.md
       - Retrieval-Augmented Generation: tutorials/rag/index.ipynb
+      - Math Reasoning: tutorials/math/index.ipynb
       - Entity Extraction: tutorials/entity_extraction/index.ipynb
       - Deployment: tutorials/deployment/index.md
   - Community:
diff --git a/dspy/datasets/__init__.py b/dspy/datasets/__init__.py
index bf6e3dd019..e8bb0f3eba 100644
--- a/dspy/datasets/__init__.py
+++ b/dspy/datasets/__init__.py
@@ -2,3 +2,4 @@
 from .dataloader import DataLoader
 from .dataset import Dataset
 from .hotpotqa import HotPotQA
+from .math import MATH
\ No newline at end of file
diff --git a/dspy/datasets/math.py b/dspy/datasets/math.py
new file mode 100644
index 0000000000..e36c576288
--- /dev/null
+++ b/dspy/datasets/math.py
@@ -0,0 +1,63 @@
+import re
+import random
+
+
+class MATH:
+    def __init__(self, subset):
+        import dspy
+        from datasets import load_dataset
+
+        ds = load_dataset("lighteval/MATH", subset)
+
+        # NOTE: Defaults to sub-splitting MATH's 'test' split into train/dev/test, presuming that current
+        # LMs are trained on MATH's train. Makes no difference for gpt-4o-mini, but might for other models.
+
+        dataset = [
+            dspy.Example(
+                question=example["problem"], reasoning=example["solution"], answer=extract_answer(example["solution"])
+            ).with_inputs("question")
+            for example in ds["test"]
+        ]
+
+        size = min(350, len(dataset) // 3)
+        random.Random(0).shuffle(dataset)
+        self.train, self.dev, self.test = dataset[:size], dataset[size:2*size], dataset[2*size:]
+
+    def metric(self, example, pred, trace=None):
+        try:
+            import math_equivalence
+        except ImportError:
+            raise ImportError("MATH's metric requires `pip install git+https://github.com/hendrycks/math.git`")
+
+        return math_equivalence.is_equiv(example.answer, pred.answer)
+
+
+def extract_answer(s):
+    start = s.find("\\boxed{")
+    if start == -1:
+        return None
+    
+    idx = start + len("\\boxed{")
+    brace_level = 1
+
+    answer = ""
+    while idx < len(s) and brace_level > 0:
+        c = s[idx]
+        if c == "{":
+            brace_level += 1
+        elif c == "}":
+            brace_level -= 1
+            if brace_level == 0:
+                break
+        answer += c
+        idx += 1
+
+    answer = re.sub(r"\\text\{[^}]*\}", "", answer)
+    answer = re.sub(r"\\!", "", answer)
+    return answer.strip()
+
+
+"""
+NOTE: MATH's official math_equivalence.is_equiv does not seem to have perfect recall.
+Consider its behavior on reference values like `left[\frac{1}{2}, \frac{4}{3}\right]`.
+"""