From f79fe870004b3113239642523ef89a21e957a0c9 Mon Sep 17 00:00:00 2001
From: Omar Khattab <okhat@users.noreply.github.com>
Date: Sat, 23 Nov 2024 22:19:20 -0800
Subject: [PATCH] Update RAG tutorial to use the downsampled RAG-QA Arena
 upload

---
 docs/docs/index.md                  |    2 +-
 docs/docs/tutorials/rag/index.ipynb | 2082 ++++++++++++++++++++++-----
 dspy/evaluate/auto_evaluation.py    |    8 +-
 dspy/utils/__init__.py              |   14 +
 4 files changed, 1725 insertions(+), 381 deletions(-)

diff --git a/docs/docs/index.md b/docs/docs/index.md
index cb16c7c69d..81b0b7f28d 100644
--- a/docs/docs/index.md
+++ b/docs/docs/index.md
@@ -368,7 +368,7 @@ BootstrapFS on MATH with a tiny LM like Llama-3.2 with Ollama (maybe with a big
 
 ## 3) **DSPy's Ecosystem** advances open-source AI research.
 
-Compared to working on or with monolithic LMs, DSPy's modular paradigm aims to enable a large community to improve the compositional architectures, inference-time strategies, and optimizers for LM programs in an open, distributed way. It gives you more control, helps you iterate much faster, and allows your programs to get better over time by applying the latest optimizers or modules.
+Compared to monolithic LMs, DSPy's modular paradigm enables a large community to improve the compositional architectures, inference-time strategies, and optimizers for LM programs in an open, distributed way. This gives DSPy users more control, helps them iterate much faster, and allows their programs to get better over time by applying the latest optimizers or modules.
 
 The DSPy research effort started at Stanford NLP in Feb 2022, building on what we learned from developing early [compound LM systems](https://bair.berkeley.edu/blog/2024/02/18/compound-ai-systems/) like [ColBERT-QA](https://arxiv.org/abs/2007.00814), [Baleen](https://arxiv.org/abs/2101.00436), and [Hindsight](https://arxiv.org/abs/2110.07752). The first version was released as [DSP](https://arxiv.org/abs/2212.14024) in Dec 2022 and evolved by Oct 2023 into [DSPy](https://arxiv.org/abs/2310.03714). Thanks to [250 contributors](https://github.com/stanfordnlp/dspy/graphs/contributors), DSPy has introduced tens of thousands of people to building and optimizing modular LM programs.
 
diff --git a/docs/docs/tutorials/rag/index.ipynb b/docs/docs/tutorials/rag/index.ipynb
index 59f3b07d3e..a2dbe11087 100644
--- a/docs/docs/tutorials/rag/index.ipynb
+++ b/docs/docs/tutorials/rag/index.ipynb
@@ -55,11 +55,11 @@
      "text": [
       "In Linux, \"high memory\" and \"low memory\" refer to different regions of the system's memory address space, particularly in the context of 32-bit architectures.\n",
       "\n",
-      "- **Low Memory**: This typically refers to the first 896 MB of memory in a 32-bit system. The kernel can directly access this memory without any special handling. It is used for kernel data structures and for user processes. The low memory region is crucial for the kernel's operation, as it allows for efficient memory management and access.\n",
+      "- **Low Memory**: This typically refers to the memory that is directly accessible by the kernel. In a 32-bit system, this is usually the first 896 MB of RAM (from 0 to 896 MB). The kernel can directly map this memory, making it faster for the kernel to access and manage. Low memory is used for kernel data structures and for user processes that require direct access to memory.\n",
       "\n",
-      "- **High Memory**: This refers to memory above the 896 MB threshold in a 32-bit system. The kernel cannot directly access this memory; instead, it must use special mechanisms to map it into the kernel's address space when needed. High memory is often used for user processes and can be allocated dynamically, but it requires additional overhead for the kernel to manage.\n",
+      "- **High Memory**: This refers to the memory above the low memory limit, which is not directly accessible by the kernel in a 32-bit system. This area is typically above 896 MB. The kernel cannot directly access this memory without using special mechanisms, such as mapping it into the kernel's address space when needed. High memory is used for user processes that require more memory than what is available in low memory.\n",
       "\n",
-      "In 64-bit systems, the distinction between high and low memory is less relevant, as the addressable memory space is significantly larger, and the kernel can access most of the memory directly.\n"
+      "In summary, low memory is directly accessible by the kernel, while high memory requires additional steps for the kernel to access it, especially in 32-bit systems. In 64-bit systems, this distinction is less significant as the kernel can address a much larger memory space directly.\n"
      ]
     }
    ],
@@ -94,7 +94,7 @@
       "\n",
       "\n",
       "\n",
-      "\u001b[34m[2024-11-10T12:39:19.458514]\u001b[0m\n",
+      "\u001b[34m[2024-11-23T22:12:48.901453]\u001b[0m\n",
       "\n",
       "\u001b[31mSystem message:\u001b[0m\n",
       "\n",
@@ -131,11 +131,11 @@
       "\u001b[32m[[ ## response ## ]]\n",
       "In Linux, \"high memory\" and \"low memory\" refer to different regions of the system's memory address space, particularly in the context of 32-bit architectures.\n",
       "\n",
-      "- **Low Memory**: This typically refers to the first 896 MB of memory in a 32-bit system. The kernel can directly access this memory without any special handling. It is used for kernel data structures and for user processes. The low memory region is crucial for the kernel's operation, as it allows for efficient memory management and access.\n",
+      "- **Low Memory**: This typically refers to the memory that is directly accessible by the kernel. In a 32-bit system, this is usually the first 896 MB of RAM (from 0 to 896 MB). The kernel can directly map this memory, making it faster for the kernel to access and manage. Low memory is used for kernel data structures and for user processes that require direct access to memory.\n",
       "\n",
-      "- **High Memory**: This refers to memory above the 896 MB threshold in a 32-bit system. The kernel cannot directly access this memory; instead, it must use special mechanisms to map it into the kernel's address space when needed. High memory is often used for user processes and can be allocated dynamically, but it requires additional overhead for the kernel to manage.\n",
+      "- **High Memory**: This refers to the memory above the low memory limit, which is not directly accessible by the kernel in a 32-bit system. This area is typically above 896 MB. The kernel cannot directly access this memory without using special mechanisms, such as mapping it into the kernel's address space when needed. High memory is used for user processes that require more memory than what is available in low memory.\n",
       "\n",
-      "In 64-bit systems, the distinction between high and low memory is less relevant, as the addressable memory space is significantly larger, and the kernel can access most of the memory directly.\n",
+      "In summary, low memory is directly accessible by the kernel, while high memory requires additional steps for the kernel to access it, especially in 32-bit systems. In 64-bit systems, this distinction is less significant as the kernel can address a much larger memory space directly.\n",
       "\n",
       "[[ ## completed ## ]]\u001b[0m\n",
       "\n",
@@ -170,8 +170,8 @@
      "data": {
       "text/plain": [
        "Prediction(\n",
-       "    reasoning=\"The placement of curly braces on their own line is largely a matter of coding style and conventions. In some programming languages and style guides, such as those used in C, C++, and Java, it is common to place opening curly braces on the same line as the control statement (like `if`, `for`, etc.) and closing braces on a new line. However, other styles, such as the Allman style, advocate for placing both opening and closing braces on their own lines. Ultimately, the decision should be based on the team's coding standards or personal preference, as long as it maintains readability and consistency.\",\n",
-       "    response=\"Curly braces can either appear on their own line or not, depending on the coding style you choose to follow. It's important to adhere to a consistent style throughout your codebase.\"\n",
+       "    reasoning='The placement of curly braces on their own line depends on the coding style and conventions being followed. In some programming languages and style guides, such as the Allman style, curly braces are placed on their own line to enhance readability. In contrast, other styles, like K&R style, place the opening brace on the same line as the control statement. Ultimately, it is a matter of personal or team preference, and consistency within a project is key.',\n",
+       "    response='Curly braces can appear on their own line depending on the coding style you are following. If you prefer a style that enhances readability, such as the Allman style, then yes, they should be on their own line. However, if you are following a different style, like K&R, they may not need to be. Consistency is important, so choose a style and stick with it.'\n",
        ")"
       ]
      },
@@ -191,7 +191,7 @@
    "source": [
     "\n",
     "\n",
-    "Interestingly, asking for reasoning made the output `response` shorter in this case. Is this a good thing or a bad thing? It depends on what you need: there's no free lunch, but DSPy gives you the tools to experiment with different strategies extremely quickly.\n",
+    "Interestingly, asking for reasoning can make the output `response` shorter in this case. Is this a good thing or a bad thing? It depends on what you need: there's no free lunch, but DSPy gives you the tools to experiment with different strategies extremely quickly.\n",
     "\n",
     "By the way, `dspy.ChainOfThought` is implemented in DSPy, using `dspy.Predict`. This is a good place to `dspy.inspect_history` if you're curious.\n"
    ]
@@ -220,25 +220,14 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "import os\n",
     "import ujson\n",
-    "import requests\n",
+    "from dspy.utils import download\n",
     "\n",
-    "def download(url):\n",
-    "    filename = os.path.basename(url)\n",
-    "    remote_size = int(requests.head(url, allow_redirects=True).headers.get('Content-Length', 0))\n",
-    "    local_size = os.path.getsize(filename) if os.path.exists(filename) else 0\n",
+    "# Download question--answer pairs from the RAG-QA Arena \"Tech\" dataset.\n",
+    "download(\"https://huggingface.co/dspy/cache/resolve/main/ragqa_arena_tech_examples.jsonl\")\n",
     "\n",
-    "    if local_size != remote_size:\n",
-    "        print(f\"Downloading '{filename}'...\")\n",
-    "        with requests.get(url, stream=True) as r, open(filename, 'wb') as f:\n",
-    "            for chunk in r.iter_content(chunk_size=8192): f.write(chunk)\n",
-    "\n",
-    "# Download 500 question--answer pairs from the RAG-QA Arena \"Tech\" dataset.\n",
-    "download(\"https://huggingface.co/dspy/cache/resolve/main/ragqa_arena_tech_500.json\")\n",
-    "\n",
-    "with open('ragqa_arena_tech_500.json') as f:\n",
-    "    data = ujson.load(f)"
+    "with open(\"ragqa_arena_tech_examples.jsonl\") as f:\n",
+    "    data = [ujson.loads(line) for line in f]"
    ]
   },
   {
@@ -249,8 +238,9 @@
     {
      "data": {
       "text/plain": [
-       "{'question': 'how to transfer whatsapp voice message to computer?',\n",
-       " 'response': 'To transfer voice notes from WhatsApp on your device to your computer, you have the option to select the \"Share\" feature within the app and send the files via Email, Gmail, Bluetooth, or other available services.  \\nYou can also move the files onto your phone\\'s SD card, connect your phone to your computer via a USB cable, then find and transfer the files via File Explorer on your PC. \\nAlternatively, you can choose to attach all the desired voice notes to an email and, from your phone, send them to your own email address.  \\nUpon receiving the email on your computer, you can then download the voice note attachments.'}"
+       "{'question': 'why igp is used in mpls?',\n",
+       " 'response': \"An IGP exchanges routing prefixes between gateways/routers.  \\nWithout a routing protocol, you'd have to configure each route on every router and you'd have no dynamic updates when routes change because of link failures. \\nFuthermore, within an MPLS network, an IGP is vital for advertising the internal topology and ensuring connectivity for MP-BGP inside the network.\",\n",
+       " 'gold_doc_ids': [2822, 2823]}"
       ]
      },
      "execution_count": 6,
@@ -282,7 +272,7 @@
     {
      "data": {
       "text/plain": [
-       "Example({'question': 'what are high memory and low memory on linux?', 'response': '\"High Memory\" refers to the application or user space, the memory that user programs can use and which isn\\'t permanently mapped in the kernel\\'s space, while \"Low Memory\" is the kernel\\'s space, which the kernel can address directly and is permanently mapped. \\nThe user cannot access the Low Memory as it is set aside for the required kernel programs.'}) (input_keys={'question'})"
+       "Example({'question': 'why are my text messages coming up as maybe?', 'response': 'This is part of the Proactivity features new with iOS 9: It looks at info in emails to see if anyone with this number sent you an email and if it finds the phone number associated with a contact from your email, it will show you \"Maybe\". \\n\\nHowever, it has been suggested there is a bug in iOS 11.2 that can result in \"Maybe\" being displayed even when \"Find Contacts in Other Apps\" is disabled.', 'gold_doc_ids': [3956, 3957, 8034]}) (input_keys={'question'})"
       ]
      },
      "execution_count": 7,
@@ -305,11 +295,12 @@
     "\n",
     "Now, let's divide the data into:\n",
     "\n",
-    "- Training and Validation sets:\n",
+    "- Training (and with it Validation) set:\n",
     "    - These are the splits you typically give to DSPy optimizers.\n",
     "    - Optimizers typically learn directly from the training examples and check their progress using the validation examples.\n",
     "    - It's good to have 30--300 examples for training and validation each.\n",
     "    - For prompt optimizers in particular, it's often better to pass _more_ validation than training.\n",
+    "    - Below, we'll use 200 in total. MIPROv2 will split them into 20% training and 80% validation if you don't pass a valset.\n",
     "\n",
     "- Development and Test sets: The rest, typically on the order of 30--1000, can be used for:\n",
     "    - development (i.e., you can inspect them as you iterate on your system) and\n",
@@ -324,7 +315,7 @@
     {
      "data": {
       "text/plain": [
-       "(50, 100, 150, 200)"
+       "(200, 300, 500)"
       ]
      },
      "execution_count": 8,
@@ -333,9 +324,12 @@
     }
    ],
    "source": [
-    "trainset, valset, devset, testset = data[:50], data[50:150], data[150:300], data[300:500]\n",
+    "import random\n",
     "\n",
-    "len(trainset), len(valset), len(devset), len(testset)"
+    "random.Random(0).shuffle(data)\n",
+    "trainset, devset, testset = data[:200], data[200:500], data[500:1000]\n",
+    "\n",
+    "len(trainset), len(devset), len(testset)"
    ]
   },
   {
@@ -346,8 +340,7 @@
     "\n",
     "What kind of metric can suit our question-answering task? There are many choices, but since the answers are long, we may ask: How well does the system response _cover_ all key facts in the gold response? And the other way around, how well is the system response _not saying things_ that aren't in the gold response?\n",
     "\n",
-    "That metric is essentially a **semantic F1**, so let's load a `SemanticF1` metric from DSPy. This metric is actually implemented as a [very simple DSPy module](https://github.com/stanfordnlp/dspy/blob/77c2e1cceba427c7f91edb2ed5653276fb0c6de7/dspy/evaluate/auto_evaluation.py#L21) using whatever LM we're working with.\n",
-    "\n"
+    "That metric is essentially a **semantic F1**, so let's load a `SemanticF1` metric from DSPy. This metric is actually implemented as a [very simple DSPy module](https://github.com/stanfordnlp/dspy/blob/main/dspy/evaluate/auto_evaluation.py#L21) using whatever LM we're working with."
    ]
   },
   {
@@ -359,14 +352,15 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Question: \t what are high memory and low memory on linux?\n",
+      "Question: \t why are my text messages coming up as maybe?\n",
       "\n",
-      "Gold Response: \t \"High Memory\" refers to the application or user space, the memory that user programs can use and which isn't permanently mapped in the kernel's space, while \"Low Memory\" is the kernel's space, which the kernel can address directly and is permanently mapped. \n",
-      "The user cannot access the Low Memory as it is set aside for the required kernel programs.\n",
+      "Gold Response: \t This is part of the Proactivity features new with iOS 9: It looks at info in emails to see if anyone with this number sent you an email and if it finds the phone number associated with a contact from your email, it will show you \"Maybe\". \n",
       "\n",
-      "Predicted Response: \t In Linux, \"low memory\" refers to the first 896 MB of RAM, which is directly accessible by the kernel and used for kernel operations and user processes. \"High memory\" refers to memory above this limit, which is not directly accessible by the kernel in 32-bit systems and is used for user processes, requiring special handling to access. This distinction is crucial for effective memory management in Linux.\n",
+      "However, it has been suggested there is a bug in iOS 11.2 that can result in \"Maybe\" being displayed even when \"Find Contacts in Other Apps\" is disabled.\n",
       "\n",
-      "Semantic F1 Score: 0.87\n"
+      "Predicted Response: \t Your text messages are showing up as \"maybe\" because your messaging app is uncertain about the sender's identity. This typically occurs when the sender's number is not saved in your contacts or if the message is from an unknown number. To resolve this, you can save the contact in your address book or check the message settings in your app.\n",
+      "\n",
+      "Semantic F1 Score: 0.33\n"
      ]
     }
    ],
@@ -374,7 +368,7 @@
     "from dspy.evaluate import SemanticF1\n",
     "\n",
     "# Instantiate the metric.\n",
-    "metric = SemanticF1()\n",
+    "metric = SemanticF1(decompositional=True)\n",
     "\n",
     "# Produce a prediction from our `cot` module, using the `example` above as input.\n",
     "pred = cot(**example.inputs())\n",
@@ -410,7 +404,7 @@
       "\n",
       "\n",
       "\n",
-      "\u001b[34m[2024-11-10T12:39:19.701005]\u001b[0m\n",
+      "\u001b[34m[2024-11-23T22:12:49.329836]\u001b[0m\n",
       "\n",
       "\u001b[31mSystem message:\u001b[0m\n",
       "\n",
@@ -421,8 +415,11 @@
       "\n",
       "Your output fields are:\n",
       "1. `reasoning` (str)\n",
-      "2. `recall` (float): fraction (out of 1.0) of ground truth covered by the system response\n",
-      "3. `precision` (float): fraction (out of 1.0) of system response covered by the ground truth\n",
+      "2. `ground_truth_key_ideas` (str): enumeration of key ideas in the ground truth\n",
+      "3. `system_response_key_ideas` (str): enumeration of key ideas in the system response\n",
+      "4. `discussion` (str): discussion of the overlap between ground truth and system response\n",
+      "5. `recall` (float): fraction (out of 1.0) of ground truth covered by the system response\n",
+      "6. `precision` (float): fraction (out of 1.0) of system response covered by the ground truth\n",
       "\n",
       "All interactions will be structured in the following way, with the appropriate values filled in.\n",
       "\n",
@@ -438,6 +435,15 @@
       "[[ ## reasoning ## ]]\n",
       "{reasoning}\n",
       "\n",
+      "[[ ## ground_truth_key_ideas ## ]]\n",
+      "{ground_truth_key_ideas}\n",
+      "\n",
+      "[[ ## system_response_key_ideas ## ]]\n",
+      "{system_response_key_ideas}\n",
+      "\n",
+      "[[ ## discussion ## ]]\n",
+      "{discussion}\n",
+      "\n",
       "[[ ## recall ## ]]\n",
       "{recall}        # note: the value you produce must be a single float value\n",
       "\n",
@@ -447,35 +453,50 @@
       "[[ ## completed ## ]]\n",
       "\n",
       "In adhering to this structure, your objective is: \n",
-      "        Compare a system's response to the ground truth to compute its recall and precision.\n",
-      "        If asked to reason, enumerate key ideas in each response, and whether they are present in the other response.\n",
+      "        Compare a system's response to the ground truth to compute recall and precision of key ideas.\n",
+      "        You will first enumerate key ideas in each response, discuss their overlap, and then report recall and precision.\n",
       "\n",
       "\n",
       "\u001b[31mUser message:\u001b[0m\n",
       "\n",
       "[[ ## question ## ]]\n",
-      "what are high memory and low memory on linux?\n",
+      "why are my text messages coming up as maybe?\n",
       "\n",
       "[[ ## ground_truth ## ]]\n",
-      "\"High Memory\" refers to the application or user space, the memory that user programs can use and which isn't permanently mapped in the kernel's space, while \"Low Memory\" is the kernel's space, which the kernel can address directly and is permanently mapped. \n",
-      "The user cannot access the Low Memory as it is set aside for the required kernel programs.\n",
+      "This is part of the Proactivity features new with iOS 9: It looks at info in emails to see if anyone with this number sent you an email and if it finds the phone number associated with a contact from your email, it will show you \"Maybe\". \n",
+      "\n",
+      "However, it has been suggested there is a bug in iOS 11.2 that can result in \"Maybe\" being displayed even when \"Find Contacts in Other Apps\" is disabled.\n",
       "\n",
       "[[ ## system_response ## ]]\n",
-      "In Linux, \"low memory\" refers to the first 896 MB of RAM, which is directly accessible by the kernel and used for kernel operations and user processes. \"High memory\" refers to memory above this limit, which is not directly accessible by the kernel in 32-bit systems and is used for user processes, requiring special handling to access. This distinction is crucial for effective memory management in Linux.\n",
+      "Your text messages are showing up as \"maybe\" because your messaging app is uncertain about the sender's identity. This typically occurs when the sender's number is not saved in your contacts or if the message is from an unknown number. To resolve this, you can save the contact in your address book or check the message settings in your app.\n",
       "\n",
-      "Respond with the corresponding output fields, starting with the field `[[ ## reasoning ## ]]`, then `[[ ## recall ## ]]` (must be formatted as a valid Python float), then `[[ ## precision ## ]]` (must be formatted as a valid Python float), and then ending with the marker for `[[ ## completed ## ]]`.\n",
+      "Respond with the corresponding output fields, starting with the field `[[ ## reasoning ## ]]`, then `[[ ## ground_truth_key_ideas ## ]]`, then `[[ ## system_response_key_ideas ## ]]`, then `[[ ## discussion ## ]]`, then `[[ ## recall ## ]]` (must be formatted as a valid Python float), then `[[ ## precision ## ]]` (must be formatted as a valid Python float), and then ending with the marker for `[[ ## completed ## ]]`.\n",
       "\n",
       "\n",
       "\u001b[31mResponse:\u001b[0m\n",
       "\n",
       "\u001b[32m[[ ## reasoning ## ]]\n",
-      "The system response correctly identifies \"Low Memory\" as the portion of RAM directly accessible by the kernel and used for kernel operations, which aligns with the ground truth. It also mentions that \"High Memory\" refers to memory above a certain limit that is not directly accessible by the kernel, which is consistent with the ground truth's description of user space. However, the ground truth emphasizes that Low Memory is set aside for kernel programs and that users cannot access it, which is not explicitly stated in the system response. Overall, the key ideas are present, but the system response lacks the explicit mention of user access limitations for Low Memory.\n",
+      "The system response provides a general explanation for why text messages may show up as \"maybe,\" focusing on the uncertainty of the sender's identity. However, it does not mention the specific iOS features or the potential bug in iOS 11.2 that are highlighted in the ground truth. The ground truth explains that this feature is part of iOS 9's Proactivity features and mentions a specific bug in iOS 11.2, which is not addressed in the system response.\n",
+      "\n",
+      "[[ ## ground_truth_key_ideas ## ]]\n",
+      "1. Proactivity features introduced in iOS 9.\n",
+      "2. The system checks emails to associate phone numbers with contacts.\n",
+      "3. \"Maybe\" is displayed if the number is not saved in contacts.\n",
+      "4. Mention of a bug in iOS 11.2 causing \"Maybe\" to appear incorrectly.\n",
+      "\n",
+      "[[ ## system_response_key_ideas ## ]]\n",
+      "1. Text messages show up as \"maybe\" due to uncertainty about the sender's identity.\n",
+      "2. Occurs when the sender's number is not saved in contacts or is from an unknown number.\n",
+      "3. Suggests saving the contact or checking message settings.\n",
+      "\n",
+      "[[ ## discussion ## ]]\n",
+      "There is some overlap between the ground truth and the system response regarding the uncertainty of the sender's identity and the suggestion to save the contact. However, the system response lacks specific details about the iOS features and the bug mentioned in the ground truth. The ground truth provides a more comprehensive explanation of the \"maybe\" feature, while the system response is more general and does not address the iOS version specifics.\n",
       "\n",
       "[[ ## recall ## ]]\n",
-      "0.85\n",
+      "0.25\n",
       "\n",
       "[[ ## precision ## ]]\n",
-      "0.90\n",
+      "0.5\n",
       "\n",
       "[[ ## completed ## ]]\u001b[0m\n",
       "\n",
@@ -502,12 +523,25 @@
    "execution_count": 11,
    "metadata": {},
    "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Average Metric: 125.68 / 300 (41.9%): 100%|██████████| 300/300 [00:00<00:00, 598.18it/s]"
+     ]
+    },
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Average Metric: 55.380830691218016 / 150  (36.9): 100%|██████████| 150/150 [00:00<00:00, 513.51it/s]\n",
-      "2024/11/10 12:39:20 INFO dspy.evaluate.evaluate: Average Metric: 55.380830691218016 / 150 (36.9%)\n"
+      "2024/11/23 22:12:49 INFO dspy.evaluate.evaluate: Average Metric: 125.68228336477591 / 300 (41.9%)\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n"
      ]
     },
     {
@@ -533,6 +567,7 @@
        "      <th></th>\n",
        "      <th>question</th>\n",
        "      <th>example_response</th>\n",
+       "      <th>gold_doc_ids</th>\n",
        "      <th>reasoning</th>\n",
        "      <th>pred_response</th>\n",
        "      <th>SemanticF1</th>\n",
@@ -541,44 +576,50 @@
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
-       "      <td>why is mercurial considered to be easier than git?</td>\n",
-       "      <td>Mercurial's syntax is considered more familiar, especially for tho...</td>\n",
-       "      <td>Mercurial is often considered easier than Git for several reasons....</td>\n",
-       "      <td>Mercurial is considered easier than Git primarily due to its simpl...</td>\n",
-       "      <td>✔️ [0.545]</td>\n",
+       "      <td>when to use c over c++, and c++ over c?</td>\n",
+       "      <td>If you are equally familiar with both C++ and C, it's advisable to...</td>\n",
+       "      <td>[733]</td>\n",
+       "      <td>C and C++ are both powerful programming languages, but they serve ...</td>\n",
+       "      <td>Use C when you need low-level access to memory, require high perfo...</td>\n",
+       "      <td></td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
-       "      <td>open finder window from current terminal location?</td>\n",
-       "      <td>If you type 'open .' in Terminal, it will open the current directo...</td>\n",
-       "      <td>To open a Finder window from the current terminal location on a Ma...</td>\n",
-       "      <td>You can open a Finder window from your current terminal location b...</td>\n",
-       "      <td>✔️ [0.667]</td>\n",
+       "      <td>should images be stored in a git repository?</td>\n",
+       "      <td>One viewpoint expresses that there is no significant downside, esp...</td>\n",
+       "      <td>[6253, 6254, 6275, 6278, 8215]</td>\n",
+       "      <td>Storing images in a Git repository can be beneficial for version c...</td>\n",
+       "      <td>Images can be stored in a Git repository, but it's important to co...</td>\n",
+       "      <td>✔️ [0.444]</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "text/plain": [
-       "                                             question  \\\n",
-       "0  why is mercurial considered to be easier than git?   \n",
-       "1  open finder window from current terminal location?   \n",
+       "                                       question  \\\n",
+       "0       when to use c over c++, and c++ over c?   \n",
+       "1  should images be stored in a git repository?   \n",
        "\n",
        "                                                        example_response  \\\n",
-       "0  Mercurial's syntax is considered more familiar, especially for tho...   \n",
-       "1  If you type 'open .' in Terminal, it will open the current directo...   \n",
+       "0  If you are equally familiar with both C++ and C, it's advisable to...   \n",
+       "1  One viewpoint expresses that there is no significant downside, esp...   \n",
+       "\n",
+       "                     gold_doc_ids  \\\n",
+       "0                           [733]   \n",
+       "1  [6253, 6254, 6275, 6278, 8215]   \n",
        "\n",
        "                                                               reasoning  \\\n",
-       "0  Mercurial is often considered easier than Git for several reasons....   \n",
-       "1  To open a Finder window from the current terminal location on a Ma...   \n",
+       "0  C and C++ are both powerful programming languages, but they serve ...   \n",
+       "1  Storing images in a Git repository can be beneficial for version c...   \n",
        "\n",
        "                                                           pred_response  \\\n",
-       "0  Mercurial is considered easier than Git primarily due to its simpl...   \n",
-       "1  You can open a Finder window from your current terminal location b...   \n",
+       "0  Use C when you need low-level access to memory, require high perfo...   \n",
+       "1  Images can be stored in a Git repository, but it's important to co...   \n",
        "\n",
        "   SemanticF1  \n",
-       "0  ✔️ [0.545]  \n",
-       "1  ✔️ [0.667]  "
+       "0              \n",
+       "1  ✔️ [0.444]  "
       ]
      },
      "metadata": {},
@@ -594,7 +635,7 @@
        "                    font-weight: bold;\n",
        "                    color: #555;\n",
        "                    margin: 10px 0;'>\n",
-       "                    ... 148 more rows not displayed ...\n",
+       "                    ... 298 more rows not displayed ...\n",
        "                </div>\n",
        "                "
       ],
@@ -608,7 +649,7 @@
     {
      "data": {
       "text/plain": [
-       "36.92"
+       "41.89"
       ]
      },
      "execution_count": 11,
@@ -640,7 +681,7 @@
    "source": [
     "## Basic Retrieval-Augmented Generation (RAG).\n",
     "\n",
-    "First, let's download the corpus data that we will use for RAG search. The next cell will seek to download 4 GBs, so it may take a few minutes. A future version of this notebook will come with a cache that allows you to skip downloads and the PyTorch installation."
+    "First, let's download the corpus data that we will use for RAG search. An older version of this tutorial used the full (650,000 document) corpus. To make this very fast and cheap to run, we've downsampled the corpus to just 28,000 documents."
    ]
   },
   {
@@ -649,8 +690,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "download('https://huggingface.co/datasets/colbertv2/lotte_passages/resolve/main/technology/test_collection.jsonl')\n",
-    "download('https://huggingface.co/dspy/cache/resolve/main/index.pt')"
+    "download(\"https://huggingface.co/dspy/cache/resolve/main/ragqa_arena_tech_corpus.jsonl\")"
    ]
   },
   {
@@ -659,31 +699,33 @@
    "source": [
     "## Set up your system's retriever.\n",
     "\n",
-    "As far as DSPy is concerned, you can plug in any Python code for calling tools or retrievers. Hence, for our RAG system, we can plug any tools for the search step. Here, we'll just use OpenAI Embeddings and PyTorch for top-K search, but this is not a special choice, just a convenient one."
+    "As far as DSPy is concerned, you can plug in any Python code for calling tools or retrievers. Here, we'll just use OpenAI Embeddings and do top-K search locally, just for convenience."
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 13,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Loaded 28436 documents. Will encode them below.\n",
+      "Training a 32-byte FAISS index with 337 partitions, based on 28436 x 512-dim embeddings\n"
+     ]
+    }
+   ],
    "source": [
-    "import torch\n",
-    "import functools\n",
-    "from litellm import embedding as Embed\n",
-    "\n",
-    "with open(\"test_collection.jsonl\") as f:\n",
-    "    corpus = [ujson.loads(line) for line in f]\n",
-    "\n",
-    "index = torch.load('index.pt', weights_only=True)\n",
-    "max_characters = 4000 # >98th percentile of document lengths\n",
-    "\n",
-    "@functools.lru_cache(maxsize=None)\n",
-    "def search(query, k=5):\n",
-    "    query_embedding = torch.tensor(Embed(input=query, model=\"text-embedding-3-small\").data[0]['embedding'])\n",
-    "    topk_scores, topk_indices = torch.matmul(index, query_embedding).topk(k)\n",
-    "    topK = [dict(score=score.item(), **corpus[idx]) for idx, score in zip(topk_indices, topk_scores)]\n",
-    "    return [doc['text'][:max_characters] for doc in topK]"
+    "max_characters = 6000  # for truncating >99th percentile of documents\n",
+    "topk_docs_to_retrieve = 5  # number of documents to retrieve per search query\n",
+    "\n",
+    "with open(\"ragqa_arena_tech_corpus.jsonl\") as f:\n",
+    "    corpus = [ujson.loads(line)['text'][:max_characters] for line in f]\n",
+    "    print(f\"Loaded {len(corpus)} documents. Will encode them below.\")\n",
+    "\n",
+    "embedder = dspy.Embedder('openai/text-embedding-3-small', dimensions=512)\n",
+    "search = dspy.retrievers.Embeddings(embedder=embedder, corpus=corpus, k=topk_docs_to_retrieve)"
    ]
   },
   {
@@ -707,12 +749,11 @@
    "outputs": [],
    "source": [
     "class RAG(dspy.Module):\n",
-    "    def __init__(self, num_docs=5):\n",
-    "        self.num_docs = num_docs\n",
+    "    def __init__(self):\n",
     "        self.respond = dspy.ChainOfThought('context, question -> response')\n",
     "\n",
     "    def forward(self, question):\n",
-    "        context = search(question, k=self.num_docs)\n",
+    "        context = search(question).passages\n",
     "        return self.respond(context=context, question=question)"
    ]
   },
@@ -733,8 +774,8 @@
      "data": {
       "text/plain": [
        "Prediction(\n",
-       "    reasoning=\"High memory and low memory in Linux refer to the way the operating system organizes and manages memory for user-space applications and the kernel. Low memory is the portion of memory that is directly accessible by the kernel, while high memory is the part that is not directly mapped by the kernel's page tables. In a typical 32-bit architecture, low memory usually consists of the lower 3 GB of virtual memory, which is accessible to user-space applications, while high memory comprises the upper 1 GB, which is reserved for the kernel. The distinction is important for memory management, especially in systems with large amounts of RAM, as it affects how the kernel accesses and manages memory resources.\",\n",
-       "    response=\"In Linux, high memory refers to the portion of memory that is not directly mapped by the kernel's page tables, meaning the kernel cannot access it directly without mapping it into its address space first. Low memory, on the other hand, is the segment of memory that the kernel can access directly. In a typical 32-bit system, low memory consists of the lower 3 GB of virtual memory, while high memory comprises the upper 1 GB. This organization helps manage memory more efficiently, especially in systems with large physical memory.\"\n",
+       "    reasoning=\"High Memory and Low Memory in Linux refer to two segments of the kernel's memory space. Low Memory is the portion of memory that the kernel can access directly and is statically mapped at boot time. This area is typically used for kernel data structures and is always accessible to the kernel. High Memory, on the other hand, is not permanently mapped in the kernel's address space, meaning that the kernel cannot access it directly without first mapping it into its address space. High Memory is used for user-space applications and temporary data buffers. The distinction allows for better memory management and security, as user-space applications cannot directly access kernel-space memory.\",\n",
+       "    response=\"In Linux, High Memory refers to the segment of memory that is not permanently mapped in the kernel's address space, which means the kernel must map it temporarily to access it. This area is typically used for user-space applications and temporary data buffers. Low Memory, in contrast, is the portion of memory that the kernel can access directly and is statically mapped at boot time. It is used for kernel data structures and is always accessible to the kernel. This separation enhances security by preventing user-space applications from accessing kernel-space memory directly.\"\n",
        ")"
       ]
      },
@@ -761,7 +802,7 @@
       "\n",
       "\n",
       "\n",
-      "\u001b[34m[2024-11-10T12:39:22.802994]\u001b[0m\n",
+      "\u001b[34m[2024-11-23T22:13:02.348625]\u001b[0m\n",
       "\n",
       "\u001b[31mSystem message:\u001b[0m\n",
       "\n",
@@ -797,10 +838,10 @@
       "\n",
       "[[ ## context ## ]]\n",
       "[1] «As far as I remember, High Memory is used for application space and Low Memory for the kernel. Advantage is that (user-space) applications cant access kernel-space memory.»\n",
-      "[2] «For the people looking for an explanation in the context of Linux kernel memory space, beware that there are two conflicting definitions of the high/low memory split (unfortunately there is no standard, one has to interpret that in context): High memory defined as the totality of kernel space in VIRTUAL memory. This is a region that only the kernel can access and comprises all virtual addresses greater or equal than PAGE_OFFSET. Low memory refers therefore to the region of the remaining addresses, which correspond to the user-space memory accessible from each user process. For example: on 32-bit x86 with a default PAGE_OFFSET, this means that high memory is any address ADDR with ADDR ≥ 0xC0000000 = PAGE_OFFSET (i.e. higher 1 GB). This is the reason why in Linux 32-bit processes are typically limited to 3 GB. Note that PAGE_OFFSET cannot be configured directly, it depends on the configurable VMSPLIT_x options (source). To summarize: in 32-bit archs, virtual memory is by default split into lower 3 GB (user space) and higher 1 GB (kernel space). For 64 bit, PAGE_OFFSET is not configurable and depends on architectural details that are sometimes detected at runtime during kernel load. On x86_64, PAGE_OFFSET is 0xffff888000000000 for 4-level paging (typical) and 0xff11000000000000 for 5-level paging (source). For ARM64 this is usually 0x8000000000000000. Note though, if KASLR is enabled, this value is intentionally unpredictable. High memory defined as the portion of PHYSICAL memory that cannot be mapped contiguously with the rest of the kernel virtual memory. A portion of the kernel virtual address space can be mapped as a single contiguous chunk into the so-called physical low memory. To fully understand what this means, a deeper knowledge of the Linux virtual memory space is required. I would recommend going through these slides. From the slides: This kind of high/low memory split is only applicable to 32-bit architectures where the installed physical RAM size is relatively high (more than ~1 GB). Otherwise, i.e. when the physical address space is small (<1 GB) or when the virtual memory space is large (64 bits), the whole physical space can be accessed from the kernel virtual memory space. In that case, all physical memory is considered low memory. It is preferable that high memory does not exist at all because the whole physical space can be accessed directly from the kernel, which makes memory management a lot simpler and efficient. This is especially important when dealing with DMAs (which typically require physically contiguous memory). See also the answer by @gilles»\n",
-      "[3] «Low and High do not refer to whether there is a lot of usage or not. They represent the way it is organized by the system. According to Wikipedia: High Memory is the part of physical memory in a computer which is not directly mapped by the page tables of its operating system kernel. There is no duration for the free command which simply computes a snapshot of the information available. Most people, including programmers, do not need to understand it more clearly as it is managed in a much simpler form through system calls and compiler/interpreter operations.»\n",
-      "[4] «This is relevant to the Linux kernel; Im not sure how any Unix kernel handles this. The High Memory is the segment of memory that user-space programs can address. It cannot touch Low Memory. Low Memory is the segment of memory that the Linux kernel can address directly. If the kernel must access High Memory, it has to map it into its own address space first. There was a patch introduced recently that lets you control where the segment is. The tradeoff is that you can take addressable memory away from user space so that the kernel can have more memory that it does not have to map before using. Additional resources: http://tldp.org/HOWTO/KernelAnalysis-HOWTO-7.html http://linux-mm.org/HighMemory»\n",
-      "[5] «HIGHMEM is a range of kernels memory space, but it is NOT memory you access but its a place where you put what you want to access. A typical 32bit Linux virtual memory map is like: 0x00000000-0xbfffffff: user process (3GB) 0xc0000000-0xffffffff: kernel space (1GB) (CPU-specific vector and whatsoever are ignored here). Linux splits the 1GB kernel space into 2 pieces, LOWMEM and HIGHMEM. The split varies from installation to installation. If an installation chooses, say, 512MB-512MB for LOW and HIGH mems, the 512MB LOWMEM (0xc0000000-0xdfffffff) is statically mapped at the kernel boot time; usually the first so many bytes of the physical memory is used for this so that virtual and physical addresses in this range have a constant offset of, say, 0xc0000000. On the other hand, the latter 512MB (HIGHMEM) has no static mapping (although you could leave pages semi-permanently mapped there, but you must do so explicitly in your driver code). Instead, pages are temporarily mapped and unmapped here so that virtual and physical addresses in this range have no consistent mapping. Typical uses of HIGHMEM include single-time data buffers.»\n",
+      "[2] «HIGHMEM is a range of kernels memory space, but it is NOT memory you access but its a place where you put what you want to access. A typical 32bit Linux virtual memory map is like: 0x00000000-0xbfffffff: user process (3GB) 0xc0000000-0xffffffff: kernel space (1GB) (CPU-specific vector and whatsoever are ignored here). Linux splits the 1GB kernel space into 2 pieces, LOWMEM and HIGHMEM. The split varies from installation to installation. If an installation chooses, say, 512MB-512MB for LOW and HIGH mems, the 512MB LOWMEM (0xc0000000-0xdfffffff) is statically mapped at the kernel boot time; usually the first so many bytes of the physical memory is used for this so that virtual and physical addresses in this range have a constant offset of, say, 0xc0000000. On the other hand, the latter 512MB (HIGHMEM) has no static mapping (although you could leave pages semi-permanently mapped there, but you must do so explicitly in your driver code). Instead, pages are temporarily mapped and unmapped here so that virtual and physical addresses in this range have no consistent mapping. Typical uses of HIGHMEM include single-time data buffers.»\n",
+      "[3] «This is relevant to the Linux kernel; Im not sure how any Unix kernel handles this. The High Memory is the segment of memory that user-space programs can address. It cannot touch Low Memory. Low Memory is the segment of memory that the Linux kernel can address directly. If the kernel must access High Memory, it has to map it into its own address space first. There was a patch introduced recently that lets you control where the segment is. The tradeoff is that you can take addressable memory away from user space so that the kernel can have more memory that it does not have to map before using. Additional resources: http://tldp.org/HOWTO/KernelAnalysis-HOWTO-7.html http://linux-mm.org/HighMemory»\n",
+      "[4] «The first reference to turn to is Linux Device Drivers (available both online and in book form), particularly chapter 15 which has a section on the topic. In an ideal world, every system component would be able to map all the memory it ever needs to access. And this is the case for processes on Linux and most operating systems: a 32-bit process can only access a little less than 2^32 bytes of virtual memory (in fact about 3GB on a typical Linux 32-bit architecture). It gets difficult for the kernel, which needs to be able to map the full memory of the process whose system call its executing, plus the whole physical memory, plus any other memory-mapped hardware device. So when a 32-bit kernel needs to map more than 4GB of memory, it must be compiled with high memory support. High memory is memory which is not permanently mapped in the kernels address space. (Low memory is the opposite: it is always mapped, so you can access it in the kernel simply by dereferencing a pointer.) When you access high memory from kernel code, you need to call kmap first, to obtain a pointer from a page data structure (struct page). Calling kmap works whether the page is in high or low memory. There is also kmap_atomic which has added constraints but is more efficient on multiprocessor machines because it uses finer-grained locking. The pointer obtained through kmap is a resource: it uses up address space. Once youve finished with it, you must call kunmap (or kunmap_atomic) to free that resource; then the pointer is no longer valid, and the contents of the page cant be accessed until you call kmap again.»\n",
+      "[5] «/proc/meminfo will tell you how free works, but /proc/kcore can tell you what the kernel uses. From the same page: /proc/kcore This file represents the physical memory of the system and is stored in the ELF core file format. With this pseudo-file, and an unstripped kernel (/usr/src/linux/vmlinux) binary, GDB can be used to examine the current state of any kernel data structures. The total length of the file is the size of physical memory (RAM) plus 4KB. /proc/meminfo This file reports statistics about memory usage on the system. It is used by free(1) to report the amount of free and used memory (both physical and swap) on the system as well as the shared memory and buffers used by the kernel. Each line of the file consists of a parameter name, followed by a colon, the value of the parameter, and an option unit of measurement (e.g., kB). The list below describes the parameter names and the format specifier required to read the field value. Except as noted below, all of the fields have been present since at least Linux 2.6.0. Some fileds are displayed only if the kernel was configured with various options; those dependencies are noted in the list. MemTotal %lu Total usable RAM (i.e., physical RAM minus a few reserved bits and the kernel binary code). MemFree %lu The sum of LowFree+HighFree. Buffers %lu Relatively temporary storage for raw disk blocks that shouldnt get tremendously large (20MB or so). Cached %lu In-memory cache for files read from the disk (the page cache). Doesnt include SwapCached. SwapCached %lu Memory that once was swapped out, is swapped back in but still also is in the swap file. (If memory pressure is high, these pages dont need to be swapped out again because they are already in the swap file. This saves I/O.) Active %lu Memory that has been used more recently and usually not reclaimed unless absolutely necessary. Inactive %lu Memory which has been less recently used. It is more eligible to be reclaimed for other purposes. Active(anon) %lu (since Linux 2.6.28) [To be documented.] Inactive(anon) %lu (since Linux 2.6.28) [To be documented.] Active(file) %lu (since Linux 2.6.28) [To be documented.] Inactive(file) %lu (since Linux 2.6.28) [To be documented.] Unevictable %lu (since Linux 2.6.28) (From Linux 2.6.28 to 2.6.30, CONFIG_UNEVICTABLE_LRU was required.) [To be documented.] Mlocked %lu (since Linux 2.6.28) (From Linux 2.6.28 to 2.6.30, CONFIG_UNEVICTABLE_LRU was required.) [To be documented.] HighTotal %lu (Starting with Linux 2.6.19, CONFIG_HIGHMEM is required.) Total amount of highmem. Highmem is all memory above ~860MB of physical memory. Highmem areas are for use by user-space programs, or for the page cache. The kernel must use tricks to access this memory, making it slower to access than lowmem. HighFree %lu (Starting with Linux 2.6.19, CONFIG_HIGHMEM is required.) Amount of free highmem. LowTotal %lu (Starting with Linux 2.6.19, CONFIG_HIGHMEM is required.) Total amount of lowmem. Lowmem is memory which can be used for everything that highmem can be used for, but it is also available for the kernels use for its own data structures. Among many other things, it is where everything from Slab is allocated. Bad things happen when youre out of lowmem. LowFree %lu (Starting with Linux 2.6.19, CONFIG_HIGHMEM is required.) Amount of free lowmem. MmapCopy %lu (since Linux 2.6.29) (CONFIG_MMU is required.) [To be documented.] SwapTotal %lu Total amount of swap space available. SwapFree %lu Amount of swap space that is currently unused. Dirty %lu Memory which is waiting to get written back to the disk. Writeback %lu Memory which is actively being written back to the disk. AnonPages %lu (since Linux 2.6.18) Non-file backed pages mapped into user-space page tables. Mapped %lu Files which have been mmaped, such as libraries. Shmem %lu (since Linux 2.6.32) [To be documented.] Slab %lu In-kernel data structures cache. SReclaimable %lu (since Linux 2.6.19) Part of Slab, that might be reclaimed, such as caches. SUnreclaim %lu (since Linux 2.6.19) Part of Slab, that cannot be reclaimed on memory pressure. KernelStack %lu (since Linux 2.6.32) Amount of memory allocated to kernel stacks. PageTables %lu (since Linux 2.6.18) Amount of memory dedicated to the lowest level of page tables. Quicklists %lu (since Linux 2.6.27) (CONFIG_QUICKLIST is required.) [To be documented.] NFS_Unstable %lu (since Linux 2.6.18) NFS pages sent to the server, but not yet committed to stable storage. Bounce %lu (since Linux 2.6.18) Memory used for block device bounce buffers. WritebackTmp %lu (since Linux 2.6.26) Memory used by FUSE for temporary writeback buffers. CommitLimit %lu (since Linux 2.6.10) Based on the overcommit ratio (vm.overcommit_ratio), this is the total amount of memory currently available to be allocated on the system. This limit is adhered to only if strict overcommit accounting is enabled (mode 2 in /proc/sys/vm/overcommit_ratio). The CommitLimit is calculated using the following formula: CommitLimit = ([total RAM pages] - [total huge TLB pages]) * overcommit_ratio / 100 + [total swap pages] For example, on a system with 1GB of physical RAM and 7GB of swap with a overcommit_ratio of 30, this formula yields a CommitLimit of 7.3GB. For more details, see the memory overcommit documentation in the kernel source file Documentation/vm/overcommit-accounting. Committed_AS %lu The amount of memory presently allocated on the system. The committed memory is a sum of all of the memory which has been allocated by processes, even if it has not been used by them as of yet. A process which allocates 1GB of memory (using malloc(3) or similar), but touches only 300MB of that memory will show up as using only 300MB of memory even if it has the address space allocated for the entire 1GB. This 1GB is memory which has been committed to by the VM and can be used at any time by the allocating application. With strict overcommit enabled on the system (mode 2 /proc/sys/vm/overcommit_memory), allocations w»\n",
       "\n",
       "[[ ## question ## ]]\n",
       "what are high memory and low memory on linux?\n",
@@ -811,10 +852,10 @@
       "\u001b[31mResponse:\u001b[0m\n",
       "\n",
       "\u001b[32m[[ ## reasoning ## ]]\n",
-      "High memory and low memory in Linux refer to the way the operating system organizes and manages memory for user-space applications and the kernel. Low memory is the portion of memory that is directly accessible by the kernel, while high memory is the part that is not directly mapped by the kernel's page tables. In a typical 32-bit architecture, low memory usually consists of the lower 3 GB of virtual memory, which is accessible to user-space applications, while high memory comprises the upper 1 GB, which is reserved for the kernel. The distinction is important for memory management, especially in systems with large amounts of RAM, as it affects how the kernel accesses and manages memory resources.\n",
+      "High Memory and Low Memory in Linux refer to two segments of the kernel's memory space. Low Memory is the portion of memory that the kernel can access directly and is statically mapped at boot time. This area is typically used for kernel data structures and is always accessible to the kernel. High Memory, on the other hand, is not permanently mapped in the kernel's address space, meaning that the kernel cannot access it directly without first mapping it into its address space. High Memory is used for user-space applications and temporary data buffers. The distinction allows for better memory management and security, as user-space applications cannot directly access kernel-space memory.\n",
       "\n",
       "[[ ## response ## ]]\n",
-      "In Linux, high memory refers to the portion of memory that is not directly mapped by the kernel's page tables, meaning the kernel cannot access it directly without mapping it into its address space first. Low memory, on the other hand, is the segment of memory that the kernel can access directly. In a typical 32-bit system, low memory consists of the lower 3 GB of virtual memory, while high memory comprises the upper 1 GB. This organization helps manage memory more efficiently, especially in systems with large physical memory.\n",
+      "In Linux, High Memory refers to the segment of memory that is not permanently mapped in the kernel's address space, which means the kernel must map it temporarily to access it. This area is typically used for user-space applications and temporary data buffers. Low Memory, in contrast, is the portion of memory that the kernel can access directly and is statically mapped at boot time. It is used for kernel data structures and is always accessible to the kernel. This separation enhances security by preventing user-space applications from accessing kernel-space memory directly.\n",
       "\n",
       "[[ ## completed ## ]]\u001b[0m\n",
       "\n",
@@ -833,7 +874,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Earlier with a CoT module, we got nearly 37% in terms of semantic F1 on our `devset`. Would this `RAG` module score better?"
+    "Earlier with a CoT module, we got around 40% in terms of semantic F1 on our `devset`. Would this `RAG` module score better?"
    ]
   },
   {
@@ -841,12 +882,25 @@
    "execution_count": 17,
    "metadata": {},
    "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Average Metric: 166.39 / 300 (55.5%): 100%|██████████| 300/300 [00:14<00:00, 20.29it/s]"
+     ]
+    },
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Average Metric: 74.61311832900337 / 150  (49.7): 100%|██████████| 150/150 [00:05<00:00, 27.92it/s] \n",
-      "2024/11/10 12:39:28 INFO dspy.evaluate.evaluate: Average Metric: 74.61311832900337 / 150 (49.7%)\n"
+      "2024/11/23 22:13:17 INFO dspy.evaluate.evaluate: Average Metric: 166.39410892098812 / 300 (55.5%)\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n"
      ]
     },
     {
@@ -872,6 +926,7 @@
        "      <th></th>\n",
        "      <th>question</th>\n",
        "      <th>example_response</th>\n",
+       "      <th>gold_doc_ids</th>\n",
        "      <th>reasoning</th>\n",
        "      <th>pred_response</th>\n",
        "      <th>SemanticF1</th>\n",
@@ -880,44 +935,50 @@
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
-       "      <td>why is mercurial considered to be easier than git?</td>\n",
-       "      <td>Mercurial's syntax is considered more familiar, especially for tho...</td>\n",
-       "      <td>Mercurial is considered easier than Git for several reasons. First...</td>\n",
-       "      <td>Mercurial is considered easier than Git because it has a more fami...</td>\n",
-       "      <td>✔️ [0.797]</td>\n",
+       "      <td>when to use c over c++, and c++ over c?</td>\n",
+       "      <td>If you are equally familiar with both C++ and C, it's advisable to...</td>\n",
+       "      <td>[733]</td>\n",
+       "      <td>C should be used over C++ primarily in scenarios where simplicity ...</td>\n",
+       "      <td>Use C over C++ when working on embedded systems, requiring low-lev...</td>\n",
+       "      <td>✔️ [0.500]</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
-       "      <td>open finder window from current terminal location?</td>\n",
-       "      <td>If you type 'open .' in Terminal, it will open the current directo...</td>\n",
-       "      <td>To open a Finder window from the current terminal location, you ca...</td>\n",
-       "      <td>You can open a Finder window from your current terminal location b...</td>\n",
-       "      <td>✔️ [0.667]</td>\n",
+       "      <td>should images be stored in a git repository?</td>\n",
+       "      <td>One viewpoint expresses that there is no significant downside, esp...</td>\n",
+       "      <td>[6253, 6254, 6275, 6278, 8215]</td>\n",
+       "      <td>Storing images in a Git repository is generally not recommended du...</td>\n",
+       "      <td>While it is technically possible to store images in a Git reposito...</td>\n",
+       "      <td>✔️ [0.444]</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "text/plain": [
-       "                                             question  \\\n",
-       "0  why is mercurial considered to be easier than git?   \n",
-       "1  open finder window from current terminal location?   \n",
+       "                                       question  \\\n",
+       "0       when to use c over c++, and c++ over c?   \n",
+       "1  should images be stored in a git repository?   \n",
        "\n",
        "                                                        example_response  \\\n",
-       "0  Mercurial's syntax is considered more familiar, especially for tho...   \n",
-       "1  If you type 'open .' in Terminal, it will open the current directo...   \n",
+       "0  If you are equally familiar with both C++ and C, it's advisable to...   \n",
+       "1  One viewpoint expresses that there is no significant downside, esp...   \n",
+       "\n",
+       "                     gold_doc_ids  \\\n",
+       "0                           [733]   \n",
+       "1  [6253, 6254, 6275, 6278, 8215]   \n",
        "\n",
        "                                                               reasoning  \\\n",
-       "0  Mercurial is considered easier than Git for several reasons. First...   \n",
-       "1  To open a Finder window from the current terminal location, you ca...   \n",
+       "0  C should be used over C++ primarily in scenarios where simplicity ...   \n",
+       "1  Storing images in a Git repository is generally not recommended du...   \n",
        "\n",
        "                                                           pred_response  \\\n",
-       "0  Mercurial is considered easier than Git because it has a more fami...   \n",
-       "1  You can open a Finder window from your current terminal location b...   \n",
+       "0  Use C over C++ when working on embedded systems, requiring low-lev...   \n",
+       "1  While it is technically possible to store images in a Git reposito...   \n",
        "\n",
        "   SemanticF1  \n",
-       "0  ✔️ [0.797]  \n",
-       "1  ✔️ [0.667]  "
+       "0  ✔️ [0.500]  \n",
+       "1  ✔️ [0.444]  "
       ]
      },
      "metadata": {},
@@ -933,7 +994,7 @@
        "                    font-weight: bold;\n",
        "                    color: #555;\n",
        "                    margin: 10px 0;'>\n",
-       "                    ... 148 more rows not displayed ...\n",
+       "                    ... 298 more rows not displayed ...\n",
        "                </div>\n",
        "                "
       ],
@@ -947,7 +1008,7 @@
     {
      "data": {
       "text/plain": [
-       "49.74"
+       "55.46"
       ]
      },
      "execution_count": 17,
@@ -965,7 +1026,7 @@
    "source": [
     "## Using a DSPy Optimizer to improve your RAG prompt.\n",
     "\n",
-    "Off the shelf, our `RAG` module scores nearly 50%. What are our options to make it stronger? One of the various choices DSPy offers is optimizing the prompts in our pipeline.\n",
+    "Off the shelf, our `RAG` module scores 55%. What are our options to make it stronger? One of the various choices DSPy offers is optimizing the prompts in our pipeline.\n",
     "\n",
     "If there are many sub-modules in your program, all of them will be optimized together. In this case, there's only one: `self.respond = dspy.ChainOfThought('context, question -> response')`\n",
     "\n",
@@ -974,260 +1035,1259 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "tp = dspy.MIPROv2(metric=metric, auto=\"medium\", num_threads=24)  # use fewer threads if your rate limit is small\n",
-    "\n",
-    "optimized_rag = tp.compile(RAG(), trainset=trainset, valset=valset,\n",
-    "                           max_bootstrapped_demos=2, max_labeled_demos=2,\n",
-    "                           requires_permission_to_run=False)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "The prompt optimization process here is pretty systematic, you can learn about it for example in this paper. Importantly, it's not a magic button. It's very possible that it can overfit your training set for instance and not generalize well to a held-out set, making it essential that we iteratively validate our programs.\n",
-    "\n",
-    "Let's check on an example here, asking the same question to the baseline `rag = RAG()` program, which was not optimized, and to the `optimized_rag = MIPROv2(..)(..)` program, after prompt optimization."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 18,
    "metadata": {},
    "outputs": [
     {
-     "name": "stdout",
+     "name": "stderr",
      "output_type": "stream",
      "text": [
-      "You are correct; cmd+Tab does not work on hidden or minimized windows in macOS. It is designed to switch between applications and will only show non-minimized windows of the active application. To access minimized windows, you need to click on them directly or use other shortcuts.\n"
+      "2024/11/23 22:13:17 INFO dspy.teleprompt.mipro_optimizer_v2: \n",
+      "RUNNING WITH THE FOLLOWING MEDIUM AUTO RUN SETTINGS:\n",
+      "num_trials: 25\n",
+      "minibatch: True\n",
+      "num_candidates: 19\n",
+      "valset size: 160\n",
+      "\n",
+      "2024/11/23 22:13:17 INFO dspy.teleprompt.mipro_optimizer_v2: \n",
+      "==> STEP 1: BOOTSTRAP FEWSHOT EXAMPLES <==\n",
+      "2024/11/23 22:13:17 INFO dspy.teleprompt.mipro_optimizer_v2: These will be used as few-shot example candidates for our program and for creating instructions.\n",
+      "\n",
+      "2024/11/23 22:13:17 INFO dspy.teleprompt.mipro_optimizer_v2: Bootstrapping N=19 sets of demonstrations...\n"
      ]
-    }
-   ],
-   "source": [
-    "baseline = rag(question=\"cmd+tab does not work on hidden or minimized windows\")\n",
-    "print(baseline.response)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 20,
-   "metadata": {},
-   "outputs": [
+    },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "In macOS, the Command+Tab shortcut is specifically designed to switch between applications, not individual windows. This means that if an application is minimized or hidden, it will not be activated using Command+Tab. Here are some alternative methods to manage minimized or hidden windows:\n",
-      "\n",
-      "1. **Click on the Minimized Window:**\n",
-      "   - You can directly click on the minimized window in the Dock to restore it.\n",
-      "\n",
-      "2. **Use Command+M:**\n",
-      "   - If you want to minimize the current window, you can use Command+M. To restore it, you will need to click on it in the Dock.\n",
-      "\n",
-      "3. **Use Mission Control:**\n",
-      "   - You can activate Mission Control (F3 or Control+Up Arrow) to see all open windows and select the one you want to bring to the front.\n",
-      "\n",
-      "4. **Third-Party Applications:**\n",
-      "   - Consider using third-party applications like HyperSwitch or Witch, which can provide enhanced window management features, including switching between windows of the same application.\n",
-      "\n",
-      "5. **Keyboard Shortcuts for Specific Applications:**\n",
-      "   - Some applications may have their own shortcuts for managing windows. Check the preferences or documentation for the specific application you are using.\n",
-      "\n",
-      "By using these methods, you can effectively manage and restore minimized or hidden windows in macOS.\n"
+      "Bootstrapping set 1/19\n",
+      "Bootstrapping set 2/19\n",
+      "Bootstrapping set 3/19\n"
      ]
-    }
-   ],
-   "source": [
-    "pred = optimized_rag(question=\"cmd+tab does not work on hidden or minimized windows\")\n",
-    "print(pred.response)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "You can use `dspy.inspect_history(n=2)` to view the RAG prompt [before optimization](https://gist.github.com/okhat/5d04648f2226e72e66e26a8cb1456ee4) and [after optimization](https://gist.github.com/okhat/79405b8889b4b07da577ee19f1a3479a).\n",
-    "\n",
-    "Concretely, in of run of this notebook, the optimized prompt:\n",
-    "\n",
-    "1. Constructs the following instruction,\n",
-    "```text\n",
-    "Using the provided `context` and `question`, analyze the information step by step to generate a comprehensive and informative `response`. Ensure that the response clearly explains the concepts involved, highlights key distinctions, and addresses any complexities noted in the context.\n",
-    "```\n",
-    "\n",
-    "2. And includes two fully worked out RAG examples with synthetic reasoning and answers, e.g. `how to transfer whatsapp voice message to computer?`.\n",
-    "\n",
-    "Let's now evaluate on the overall devset."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 21,
-   "metadata": {},
-   "outputs": [
+    },
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Average Metric: 89.78303512426604 / 150  (59.9): 100%|██████████| 150/150 [00:00<00:00, 424.18it/s]\n",
-      "2024/11/10 12:39:36 INFO dspy.evaluate.evaluate: Average Metric: 89.78303512426604 / 150 (59.9%)\n"
+      " 10%|█         | 4/40 [00:00<00:04,  8.97it/s]\n"
      ]
     },
     {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>question</th>\n",
-       "      <th>example_response</th>\n",
-       "      <th>reasoning</th>\n",
-       "      <th>pred_response</th>\n",
-       "      <th>SemanticF1</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>why is mercurial considered to be easier than git?</td>\n",
-       "      <td>Mercurial's syntax is considered more familiar, especially for tho...</td>\n",
-       "      <td>Mercurial is often considered easier than Git for several reasons,...</td>\n",
-       "      <td>Mercurial is considered easier than Git for several key reasons: 1...</td>\n",
-       "      <td>✔️ [0.874]</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>open finder window from current terminal location?</td>\n",
-       "      <td>If you type 'open .' in Terminal, it will open the current directo...</td>\n",
-       "      <td>To open a Finder window from the current terminal location in macO...</td>\n",
-       "      <td>To open a Finder window from your current terminal location in mac...</td>\n",
-       "      <td>✔️ [0.600]</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "                                             question  \\\n",
-       "0  why is mercurial considered to be easier than git?   \n",
-       "1  open finder window from current terminal location?   \n",
-       "\n",
-       "                                                        example_response  \\\n",
-       "0  Mercurial's syntax is considered more familiar, especially for tho...   \n",
-       "1  If you type 'open .' in Terminal, it will open the current directo...   \n",
-       "\n",
-       "                                                               reasoning  \\\n",
-       "0  Mercurial is often considered easier than Git for several reasons,...   \n",
-       "1  To open a Finder window from the current terminal location in macO...   \n",
-       "\n",
-       "                                                           pred_response  \\\n",
-       "0  Mercurial is considered easier than Git for several key reasons: 1...   \n",
-       "1  To open a Finder window from your current terminal location in mac...   \n",
-       "\n",
-       "   SemanticF1  \n",
-       "0  ✔️ [0.874]  \n",
-       "1  ✔️ [0.600]  "
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Bootstrapped 2 full traces after 4 examples for up to 1 rounds, amounting to 4 attempts.\n",
+      "Bootstrapping set 4/19\n"
+     ]
     },
     {
-     "data": {
-      "text/html": [
-       "\n",
-       "                <div style='\n",
-       "                    text-align: center;\n",
-       "                    font-size: 16px;\n",
-       "                    font-weight: bold;\n",
-       "                    color: #555;\n",
-       "                    margin: 10px 0;'>\n",
-       "                    ... 148 more rows not displayed ...\n",
-       "                </div>\n",
-       "                "
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      " 15%|█▌        | 6/40 [00:00<00:03,  8.98it/s]\n"
+     ]
     },
     {
-     "data": {
-      "text/plain": [
-       "59.86"
-      ]
-     },
-     "execution_count": 21,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "evaluate(optimized_rag)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Keeping an eye on cost.\n",
-    "\n",
-    "DSPy allows you to track the cost of your programs, which can be used to monitor the cost of your calls. Here, we'll show you how to track the cost of your programs with DSPy."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 22,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "cost = sum([x['cost'] for x in lm.history if x['cost'] is not None])  # in USD, as calculated by LiteLLM for certain providers"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Saving and loading.\n",
-    "\n",
-    "The optimized program has a pretty simple structure on the inside. Feel free to explore it.\n",
-    "\n",
-    "Here, we'll save `optimized_rag` so we can load it again later without having to optimize from scratch."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 23,
-   "metadata": {},
-   "outputs": [
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Bootstrapped 1 full traces after 6 examples for up to 1 rounds, amounting to 6 attempts.\n",
+      "Bootstrapping set 5/19\n"
+     ]
+    },
     {
-     "data": {
-      "text/plain": [
-       "Prediction(\n",
-       "    reasoning='The behavior of the Command+Tab shortcut in macOS is designed to switch between applications rather than individual windows. This means that if an application is minimized or hidden, it will not be brought to the forefront using Command+Tab. Instead, the shortcut will only cycle through applications that are currently open and not minimized. To manage minimized windows, users may need to use different shortcuts or methods to restore them.',\n",
-       "    response='In macOS, the Command+Tab shortcut is specifically designed to switch between applications, not individual windows. This means that if an application is minimized or hidden, it will not be activated using Command+Tab. Here are some alternative methods to manage minimized or hidden windows:\\n\\n1. **Click on the Minimized Window:**\\n   - You can directly click on the minimized window in the Dock to restore it.\\n\\n2. **Use Command+M:**\\n   - If you want to minimize the current window, you can use Command+M. To restore it, you will need to click on it in the Dock.\\n\\n3. **Use Mission Control:**\\n   - You can activate Mission Control (F3 or Control+Up Arrow) to see all open windows and select the one you want to bring to the front.\\n\\n4. **Third-Party Applications:**\\n   - Consider using third-party applications like HyperSwitch or Witch, which can provide enhanced window management features, including switching between windows of the same application.\\n\\n5. **Keyboard Shortcuts for Specific Applications:**\\n   - Some applications may have their own shortcuts for managing windows. Check the preferences or documentation for the specific application you are using.\\n\\nBy using these methods, you can effectively manage and restore minimized or hidden windows in macOS.'\n",
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "  2%|▎         | 1/40 [00:00<00:04,  9.16it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.\n",
+      "Bootstrapping set 6/19\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "  8%|▊         | 3/40 [00:00<00:04,  9.16it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Bootstrapped 1 full traces after 3 examples for up to 1 rounds, amounting to 3 attempts.\n",
+      "Bootstrapping set 7/19\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "  5%|▌         | 2/40 [00:00<00:03,  9.53it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Bootstrapped 1 full traces after 2 examples for up to 1 rounds, amounting to 2 attempts.\n",
+      "Bootstrapping set 8/19\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      " 12%|█▎        | 5/40 [00:00<00:03,  8.94it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Bootstrapped 1 full traces after 5 examples for up to 1 rounds, amounting to 5 attempts.\n",
+      "Bootstrapping set 9/19\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "  5%|▌         | 2/40 [00:00<00:04,  9.15it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Bootstrapped 2 full traces after 2 examples for up to 1 rounds, amounting to 2 attempts.\n",
+      "Bootstrapping set 10/19\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "  8%|▊         | 3/40 [00:00<00:04,  9.11it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Bootstrapped 1 full traces after 3 examples for up to 1 rounds, amounting to 3 attempts.\n",
+      "Bootstrapping set 11/19\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "  8%|▊         | 3/40 [00:00<00:04,  8.67it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Bootstrapped 1 full traces after 3 examples for up to 1 rounds, amounting to 3 attempts.\n",
+      "Bootstrapping set 12/19\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "  5%|▌         | 2/40 [00:00<00:04,  8.49it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Bootstrapped 1 full traces after 2 examples for up to 1 rounds, amounting to 2 attempts.\n",
+      "Bootstrapping set 13/19\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "  5%|▌         | 2/40 [00:00<00:04,  8.91it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Bootstrapped 1 full traces after 2 examples for up to 1 rounds, amounting to 2 attempts.\n",
+      "Bootstrapping set 14/19\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "  2%|▎         | 1/40 [00:00<00:04,  9.13it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.\n",
+      "Bootstrapping set 15/19\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "  2%|▎         | 1/40 [00:00<00:04,  9.16it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.\n",
+      "Bootstrapping set 16/19\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "  5%|▌         | 2/40 [00:00<00:04,  9.24it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Bootstrapped 1 full traces after 2 examples for up to 1 rounds, amounting to 2 attempts.\n",
+      "Bootstrapping set 17/19\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      " 25%|██▌       | 10/40 [00:01<00:03,  8.74it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Bootstrapped 2 full traces after 10 examples for up to 1 rounds, amounting to 10 attempts.\n",
+      "Bootstrapping set 18/19\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "  8%|▊         | 3/40 [00:00<00:04,  8.40it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Bootstrapped 2 full traces after 3 examples for up to 1 rounds, amounting to 3 attempts.\n",
+      "Bootstrapping set 19/19\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "  8%|▊         | 3/40 [00:00<00:04,  8.64it/s]\n",
+      "2024/11/23 22:13:23 INFO dspy.teleprompt.mipro_optimizer_v2: \n",
+      "==> STEP 2: PROPOSE INSTRUCTION CANDIDATES <==\n",
+      "2024/11/23 22:13:23 INFO dspy.teleprompt.mipro_optimizer_v2: We will use the few-shot examples from the previous step, a generated dataset summary, a summary of the program code, and a randomly selected prompting tip to propose instructions.\n",
+      "2024/11/23 22:13:23 INFO dspy.teleprompt.mipro_optimizer_v2: \n",
+      "Proposing instructions...\n",
+      "\n",
+      "2024/11/23 22:13:23 INFO dspy.teleprompt.mipro_optimizer_v2: Proposed Instructions for Predictor 0:\n",
+      "\n",
+      "2024/11/23 22:13:23 INFO dspy.teleprompt.mipro_optimizer_v2: 0: Given the fields `context`, `question`, produce the fields `response`.\n",
+      "\n",
+      "2024/11/23 22:13:23 INFO dspy.teleprompt.mipro_optimizer_v2: 1: Using the provided `context` about Mac OS X commands and the `question` related to troubleshooting or file management, generate a detailed response. Begin by outlining the reasoning process step-by-step, then provide a comprehensive answer that not only addresses the question but also includes practical applications and comparisons where relevant.\n",
+      "\n",
+      "2024/11/23 22:13:23 INFO dspy.teleprompt.mipro_optimizer_v2: 2: You are a technical support assistant. Given the fields `context` and `question`, analyze the provided context to extract relevant information and produce a detailed and coherent `response` that answers the question based on the information available.\n",
+      "\n",
+      "2024/11/23 22:13:23 INFO dspy.teleprompt.mipro_optimizer_v2: 3: You are a technical support assistant. Given the fields `context` and `question`, provide a clear and structured `response` that outlines the methods for locking the screen in the XFCE desktop environment, using the information available in the `context`. Make sure to highlight the most effective methods and include any relevant details to enhance user understanding.\n",
+      "\n",
+      "2024/11/23 22:13:23 INFO dspy.teleprompt.mipro_optimizer_v2: 4: You are a technical expert in data integrity and security. Given the fields `context` and `question`, produce a well-reasoned `response` that clearly explains the differences between a hash function and a checksum, incorporating relevant details from the context provided.\n",
+      "\n",
+      "2024/11/23 22:13:23 INFO dspy.teleprompt.mipro_optimizer_v2: 5: You are a shell scripting expert. Given the context that provides various examples and explanations related to temporary files in shell scripting, along with a specific question about how to create a temporary file, produce a detailed response that includes a code snippet demonstrating the use of the `mktemp` command and how to properly manage the temporary file within a shell script. Make sure to emphasize the importance of cleanup after the file is used.\n",
+      "\n",
+      "2024/11/23 22:13:23 INFO dspy.teleprompt.mipro_optimizer_v2: 6: Using the provided `context` and `question`, analyze the information and generate a detailed yet concise `response` that effectively summarizes the main arguments and conclusions regarding the practice of commenting every line of code. Ensure that the response reflects the nuances of the context and provides a clear stance on the issue.\n",
+      "\n",
+      "2024/11/23 22:13:23 INFO dspy.teleprompt.mipro_optimizer_v2: 7: Using the provided `context` which contains detailed information about transferring photos from an iPhone to a computer, and the `question` regarding how to transfer edited photos specifically, generate a comprehensive `response` that outlines the necessary steps or methods to effectively transfer the edited photos. Ensure to highlight any limitations of direct import methods and suggest alternative approaches such as AirDrop, emailing, or using iCloud Photo Library for a successful transfer of edited images.\n",
+      "\n",
+      "2024/11/23 22:13:23 INFO dspy.teleprompt.mipro_optimizer_v2: 8: Imagine you are a technical support specialist assisting a user who is facing issues with their Mac OS X commands. They have a critical deadline and need reliable solutions to their questions. Your task is to provide detailed, accurate responses based on the given context and questions. Given the fields `context`, `question`, produce the fields `response` to help the user understand complex technical concepts and troubleshoot effectively.\n",
+      "\n",
+      "2024/11/23 22:13:23 INFO dspy.teleprompt.mipro_optimizer_v2: 9: Imagine you are a technical support agent assisting a user who has lost important data on their LG G2 device and needs to recover their TWRP backups urgently. Given the fields `context`, which contains vital information about where TWRP backups can be stored, and `question`, which asks specifically about the locations of these backups, produce a detailed `response` that guides the user on how to locate their backups effectively.\n",
+      "\n",
+      "2024/11/23 22:13:23 INFO dspy.teleprompt.mipro_optimizer_v2: 10: Using the provided `context` that contains information about deleting empty directories, and the `question` regarding how to recursively delete empty directories in the user's home directory, generate a detailed `response` that includes the appropriate command and an explanation of its components. Ensure to highlight the use of options like `-type`, `-empty`, and `-exec` in the command, and consider providing variations for additional clarity, such as including the verbose option for user feedback.\n",
+      "\n",
+      "2024/11/23 22:13:23 INFO dspy.teleprompt.mipro_optimizer_v2: 11: Based on the provided context and question, generate a detailed response that outlines the steps necessary to export a private key from a Java Keytool keystore. Include commands for converting the keystore from JKS to PKCS#12 format and for using OpenSSL to extract the private key. Emphasize the security implications of handling private keys and provide clear instructions on replacing placeholders with actual values.\n",
+      "\n",
+      "2024/11/23 22:13:23 INFO dspy.teleprompt.mipro_optimizer_v2: 12: You are a technical support assistant with expertise in Mac OS X commands. Given the fields `context` and `question`, provide a detailed and informative `response` that clarifies the distinctions or relationships between the concepts discussed in the context.\n",
+      "\n",
+      "2024/11/23 22:13:23 INFO dspy.teleprompt.mipro_optimizer_v2: 13: You are a privacy advocate explaining why someone would trust DuckDuckGo or similar providers with a privacy policy. Given the fields `context` and `question`, provide a detailed response that outlines the reasons for this trust, incorporating aspects such as the clarity of the privacy policy, technical implementations, user control, legal accountability, and community feedback.\n",
+      "\n",
+      "2024/11/23 22:13:23 INFO dspy.teleprompt.mipro_optimizer_v2: 14: Using the provided `context` and `question`, generate a detailed and coherent `response` that explains the reasons someone might trust DuckDuckGo or similar privacy-focused providers based on their privacy policies and practices.\n",
+      "\n",
+      "2024/11/23 22:13:23 INFO dspy.teleprompt.mipro_optimizer_v2: 15: Using the provided `context` that contains detailed information about Mac OS X commands and locations related to user account pictures, along with the `question` regarding where Mac stores these account pictures, generate a structured and informative `response`. Ensure that your response accurately summarizes the key locations and relevant details mentioned in the context, and clearly addresses the question posed.\n",
+      "\n",
+      "2024/11/23 22:13:23 INFO dspy.teleprompt.mipro_optimizer_v2: 16: Using the provided `context` and `question`, generate a comprehensive `response` that summarizes the key points, compares the advantages and disadvantages of the concepts discussed, and offers practical insights based on the information available. Ensure that the response is clear, organized, and addresses the user's inquiry effectively.\n",
+      "\n",
+      "2024/11/23 22:13:23 INFO dspy.teleprompt.mipro_optimizer_v2: 17: Given the context that describes various methods for creating temporary files in shell scripts, along with a specific question about how to create a temporary file, generate a detailed response that includes examples of using the `mktemp` command, ensuring to explain the importance of cleanup after file usage.\n",
+      "\n",
+      "2024/11/23 22:13:23 INFO dspy.teleprompt.mipro_optimizer_v2: 18: You are a technical support assistant. Given the fields `context`, `question`, produce the fields `response`. Ensure that your response is detailed and provides step-by-step guidance based on the context provided.\n",
+      "\n",
+      "2024/11/23 22:13:23 INFO dspy.teleprompt.mipro_optimizer_v2: \n",
+      "\n",
+      "2024/11/23 22:13:23 INFO dspy.teleprompt.mipro_optimizer_v2: Evaluating the default program...\n",
+      "\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Bootstrapped 2 full traces after 3 examples for up to 1 rounds, amounting to 3 attempts.\n",
+      "Average Metric: 89.01 / 160 (55.6%): 100%|██████████| 160/160 [00:04<00:00, 37.54it/s]"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2024/11/23 22:13:27 INFO dspy.evaluate.evaluate: Average Metric: 89.0075423349221 / 160 (55.6%)\n",
+      "2024/11/23 22:13:27 INFO dspy.teleprompt.mipro_optimizer_v2: Default program score: 55.63\n",
+      "\n",
+      "2024/11/23 22:13:27 INFO dspy.teleprompt.mipro_optimizer_v2: ==> STEP 3: FINDING OPTIMAL PROMPT PARAMETERS <==\n",
+      "2024/11/23 22:13:27 INFO dspy.teleprompt.mipro_optimizer_v2: We will evaluate the program over a series of trials with different combinations of instructions and few-shot examples to find the optimal combination using Bayesian Optimization.\n",
+      "\n",
+      "/opt/anaconda3/envs/jun2024_py310/lib/python3.10/site-packages/optuna/samplers/_tpe/sampler.py:319: ExperimentalWarning: ``multivariate`` option is an experimental feature. The interface can change in the future.\n",
+      "  warnings.warn(\n",
+      "2024/11/23 22:13:27 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 1 / 25 ==\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Average Metric: 14.72 / 25 (58.9%): 100%|██████████| 25/25 [00:00<00:00, 96.95it/s] "
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2024/11/23 22:13:27 INFO dspy.evaluate.evaluate: Average Metric: 14.719867707788584 / 25 (58.9%)\n",
+      "2024/11/23 22:13:27 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 58.88 on minibatch of size 25 with parameters ['Predictor 0: Instruction 12', 'Predictor 0: Few-Shot Set 7'].\n",
+      "2024/11/23 22:13:27 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [58.88]\n",
+      "2024/11/23 22:13:27 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.63]\n",
+      "2024/11/23 22:13:27 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 55.63\n",
+      "2024/11/23 22:13:27 INFO dspy.teleprompt.mipro_optimizer_v2: ============================\n",
+      "\n",
+      "\n",
+      "2024/11/23 22:13:27 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 2 / 25 ==\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Average Metric: 13.88 / 25 (55.5%): 100%|██████████| 25/25 [00:00<00:00, 99.17it/s] "
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2024/11/23 22:13:28 INFO dspy.evaluate.evaluate: Average Metric: 13.87639947083419 / 25 (55.5%)\n",
+      "2024/11/23 22:13:28 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 55.51 on minibatch of size 25 with parameters ['Predictor 0: Instruction 10', 'Predictor 0: Few-Shot Set 7'].\n",
+      "2024/11/23 22:13:28 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [58.88, 55.51]\n",
+      "2024/11/23 22:13:28 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.63]\n",
+      "2024/11/23 22:13:28 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 55.63\n",
+      "2024/11/23 22:13:28 INFO dspy.teleprompt.mipro_optimizer_v2: ============================\n",
+      "\n",
+      "\n",
+      "2024/11/23 22:13:28 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 3 / 25 ==\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Average Metric: 15.56 / 25 (62.3%): 100%|██████████| 25/25 [00:00<00:00, 99.46it/s] "
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2024/11/23 22:13:28 INFO dspy.evaluate.evaluate: Average Metric: 15.563671185234691 / 25 (62.3%)\n",
+      "2024/11/23 22:13:28 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 62.25 on minibatch of size 25 with parameters ['Predictor 0: Instruction 7', 'Predictor 0: Few-Shot Set 18'].\n",
+      "2024/11/23 22:13:28 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [58.88, 55.51, 62.25]\n",
+      "2024/11/23 22:13:28 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.63]\n",
+      "2024/11/23 22:13:28 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 55.63\n",
+      "2024/11/23 22:13:28 INFO dspy.teleprompt.mipro_optimizer_v2: ============================\n",
+      "\n",
+      "\n",
+      "2024/11/23 22:13:28 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 4 / 25 ==\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Average Metric: 14.54 / 25 (58.2%): 100%|██████████| 25/25 [00:00<00:00, 97.02it/s]"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2024/11/23 22:13:28 INFO dspy.evaluate.evaluate: Average Metric: 14.542840231125426 / 25 (58.2%)\n",
+      "2024/11/23 22:13:28 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 58.17 on minibatch of size 25 with parameters ['Predictor 0: Instruction 15', 'Predictor 0: Few-Shot Set 2'].\n",
+      "2024/11/23 22:13:28 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [58.88, 55.51, 62.25, 58.17]\n",
+      "2024/11/23 22:13:28 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.63]\n",
+      "2024/11/23 22:13:28 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 55.63\n",
+      "2024/11/23 22:13:28 INFO dspy.teleprompt.mipro_optimizer_v2: ============================\n",
+      "\n",
+      "\n",
+      "2024/11/23 22:13:28 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 5 / 25 ==\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Average Metric: 15.75 / 25 (63.0%): 100%|██████████| 25/25 [00:00<00:00, 104.42it/s]"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2024/11/23 22:13:28 INFO dspy.evaluate.evaluate: Average Metric: 15.746005444613344 / 25 (63.0%)\n",
+      "2024/11/23 22:13:28 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 62.98 on minibatch of size 25 with parameters ['Predictor 0: Instruction 8', 'Predictor 0: Few-Shot Set 18'].\n",
+      "2024/11/23 22:13:28 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [58.88, 55.51, 62.25, 58.17, 62.98]\n",
+      "2024/11/23 22:13:28 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.63]\n",
+      "2024/11/23 22:13:28 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 55.63\n",
+      "2024/11/23 22:13:28 INFO dspy.teleprompt.mipro_optimizer_v2: ============================\n",
+      "\n",
+      "\n",
+      "2024/11/23 22:13:28 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 6 / 25 ==\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Average Metric: 14.68 / 25 (58.7%): 100%|██████████| 25/25 [00:00<00:00, 107.78it/s]"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2024/11/23 22:13:29 INFO dspy.evaluate.evaluate: Average Metric: 14.683617165143385 / 25 (58.7%)\n",
+      "2024/11/23 22:13:29 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 58.73 on minibatch of size 25 with parameters ['Predictor 0: Instruction 7', 'Predictor 0: Few-Shot Set 1'].\n",
+      "2024/11/23 22:13:29 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [58.88, 55.51, 62.25, 58.17, 62.98, 58.73]\n",
+      "2024/11/23 22:13:29 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.63]\n",
+      "2024/11/23 22:13:29 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 55.63\n",
+      "2024/11/23 22:13:29 INFO dspy.teleprompt.mipro_optimizer_v2: ============================\n",
+      "\n",
+      "\n",
+      "2024/11/23 22:13:29 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 7 / 25 ==\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Average Metric: 15.93 / 25 (63.7%): 100%|██████████| 25/25 [00:00<00:00, 106.66it/s]"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2024/11/23 22:13:29 INFO dspy.evaluate.evaluate: Average Metric: 15.934088959267559 / 25 (63.7%)\n",
+      "2024/11/23 22:13:29 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 63.74 on minibatch of size 25 with parameters ['Predictor 0: Instruction 7', 'Predictor 0: Few-Shot Set 12'].\n",
+      "2024/11/23 22:13:29 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [58.88, 55.51, 62.25, 58.17, 62.98, 58.73, 63.74]\n",
+      "2024/11/23 22:13:29 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.63]\n",
+      "2024/11/23 22:13:29 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 55.63\n",
+      "2024/11/23 22:13:29 INFO dspy.teleprompt.mipro_optimizer_v2: ============================\n",
+      "\n",
+      "\n",
+      "2024/11/23 22:13:29 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 8 / 25 ==\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Average Metric: 15.52 / 25 (62.1%): 100%|██████████| 25/25 [00:00<00:00, 100.22it/s]"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2024/11/23 22:13:29 INFO dspy.evaluate.evaluate: Average Metric: 15.52144781700213 / 25 (62.1%)\n",
+      "2024/11/23 22:13:29 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 62.09 on minibatch of size 25 with parameters ['Predictor 0: Instruction 11', 'Predictor 0: Few-Shot Set 13'].\n",
+      "2024/11/23 22:13:29 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [58.88, 55.51, 62.25, 58.17, 62.98, 58.73, 63.74, 62.09]\n",
+      "2024/11/23 22:13:29 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.63]\n",
+      "2024/11/23 22:13:29 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 55.63\n",
+      "2024/11/23 22:13:29 INFO dspy.teleprompt.mipro_optimizer_v2: ============================\n",
+      "\n",
+      "\n",
+      "2024/11/23 22:13:29 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 9 / 25 ==\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Average Metric: 15.54 / 25 (62.2%): 100%|██████████| 25/25 [00:00<00:00, 104.70it/s]"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2024/11/23 22:13:29 INFO dspy.evaluate.evaluate: Average Metric: 15.541098318140321 / 25 (62.2%)\n",
+      "2024/11/23 22:13:29 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 62.16 on minibatch of size 25 with parameters ['Predictor 0: Instruction 5', 'Predictor 0: Few-Shot Set 4'].\n",
+      "2024/11/23 22:13:29 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [58.88, 55.51, 62.25, 58.17, 62.98, 58.73, 63.74, 62.09, 62.16]\n",
+      "2024/11/23 22:13:29 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.63]\n",
+      "2024/11/23 22:13:29 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 55.63\n",
+      "2024/11/23 22:13:29 INFO dspy.teleprompt.mipro_optimizer_v2: ============================\n",
+      "\n",
+      "\n",
+      "2024/11/23 22:13:29 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 10 / 25 ==\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Average Metric: 12.33 / 25 (49.3%): 100%|██████████| 25/25 [00:00<00:00, 72.31it/s]"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2024/11/23 22:13:30 INFO dspy.evaluate.evaluate: Average Metric: 12.332086462618921 / 25 (49.3%)\n",
+      "2024/11/23 22:13:30 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 49.33 on minibatch of size 25 with parameters ['Predictor 0: Instruction 14', 'Predictor 0: Few-Shot Set 1'].\n",
+      "2024/11/23 22:13:30 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [58.88, 55.51, 62.25, 58.17, 62.98, 58.73, 63.74, 62.09, 62.16, 49.33]\n",
+      "2024/11/23 22:13:30 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.63]\n",
+      "2024/11/23 22:13:30 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 55.63\n",
+      "2024/11/23 22:13:30 INFO dspy.teleprompt.mipro_optimizer_v2: =============================\n",
+      "\n",
+      "\n",
+      "2024/11/23 22:13:30 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Full Eval 1 =====\n",
+      "2024/11/23 22:13:30 INFO dspy.teleprompt.mipro_optimizer_v2: Doing full eval on next top averaging program (Avg Score: 63.74) from minibatch trials...\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Average Metric: 98.06 / 160 (61.3%): 100%|██████████| 160/160 [00:01<00:00, 139.10it/s]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2024/11/23 22:13:31 INFO dspy.evaluate.evaluate: Average Metric: 98.06249092576995 / 160 (61.3%)\n",
+      "2024/11/23 22:13:31 INFO dspy.teleprompt.mipro_optimizer_v2: \u001b[92mNew best full eval score!\u001b[0m Score: 61.29\n",
+      "2024/11/23 22:13:31 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.63, 61.29]\n",
+      "2024/11/23 22:13:31 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 61.29\n",
+      "2024/11/23 22:13:31 INFO dspy.teleprompt.mipro_optimizer_v2: =======================\n",
+      "2024/11/23 22:13:31 INFO dspy.teleprompt.mipro_optimizer_v2: \n",
+      "\n",
+      "2024/11/23 22:13:31 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 11 / 25 ==\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Average Metric: 15.61 / 25 (62.5%): 100%|██████████| 25/25 [00:00<00:00, 105.23it/s]"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2024/11/23 22:13:31 INFO dspy.evaluate.evaluate: Average Metric: 15.612633878081091 / 25 (62.5%)\n",
+      "2024/11/23 22:13:31 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 62.45 on minibatch of size 25 with parameters ['Predictor 0: Instruction 0', 'Predictor 0: Few-Shot Set 12'].\n",
+      "2024/11/23 22:13:31 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [58.88, 55.51, 62.25, 58.17, 62.98, 58.73, 63.74, 62.09, 62.16, 49.33, 62.45]\n",
+      "2024/11/23 22:13:31 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.63, 61.29]\n",
+      "2024/11/23 22:13:31 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 61.29\n",
+      "2024/11/23 22:13:31 INFO dspy.teleprompt.mipro_optimizer_v2: =============================\n",
+      "\n",
+      "\n",
+      "2024/11/23 22:13:31 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 12 / 25 ==\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Average Metric: 15.03 / 25 (60.1%): 100%|██████████| 25/25 [00:00<00:00, 100.46it/s]"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2024/11/23 22:13:31 INFO dspy.evaluate.evaluate: Average Metric: 15.03300812819276 / 25 (60.1%)\n",
+      "2024/11/23 22:13:31 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 60.13 on minibatch of size 25 with parameters ['Predictor 0: Instruction 6', 'Predictor 0: Few-Shot Set 18'].\n",
+      "2024/11/23 22:13:31 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [58.88, 55.51, 62.25, 58.17, 62.98, 58.73, 63.74, 62.09, 62.16, 49.33, 62.45, 60.13]\n",
+      "2024/11/23 22:13:31 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.63, 61.29]\n",
+      "2024/11/23 22:13:31 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 61.29\n",
+      "2024/11/23 22:13:31 INFO dspy.teleprompt.mipro_optimizer_v2: =============================\n",
+      "\n",
+      "\n",
+      "2024/11/23 22:13:31 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 13 / 25 ==\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Average Metric: 14.43 / 25 (57.7%): 100%|██████████| 25/25 [00:00<00:00, 112.91it/s]"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2024/11/23 22:13:32 INFO dspy.evaluate.evaluate: Average Metric: 14.430989267101385 / 25 (57.7%)\n",
+      "2024/11/23 22:13:32 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 57.72 on minibatch of size 25 with parameters ['Predictor 0: Instruction 7', 'Predictor 0: Few-Shot Set 12'].\n",
+      "2024/11/23 22:13:32 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [58.88, 55.51, 62.25, 58.17, 62.98, 58.73, 63.74, 62.09, 62.16, 49.33, 62.45, 60.13, 57.72]\n",
+      "2024/11/23 22:13:32 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.63, 61.29]\n",
+      "2024/11/23 22:13:32 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 61.29\n",
+      "2024/11/23 22:13:32 INFO dspy.teleprompt.mipro_optimizer_v2: =============================\n",
+      "\n",
+      "\n",
+      "2024/11/23 22:13:32 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 14 / 25 ==\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Average Metric: 14.68 / 25 (58.7%): 100%|██████████| 25/25 [00:00<00:00, 95.62it/s] "
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2024/11/23 22:13:32 INFO dspy.evaluate.evaluate: Average Metric: 14.681540371022235 / 25 (58.7%)\n",
+      "2024/11/23 22:13:32 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 58.73 on minibatch of size 25 with parameters ['Predictor 0: Instruction 8', 'Predictor 0: Few-Shot Set 5'].\n",
+      "2024/11/23 22:13:32 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [58.88, 55.51, 62.25, 58.17, 62.98, 58.73, 63.74, 62.09, 62.16, 49.33, 62.45, 60.13, 57.72, 58.73]\n",
+      "2024/11/23 22:13:32 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.63, 61.29]\n",
+      "2024/11/23 22:13:32 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 61.29\n",
+      "2024/11/23 22:13:32 INFO dspy.teleprompt.mipro_optimizer_v2: =============================\n",
+      "\n",
+      "\n",
+      "2024/11/23 22:13:32 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 15 / 25 ==\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Average Metric: 14.54 / 25 (58.2%): 100%|██████████| 25/25 [00:00<00:00, 100.56it/s]"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2024/11/23 22:13:32 INFO dspy.evaluate.evaluate: Average Metric: 14.53865209268966 / 25 (58.2%)\n",
+      "2024/11/23 22:13:32 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 58.15 on minibatch of size 25 with parameters ['Predictor 0: Instruction 8', 'Predictor 0: Few-Shot Set 14'].\n",
+      "2024/11/23 22:13:32 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [58.88, 55.51, 62.25, 58.17, 62.98, 58.73, 63.74, 62.09, 62.16, 49.33, 62.45, 60.13, 57.72, 58.73, 58.15]\n",
+      "2024/11/23 22:13:32 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.63, 61.29]\n",
+      "2024/11/23 22:13:32 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 61.29\n",
+      "2024/11/23 22:13:32 INFO dspy.teleprompt.mipro_optimizer_v2: =============================\n",
+      "\n",
+      "\n",
+      "2024/11/23 22:13:32 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 16 / 25 ==\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Average Metric: 14.83 / 25 (59.3%): 100%|██████████| 25/25 [00:00<00:00, 108.11it/s]"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2024/11/23 22:13:32 INFO dspy.evaluate.evaluate: Average Metric: 14.832026371762414 / 25 (59.3%)\n",
+      "2024/11/23 22:13:32 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 59.33 on minibatch of size 25 with parameters ['Predictor 0: Instruction 8', 'Predictor 0: Few-Shot Set 18'].\n",
+      "2024/11/23 22:13:32 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [58.88, 55.51, 62.25, 58.17, 62.98, 58.73, 63.74, 62.09, 62.16, 49.33, 62.45, 60.13, 57.72, 58.73, 58.15, 59.33]\n",
+      "2024/11/23 22:13:32 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.63, 61.29]\n",
+      "2024/11/23 22:13:32 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 61.29\n",
+      "2024/11/23 22:13:32 INFO dspy.teleprompt.mipro_optimizer_v2: =============================\n",
+      "\n",
+      "\n",
+      "2024/11/23 22:13:32 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 17 / 25 ==\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Average Metric: 17.22 / 25 (68.9%): 100%|██████████| 25/25 [00:00<00:00, 105.12it/s]"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2024/11/23 22:13:33 INFO dspy.evaluate.evaluate: Average Metric: 17.216978671345192 / 25 (68.9%)\n",
+      "2024/11/23 22:13:33 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 68.87 on minibatch of size 25 with parameters ['Predictor 0: Instruction 16', 'Predictor 0: Few-Shot Set 6'].\n",
+      "2024/11/23 22:13:33 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [58.88, 55.51, 62.25, 58.17, 62.98, 58.73, 63.74, 62.09, 62.16, 49.33, 62.45, 60.13, 57.72, 58.73, 58.15, 59.33, 68.87]\n",
+      "2024/11/23 22:13:33 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.63, 61.29]\n",
+      "2024/11/23 22:13:33 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 61.29\n",
+      "2024/11/23 22:13:33 INFO dspy.teleprompt.mipro_optimizer_v2: =============================\n",
+      "\n",
+      "\n",
+      "2024/11/23 22:13:33 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 18 / 25 ==\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Average Metric: 15.12 / 25 (60.5%): 100%|██████████| 25/25 [00:00<00:00, 97.80it/s] "
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2024/11/23 22:13:33 INFO dspy.evaluate.evaluate: Average Metric: 15.123535939830598 / 25 (60.5%)\n",
+      "2024/11/23 22:13:33 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 60.49 on minibatch of size 25 with parameters ['Predictor 0: Instruction 1', 'Predictor 0: Few-Shot Set 6'].\n",
+      "2024/11/23 22:13:33 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [58.88, 55.51, 62.25, 58.17, 62.98, 58.73, 63.74, 62.09, 62.16, 49.33, 62.45, 60.13, 57.72, 58.73, 58.15, 59.33, 68.87, 60.49]\n",
+      "2024/11/23 22:13:33 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.63, 61.29]\n",
+      "2024/11/23 22:13:33 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 61.29\n",
+      "2024/11/23 22:13:33 INFO dspy.teleprompt.mipro_optimizer_v2: =============================\n",
+      "\n",
+      "\n",
+      "2024/11/23 22:13:33 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 19 / 25 ==\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Average Metric: 15.26 / 25 (61.0%): 100%|██████████| 25/25 [00:00<00:00, 99.12it/s] "
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2024/11/23 22:13:33 INFO dspy.evaluate.evaluate: Average Metric: 15.256960301954985 / 25 (61.0%)\n",
+      "2024/11/23 22:13:33 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 61.03 on minibatch of size 25 with parameters ['Predictor 0: Instruction 16', 'Predictor 0: Few-Shot Set 14'].\n",
+      "2024/11/23 22:13:33 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [58.88, 55.51, 62.25, 58.17, 62.98, 58.73, 63.74, 62.09, 62.16, 49.33, 62.45, 60.13, 57.72, 58.73, 58.15, 59.33, 68.87, 60.49, 61.03]\n",
+      "2024/11/23 22:13:33 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.63, 61.29]\n",
+      "2024/11/23 22:13:33 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 61.29\n",
+      "2024/11/23 22:13:33 INFO dspy.teleprompt.mipro_optimizer_v2: =============================\n",
+      "\n",
+      "\n",
+      "2024/11/23 22:13:33 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 20 / 25 ==\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Average Metric: 14.61 / 25 (58.4%): 100%|██████████| 25/25 [00:00<00:00, 102.38it/s]"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2024/11/23 22:13:34 INFO dspy.evaluate.evaluate: Average Metric: 14.607005004992326 / 25 (58.4%)\n",
+      "2024/11/23 22:13:34 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 58.43 on minibatch of size 25 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 9'].\n",
+      "2024/11/23 22:13:34 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [58.88, 55.51, 62.25, 58.17, 62.98, 58.73, 63.74, 62.09, 62.16, 49.33, 62.45, 60.13, 57.72, 58.73, 58.15, 59.33, 68.87, 60.49, 61.03, 58.43]\n",
+      "2024/11/23 22:13:34 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.63, 61.29]\n",
+      "2024/11/23 22:13:34 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 61.29\n",
+      "2024/11/23 22:13:34 INFO dspy.teleprompt.mipro_optimizer_v2: =============================\n",
+      "\n",
+      "\n",
+      "2024/11/23 22:13:34 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Full Eval 2 =====\n",
+      "2024/11/23 22:13:34 INFO dspy.teleprompt.mipro_optimizer_v2: Doing full eval on next top averaging program (Avg Score: 68.87) from minibatch trials...\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Average Metric: 97.23 / 160 (60.8%): 100%|██████████| 160/160 [00:11<00:00, 14.01it/s] "
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2024/11/23 22:13:45 INFO dspy.evaluate.evaluate: Average Metric: 97.22622109571304 / 160 (60.8%)\n",
+      "2024/11/23 22:13:45 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.63, 61.29, 60.77]\n",
+      "2024/11/23 22:13:45 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 61.29\n",
+      "2024/11/23 22:13:45 INFO dspy.teleprompt.mipro_optimizer_v2: =======================\n",
+      "2024/11/23 22:13:45 INFO dspy.teleprompt.mipro_optimizer_v2: \n",
+      "\n",
+      "2024/11/23 22:13:45 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 21 / 25 ==\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Average Metric: 16.54 / 25 (66.2%): 100%|██████████| 25/25 [00:00<00:00, 112.10it/s]"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2024/11/23 22:13:45 INFO dspy.evaluate.evaluate: Average Metric: 16.54482901646923 / 25 (66.2%)\n",
+      "2024/11/23 22:13:45 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 66.18 on minibatch of size 25 with parameters ['Predictor 0: Instruction 16', 'Predictor 0: Few-Shot Set 6'].\n",
+      "2024/11/23 22:13:45 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [58.88, 55.51, 62.25, 58.17, 62.98, 58.73, 63.74, 62.09, 62.16, 49.33, 62.45, 60.13, 57.72, 58.73, 58.15, 59.33, 68.87, 60.49, 61.03, 58.43, 66.18]\n",
+      "2024/11/23 22:13:45 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.63, 61.29, 60.77]\n",
+      "2024/11/23 22:13:45 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 61.29\n",
+      "2024/11/23 22:13:45 INFO dspy.teleprompt.mipro_optimizer_v2: =============================\n",
+      "\n",
+      "\n",
+      "2024/11/23 22:13:45 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 22 / 25 ==\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Average Metric: 14.84 / 25 (59.4%): 100%|██████████| 25/25 [00:00<00:00, 113.00it/s]"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2024/11/23 22:13:45 INFO dspy.evaluate.evaluate: Average Metric: 14.837814582612035 / 25 (59.4%)\n",
+      "2024/11/23 22:13:45 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 59.35 on minibatch of size 25 with parameters ['Predictor 0: Instruction 16', 'Predictor 0: Few-Shot Set 6'].\n",
+      "2024/11/23 22:13:45 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [58.88, 55.51, 62.25, 58.17, 62.98, 58.73, 63.74, 62.09, 62.16, 49.33, 62.45, 60.13, 57.72, 58.73, 58.15, 59.33, 68.87, 60.49, 61.03, 58.43, 66.18, 59.35]\n",
+      "2024/11/23 22:13:45 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.63, 61.29, 60.77]\n",
+      "2024/11/23 22:13:45 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 61.29\n",
+      "2024/11/23 22:13:45 INFO dspy.teleprompt.mipro_optimizer_v2: =============================\n",
+      "\n",
+      "\n",
+      "2024/11/23 22:13:45 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 23 / 25 ==\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Average Metric: 14.71 / 25 (58.8%): 100%|██████████| 25/25 [00:00<00:00, 105.76it/s]"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2024/11/23 22:13:46 INFO dspy.evaluate.evaluate: Average Metric: 14.711485027993763 / 25 (58.8%)\n",
+      "2024/11/23 22:13:46 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 58.85 on minibatch of size 25 with parameters ['Predictor 0: Instruction 3', 'Predictor 0: Few-Shot Set 15'].\n",
+      "2024/11/23 22:13:46 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [58.88, 55.51, 62.25, 58.17, 62.98, 58.73, 63.74, 62.09, 62.16, 49.33, 62.45, 60.13, 57.72, 58.73, 58.15, 59.33, 68.87, 60.49, 61.03, 58.43, 66.18, 59.35, 58.85]\n",
+      "2024/11/23 22:13:46 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.63, 61.29, 60.77]\n",
+      "2024/11/23 22:13:46 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 61.29\n",
+      "2024/11/23 22:13:46 INFO dspy.teleprompt.mipro_optimizer_v2: =============================\n",
+      "\n",
+      "\n",
+      "2024/11/23 22:13:46 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 24 / 25 ==\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Average Metric: 15.14 / 25 (60.6%): 100%|██████████| 25/25 [00:00<00:00, 95.66it/s]"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2024/11/23 22:13:46 INFO dspy.evaluate.evaluate: Average Metric: 15.144601379869599 / 25 (60.6%)\n",
+      "2024/11/23 22:13:46 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 60.58 on minibatch of size 25 with parameters ['Predictor 0: Instruction 18', 'Predictor 0: Few-Shot Set 8'].\n",
+      "2024/11/23 22:13:46 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [58.88, 55.51, 62.25, 58.17, 62.98, 58.73, 63.74, 62.09, 62.16, 49.33, 62.45, 60.13, 57.72, 58.73, 58.15, 59.33, 68.87, 60.49, 61.03, 58.43, 66.18, 59.35, 58.85, 60.58]\n",
+      "2024/11/23 22:13:46 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.63, 61.29, 60.77]\n",
+      "2024/11/23 22:13:46 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 61.29\n",
+      "2024/11/23 22:13:46 INFO dspy.teleprompt.mipro_optimizer_v2: =============================\n",
+      "\n",
+      "\n",
+      "2024/11/23 22:13:46 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 25 / 25 ==\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Average Metric: 14.26 / 25 (57.0%): 100%|██████████| 25/25 [00:00<00:00, 103.69it/s]"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2024/11/23 22:13:46 INFO dspy.evaluate.evaluate: Average Metric: 14.257718170019547 / 25 (57.0%)\n",
+      "2024/11/23 22:13:46 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 57.03 on minibatch of size 25 with parameters ['Predictor 0: Instruction 16', 'Predictor 0: Few-Shot Set 0'].\n",
+      "2024/11/23 22:13:46 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [58.88, 55.51, 62.25, 58.17, 62.98, 58.73, 63.74, 62.09, 62.16, 49.33, 62.45, 60.13, 57.72, 58.73, 58.15, 59.33, 68.87, 60.49, 61.03, 58.43, 66.18, 59.35, 58.85, 60.58, 57.03]\n",
+      "2024/11/23 22:13:46 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.63, 61.29, 60.77]\n",
+      "2024/11/23 22:13:46 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 61.29\n",
+      "2024/11/23 22:13:46 INFO dspy.teleprompt.mipro_optimizer_v2: =============================\n",
+      "\n",
+      "\n",
+      "2024/11/23 22:13:46 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Full Eval 3 =====\n",
+      "2024/11/23 22:13:46 INFO dspy.teleprompt.mipro_optimizer_v2: Doing full eval on next top averaging program (Avg Score: 62.45) from minibatch trials...\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Average Metric: 95.14 / 160 (59.5%): 100%|██████████| 160/160 [00:01<00:00, 143.17it/s]"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2024/11/23 22:13:47 INFO dspy.evaluate.evaluate: Average Metric: 95.13659459156446 / 160 (59.5%)\n",
+      "2024/11/23 22:13:47 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.63, 61.29, 60.77, 59.46]\n",
+      "2024/11/23 22:13:47 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 61.29\n",
+      "2024/11/23 22:13:47 INFO dspy.teleprompt.mipro_optimizer_v2: =======================\n",
+      "2024/11/23 22:13:47 INFO dspy.teleprompt.mipro_optimizer_v2: \n",
+      "\n",
+      "2024/11/23 22:13:47 INFO dspy.teleprompt.mipro_optimizer_v2: Returning best identified program with score 61.29!\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "tp = dspy.MIPROv2(metric=metric, auto=\"medium\", num_threads=24)  # use fewer threads if your rate limit is small\n",
+    "\n",
+    "optimized_rag = tp.compile(RAG(), trainset=trainset,\n",
+    "                           max_bootstrapped_demos=2, max_labeled_demos=2,\n",
+    "                           requires_permission_to_run=False)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The prompt optimization process here is pretty systematic, you can learn about it for example in this paper. Importantly, it's not a magic button. It's very possible that it can overfit your training set for instance and not generalize well to a held-out set, making it essential that we iteratively validate our programs.\n",
+    "\n",
+    "Let's check on an example here, asking the same question to the baseline `rag = RAG()` program, which was not optimized, and to the `optimized_rag = MIPROv2(..)(..)` program, after prompt optimization."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "You are correct that cmd+tab does not work on hidden or minimized windows. To switch back to a minimized app, you must first switch to another application and let it take focus before returning to the minimized one.\n"
+     ]
+    }
+   ],
+   "source": [
+    "baseline = rag(question=\"cmd+tab does not work on hidden or minimized windows\")\n",
+    "print(baseline.response)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The Command + Tab shortcut on macOS is designed to switch between currently open applications, but it does not directly restore minimized or hidden windows. When you use Command + Tab, it cycles through the applications that are actively running, and minimized windows do not count as active. To manage minimized windows, you can use other shortcuts or methods. For example, you can use Command + Option + H + M to hide all other applications and minimize the most recently used one. Alternatively, you can navigate to the application you want to restore using Command + Tab and then manually click on the minimized window in the Dock to bring it back to focus.\n"
+     ]
+    }
+   ],
+   "source": [
+    "pred = optimized_rag(question=\"cmd+tab does not work on hidden or minimized windows\")\n",
+    "print(pred.response)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "You can use `dspy.inspect_history(n=2)` to view the RAG prompt [before optimization](https://gist.github.com/okhat/5d04648f2226e72e66e26a8cb1456ee4) and [after optimization](https://gist.github.com/okhat/79405b8889b4b07da577ee19f1a3479a).\n",
+    "\n",
+    "Concretely, in one of the runs of this notebook, the optimized prompt does the following (note that it may be different on a later rerun).\n",
+    "\n",
+    "1. Constructs the following instruction,\n",
+    "```text\n",
+    "Using the provided `context` and `question`, analyze the information step by step to generate a comprehensive and informative `response`. Ensure that the response clearly explains the concepts involved, highlights key distinctions, and addresses any complexities noted in the context.\n",
+    "```\n",
+    "\n",
+    "2. And includes two fully worked out RAG examples with synthetic reasoning and answers, e.g. `how to transfer whatsapp voice message to computer?`.\n",
+    "\n",
+    "Let's now evaluate on the overall devset."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Average Metric: 183.28 / 300 (61.1%): 100%|██████████| 300/300 [00:13<00:00, 22.20it/s] "
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2024/11/23 22:14:01 INFO dspy.evaluate.evaluate: Average Metric: 183.27658621624977 / 300 (61.1%)\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>question</th>\n",
+       "      <th>example_response</th>\n",
+       "      <th>gold_doc_ids</th>\n",
+       "      <th>reasoning</th>\n",
+       "      <th>pred_response</th>\n",
+       "      <th>SemanticF1</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>when to use c over c++, and c++ over c?</td>\n",
+       "      <td>If you are equally familiar with both C++ and C, it's advisable to...</td>\n",
+       "      <td>[733]</td>\n",
+       "      <td>The context provides insights into the strengths and weaknesses of...</td>\n",
+       "      <td>You should consider using C over C++ in scenarios where simplicity...</td>\n",
+       "      <td>✔️ [0.333]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>should images be stored in a git repository?</td>\n",
+       "      <td>One viewpoint expresses that there is no significant downside, esp...</td>\n",
+       "      <td>[6253, 6254, 6275, 6278, 8215]</td>\n",
+       "      <td>The context discusses the challenges and considerations of storing...</td>\n",
+       "      <td>Storing images in a Git repository is generally considered bad pra...</td>\n",
+       "      <td>✔️ [0.500]</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                       question  \\\n",
+       "0       when to use c over c++, and c++ over c?   \n",
+       "1  should images be stored in a git repository?   \n",
+       "\n",
+       "                                                        example_response  \\\n",
+       "0  If you are equally familiar with both C++ and C, it's advisable to...   \n",
+       "1  One viewpoint expresses that there is no significant downside, esp...   \n",
+       "\n",
+       "                     gold_doc_ids  \\\n",
+       "0                           [733]   \n",
+       "1  [6253, 6254, 6275, 6278, 8215]   \n",
+       "\n",
+       "                                                               reasoning  \\\n",
+       "0  The context provides insights into the strengths and weaknesses of...   \n",
+       "1  The context discusses the challenges and considerations of storing...   \n",
+       "\n",
+       "                                                           pred_response  \\\n",
+       "0  You should consider using C over C++ in scenarios where simplicity...   \n",
+       "1  Storing images in a Git repository is generally considered bad pra...   \n",
+       "\n",
+       "   SemanticF1  \n",
+       "0  ✔️ [0.333]  \n",
+       "1  ✔️ [0.500]  "
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "\n",
+       "                <div style='\n",
+       "                    text-align: center;\n",
+       "                    font-size: 16px;\n",
+       "                    font-weight: bold;\n",
+       "                    color: #555;\n",
+       "                    margin: 10px 0;'>\n",
+       "                    ... 298 more rows not displayed ...\n",
+       "                </div>\n",
+       "                "
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "61.09"
+      ]
+     },
+     "execution_count": 21,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "evaluate(optimized_rag)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Keeping an eye on cost.\n",
+    "\n",
+    "DSPy allows you to track the cost of your programs, which can be used to monitor the cost of your calls. Here, we'll show you how to track the cost of your programs with DSPy."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cost = sum([x['cost'] for x in lm.history if x['cost'] is not None])  # in USD, as calculated by LiteLLM for certain providers"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Saving and loading.\n",
+    "\n",
+    "The optimized program has a pretty simple structure on the inside. Feel free to explore it.\n",
+    "\n",
+    "Here, we'll save `optimized_rag` so we can load it again later without having to optimize from scratch."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Prediction(\n",
+       "    reasoning='The context explains how the Command + Tab shortcut functions on macOS, particularly in relation to switching between applications. It notes that this shortcut does not bring back minimized or hidden windows directly. Instead, it cycles through applications that are currently open and visible. The information also suggests alternative methods for managing minimized windows and provides insights into how to navigate between applications effectively.',\n",
+       "    response='The Command + Tab shortcut on macOS is designed to switch between currently open applications, but it does not directly restore minimized or hidden windows. When you use Command + Tab, it cycles through the applications that are actively running, and minimized windows do not count as active. To manage minimized windows, you can use other shortcuts or methods. For example, you can use Command + Option + H + M to hide all other applications and minimize the most recently used one. Alternatively, you can navigate to the application you want to restore using Command + Tab and then manually click on the minimized window in the Dock to bring it back to focus.'\n",
        ")"
       ]
      },
@@ -1245,6 +2305,278 @@
     "loaded_rag(question=\"cmd+tab does not work on hidden or minimized windows\")"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "\n",
+      "\n",
+      "\n",
+      "\u001b[34m[2024-11-23T22:14:01.562290]\u001b[0m\n",
+      "\n",
+      "\u001b[31mSystem message:\u001b[0m\n",
+      "\n",
+      "Your input fields are:\n",
+      "1. `question` (str)\n",
+      "2. `ground_truth` (str)\n",
+      "3. `system_response` (str)\n",
+      "\n",
+      "Your output fields are:\n",
+      "1. `reasoning` (str)\n",
+      "2. `ground_truth_key_ideas` (str): enumeration of key ideas in the ground truth\n",
+      "3. `system_response_key_ideas` (str): enumeration of key ideas in the system response\n",
+      "4. `discussion` (str): discussion of the overlap between ground truth and system response\n",
+      "5. `recall` (float): fraction (out of 1.0) of ground truth covered by the system response\n",
+      "6. `precision` (float): fraction (out of 1.0) of system response covered by the ground truth\n",
+      "\n",
+      "All interactions will be structured in the following way, with the appropriate values filled in.\n",
+      "\n",
+      "[[ ## question ## ]]\n",
+      "{question}\n",
+      "\n",
+      "[[ ## ground_truth ## ]]\n",
+      "{ground_truth}\n",
+      "\n",
+      "[[ ## system_response ## ]]\n",
+      "{system_response}\n",
+      "\n",
+      "[[ ## reasoning ## ]]\n",
+      "{reasoning}\n",
+      "\n",
+      "[[ ## ground_truth_key_ideas ## ]]\n",
+      "{ground_truth_key_ideas}\n",
+      "\n",
+      "[[ ## system_response_key_ideas ## ]]\n",
+      "{system_response_key_ideas}\n",
+      "\n",
+      "[[ ## discussion ## ]]\n",
+      "{discussion}\n",
+      "\n",
+      "[[ ## recall ## ]]\n",
+      "{recall}        # note: the value you produce must be a single float value\n",
+      "\n",
+      "[[ ## precision ## ]]\n",
+      "{precision}        # note: the value you produce must be a single float value\n",
+      "\n",
+      "[[ ## completed ## ]]\n",
+      "\n",
+      "In adhering to this structure, your objective is: \n",
+      "        Compare a system's response to the ground truth to compute recall and precision of key ideas.\n",
+      "        You will first enumerate key ideas in each response, discuss their overlap, and then report recall and precision.\n",
+      "\n",
+      "\n",
+      "\u001b[31mUser message:\u001b[0m\n",
+      "\n",
+      "[[ ## question ## ]]\n",
+      "how to use pip after the os x el capitan upgrade?\n",
+      "\n",
+      "[[ ## ground_truth ## ]]\n",
+      "One quick solution is to install Python using homebrew into a directory such as `/usr/local/bin`, which allows for the use of pip and python regardless of the system provided version.  \n",
+      "Additionally, opting to use `pip3 install` alongside the desired package name helps to solve permission problem.  \n",
+      "A direct and secure approach without altering system settings is to install Python modules for the individual user using the `pip install --user` command followed by the module name.  \n",
+      "An alternative method involves installing Python with homebrew and then obtaining pip through the `sudo easy_install pip` command.  \n",
+      "Lastly, using virtual environments, specifically using virtualenv or virtualenvwrapper, creates an isolated environment for each project, permitting seamless pip usage for package management within these enclosed spaces.\n",
+      "\n",
+      "[[ ## system_response ## ]]\n",
+      "After upgrading to OS X El Capitan, you can use pip by following these steps:\n",
+      "\n",
+      "1. **Check Python Installation**: Ensure that you have the correct version of Python installed. You can check this by running `python --version` or `python3 --version` in the terminal.\n",
+      "\n",
+      "2. **Install pip**: If pip is not already installed, you can install it using the following command:\n",
+      "   ```bash\n",
+      "   sudo easy_install pip\n",
+      "   ```\n",
+      "   Alternatively, if you are using a specific version of Python installed via MacPorts, make sure to use the corresponding pip version, such as `py36-pip` for Python 3.6.\n",
+      "\n",
+      "3. **Verify pip Installation**: After installation, verify that pip is working by running:\n",
+      "   ```bash\n",
+      "   pip --version\n",
+      "   ```\n",
+      "\n",
+      "4. **Using pip**: You can now use pip to install packages. For example, to install the `nltk` package, you would run:\n",
+      "   ```bash\n",
+      "   pip install nltk\n",
+      "   ```\n",
+      "\n",
+      "5. **Managing Multiple Versions**: If you have multiple versions of Python installed (e.g., via MacPorts or Homebrew), ensure you are using the correct pip associated with the desired Python version. You can do this by using the `port select` command to set the active Python version.\n",
+      "\n",
+      "6. **Check PATH**: If you encounter issues with pip not being recognized, check your PATH settings. You may need to add the directory containing pip to your PATH by editing the `/etc/paths` file:\n",
+      "   ```bash\n",
+      "   sudo vim /etc/paths\n",
+      "   ```\n",
+      "   Add the path to the pip installation (e.g., `/usr/local/bin`) at the end of the file, save, and exit.\n",
+      "\n",
+      "By following these steps, you should be able to use pip effectively after your OS X El Capitan upgrade.\n",
+      "\n",
+      "Respond with the corresponding output fields, starting with the field `[[ ## reasoning ## ]]`, then `[[ ## ground_truth_key_ideas ## ]]`, then `[[ ## system_response_key_ideas ## ]]`, then `[[ ## discussion ## ]]`, then `[[ ## recall ## ]]` (must be formatted as a valid Python float), then `[[ ## precision ## ]]` (must be formatted as a valid Python float), and then ending with the marker for `[[ ## completed ## ]]`.\n",
+      "\n",
+      "\n",
+      "\u001b[31mResponse:\u001b[0m\n",
+      "\n",
+      "\u001b[32m[[ ## reasoning ## ]]\n",
+      "The system response provides a detailed guide on how to use pip after upgrading to OS X El Capitan, including checking the Python installation, installing pip, verifying the installation, and managing multiple versions of Python. The ground truth also covers similar points but emphasizes using Homebrew for Python installation, using `pip3`, and installing packages for the individual user. Both responses address the installation and usage of pip, but the ground truth includes additional methods and best practices for managing Python environments.\n",
+      "\n",
+      "[[ ## ground_truth_key_ideas ## ]]\n",
+      "1. Install Python using Homebrew into `/usr/local/bin`.\n",
+      "2. Use `pip3 install` to solve permission issues.\n",
+      "3. Install Python modules for the individual user with `pip install --user`.\n",
+      "4. Install pip using `sudo easy_install pip`.\n",
+      "5. Use virtual environments (virtualenv or virtualenvwrapper) for isolated package management.\n",
+      "\n",
+      "[[ ## system_response_key_ideas ## ]]\n",
+      "1. Check Python installation version.\n",
+      "2. Install pip using `sudo easy_install pip`.\n",
+      "3. Verify pip installation.\n",
+      "4. Use pip to install packages (e.g., `pip install nltk`).\n",
+      "5. Manage multiple Python versions and check PATH settings.\n",
+      "\n",
+      "[[ ## discussion ## ]]\n",
+      "There is a significant overlap between the ground truth and the system response regarding the installation and usage of pip. Both mention the installation of pip and the verification of its functionality. However, the ground truth provides additional methods for managing Python installations and permissions, such as using Homebrew and user-specific installations. The system response focuses more on the steps to ensure pip is set up correctly after the OS X upgrade, including managing multiple Python versions and checking PATH settings. Overall, while both responses cover the essential aspects of using pip, the ground truth offers a broader perspective on installation methods and user management.\n",
+      "\n",
+      "[[ ## recall ## ]]\n",
+      "0.5\n",
+      "\n",
+      "[[ ## precision ## ]]\n",
+      "0.5\n",
+      "\n",
+      "[[ ## completed ## ]]\u001b[0m\n",
+      "\n",
+      "\n",
+      "\n",
+      "\n",
+      "\n",
+      "\u001b[34m[2024-11-23T22:14:01.708696]\u001b[0m\n",
+      "\n",
+      "\u001b[31mSystem message:\u001b[0m\n",
+      "\n",
+      "Your input fields are:\n",
+      "1. `context` (str)\n",
+      "2. `question` (str)\n",
+      "\n",
+      "Your output fields are:\n",
+      "1. `reasoning` (str)\n",
+      "2. `response` (str)\n",
+      "\n",
+      "All interactions will be structured in the following way, with the appropriate values filled in.\n",
+      "\n",
+      "[[ ## context ## ]]\n",
+      "{context}\n",
+      "\n",
+      "[[ ## question ## ]]\n",
+      "{question}\n",
+      "\n",
+      "[[ ## reasoning ## ]]\n",
+      "{reasoning}\n",
+      "\n",
+      "[[ ## response ## ]]\n",
+      "{response}\n",
+      "\n",
+      "[[ ## completed ## ]]\n",
+      "\n",
+      "In adhering to this structure, your objective is: \n",
+      "        Using the provided `context` which contains detailed information about transferring photos from an iPhone to a computer, and the `question` regarding how to transfer edited photos specifically, generate a comprehensive `response` that outlines the necessary steps or methods to effectively transfer the edited photos. Ensure to highlight any limitations of direct import methods and suggest alternative approaches such as AirDrop, emailing, or using iCloud Photo Library for a successful transfer of edited images.\n",
+      "\n",
+      "\n",
+      "\u001b[31mUser message:\u001b[0m\n",
+      "\n",
+      "This is an example of the task, though some input or output fields are not supplied.\n",
+      "\n",
+      "[[ ## context ## ]]\n",
+      "Not supplied for this particular example.\n",
+      "\n",
+      "[[ ## question ## ]]\n",
+      "can i create a shortcut to open a specific application on os x?\n",
+      "\n",
+      "Respond with the corresponding output fields, starting with the field `[[ ## reasoning ## ]]`, then `[[ ## response ## ]]`, and then ending with the marker for `[[ ## completed ## ]]`.\n",
+      "\n",
+      "\n",
+      "\u001b[31mAssistant message:\u001b[0m\n",
+      "\n",
+      "[[ ## reasoning ## ]]\n",
+      "Not supplied for this particular example.\n",
+      "\n",
+      "[[ ## response ## ]]\n",
+      "While Mac OS X does have a built-in method for setting keyboard shortcuts, it doesn't work for Firefox.  \n",
+      "Karabiner-Elements allows for customization of keyboard shortcuts on Mac versions 10.12 and later, including executing shell commands.  \n",
+      "Alfred, particularly in its Power Pack version, offers the ability to map shortcuts to applications, among its various features.  \n",
+      "Using Automator Services in combination with built-in keyboard shortcuts is another method to create shortcuts without third party apps.  \n",
+      "Spark 3 is a free Shortcuts manager which enables you to create Hot Keys to launch applications and documents, execute AppleScript, control iTunes, etc.  \n",
+      "Some users have found Apptivate to be the easiest to use for assigning global shortcuts to apps.  \n",
+      "To open Terminal with a shortcut on Mac without third-party software, one can utilize a custom Quick Action created in Automator and assign it a unique hotkey, like ⌘ + ⌥ + F12.  \n",
+      "For iTerm2 users with Oh My Zsh, one can replace AppleScript with the 'Launch Application' action in Automator.   \n",
+      "Furthermore, Terminal app already has built-in shortcut keys for opening new terminals or tabs at a folder location: New Terminal at Folder ^+⇧+T New Terminal at Tab Folder ⌥+^+⇧+T.\n",
+      "\n",
+      "[[ ## completed ## ]]\n",
+      "\n",
+      "\n",
+      "\u001b[31mUser message:\u001b[0m\n",
+      "\n",
+      "[[ ## context ## ]]\n",
+      "[1] «On linux, /sbin/nologin comes from the util-linux project, while /bin/false is part of GNU Coreutils. They serve different roles, and nologin has the option of printing a message for people who have it as their shell who are logging in. The linux commands come from BSD, where they seem to have a long history of being different. The FreeBSD false simply returns 1, while the nologin checks to make sure its running on a TTY and sends a message to syslog during login attempts. The linux versions are a bit more complicated (false doing all sorts of fun stuff with internationalization for the output of --help, I assume) but essentially perform the same way.»\n",
+      "[2] «/bin/false is a system command that is used anytime you need to pass a command to a program that should do nothing more than exit with an error. Its the companion to /bin/true. Both of these are very old and standard POSIX utilities and neither produce any output by definition. true is sometimes used for a shell script that should loop indefinitely, like: while true; do ... # Waste time if [ $wasted_time -gt 100000 ]; then exit 0 fi ... done /usr/sbin/nologin is specifically designed to replace a shell and produces output complaining you cant log-in. Before it existed, it was common to use /bin/false for dummy users, but could be confusing since the user doesnt know why theyre kicked off.»\n",
+      "[3] «/bin/false only job is to exit with a non-zero exit code. Try it at the command line: $:> /bin/false $:> echo $? 1 $:> Some institutions use /bin/false in the shell field of the password file. If user tries to login, the shell is /bin/false, so they are exited right away»\n",
+      "[4] «Some FTP servers will allow you FTP access only if you have a valid shell. /sbin/nologin is regarded as a valid shell, whereas /bin/false is not. (I think valid means its exit status is 0, but /etc/shells may also come into it, it probably depends on the system, the FTP software, and your configuration.)»\n",
+      "[5] «You can use false (/bin/false, /usr/bin/false, or shell builtin): $ false || echo It failed. It failed. $ You can also use exit 1 from a subshell: $ (exit 1) || echo Gosh, it failed too. Gosh, it failed too. $»\n",
+      "\n",
+      "[[ ## question ## ]]\n",
+      "whats the difference between /sbin/nologin and /bin/false\n",
+      "\n",
+      "Respond with the corresponding output fields, starting with the field `[[ ## reasoning ## ]]`, then `[[ ## response ## ]]`, and then ending with the marker for `[[ ## completed ## ]]`.\n",
+      "\n",
+      "\n",
+      "\u001b[31mAssistant message:\u001b[0m\n",
+      "\n",
+      "[[ ## reasoning ## ]]\n",
+      "The context provides detailed information about the roles and behaviors of `/sbin/nologin` and `/bin/false`. `/sbin/nologin` is designed to prevent users from logging in while providing a message indicating that login is not allowed. In contrast, `/bin/false` simply exits with a non-zero exit code without any output, serving as a command that does nothing. The distinction lies in their intended use cases and the feedback they provide to users.\n",
+      "\n",
+      "[[ ## response ## ]]\n",
+      "The main difference between `/sbin/nologin` and `/bin/false` is their purpose and behavior during login attempts. `/sbin/nologin` is used to prevent users from logging in while displaying a message that informs them they cannot log in. It is considered a valid shell for certain applications, such as FTP servers. On the other hand, `/bin/false` is a command that simply exits with a non-zero exit code and does not provide any output. It is often used in situations where a command needs to do nothing and is not intended to inform the user of any restrictions.\n",
+      "\n",
+      "[[ ## completed ## ]]\n",
+      "\n",
+      "\n",
+      "\u001b[31mUser message:\u001b[0m\n",
+      "\n",
+      "[[ ## context ## ]]\n",
+      "[1] «If you hold command and quickly tap tab you will cycle between your two most recently used applications without bringing up the heads up display. If you want to use a third party app Witch has options to ignore minimized windows.»\n",
+      "[2] «Try this: On your Mac, Navigate to System Preferences Go to Mission Control Uncheck When switching to an application, switch to a Space with open windows for the application Try using the cmd+tab now.»\n",
+      "[3] «Command + Tab until you get the apps icon. Before releasing the Command key, press and hold the Option key. You must switch to another app and let it take focus first. In other words, you cant just Command + Tab to another app and before actually selecting that app (by releasing the Command and Tab keys), switch right back to your minimized app, which you might attempt to do if you minimized it by accident or just simply changed your mind shortly after minimizing. Both the Command and left Option keys must be pressed on the same side (left or right) of the keyboard.»\n",
+      "[4] «Command-option-click does not hide all open windows. As stated in the linked Apple article, this command hides all other open items leaving the clicked item visible. It is possible to hide all other items and minimize your most recently item with command+option+h+m. This is a combination of command+option+h - Hide Others and command+option+m - Minimize.»\n",
+      "[5] «Cmd+` of course will work but I do it in different way. I do the Cmd+tab when i do that, I dont release the Cmd so the opened apps are still visible. While it is visible, I will press ▼. This way it will show all your opened application with animated displayed so you can navigate using arrow keys.»\n",
+      "\n",
+      "[[ ## question ## ]]\n",
+      "cmd+tab does not work on hidden or minimized windows\n",
+      "\n",
+      "Respond with the corresponding output fields, starting with the field `[[ ## reasoning ## ]]`, then `[[ ## response ## ]]`, and then ending with the marker for `[[ ## completed ## ]]`.\n",
+      "\n",
+      "\n",
+      "\u001b[31mResponse:\u001b[0m\n",
+      "\n",
+      "\u001b[32m[[ ## reasoning ## ]]\n",
+      "The context explains how the Command + Tab shortcut functions on macOS, particularly in relation to switching between applications. It notes that this shortcut does not bring back minimized or hidden windows directly. Instead, it cycles through applications that are currently open and visible. The information also suggests alternative methods for managing minimized windows and provides insights into how to navigate between applications effectively.\n",
+      "\n",
+      "[[ ## response ## ]]\n",
+      "The Command + Tab shortcut on macOS is designed to switch between currently open applications, but it does not directly restore minimized or hidden windows. When you use Command + Tab, it cycles through the applications that are actively running, and minimized windows do not count as active. To manage minimized windows, you can use other shortcuts or methods. For example, you can use Command + Option + H + M to hide all other applications and minimize the most recently used one. Alternatively, you can navigate to the application you want to restore using Command + Tab and then manually click on the minimized window in the Dock to bring it back to focus.\n",
+      "\n",
+      "[[ ## completed ## ]]\u001b[0m\n",
+      "\n",
+      "\n",
+      "\n",
+      "\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "dspy.inspect_history(n=2)"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -1252,22 +2584,20 @@
     "\n",
     "## What's next?\n",
     "\n",
-    "Improving from around 37% to approximately 60% on this task, in terms of `SemanticF1`, was pretty easy.\n",
+    "Improving from around 42% to approximately 61% on this task, in terms of `SemanticF1`, was pretty easy.\n",
     "\n",
     "But DSPy gives you paths to continue iterating on the quality of your system and we have barely scratched the surface.\n",
     "\n",
     "In general, you have the following tools:\n",
     "\n",
-    "1. Explore better system architectures for your program, e.g. what if we ask the LM to generate search queries for the retriever? See this [notebook](https://colab.research.google.com/github/stanfordnlp/dspy/blob/main/intro.ipynb) or the [STORM pipeline](https://arxiv.org/abs/2402.14207) built in DSPy.\n",
+    "1. Explore better system architectures for your program, e.g. what if we ask the LM to generate search queries for the retriever? See, e.g., the [STORM pipeline](https://arxiv.org/abs/2402.14207) built in DSPy.\n",
     "2. Explore different [prompt optimizers](https://arxiv.org/abs/2406.11695) or [weight optimizers](https://arxiv.org/abs/2407.10930). See the **[Optimizers Docs](/building-blocks/6-optimizers)**.\n",
     "3. Scale inference time compute using DSPy Optimizers, e.g. this [notebook](https://github.com/stanfordnlp/dspy/blob/main/examples/agents/multi_agent.ipynb).\n",
-    "4. Cut cost by distilling to a smaller LM, via prompt or weight optimization, e.g. [this notebook](https://github.com/stanfordnlp/dspy/blob/main/examples/nli/scone/scone.ipynb) or [this notebook](https://colab.research.google.com/github/stanfordnlp/dspy/blob/main/examples/qa/hotpot/multihop_finetune.ipynb).\n",
+    "4. Cut cost by distilling to a smaller LM, via prompt or weight optimization, e.g. [this notebook](https://github.com/stanfordnlp/dspy/blob/main/examples/nli/scone/scone.ipynb).\n",
     "\n",
     "How do you decide which ones to proceed with first?\n",
     "\n",
-    "The first step is to look at your system outputs, which will allow you to identify the sources of lower performance if any. While doing all of this, make sure you continue to refine your metric, e.g. by optimizing against your judgments, and to collect more (or more realistic) data, e.g. from related domains or from putting a demo of your system in front of users.\n",
-    "\n",
-    "Learn more about the [development cycle](/building-blocks/solving_your_task) in DSPy."
+    "The first step is to look at your system outputs, which will allow you to identify the sources of lower performance if any. While doing all of this, make sure you continue to refine your metric, e.g. by optimizing against your judgments, and to collect more (or more realistic) data, e.g. from related domains or from putting a demo of your system in front of users."
    ]
   }
  ],
diff --git a/dspy/evaluate/auto_evaluation.py b/dspy/evaluate/auto_evaluation.py
index d96d58f218..d983321439 100644
--- a/dspy/evaluate/auto_evaluation.py
+++ b/dspy/evaluate/auto_evaluation.py
@@ -55,7 +55,7 @@ def forward(self, example, pred, trace=None):
 ###########
 
 
-class DecompositionalSemanticRecall(dspy.Signature):
+class AnswerCompleteness(dspy.Signature):
     """
     Estimate the completeness of a system's responses, against the ground truth.
     You will first enumerate key ideas in each response, discuss their overlap, and then report completeness.
@@ -71,7 +71,7 @@ class DecompositionalSemanticRecall(dspy.Signature):
 
 
 
-class DecompositionalGroundedness(dspy.Signature):
+class AnswerGroundedness(dspy.Signature):
     """
     Estimate the groundedness of a system's responses, against real retrieved documents written by people.
     You will first enumerate whatever non-trivial or check-worthy claims are made in the system response, and then
@@ -89,8 +89,8 @@ class DecompositionalGroundedness(dspy.Signature):
 class CompleteAndGrounded(dspy.Module):
     def __init__(self, threshold=0.66):
         self.threshold = threshold
-        self.completeness_module = dspy.ChainOfThought(DecompositionalSemanticRecall)
-        self.groundedness_module = dspy.ChainOfThought(DecompositionalGroundedness)
+        self.completeness_module = dspy.ChainOfThought(AnswerCompleteness)
+        self.groundedness_module = dspy.ChainOfThought(AnswerGroundedness)
 
     def forward(self, example, pred, trace=None):
         completeness = self.completeness_module(question=example.question, ground_truth=example.response, system_response=pred.response)
diff --git a/dspy/utils/__init__.py b/dspy/utils/__init__.py
index f12b34b180..ba205504e7 100644
--- a/dspy/utils/__init__.py
+++ b/dspy/utils/__init__.py
@@ -2,3 +2,17 @@
 from dspy.utils.dummies import *
 from dspy.utils.caching import *
 from dspy.utils.logging_utils import *
+
+import os
+import ujson
+import requests
+
+def download(url):
+    filename = os.path.basename(url)
+    remote_size = int(requests.head(url, allow_redirects=True).headers.get('Content-Length', 0))
+    local_size = os.path.getsize(filename) if os.path.exists(filename) else 0
+
+    if local_size != remote_size:
+        print(f"Downloading '{filename}'...")
+        with requests.get(url, stream=True) as r, open(filename, 'wb') as f:
+            for chunk in r.iter_content(chunk_size=8192): f.write(chunk)