# NOTE: This is not a cookbook

This is a testing notebook in order to make sure that multimodal works.

Cookbook is on the way, but if you have particular ideas, message @isaac on discord

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import dspy
from dspy.datasets import DataLoader
from dspy.evaluate.metrics import answer_exact_match
from typing import List
from dspy.evaluate import Evaluate

import dotenv
import litellm

litellm.suppress_debug_info = True

dotenv.load_dotenv()

def debug_exact_match(example, pred, trace=None, frac=1.0):
    print(example.inputs())
    print(example.answer)
    print(pred)
    # print(trace)
    # print(frac)
    return answer_exact_match(example, pred, trace, frac)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# vllm serve Qwen/Qwen2-VL-7B-Instruct --trust-remote-code --limit-mm-per-prompt image=16 --seed 42 --pipeline-parallel-size 2
qwen_lm = dspy.LM(model="openai/Qwen/Qwen2-VL-7B-Instruct", api_base="http://localhost:8000/v1", api_key="sk-fake-key", max_tokens=5000)
haiku_lm = dspy.LM(model="anthropic/claude-3-haiku-20240307", max_tokens=4096)
# vllm serve meta-llama/Llama-3.2-11B-Vision-Instruct --trust-remote-code --limit-mm-per-prompt image=16 --seed 42 --enforce-eager --max-num-seqs 48
llama_lm = dspy.LM(model="openai/meta-llama/Llama-3.2-11B-Vision-Instruct", api_base="http://localhost:8000/v1", api_key="sk-fake-key", max_tokens=5000)
internlm_lm = dspy.LM(model="openai/OpenGVLab/InternVL2-8B", api_base="http://localhost:8000/v1", api_key="sk-fake-key", max_tokens=5000)
gpt_lm = dspy.LM(model="openai/gpt-4o-mini", max_tokens=5000)
all_lms = [qwen_lm, haiku_lm, llama_lm, gpt_lm]

dspy.settings.configure(lm=qwen_lm)

In [4]:
class DogPictureSignature(dspy.Signature):
    """Answer the question based on the image."""
    image: dspy.Image = dspy.InputField()
    question: str = dspy.InputField()
    answer: str = dspy.OutputField()

class DogPicture(dspy.Module):
    def __init__(self) -> None:
        self.predictor = dspy.ChainOfThought(DogPictureSignature)
    
    def __call__(self, **kwargs):
        return self.predictor(**kwargs)

dog_picture = DogPicture()

example = dspy.Example(image=dspy.Image.from_url("https://i.pinimg.com/564x/78/f9/6d/78f96d0314d39a1b8a849005123e166d.jpg"), question="What is the breed of the dog in the image?").with_inputs("image", "question")
print(dog_picture(**example.inputs()))

Prediction(
    reasoning="The dog in the image has a golden coat and floppy ears, which are characteristic features of a Golden Retriever. Additionally, the dog's size and overall appearance are consistent with that of a Golden Retriever puppy.",
    answer='The breed of the dog in the image is a Golden Retriever.'
)


In [8]:
# qwen_lm.inspect_history()
import rich
rich.print(qwen_lm.history)


In [9]:
p = dspy.Predict("question -> answer")
p(question="What is the capital of France?")
qwen_lm.inspect_history()





[31mSystem message:[0m

Your input fields are:
1. `question` (str)

Your output fields are:
1. `answer` (str)

All interactions will be structured in the following way, with the appropriate values filled in.

[[ ## question ## ]]
{question}

[[ ## answer ## ]]
{answer}

[[ ## completed ## ]]


In adhering to this structure, your objective is: 
        Given the fields `question`, produce the fields `answer`.


[31mUser message:[0m

[[ ## question ## ]]
What is the capital of France?

Respond with the corresponding output fields, starting with the field `[[ ## answer ## ]]`, and then ending with the marker for `[[ ## completed ## ]]`.


[31mResponse:[0m

[32m[[ ## answer ## ]]
Paris

[[ ## completed ## ]][0m







In [5]:
%%capture
from concurrent.futures import ThreadPoolExecutor

input_keys = tuple([f"image_{i}" for i in range(1, 3)] + ["question", "options"])
subsets = ['Accounting', 'Agriculture', 'Architecture_and_Engineering', 'Art', 'Art_Theory', 'Basic_Medical_Science', 'Biology', 'Chemistry', 'Clinical_Medicine', 'Computer_Science', 'Design', 'Diagnostics_and_Laboratory_Medicine', 'Economics', 'Electronics', 'Energy_and_Power', 'Finance', 'Geography', 'History', 'Literature', 'Manage', 'Marketing', 'Materials', 'Math', 'Mechanical_Engineering', 'Music', 'Pharmacy', 'Physics', 'Psychology', 'Public_Health', 'Sociology']

devset = []
valset = []
with ThreadPoolExecutor(max_workers=len(subsets)) as executor:
    def load_dataset(subset_index_subset):
        subset_index, subset = subset_index_subset
        dataset = DataLoader().from_huggingface("MMMU/MMMU", subset, split=["dev", "validation"], input_keys=input_keys)
        return subset_index, dataset["dev"], dataset["validation"]
    
    results = list(executor.map(load_dataset, enumerate(subsets)))
    
    results.sort(key=lambda x: x[0])
    
    for _, dev, val in results:
        devset.extend(dev)
        valset.extend(val)

print(len(devset))
print(len(valset))

In [6]:
import ast

def count_images(dataset):
    image_counts = {i: 0 for i in range(6)}  # Initialize counts for 0 to 2 images
    for example in dataset:
        count = sum(1 for key in example.inputs().keys() if key.startswith('image_') and example.inputs()[key] is not None)
        image_counts[count] += 1
    return image_counts

def count_multiple_choice_questions(dataset):
    return sum(1 for example in dataset if example["question_type"] == "multiple-choice")
max_images = 5

num_images = 1

devset_filtered = [example for example in devset if sum(1 for key in example.inputs().keys() if key.startswith('image_') and example.inputs()[key] is not None) == num_images]
valset_filtered = [example for example in valset if sum(1 for key in example.inputs().keys() if key.startswith('image_') and example.inputs()[key] is not None) == num_images]

devset_filtered = [example for example in devset_filtered if example["question_type"] == "multiple-choice"]
valset_filtered = [example for example in valset_filtered if example["question_type"] == "multiple-choice"]

def update_example_image_key(example):
    example_copy = example.copy()
    example_copy["image"] = example_copy["image_1"]
    return example_copy.with_inputs(*example.inputs().keys(), "image")



devset_filtered = list(map(update_example_image_key, devset_filtered))
valset_filtered = list(map(update_example_image_key, valset_filtered))

devset_image_counts = count_images(devset_filtered)
valset_image_counts = count_images(valset_filtered)

devset_multiple_choice_questions = count_multiple_choice_questions(devset_filtered)
valset_multiple_choice_questions = count_multiple_choice_questions(valset_filtered)

print("Image counts in devset:")
for count, num_examples in devset_image_counts.items():
    print(f"{count} image(s): {num_examples} examples")

print("\nImage counts in valset:")
for count, num_examples in valset_image_counts.items():
    print(f"{count} image(s): {num_examples} examples")

print("\nMultiple choice questions in devset:")
print(devset_multiple_choice_questions, "out of", len(devset_filtered))
print("\nMultiple choice questions in valset:")
print(valset_multiple_choice_questions, "out of", len(valset_filtered))

def convert_multiple_choice_to_letter(dataset):
    new_dataset = []
    for example in dataset:
        if example["question_type"] == "multiple-choice":
            # print(example["options"])
            options = ast.literal_eval(example["options"])
            example["choices"] = str([chr(65 + i) + ". " + option for i, option in enumerate(options)])
        else:
            example["choices"] = str(ast.literal_eval(example["options"]))
            if example["choices"] == []:
                example["choices"] = "Free response"

        updated_example = example.with_inputs(*example.inputs().keys(), "choices")
        new_dataset.append(updated_example)
    return new_dataset

print(devset_filtered[0])
updated_devset = convert_multiple_choice_to_letter(devset_filtered)
print(updated_devset[0])
updated_valset = convert_multiple_choice_to_letter(valset_filtered)




Image counts in devset:
0 image(s): 0 examples
1 image(s): 137 examples
2 image(s): 0 examples
3 image(s): 0 examples
4 image(s): 0 examples
5 image(s): 0 examples

Image counts in valset:
0 image(s): 0 examples
1 image(s): 805 examples
2 image(s): 0 examples
3 image(s): 0 examples
4 image(s): 0 examples
5 image(s): 0 examples

Multiple choice questions in devset:
137 out of 137

Multiple choice questions in valset:
805 out of 805
Example({'id': 'dev_Accounting_1', 'question': 'Each of the following situations relates to a different company. <image 1> For company B, find the missing amounts.', 'options': "['$63,020', '$58,410', '$71,320', '$77,490']", 'explanation': '', 'image_1': <PIL.PngImagePlugin.PngImageFile image mode=RGBA size=1234x289 at 0x72A378A7C5E0>, 'image_2': None, 'image_3': None, 'image_4': None, 'image_5': None, 'image_6': None, 'image_7': None, 'img_type': "['Tables']", 'answer': 'D', 'topic_difficulty': 'Easy', 'question_type': 'multiple-choice', 'subfield': 'Financi

In [8]:
import re
from typing import Literal
class MMMUSignature(dspy.Signature):
    """Answer with the letter of the correct answer."""

    question: str = dspy.InputField()
    image: dspy.Image = dspy.InputField()
    choices: List[str] = dspy.InputField()
    answer: Literal["A", "B", "C", "D", "E"] = dspy.OutputField()

class MMMUModule(dspy.Module):
    def __init__(self, cot=True):
        super().__init__()
        if cot:
            self.predictor = dspy.ChainOfThought(MMMUSignature)
        else:
            self.predictor = dspy.Predict(MMMUSignature)

    def __call__(self, **kwargs):
        # Clean up predictions
        prediction = self.predictor(**kwargs)
        # Multiple choice case
        if "A." in kwargs["choices"]:
            # regex to extract A, B, C, or D, or E
            answer = re.search(r'[A-E]', prediction["answer"])
            if not answer:
                answer = prediction["answer"]
            else:
                answer = answer.group(0)
            prediction["answer"] = answer
        # Free response case
        return prediction


In [9]:


sample_input = updated_devset[0]
# print(sample_input.inputs())
# print(encode_image(sample_input.inputs()["image_1"]))
mmmu = MMMUModule()
print(sample_input.inputs())
print(mmmu(**sample_input.inputs()))
print(sample_input.answer)

evaluate_mmmu = Evaluate(metric=answer_exact_match, num_threads=50, devset=updated_valset, display_progress=True, max_errors=500, return_outputs=True)

Example({'question': 'Each of the following situations relates to a different company. <image 1> For company B, find the missing amounts.', 'options': "['$63,020', '$58,410', '$71,320', '$77,490']", 'image_1': <PIL.PngImagePlugin.PngImageFile image mode=RGBA size=1234x289 at 0x72A378A7C5E0>, 'image_2': None, 'image': <PIL.PngImagePlugin.PngImageFile image mode=RGBA size=1234x289 at 0x72A378A7C5E0>, 'choices': "['A. $63,020', 'B. $58,410', 'C. $71,320', 'D. $77,490']"}) (input_keys={'image_2', 'options', 'image_1', 'question', 'choices', 'image'})
Prediction(
    reasoning='To find the missing amounts for company B, we need to balance the income statement. We know the revenues, expenses, gains, and losses for company B. We can calculate the missing amounts by subtracting the known amounts from the total.\n\nRevenues: $1,480,500\nExpenses: $1,518,300\nGains: $78,120\nLosses: $0\n\nNet Income or (Loss): $39,690\n\nWe can calculate the missing amounts as follows:\n\n1. Revenues: $1,480,500

In [12]:

def test_lm(lm, cot=False):
    if lm.model == "openai/gpt-4o-mini":
        num_threads = 10
    else:
        num_threads = 30
    evaluate_mmmu = Evaluate(metric=answer_exact_match, num_threads=num_threads, devset=updated_valset, display_progress=True, max_errors=500, return_outputs=True)
    mmmu = MMMUModule(cot=cot)
    with dspy.context(lm=lm):
        scores, outputs = evaluate_mmmu(mmmu)
        num_bad_format = sum(1 for example in outputs if example[1].get("answer", None) is None)
        return scores, num_bad_format

res1 = test_lm(qwen_lm)
res1_cot = test_lm(qwen_lm, cot=True)
# test_lm(haiku_lm)
# test_lm(llama_lm)
# res1 = test_lm(internlm_lm)
# res2 = test_lm(gpt_lm)
# res2_cot = test_lm(gpt_lm, cot=True)
# Results:
# MMMU Val(single image only, multiple choice only), N=805
# Temp 0, max_tokens=5k

# 4o-mini:
# Reported: 59.4

# Measured (cot, predict): 60.0, 56.4
# Num bad format (cot, predict): 0, 1

# qwen-7b
# Reported: 54.1
# Measured (cot, predict): 49.0, 49.69
# Num bad format (cot, predict): 17, 0
print("MMMU Validation Set (single image only, multiple choice only), N=805")
print("Temp 0, max_tokens=5k")
print("qwen-7b")
print("Reported:", 54.1)
print("Measured (cot, predict):", f"{res1_cot[0]:.1f}, {res1[0]:.2f}")
print("Num bad format (cot, predict):", f"{res1_cot[1]}, {res1[1]}")
# print()
# print("gpt-4o-mini")
# print("Reported:", 59.4)
# print("Measured (cot, predict):", f"{res2_cot[0]:.1f}, {res2[0]:.2f}") 
# print("Num bad format (cot, predict):", f"{res2_cot[1]}, {res2[1]}")


Average Metric: 50 / 104  (48.1):  13%|█▎        | 103/805 [00:02<00:15, 44.79it/s]2024/11/01 21:41:15 ERROR dspy.evaluate.evaluate: Error for example in dev set: 		 Images are not yet supported in JSON mode.. Set `provide_traceback=True` to see the stack trace.
Average Metric: 0.0 / 498  (0.0):  62%|██████▏   | 498/805 [33:12<20:28,  4.00s/it]
Average Metric: 54 / 108  (50.0):  13%|█▎        | 107/805 [00:03<00:32, 21.76it/s]2024/11/01 21:41:15 ERROR dspy.evaluate.evaluate: Error for example in dev set: 		 Images are not yet supported in JSON mode.. Set `provide_traceback=True` to see the stack trace.
Average Metric: 65 / 127  (51.2):  16%|█▌        | 126/805 [00:03<00:29, 22.90it/s]2024/11/01 21:41:16 ERROR dspy.evaluate.evaluate: Error for example in dev set: 		 Images are not yet supported in JSON mode.. Set `provide_traceback=True` to see the stack trace.
Average Metric: 216.0 / 443  (48.8):  55%|█████▍    | 442/805 [00:17<00:25, 14.49it/s] 2024/11/01 21:41:30 ERROR dspy.evaluate.

Expected dict_keys(['reasoning', 'answer']) but got dict_keys(['reasoning']) from [[ ## reasoning ## ]]
The conversion cost is the sum of direct materials and direct labor. Direct materials are $15,000 and direct labor is $25,000. Therefore, the conversion cost is $15,000 + $25,000 = $40,000. However, this option is not available. The next step is to consider the factory depreciation expense, factory utilities expense, and payroll staff's salary. These are all indirect costs and do not contribute to the conversion cost. Therefore, the conversion cost is the sum of direct materials and direct labor, which is $15,000 + $25,000 = $40,000. This option is not available, so the next step is to consider the factory depreciation expense, factory utilities expense, and payroll staff's salary. These are all indirect costs and do not contribute to the conversion cost. Therefore, the conversion cost is the sum of direct materials and direct labor, which is $15,000 + $25,000 = $40,000. This option 

2024/11/01 21:45:56 ERROR dspy.evaluate.evaluate: Error for example in dev set: 		 Images are not yet supported in JSON mode.. Set `provide_traceback=True` to see the stack trace.
Average Metric: 378.0 / 787  (48.0):  98%|█████████▊| 787/805 [03:54<00:55,  3.09s/it]2024/11/01 21:45:56 ERROR dspy.evaluate.evaluate: Error for example in dev set: 		 Images are not yet supported in JSON mode.. Set `provide_traceback=True` to see the stack trace.
2024/11/01 21:45:56 ERROR dspy.evaluate.evaluate: Error for example in dev set: 		 Images are not yet supported in JSON mode.. Set `provide_traceback=True` to see the stack trace.
Average Metric: 378.0 / 789  (47.9):  98%|█████████▊| 788/805 [03:54<00:52,  3.09s/it]2024/11/01 21:45:56 ERROR dspy.evaluate.evaluate: Error for example in dev set: 		 Images are not yet supported in JSON mode.. Set `provide_traceback=True` to see the stack trace.
Average Metric: 378.0 / 790  (47.8):  98%|█████████▊| 789/805 [03:54<00:49,  3.09s/it]2024/11/01 21:45:56 ER

Expected dict_keys(['reasoning', 'answer']) but got dict_keys(['reasoning']) from [[ ## reasoning ## ]]
To determine the maximum value of live load force in bar CG, we need to consider the effects of both the uniform live load and the concentrated live load. The uniform live load of 0.32 kip/ft will cause a distributed force along the entire length of the truss, while the concentrated live load of 24 kips will be applied at a specific point. The maximum force in bar CG will occur when the concentrated load is applied at the point where it will produce the largest force. This point is typically at the midpoint of the truss span, which is 45 feet from each end. The maximum force in bar CG will be the sum of the forces due to the uniform live load and the concentrated live load. The uniform live load will cause a force of 0.32 kip/ft * 45 ft = 14.4 kips. The concentrated live load will cause a force of 24 kips. Therefore, the maximum force in bar CG will be 14.4 kips + 24 kips = 38.4 kips

2024/11/01 21:46:12 ERROR dspy.evaluate.evaluate: Error for example in dev set: 		 Images are not yet supported in JSON mode.. Set `provide_traceback=True` to see the stack trace.
Average Metric: 378.0 / 792  (47.7):  98%|█████████▊| 791/805 [04:10<00:43,  3.09s/it]2024/11/01 21:46:12 ERROR dspy.evaluate.evaluate: Error for example in dev set: 		 Images are not yet supported in JSON mode.. Set `provide_traceback=True` to see the stack trace.
Average Metric: 378.0 / 793  (47.7):  98%|█████████▊| 792/805 [04:10<00:40,  3.13s/it]2024/11/01 21:46:12 ERROR dspy.evaluate.evaluate: Error for example in dev set: 		 Images are not yet supported in JSON mode.. Set `provide_traceback=True` to see the stack trace.
Average Metric: 378.0 / 794  (47.6):  99%|█████████▊| 793/805 [04:10<00:37,  3.13s/it]2024/11/01 21:46:12 ERROR dspy.evaluate.evaluate: Error for example in dev set: 		 Images are not yet supported in JSON mode.. Set `provide_traceback=True` to see the stack trace.
Average Metric: 378.0 

ExpectedExpected dict_keys(['reasoning', 'answer']) but got dict_keys(['reasoning']) from [[ ## reasoning ## ]]
The voltage across the resistor $v_R(t)$ in a series RL circuit can be found using the formula $v_R(t) = v(t) - i(t)R$, where $v(t)$ is the voltage source, $i(t)$ is the current through the inductor, and $R$ is the resistance. The current through the inductor is given by $i(t) = L \frac{dv(t)}{dt}$, where $L$ is the inductance. For a unit ramp voltage source, $v(t) = t$, and the derivative of $v(t)$ is $1$. Therefore, $i(t) = L \cdot 1 = L$. Substituting $v(t)$, $i(t)$, and $R$ into the formula for $v_R(t)$ gives $v_R(t) = t - L \cdot 1 \cdot R = t - L \cdot R$. Given $L = 0.1$H and $R = 1$Ω, $v_R(t) = t - 0.1 \cdot 1 = t - 0.1$. However, this is not one of the given options. The correct approach is to recognize that the voltage across the resistor is the difference between the voltage source and the voltage drop across the inductor, which is given by $L \frac{dv(t)}{dt}$. Fo

2024/11/01 21:46:13 ERROR dspy.evaluate.evaluate: Error for example in dev set: 		 Images are not yet supported in JSON mode.. Set `provide_traceback=True` to see the stack trace.
Average Metric: 378.0 / 797  (47.4):  99%|█████████▉| 797/805 [04:10<00:14,  1.87s/it]

Expected dict_keys(['reasoning', 'answer']) but got dict_keys(['reasoning']) from [[ ## reasoning ## ]]
The height \(h\) can be found using the principle of hydrostatics, which states that the pressure at any point in a fluid is the same in all directions. The pressure at the bottom of the gasoline column is equal to the pressure at the bottom of the water column plus the pressure due to the height \(h\) of the gasoline column. The pressure due to the height \(h\) of the gasoline column can be calculated using the formula \(P = \rho g h\), where \(\rho\) is the density of the gasoline, \(g\) is the acceleration due to gravity, and \(h\) is the height of the gasoline column. The density of gasoline is given as 1.60 times the density of water, and the density of water is \(1000 \, \text{kg/m}^3\). The acceleration due to gravity is approximately \(9.81 \, \text{m/s}^2\). The height of the water column is 1.5 meters, and the height of the gasoline column is \(h\). Therefore, the pressure 

2024/11/01 21:46:14 ERROR dspy.evaluate.evaluate: Error for example in dev set: 		 Images are not yet supported in JSON mode.. Set `provide_traceback=True` to see the stack trace.
Average Metric: 378.0 / 798  (47.4):  99%|█████████▉| 798/805 [04:11<00:12,  1.77s/it]

Expected dict_keys(['reasoning', 'answer']) but got dict_keys(['reasoning']) from [[ ## reasoning ## ]]
The gage pressure \( p_1 \) needed to provide a 20°C water flow rate \( Q = 60 \, \text{m}^3/\text{h} \) can be calculated using the Bernoulli's equation. The Bernoulli's equation relates the pressure, velocity, and elevation in a fluid flow. For this problem, we need to find the pressure difference between the tank and the open jet.

First, we need to convert the flow rate from \( \text{m}^3/\text{h} \) to \( \text{m}^3/\text{s} \):
\[ Q = 60 \, \text{m}^3/\text{h} = \frac{60}{3600} \, \text{m}^3/\text{s} = 0.01667 \, \text{m}^3/\text{s} \]

Next, we need to find the velocity \( v \) of the water in the pipe:
\[ v = \frac{Q}{A} \]
where \( A \) is the cross-sectional area of the pipe.

The cross-sectional area of the pipe is:
\[ A = \pi \left(\frac{d}{2}\right)^2 = \pi \left(\frac{5}{2}\right)^2 = \frac{25\pi}{4} \, \text{m}^2 \]

Now, we can calculate the velocity:
\[ v = \frac{0.0

2024/11/01 21:46:14 ERROR dspy.evaluate.evaluate: Error for example in dev set: 		 Images are not yet supported in JSON mode.. Set `provide_traceback=True` to see the stack trace.
Average Metric: 378.0 / 799  (47.3):  99%|█████████▉| 799/805 [04:12<00:09,  1.61s/it]

Expected dict_keys(['reasoning', 'answer']) but got dict_keys(['reasoning']) from [[ ## reasoning ## ]]
The power extracted by the turbine can be estimated using the formula: Power = Flow rate * Head loss * Efficiency. The flow rate is given as 15,000 gal/min, which needs to be converted to m³/s for consistency with the head loss in feet. 1 gal/min = 0.000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000

2024/11/01 21:46:24 ERROR dspy.evaluate.evaluate: Error for example in dev set: 		 Images are not yet supported in JSON mode.. Set `provide_traceback=True` to see the stack trace.
Average Metric: 378.0 / 800  (47.2):  99%|█████████▉| 799/805 [04:21<00:09,  1.61s/it]2024/11/01 21:46:24 ERROR dspy.evaluate.evaluate: Error for example in dev set: 		 Images are not yet supported in JSON mode.. Set `provide_traceback=True` to see the stack trace.
Average Metric: 378.0 / 801  (47.2):  99%|█████████▉| 800/805 [04:21<00:14,  2.83s/it]

ExpectedExpected dict_keys(['reasoning', 'answer']) but got dict_keys(['reasoning']) from [[ ## reasoning ## ]]
The average normal stress in a cable is given by the formula \(\sigma = \frac{F}{A}\), where \(F\) is the force and \(A\) is the cross-sectional area. Since the force is the same for both cables, the average normal stress will be the same if the cross-sectional areas are the same. The diameter of the cable is given as 10 mm for AC, so we need to find the diameter of AB that will give the same cross-sectional area. The cross-sectional area of a circle is given by \(A = \pi r^2\), where \(r\) is the radius. The radius of the 10 mm diameter cable is 5 mm. We need to find the radius of the AB cable that will have the same cross-sectional area. Let's denote the radius of the AB cable as \(r_{AB}\). We have \(\pi (5^2) = \pi (r_{AB}^2)\). Solving for \(r_{AB}\), we get \(r_{AB} = 5\). The diameter of the AB cable is twice the radius, so the diameter of AB is 10 mm. However, this is

2024/11/01 21:46:26 ERROR dspy.evaluate.evaluate: Error for example in dev set: 		 Images are not yet supported in JSON mode.. Set `provide_traceback=True` to see the stack trace.
Average Metric: 378.0 / 802  (47.1): 100%|█████████▉| 802/805 [04:23<00:06,  2.29s/it]

Expected dict_keys(['reasoning', 'answer']) but got dict_keys(['reasoning']) from [[ ## reasoning ## ]]
To use Euler's method, we start with the initial condition f(-1) = 1.5 and the step size of 0.8. We will iterate through the x-values in the table, using the derivative f'(x) to approximate the change in f(x) for each step.

Starting at x = -1, we have:
- f(-1) = 1.5
- f'(-1) = 1

The change in f(x) for the first step is:
- Δf = f'(-1) * Δx = 1 * 0.8 = 0.8

So, the new value of f(x) after the first step is:
- f(-1 + 0.8) = f(-1) + Δf = 1.5 + 0.8 = 2.3

Now, we move to x = -0.6:
- f(-0.6) = 2.3
- f'(-0.6) = 2

The change in f(x) for the second step is:
- Δf = f'(-0.6) * Δx = 2 * 0.8 = 1.6

So, the new value of f(x) after the second step is:
- f(-0.6 + 0.8) = f(-0.6) + Δf = 2.3 + 1.6 = 3.9

However, this value is not in the given choices, so we need to re-evaluate our approach. It seems we made a mistake in our calculation. Let's correct it.

Starting at x = -1, we have:
- f(-1) = 1.5


2024/11/01 21:46:26 ERROR dspy.evaluate.evaluate: Error for example in dev set: 		 Images are not yet supported in JSON mode.. Set `provide_traceback=True` to see the stack trace.
Average Metric: 378.0 / 803  (47.1): 100%|█████████▉| 803/805 [04:24<00:03,  2.00s/it]

Expected dict_keys(['reasoning', 'answer']) but got dict_keys(['reasoning']) from [[ ## reasoning ## ]]
The distance D required for the car to coast to a stop can be found by integrating the acceleration a = -C1 - C2*v^2 with respect to time t, and then integrating the velocity v with respect to time t. The initial velocity is v0, and the final velocity is 0. The acceleration a is the derivative of velocity v with respect to time t, so we have:

a = dv/dt = -C1 - C2*v^2

Integrating both sides with respect to time t, we get:

∫a dt = ∫(-C1 - C2*v^2) dt

v = -C1*t - (C2/3)*v^3 + C3

where C3 is the constant of integration. We can solve for v in terms of t:

v = -C1*t - (C2/3)*v^3 + C3

Now, we can integrate v with respect to time t to find the distance D:

∫v dt = ∫(-C1*t - (C2/3)*v^3 + C3) dt

D = -C1*(t^2/2) - (C2/9)*v^4 + C3*t + C4

where C4 is the constant of integration. We can solve for D in terms of v0:

D = -C1*(t^2/2) - (C2/9)*v0^4 + C3*t + C4

Now, we can use the fact that the

2024/11/01 21:46:27 ERROR dspy.evaluate.evaluate: Error for example in dev set: 		 Images are not yet supported in JSON mode.. Set `provide_traceback=True` to see the stack trace.
Average Metric: 378.0 / 804  (47.0): 100%|█████████▉| 804/805 [04:24<00:01,  1.70s/it]

Expected dict_keys(['reasoning', 'answer']) but got dict_keys(['reasoning']) from [[ ## reasoning ## ]]
The maximum-normal-stress theory states that yielding will occur when the maximum normal stress exceeds the yield strength. The maximum normal stress is the larger of the two principal stresses. For point a, the maximum normal stress is 200 MPa (σ1), and for point b, it is 150 MPa (σ1). Since the yield strength of the steel is 400 MPa, the frame will experience initial yielding when the maximum normal stress reaches 400 MPa. To find the test load at which this occurs, we need to determine the load that will cause the maximum normal stress to reach 400 MPa. The load can be calculated using the formula: Load = (σ1 - Sy) / (Ssy - Sy), where σ1 is the maximum normal stress, Sy is the yield strength, and Ssy is the strain hardening modulus. For point a, the load is (200 - 400) / (250 - 400) = -200 MPa / -150 MPa = 1.33 kN. For point b, the load is (150 - 400) / (250 - 400) = -250 MPa / -1

2024/11/01 21:46:32 ERROR dspy.evaluate.evaluate: Error for example in dev set: 		 Images are not yet supported in JSON mode.. Set `provide_traceback=True` to see the stack trace.
Average Metric: 378.0 / 805  (47.0): 100%|██████████| 805/805 [04:29<00:00,  2.99it/s]
2024/11/01 21:46:32 INFO dspy.evaluate.evaluate: Average Metric: 378.0 / 805 (47.0%)


Expected dict_keys(['reasoning', 'answer']) but got dict_keys(['reasoning']) from [[ ## reasoning ## ]]
When the ammeter reads zero, it means that there is no current flowing through the circuit. This implies that the voltage drop across the 1 ohm resistor is equal to the voltage drop across the 2 ohm resistor. Therefore, the voltage drop across the 3 ohm resistor is equal to the voltage drop across the 1 ohm resistor. This means that the current through the 1 ohm resistor is equal to the current through the 2 ohm resistor. Since the current through the 1 ohm resistor is equal to the current through the 2 ohm resistor, the current through the 3 ohm resistor is equal to the current through the 1 ohm resistor. This means that the current through the 3 ohm resistor is equal to the current through the 2 ohm resistor. Therefore, the current through the 3 ohm resistor is equal to the current through the 2 ohm resistor. This means that the current through the 3 ohm resistor is equal to the cu

In [None]:
scores, outputs = evaluate_mmmu(mmmu)
# lm.inspect_history()


In [None]:
from collections import Counter
c = Counter([outputs[i][1].get("answer", "nothing returned") for i in range(len(outputs))])
non_letters = sum([1 for output in outputs if output[1].get("answer", "nothing returned") not in ["A", "B", "C", "D"]])
print(c)
print(non_letters)




In [None]:
mc_correct = sum(outputs[i][2] for i in range(len(outputs)) if outputs[i][0]["question_type"] == "multiple-choice")
total_mc = sum(1 for example in outputs if example[0]["question_type"] == "multiple-choice")
print(mc_correct, total_mc)
print(mc_correct / total_mc)
print(sum(outputs[i][1].get("answer", None) is None for i in range(len(outputs))))

# Note: Run above here

# Make sure that multiple images work

## No examples

In [None]:
import PIL
def set_image_to_black_square(example, key):
    example_copy = example.copy()
    example_copy[key] = PIL.Image.open("black_image_300x300.png")
    return example_copy.with_inputs(*example.inputs().keys())

print(updated_devset[0]["image_1"])
print(updated_devset[0]["image_2"])
examples_no_image_1 = list(map(lambda x: set_image_to_black_square(x, "image_1"), updated_valset))
print(examples_no_image_1[0]["image_1"] == PIL.Image.open("black_image_300x300.png"))
print(examples_no_image_1[0]["image_2"] == PIL.Image.open("black_image_300x300.png"))
examples_no_image_2 = list(map(lambda x: set_image_to_black_square(x, "image_2"), updated_valset))
print(examples_no_image_2[0]["image_1"] == PIL.Image.open("black_image_300x300.png"))
print(examples_no_image_2[0]["image_2"] == PIL.Image.open("black_image_300x300.png"))

examples_no_actual_image = list(map(lambda x: set_image_to_black_square(x, "image_1"), updated_valset))
examples_no_actual_image = list(map(lambda x: set_image_to_black_square(x, "image_2"), examples_no_actual_image))
print(examples_no_actual_image[0]["image_1"] == PIL.Image.open("black_image_300x300.png"))
print(examples_no_actual_image[0]["image_2"] == PIL.Image.open("black_image_300x300.png"))


In [None]:
mmmu = MMMUModule()
print(examples_no_image_1[0].inputs())
print(mmmu(**examples_no_image_1[0].inputs()))

print(examples_no_image_2[0].inputs())
print(mmmu(**examples_no_image_2[0].inputs()))


In [None]:
normal = evaluate_mmmu(mmmu, devset=updated_valset)
no_image_1 = evaluate_mmmu(mmmu, devset=examples_no_image_1)
no_image_2 = evaluate_mmmu(mmmu, devset=examples_no_image_2)
no_actual_image = evaluate_mmmu(mmmu, devset=examples_no_actual_image)
print("Testing on MMMU validation set (N=", len(updated_valset), ")")
print("Score with both images:", normal)
print("Score with image_1 set to black square:", no_image_1)
print("Score with image_2 set to black square:", no_image_2)
print("Score with both images set to black squares:", no_actual_image)

## TODO: Test with bootstrapped examples


# Make sure that JPGs work

## Convert images to JPGs

In [None]:
import io
from PIL import Image

def convert_to_jpg(example):
    example_copy = example.copy()
    for key in ['image_1', 'image_2']:
        if key in example_copy and isinstance(example_copy[key], Image.Image):
            # Convert to RGB mode (in case it's not already)
            img = example[key].convert('RGB')
            
            # Save as JPG in memory
            buffer = io.BytesIO()
            img.save(buffer, format='JPEG')
            buffer.seek(0)
            
            # Load the JPG back as a PIL Image
            example_copy[key] = Image.open(buffer)
    
    return example_copy.with_inputs(*example.inputs().keys())

# Convert all images in the dataset to JPG
examples_jpg = list(map(convert_to_jpg, updated_valset))

# Verify conversion
print("Original image format:", updated_valset[0]['image_1'].format)
print("Converted image format:", examples_jpg[0]['image_1'].format)


In [None]:
examples_jpg = list(map(convert_to_jpg, updated_valset))
examples_no_image_1_jpg = list(map(lambda x: convert_to_jpg(x), examples_no_image_1))
examples_no_image_2_jpg = list(map(lambda x: convert_to_jpg(x), examples_no_image_2))
examples_no_actual_image_jpg = list(map(lambda x: convert_to_jpg(x), examples_no_actual_image))

mmmu = MMMUModule()
print(examples_no_image_1_jpg[0].inputs())
print(mmmu(**examples_no_image_1_jpg[0].inputs()))
print(examples_no_image_1_jpg[0]["image_1"].format)

In [None]:
normal = evaluate_mmmu(mmmu, devset=examples_jpg)
no_image_1 = evaluate_mmmu(mmmu, devset=examples_no_image_1_jpg)
no_image_2 = evaluate_mmmu(mmmu, devset=examples_no_image_2_jpg)
no_actual_image = evaluate_mmmu(mmmu, devset=examples_no_actual_image_jpg)
print("Testing on MMMU validation set (N=", len(updated_valset), ")")
print("Score with both images:", normal)
print("Score with image_1 set to black square:", no_image_1)
print("Score with image_2 set to black square:", no_image_2)
print("Score with both images set to black squares:", no_actual_image)

In [None]:
lm.inspect_history()

# Testing that URLs work

In [None]:

colors = {
    "White": "FFFFFF",
    "Red": "FF0000",
    "Green": "00FF00",
    "Blue": "0000FF",
    "Yellow": "FFFF00",
    "Cyan": "00FFFF",
    "Magenta": "FF00FF",
    "Gray": "808080",
    "Orange": "FFA500",
    "Purple": "800080"
}
def get_color_image_url(color, file_extension="png"):
    return f"https://placehold.co/300/{colors[color]}/{colors[color]}.{file_extension}"


In [None]:
import random

def generate_random_2_color_image_examples(n):
    examples = []
    for _ in range(n):
        color_1, color_2 = random.sample(list(colors.keys()), 2)
        chosen_color = color_1 if random.random() < 0.5 else color_2
        chosen_image = "image_1" if chosen_color == color_1 else "image_2"
        example_kwargs = {
            "image_1": get_color_image_url(color_1),
            "image_2": get_color_image_url(color_2),
            "question": f"What color is {chosen_image}?",
            "answer": chosen_color
        }
        examples.append(dspy.Example(**example_kwargs).with_inputs("image_1", "image_2", "question"))
    return examples

examples = generate_random_2_color_image_examples(100)
print(examples[0])


In [None]:
class ColorSignature(dspy.Signature):
    """Output the color of the designated image."""
    image_1: dspy.Image = dspy.InputField(desc="An image")
    image_2: dspy.Image = dspy.InputField(desc="An image")
    question: str = dspy.InputField(desc="A question about the image")
    answer: str = dspy.OutputField(desc="The color of the designated image")
color_program = dspy.Predict(ColorSignature)


In [None]:
print(examples[0])
print(color_program(**examples[0].inputs()))

In [None]:
few_shot_optimizer = dspy.BootstrapFewShot(metric=answer_exact_match, max_bootstrapped_demos=3, max_labeled_demos=10)
smaller_few_shot_optimizer = dspy.BootstrapFewShot(metric=answer_exact_match, max_bootstrapped_demos=1, max_labeled_demos=1)
dataset = generate_random_2_color_image_examples(1000)
trainset = dataset[:200]
validationset = dataset[200:400]
evaluate_colors = Evaluate(metric=answer_exact_match, num_threads=300, devset=validationset)

In [None]:
compiled_color_program = few_shot_optimizer.compile(color_program, trainset=trainset)
compiled_smaller_color_program = smaller_few_shot_optimizer.compile(color_program, trainset=trainset)
print(evaluate_colors(color_program))
print(evaluate_colors(compiled_color_program))
print(evaluate_colors(compiled_smaller_color_program))

In [None]:
print(compiled_color_program(**validationset[0].inputs()))
lm.inspect_history()

# TODO(Isaac): Delete; Archive of old experiments

In [None]:
dataset = DataLoader().from_huggingface("Alanox/stanford-dogs", split="full", input_keys=("image",), trust_remote_code=True)

In [None]:
# rename the field from "image" to "image_1"
def rename_field(example, old_name, new_name):
    try:
        example[new_name] = example[old_name]
        del example[old_name]
    except Exception:
        pass
    return example
    
dog_dataset = list(map(rename_field, dataset, ["image"]*len(dataset), ["image_1"]*len(dataset)))
dog_dataset2 = list(map(rename_field, dog_dataset, ["target"]*len(dog_dataset), ["answer"]*len(dog_dataset)))
dog_dataset3 = list(map(lambda x: x.with_inputs("image_1"), dog_dataset2))
dog_dataset = dog_dataset3
random.shuffle(dog_dataset)

In [None]:
class DogPictureSignature(dspy.Signature):
    """Output the dog breed of the dog in the image."""
    image: dspy.Image = dspy.InputField(desc="An image of a dog")
    answer: str = dspy.OutputField(desc="The dog breed of the dog in the image")

class DogPicture(dspy.Module):
    def __init__(self) -> None:
        self.predictor = dspy.ChainOfThought(DogPictureSignature)
    
    def __call__(self, **kwargs):
        return self.predictor(**kwargs)

dog_picture = DogPicture()

example = dspy.Example(image=dspy.Image.from_url("https://i.pinimg.com/564x/78/f9/6d/78f96d0314d39a1b8a849005123e166d.jpg"))
print(dog_picture(**example.inputs()))
# print(dog_picture(**dog_dataset[0].inputs()))

NameError: name 'dog_dataset' is not defined

In [None]:
# TODO: Test inline signature
# TODO: Test json adapter