In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import dspy
from dspy.datasets import DataLoader
from dspy.evaluate.metrics import answer_exact_match
from typing import List
from dspy.evaluate import Evaluate

import dotenv
import litellm

litellm.suppress_debug_info = True

dotenv.load_dotenv()

def debug_exact_match(example, pred, trace=None, frac=1.0):
    print(example.inputs())
    print(example.answer)
    print(pred)
    # print(trace)
    # print(frac)
    return answer_exact_match(example, pred, trace, frac)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# vllm serve Qwen/Qwen2-VL-7B-Instruct --trust-remote-code --limit-mm-per-prompt image=16 --seed 42 --pipeline-parallel-size 2
qwen_lm = dspy.LM(model="openai/Qwen/Qwen2-VL-7B-Instruct", api_base="http://localhost:8000/v1", api_key="sk-fake-key", max_tokens=5000)
haiku_lm = dspy.LM(model="anthropic/claude-3-haiku-20240307", max_tokens=4096)
# vllm serve meta-llama/Llama-3.2-11B-Vision-Instruct --trust-remote-code --limit-mm-per-prompt image=16 --seed 42 --enforce-eager --max-num-seqs 48
llama_lm = dspy.LM(model="openai/meta-llama/Llama-3.2-11B-Vision-Instruct", api_base="http://localhost:8000/v1", api_key="sk-fake-key", max_tokens=5000)
internlm_lm = dspy.LM(model="openai/OpenGVLab/InternVL2-8B", api_base="http://localhost:8000/v1", api_key="sk-fake-key", max_tokens=5000)
gpt_lm = dspy.LM(model="openai/gpt-4o-mini", max_tokens=5000)
all_lms = [qwen_lm, haiku_lm, llama_lm, gpt_lm]

dspy.settings.configure(lm=qwen_lm)

In [12]:
%%capture
from concurrent.futures import ThreadPoolExecutor

input_keys = tuple([f"image_{i}" for i in range(1, 3)] + ["question", "options"])
subsets = ['Accounting', 'Agriculture', 'Architecture_and_Engineering', 'Art', 'Art_Theory', 'Basic_Medical_Science', 'Biology', 'Chemistry', 'Clinical_Medicine', 'Computer_Science', 'Design', 'Diagnostics_and_Laboratory_Medicine', 'Economics', 'Electronics', 'Energy_and_Power', 'Finance', 'Geography', 'History', 'Literature', 'Manage', 'Marketing', 'Materials', 'Math', 'Mechanical_Engineering', 'Music', 'Pharmacy', 'Physics', 'Psychology', 'Public_Health', 'Sociology']

devset = []
valset = []
with ThreadPoolExecutor(max_workers=len(subsets)) as executor:
    def load_dataset(subset_index_subset):
        subset_index, subset = subset_index_subset
        dataset = DataLoader().from_huggingface("MMMU/MMMU", subset, split=["dev", "validation"], input_keys=input_keys)
        return subset_index, dataset["dev"], dataset["validation"]
    
    results = list(executor.map(load_dataset, enumerate(subsets)))
    
    results.sort(key=lambda x: x[0])
    
    for _, dev, val in results:
        devset.extend(dev)
        valset.extend(val)

print(len(devset))
print(len(valset))

In [18]:
import ast

def count_images(dataset):
    image_counts = {i: 0 for i in range(6)}  # Initialize counts for 0 to 2 images
    for example in dataset:
        count = sum(1 for key in example.inputs().keys() if key.startswith('image_') and example.inputs()[key] is not None)
        image_counts[count] += 1
    return image_counts

def count_multiple_choice_questions(dataset):
    return sum(1 for example in dataset if example["question_type"] == "multiple-choice")
max_images = 5

num_images = 1

devset_filtered = [example for example in devset if sum(1 for key in example.inputs().keys() if key.startswith('image_') and example.inputs()[key] is not None) == num_images]
valset_filtered = [example for example in valset if sum(1 for key in example.inputs().keys() if key.startswith('image_') and example.inputs()[key] is not None) == num_images]

devset_filtered = [example for example in devset_filtered if example["question_type"] == "multiple-choice"]
valset_filtered = [example for example in valset_filtered if example["question_type"] == "multiple-choice"]

def update_example_image_key(example):
    example_copy = example.copy()
    example_copy["image"] = example_copy["image_1"]
    return example_copy.with_inputs(*example.inputs().keys(), "image")



devset_filtered = list(map(update_example_image_key, devset_filtered))
valset_filtered = list(map(update_example_image_key, valset_filtered))

devset_image_counts = count_images(devset_filtered)
valset_image_counts = count_images(valset_filtered)

devset_multiple_choice_questions = count_multiple_choice_questions(devset_filtered)
valset_multiple_choice_questions = count_multiple_choice_questions(valset_filtered)

print("Image counts in devset:")
for count, num_examples in devset_image_counts.items():
    print(f"{count} image(s): {num_examples} examples")

print("\nImage counts in valset:")
for count, num_examples in valset_image_counts.items():
    print(f"{count} image(s): {num_examples} examples")

print("\nMultiple choice questions in devset:")
print(devset_multiple_choice_questions, "out of", len(devset_filtered))
print("\nMultiple choice questions in valset:")
print(valset_multiple_choice_questions, "out of", len(valset_filtered))

def convert_multiple_choice_to_letter(dataset):
    new_dataset = []
    for example in dataset:
        if example["question_type"] == "multiple-choice":
            # print(example["options"])
            options = ast.literal_eval(example["options"])
            example["choices"] = str([chr(65 + i) + ". " + option for i, option in enumerate(options)])
        else:
            example["choices"] = str(ast.literal_eval(example["options"]))
            if example["choices"] == []:
                example["choices"] = "Free response"

        updated_example = example.with_inputs(*example.inputs().keys(), "choices")
        new_dataset.append(updated_example)
    return new_dataset

print(devset_filtered[0])
updated_devset = convert_multiple_choice_to_letter(devset_filtered)
print(updated_devset[0])
updated_valset = convert_multiple_choice_to_letter(valset_filtered)




Image counts in devset:
0 image(s): 0 examples
1 image(s): 137 examples
2 image(s): 0 examples
3 image(s): 0 examples
4 image(s): 0 examples
5 image(s): 0 examples

Image counts in valset:
0 image(s): 0 examples
1 image(s): 805 examples
2 image(s): 0 examples
3 image(s): 0 examples
4 image(s): 0 examples
5 image(s): 0 examples

Multiple choice questions in devset:
137 out of 137

Multiple choice questions in valset:
805 out of 805
Example({'id': 'dev_Accounting_1', 'question': 'Each of the following situations relates to a different company. <image 1> For company B, find the missing amounts.', 'options': "['$63,020', '$58,410', '$71,320', '$77,490']", 'explanation': '', 'image_1': <PIL.PngImagePlugin.PngImageFile image mode=RGBA size=1234x289 at 0x78F25009ACE0>, 'image_2': None, 'image_3': None, 'image_4': None, 'image_5': None, 'image_6': None, 'image_7': None, 'img_type': "['Tables']", 'answer': 'D', 'topic_difficulty': 'Easy', 'question_type': 'multiple-choice', 'subfield': 'Financi

In [22]:
import re
from typing import Literal
class MMMUSignature(dspy.Signature):
    """Answer with the letter of the correct answer."""

    question: str = dspy.InputField()
    image: dspy.Image = dspy.InputField()
    choices: List[str] = dspy.InputField()
    answer: Literal["A", "B", "C", "D", "E"] = dspy.OutputField()

class MMMUModule(dspy.Module):
    def __init__(self, cot=True):
        super().__init__()
        if cot:
            self.predictor = dspy.ChainOfThought(MMMUSignature)
        else:
            self.predictor = dspy.Predict(MMMUSignature)

    def __call__(self, **kwargs):
        # Clean up predictions
        prediction = self.predictor(**kwargs)
        # Multiple choice case
        if "A." in kwargs["choices"]:
            # regex to extract A, B, C, or D, or E
            answer = re.search(r'[A-E]', prediction["answer"])
            if not answer:
                answer = prediction["answer"]
            else:
                answer = answer.group(0)
            prediction["answer"] = answer
        # Free response case
        return prediction


In [23]:


sample_input = updated_devset[0]
# print(sample_input.inputs())
# print(encode_image(sample_input.inputs()["image_1"]))
mmmu = MMMUModule()
print(sample_input.inputs())
print(mmmu(**sample_input.inputs()))
print(sample_input.answer)

evaluate_mmmu = Evaluate(metric=answer_exact_match, num_threads=50, devset=updated_valset, display_progress=True, max_errors=500, return_outputs=True)

Example({'question': 'Each of the following situations relates to a different company. <image 1> For company B, find the missing amounts.', 'options': "['$63,020', '$58,410', '$71,320', '$77,490']", 'image_1': <PIL.PngImagePlugin.PngImageFile image mode=RGBA size=1234x289 at 0x78F25009ACE0>, 'image_2': None, 'image': <PIL.PngImagePlugin.PngImageFile image mode=RGBA size=1234x289 at 0x78F25009ACE0>, 'choices': "['A. $63,020', 'B. $58,410', 'C. $71,320', 'D. $77,490']"}) (input_keys={'image_1', 'question', 'image_2', 'image', 'options', 'choices'})
Expected dict_keys(['reasoning', 'answer']) but got dict_keys(['reasoning']) from [[ ## reasoning ## ]]
To find the missing amounts for company B, we need to balance the income statement. We know the revenues, expenses, gains, and losses. We can calculate the missing amounts by subtracting the known amounts from the total.

Revenues: $1,480,500
Expenses: $1,518,300
Gains: $78,120
Losses: $0

Net Income or (Loss): $39,690

We can calculate the 

NotImplementedError: Images are not yet supported in JSON mode.

In [25]:

def test_lm(lm, cot=False):
    if lm.model == "openai/gpt-4o-mini":
        num_threads = 10
    else:
        num_threads = 30
    evaluate_mmmu = Evaluate(metric=answer_exact_match, num_threads=num_threads, devset=updated_valset, display_progress=True, max_errors=500, return_outputs=True)
    mmmu = MMMUModule(cot=cot)
    with dspy.context(lm=lm):
        scores, outputs = evaluate_mmmu(mmmu)
        num_bad_format = sum(1 for example in outputs if example[1].get("answer", None) is None)
        return scores, num_bad_format

res1 = test_lm(qwen_lm)
res1_cot = test_lm(qwen_lm, cot=True)
# test_lm(haiku_lm)
# test_lm(llama_lm)
# res1 = test_lm(internlm_lm)
res2 = test_lm(gpt_lm)
res2_cot = test_lm(gpt_lm, cot=True)
# Results:
# MMMU Val(single image only, multiple choice only), N=805
# Temp 0, max_tokens=5k

# 4o-mini:
# Reported: 59.4

# Measured (cot, predict): 60.0, 56.4
# Num bad format (cot, predict): 0, 1

# qwen-7b
# Reported: 54.1
# Measured (cot, predict): 49.0, 49.69
# Num bad format (cot, predict): 17, 0
print("MMMU Validation Set (single image only, multiple choice only), N=805")
print("Temp 0, max_tokens=5k")
print("qwen-7b")
print("Reported:", 54.1)
print("Measured (cot, predict):", f"{res1_cot[0]:.1f}, {res1[0]:.2f}")
print("Num bad format (cot, predict):", f"{res1_cot[1]}, {res1[1]}")
print()
print("gpt-4o-mini")
print("Reported:", 59.4)
print("Measured (cot, predict):", f"{res2_cot[0]:.1f}, {res2[0]:.2f}") 
print("Num bad format (cot, predict):", f"{res2_cot[1]}, {res2[1]}")


Average Metric: 40 / 101  (39.6):  13%|█▎        | 101/805 [00:02<00:24, 28.27it/s][2m2024-10-31T20:19:02.986124Z[0m [[31m[1merror    [0m] [1mError for example in dev set: 		 Images are not yet supported in JSON mode.. Set `provide_traceback=True` to see the stack trace.[0m [[0m[1m[34mdspy.evaluate.evaluate[0m][0m [36mfilename[0m=[35mevaluate.py[0m [36mlineno[0m=[35m200[0m
Average Metric: 48.0 / 113  (42.5):  14%|█▍        | 112/805 [00:02<00:31, 22.34it/s][2m2024-10-31T20:19:03.622709Z[0m [[31m[1merror    [0m] [1mError for example in dev set: 		 Images are not yet supported in JSON mode.. Set `provide_traceback=True` to see the stack trace.[0m [[0m[1m[34mdspy.evaluate.evaluate[0m][0m [36mfilename[0m=[35mevaluate.py[0m [36mlineno[0m=[35m200[0m
Average Metric: 57.0 / 124  (46.0):  15%|█▌        | 123/805 [00:03<00:36, 18.89it/s][2m2024-10-31T20:19:04.166848Z[0m [[31m[1merror    [0m] [1mError for example in dev set: 		 Images are not yet supp

Expected dict_keys(['reasoning', 'answer']) but got dict_keys(['reasoning']) from [[ ## reasoning ## ]]
To find the balances in the work in process inventory, finished goods inventory, and cost of goods sold for January, we need to analyze the job cost sheet provided. The work in process inventory is the sum of the costs for jobs that are not yet completed, the finished goods inventory is the sum of the costs for jobs that have been completed but not yet sold, and the cost of goods sold is the sum of the costs for jobs that have been sold.

From the job cost sheet:
- Job AA2 is not completed and not sold, so its cost is in the work in process inventory.
- Job AA4 is completed and sold, so its cost is in the cost of goods sold.
- Job AA5 is completed and sold, so its cost is in the cost of goods sold.
- Job AA3 is not completed and not sold, so its cost is in the work in process inventory.

The total cost for January is $7,338, which is the sum of the costs for all jobs. The total cost 

Average Metric: 0 / 1  (0.0):   0%|          | 0/805 [00:00<?, ?it/s][2m2024-10-31T20:19:11.037358Z[0m [[31m[1merror    [0m] [1mError for example in dev set: 		 Images are not yet supported in JSON mode.. Set `provide_traceback=True` to see the stack trace.[0m [[0m[1m[34mdspy.evaluate.evaluate[0m][0m [36mfilename[0m=[35mevaluate.py[0m [36mlineno[0m=[35m200[0m
Average Metric: 2.0 / 4  (50.0):   0%|          | 3/805 [00:00<00:14, 53.66it/s][2m2024-10-31T20:19:11.078310Z[0m [[31m[1merror    [0m] [1mError for example in dev set: 		 Images are not yet supported in JSON mode.. Set `provide_traceback=True` to see the stack trace.[0m [[0m[1m[34mdspy.evaluate.evaluate[0m][0m [36mfilename[0m=[35mevaluate.py[0m [36mlineno[0m=[35m200[0m
Average Metric: 3.0 / 8  (37.5):   1%|          | 7/805 [00:00<00:10, 77.00it/s][2m2024-10-31T20:19:11.106959Z[0m [[31m[1merror    [0m] [1mError for example in dev set: 		 Images are not yet supported in JSON mode.. Set

Expected dict_keys(['reasoning', 'answer']) but got dict_keys(['reasoning']) from [[ ## reasoning ## ]]
To determine the number of photographs taken by camera A, we need to calculate the area covered by each photograph and then divide the total area of the strip by the area covered by each photograph. The area covered by each photograph is given by the format size of the camera, which is 180 mm x 180 mm. The total area of the strip is 16 km x 1350 m = 21,600,000 m^2. The area covered by each photograph is 180 mm x 180 mm = 32,400 mm^2 = 0.00324 m^2. Therefore, the number of photographs taken by camera A is 21,600,000 m^2 / 0.00324 m^2 = 6,666,666.67. Since we can't take a fraction of a photograph, we round up to the nearest whole number, which is 6,666,667. However, this is not one of the options provided, so we need to convert the answer to a more manageable form. We can do this by dividing the number of photographs by 100,000 to get an approximate number of photographs per kilometer.

Average Metric: 6.0 / 14  (42.9):   2%|▏         | 13/805 [00:00<00:08, 89.31it/s][2m2024-10-31T20:19:11.174076Z[0m [[31m[1merror    [0m] [1mError for example in dev set: 		 Images are not yet supported in JSON mode.. Set `provide_traceback=True` to see the stack trace.[0m [[0m[1m[34mdspy.evaluate.evaluate[0m][0m [36mfilename[0m=[35mevaluate.py[0m [36mlineno[0m=[35m200[0m
Average Metric: 6.0 / 15  (40.0):   2%|▏         | 14/805 [00:00<00:08, 89.31it/s][2m2024-10-31T20:19:11.177514Z[0m [[31m[1merror    [0m] [1mError for example in dev set: 		 Images are not yet supported in JSON mode.. Set `provide_traceback=True` to see the stack trace.[0m [[0m[1m[34mdspy.evaluate.evaluate[0m][0m [36mfilename[0m=[35mevaluate.py[0m [36mlineno[0m=[35m200[0m
Average Metric: 7.0 / 23  (30.4):   3%|▎         | 22/805 [00:00<00:06, 112.06it/s]

Expected dict_keys(['reasoning', 'answer']) but got dict_keys(['reasoning']) from [[ ## reasoning ## ]]
The shear at point D can be determined by considering the influence line for the shear at D. The influence line for the shear at D is a straight line, and the shear at D is equal to the area under this line. The area under the influence line for the shear at D is equal to the area under the influence line for the reaction at C, which is a triangle with base 175 feet and height 100 feet. The area of this triangle is (1/2) * 175 * 100 = 8750 square feet. The shear at D is equal to the area under the influence line for the reaction at C, which is 8750 square feet. The shear at D is equal to the area under the influence line for the reaction at C, which is 8750 square feet. The shear at D is equal to the area under the influence line for the reaction at C, which is 8750 square feet. The shear at D is equal to the area under the influence line for the reaction at C, which is 8750 square f

Average Metric: 7.0 / 24  (29.2):   3%|▎         | 23/805 [00:00<00:06, 112.06it/s][2m2024-10-31T20:19:11.253747Z[0m [[31m[1merror    [0m] [1mError for example in dev set: 		 Images are not yet supported in JSON mode.. Set `provide_traceback=True` to see the stack trace.[0m [[0m[1m[34mdspy.evaluate.evaluate[0m][0m [36mfilename[0m=[35mevaluate.py[0m [36mlineno[0m=[35m200[0m
Average Metric: 17.0 / 66  (25.8):   8%|▊         | 65/805 [00:00<00:04, 152.18it/s][2m2024-10-31T20:19:11.673238Z[0m [[31m[1merror    [0m] [1mError for example in dev set: 		 Images are not yet supported in JSON mode.. Set `provide_traceback=True` to see the stack trace.[0m [[0m[1m[34mdspy.evaluate.evaluate[0m][0m [36mfilename[0m=[35mevaluate.py[0m [36mlineno[0m=[35m200[0m
Average Metric: 22.0 / 72  (30.6):   9%|▉         | 72/805 [00:00<00:10, 71.76it/s] 

Expected dict_keys(['reasoning', 'answer']) but got dict_keys(['reasoning']) from [[ ## reasoning ## ]]
The Roman portrait mummy is named "Semion," which is a common name in ancient Rome. The woman in the portrait is likely named after her husband, Semion, who was a common name in ancient Rome. The name "Semion" is also a common name in modern times, and it is often used as a middle name or a nickname. The name "Semion" is also a common name in the Middle East, and it is often used as a first name. The name "Semion" is also a common name in the United States, and it is often used as a first name or a middle name. The name "Semion" is also a common name in the United Kingdom, and it is often used as a first name or a middle name. The name "Semion" is also a common name in Canada, and it is often used as a first name or a middle name. The name "Semion" is also a common name in Australia, and it is often used as a first name or a middle name. The name "Semion" is also a common name in New

Average Metric: 30.0 / 84  (35.7):  10%|█         | 84/805 [00:01<00:17, 41.19it/s][2m2024-10-31T20:19:12.493422Z[0m [[31m[1merror    [0m] [1mError for example in dev set: 		 Images are not yet supported in JSON mode.. Set `provide_traceback=True` to see the stack trace.[0m [[0m[1m[34mdspy.evaluate.evaluate[0m][0m [36mfilename[0m=[35mevaluate.py[0m [36mlineno[0m=[35m200[0m
Average Metric: 61.0 / 137  (44.5):  17%|█▋        | 136/805 [00:03<00:14, 46.73it/s][2m2024-10-31T20:19:14.474583Z[0m [[31m[1merror    [0m] [1mError for example in dev set: 		 Images are not yet supported in JSON mode.. Set `provide_traceback=True` to see the stack trace.[0m [[0m[1m[34mdspy.evaluate.evaluate[0m][0m [36mfilename[0m=[35mevaluate.py[0m [36mlineno[0m=[35m200[0m
Average Metric: 63.0 / 139  (45.3):  17%|█▋        | 139/805 [00:03<00:35, 18.52it/s][2m2024-10-31T20:19:14.690073Z[0m [[31m[1merror    [0m] [1mError for example in dev set: 		 Images are not yet supp

Expected dict_keys(['reasoning', 'answer']) but got dict_keys(['reasoning']) from [[ ## reasoning ## ]]
The voltage across the resistor $v_R(t)$ in a series RL circuit can be found using the formula $v_R(t) = v(t) - i(t)R$, where $v(t)$ is the voltage source, $i(t)$ is the current through the circuit, and $R$ is the resistance. The current through the circuit can be found using the formula $i(t) = L \frac{dv(t)}{dt} + \frac{v(t)}{R}$, where $L$ is the inductance. Given that the voltage source is a unit ramp function, $v(t) = t$, and the initial condition is zero, we can solve for $v_R(t)$.

First, we find the current $i(t)$:
\[i(t) = L \frac{dv(t)}{dt} + \frac{v(t)}{R} = 0.1 \frac{d}{dt}(t) + \frac{t}{1} = 0.1 + t\]

Then, we substitute $i(t)$ into the formula for $v_R(t)$:
\[v_R(t) = v(t) - i(t)R = t - (0.1 + t) = -0.1\]

However, this result is incorrect because we made a mistake in our calculation. The correct formula for the current in an RL circuit with a unit ramp voltage source 

Average Metric: 108.0 / 235  (46.0):  29%|██▉       | 234/805 [00:05<00:09, 59.86it/s][2m2024-10-31T20:19:16.664409Z[0m [[31m[1merror    [0m] [1mError for example in dev set: 		 Images are not yet supported in JSON mode.. Set `provide_traceback=True` to see the stack trace.[0m [[0m[1m[34mdspy.evaluate.evaluate[0m][0m [36mfilename[0m=[35mevaluate.py[0m [36mlineno[0m=[35m200[0m
Average Metric: 108.0 / 236  (45.8):  29%|██▉       | 235/805 [00:05<00:09, 59.86it/s][2m2024-10-31T20:19:16.696010Z[0m [[31m[1merror    [0m] [1mError for example in dev set: 		 Images are not yet supported in JSON mode.. Set `provide_traceback=True` to see the stack trace.[0m [[0m[1m[34mdspy.evaluate.evaluate[0m][0m [36mfilename[0m=[35mevaluate.py[0m [36mlineno[0m=[35m200[0m
Average Metric: 108.0 / 237  (45.6):  29%|██▉       | 236/805 [00:05<00:09, 59.86it/s][2m2024-10-31T20:19:16.716079Z[0m [[31m[1merror    [0m] [1mError for example in dev set: 		 Images are not yet

Expected dict_keys(['reasoning', 'answer']) but got dict_keys(['reasoning']) from [[ ## reasoning ## ]]
To calculate the net working capital (NWC), we need to find the difference between total liquid assets and total current liabilities. Total liquid assets include cash, checking, savings, and money market funds. Total current liabilities include medical bills payable and utility bills payable.

Total liquid assets = $300 (Cash) + $3,000 (Checking) + $1,200 (Savings) + $1,200 (Money market funds) = $5,700

Total current liabilities = $250 (Medical bills payable) + $150 (Utility bills payable) = $400

NWC = Total liquid assets - Total current liabilities = $5,700 - $400 = $5,300

However, the options provided do not match the calculated NWC. It seems there might be a mistake in the provided options or the calculation. Let's recheck the calculation.

Upon rechecking, it appears there was a mistake in the calculation of total liquid assets. The correct total liquid assets should be:

Tota

Average Metric: 120.0 / 261  (46.0):  32%|███▏      | 260/805 [00:06<00:08, 61.38it/s][2m2024-10-31T20:19:17.104374Z[0m [[31m[1merror    [0m] [1mError for example in dev set: 		 Images are not yet supported in JSON mode.. Set `provide_traceback=True` to see the stack trace.[0m [[0m[1m[34mdspy.evaluate.evaluate[0m][0m [36mfilename[0m=[35mevaluate.py[0m [36mlineno[0m=[35m200[0m
Average Metric: 122.0 / 269  (45.4):  33%|███▎      | 268/805 [00:06<00:10, 49.78it/s][2m2024-10-31T20:19:17.389409Z[0m [[31m[1merror    [0m] [1mError for example in dev set: 		 Images are not yet supported in JSON mode.. Set `provide_traceback=True` to see the stack trace.[0m [[0m[1m[34mdspy.evaluate.evaluate[0m][0m [36mfilename[0m=[35mevaluate.py[0m [36mlineno[0m=[35m200[0m
Average Metric: 126.0 / 274  (46.0):  34%|███▍      | 273/805 [00:06<00:10, 49.78it/s][2m2024-10-31T20:19:17.470535Z[0m [[31m[1merror    [0m] [1mError for example in dev set: 		 Images are not yet

Expected dict_keys(['reasoning', 'answer']) but got dict_keys(['reasoning']) from [[ ## reasoning ## ]]
The shear modulus (G) of the polymer can be determined using the formula for shear stress (τ) and shear strain (γ), where τ = Gγ. The shear stress is given by the force (P) divided by the area (A) over which the force is applied. The area is the width (b) times the height (h) of the block. The shear strain is the displacement (Δx) divided by the height (h) of the block. Therefore, G = PΔx / (bh^2).

Given:
- P = 2 kN = 2000 N
- Δx = 2 mm = 0.002 m
- b = 100 mm = 0.1 m
- h = 200 mm = 0.2 m

Substituting the values into the formula, we get:
G = (2000 N * 0.002 m) / (0.1 m * 0.2 m^2) = 2000 N * 0.002 m / (0.1 m * 0.2 m^2) = 2000 N * 0.002 m / 0.02 m^2 = 2000 N * 0.002 m / 0.02 m^2 = 2000 N * 0.002 m / 0.02 m^2 = 2000 N * 0.002 m / 0.02 m^2 = 2000 N * 0.002 m / 0.02 m^2 = 2000 N * 0.002 m / 0.02 m^2 = 2000 N * 0.002 m / 0.02 m^2 = 2000 N * 0.002 m / 0.02 m^2 = 2000 N * 0.002 m / 0.02 m^2

[2m2024-10-31T20:19:17.924432Z[0m [[31m[1merror    [0m] [1mError for example in dev set: 		 Images are not yet supported in JSON mode.. Set `provide_traceback=True` to see the stack trace.[0m [[0m[1m[34mdspy.evaluate.evaluate[0m][0m [36mfilename[0m=[35mevaluate.py[0m [36mlineno[0m=[35m200[0m
Average Metric: 138.0 / 298  (46.3):  37%|███▋      | 297/805 [00:07<00:14, 35.65it/s][2m2024-10-31T20:19:17.940313Z[0m [[31m[1merror    [0m] [1mError for example in dev set: 		 Images are not yet supported in JSON mode.. Set `provide_traceback=True` to see the stack trace.[0m [[0m[1m[34mdspy.evaluate.evaluate[0m][0m [36mfilename[0m=[35mevaluate.py[0m [36mlineno[0m=[35m200[0m
Average Metric: 138.0 / 298  (46.3):  37%|███▋      | 298/805 [00:07<00:12, 41.36it/s][2m2024-10-31T20:19:17.977448Z[0m [[31m[1merror    [0m] [1mError for example in dev set: 		 Images are not yet supported in JSON mode.. Set `provide_traceback=True` to see the stack trace.[0m [[

Expected dict_keys(['reasoning', 'answer']) but got dict_keys(['reasoning']) from [[ ## reasoning ## ]]
To calculate the adjusted OR, we need to use the Mantel-Haenszel method, which takes into account the confounding factor of place of residence. The formula for the Mantel-Haenszel OR is:

\[ OR_{MH} = \frac{\sum_{i=1}^{k} \frac{a_i}{b_i} \cdot \frac{c_i}{d_i}}{\sum_{i=1}^{k} \frac{a_i + c_i}{b_i + d_i}} \]

where:
- \( a_i \) is the number of cases in the ith stratum (rural or urban) for the exposed group,
- \( b_i \) is the number of controls in the ith stratum for the exposed group,
- \( c_i \) is the number of cases in the ith stratum for the unexposed group,
- \( d_i \) is the number of controls in the ith stratum for the unexposed group.

For drinking surface water:
- Rural: \( a_1 = 20 \), \( b_1 = 50 \), \( c_1 = 30 \), \( d_1 = 150 \)
- Urban: \( a_2 = 150 \), \( b_2 = 30 \), \( c_2 = 50 \), \( d_2 = 20 \)

Plugging these values into the formula, we get:

\[ OR_{MH} = \frac{\

Average Metric: 180.0 / 378  (47.6):  47%|████▋     | 377/805 [00:08<00:03, 116.90it/s][2m2024-10-31T20:19:19.128546Z[0m [[31m[1merror    [0m] [1mError for example in dev set: 		 Images are not yet supported in JSON mode.. Set `provide_traceback=True` to see the stack trace.[0m [[0m[1m[34mdspy.evaluate.evaluate[0m][0m [36mfilename[0m=[35mevaluate.py[0m [36mlineno[0m=[35m200[0m
Average Metric: 252.0 / 511  (49.3):  63%|██████▎   | 510/805 [00:08<00:00, 296.75it/s][2m2024-10-31T20:19:19.254773Z[0m [[31m[1merror    [0m] [1mError for example in dev set: 		 Images are not yet supported in JSON mode.. Set `provide_traceback=True` to see the stack trace.[0m [[0m[1m[34mdspy.evaluate.evaluate[0m][0m [36mfilename[0m=[35mevaluate.py[0m [36mlineno[0m=[35m200[0m
Average Metric: 284.0 / 613  (46.3):  76%|███████▌  | 612/805 [00:08<00:00, 549.19it/s][2m2024-10-31T20:19:19.349359Z[0m [[31m[1merror    [0m] [1mError for example in dev set: 		 Images are not 

Expected dict_keys(['reasoning', 'answer']) but got dict_keys(['reasoning']) from [[ ## reasoning ## ]]
To determine the bearing stress acting on collar A, we first need to calculate the area over which the axial force is applied. The bearing stress (\( \sigma_b \)) can be calculated using the formula:

\[
\sigma_b = \frac{F}{A}
\]

where:
- \( F \) is the axial force (5 kN = 5000 N),
- \( A \) is the bearing area.

The collar A has a diameter of 60 mm, so the radius \( r \) is:

\[
r = \frac{60 \, \text{mm}}{2} = 30 \, \text{mm} = 0.03 \, \text{m}
\]

The area \( A \) of the collar can be calculated using the formula for the area of a circle:

\[
A = \pi r^2 = \pi (0.03)^2 \approx 0.002827 \, \text{m}^2
\]

Now, substituting the values into the bearing stress formula:

\[
\sigma_b = \frac{5000 \, \text{N}}{0.002827 \, \text{m}^2} \approx 1765.4 \, \text{kPa} = 1.765 \, \text{MPa}
\]

However, we need to consider the effective area that the axial force is acting on, which is the area o

Average Metric: 350.0 / 616  (56.8):  76%|███████▋  | 615/805 [00:14<00:02, 73.43it/s][2m2024-10-31T20:20:14.338541Z[0m [[31m[1merror    [0m] [1mError for example in dev set: 		 Images are not yet supported in JSON mode.. Set `provide_traceback=True` to see the stack trace.[0m [[0m[1m[34mdspy.evaluate.evaluate[0m][0m [36mfilename[0m=[35mevaluate.py[0m [36mlineno[0m=[35m200[0m
Average Metric: 357.0 / 634  (56.3):  79%|███████▊  | 633/805 [00:15<00:01, 97.05it/s][2m2024-10-31T20:20:14.431305Z[0m [[31m[1merror    [0m] [1mError for example in dev set: 		 Images are not yet supported in JSON mode.. Set `provide_traceback=True` to see the stack trace.[0m [[0m[1m[34mdspy.evaluate.evaluate[0m][0m [36mfilename[0m=[35mevaluate.py[0m [36mlineno[0m=[35m200[0m
Average Metric: 358.0 / 637  (56.2):  79%|███████▉  | 636/805 [00:15<00:01, 97.05it/s][2m2024-10-31T20:20:14.483489Z[0m [[31m[1merror    [0m] [1mError for example in dev set: 		 Images are not yet

MMMU Validation Set (single image only, multiple choice only), N=805
Temp 0, max_tokens=5k
qwen-7b
Reported: 54.1
Measured (cot, predict): 45.1, 46.21
Num bad format (cot, predict): 99, 35

gpt-4o-mini
Reported: 59.4
Measured (cot, predict): 56.6, 54.66
Num bad format (cot, predict): 50, 5


In [None]:
scores, outputs = evaluate_mmmu(mmmu)
# lm.inspect_history()


In [9]:
from collections import Counter
c = Counter([outputs[i][1].get("answer", "nothing returned") for i in range(len(outputs))])
non_letters = sum([1 for output in outputs if output[1].get("answer", "nothing returned") not in ["A", "B", "C", "D"]])
print(c)
print(non_letters)




Counter({'A': 214, 'B': 213, 'C': 212, 'D': 130, 'nothing returned': 16, 'E': 7, '$750': 1, 'F': 1, 'P, Q': 1, 'None of the provided answer choices can be selected as the correct answer because the necessary data to perform the calculation is not available in the image.': 1, "None of the options can be the sequence of edges added to the minimum spanning tree using Kruskal's algorithm because they all form cycles with the edges already in the tree.": 1, 'None of the given answer choices match the calculated height of 10.5 cm. Therefore, the correct answer is not provided in the given answer choices.': 1, 'None of the answer choices are correct.': 1, 'None of the answer choices provided directly relate to the size of the countries on the map. Therefore, the correct answer is not listed.': 1, 'None of the provided answer choices match the calculated cumulative relative frequency of 0.6.': 1, '59': 1, '1': 1, 'None of the provided answer choices match the calculated value.': 1, 'None of th

In [10]:
mc_correct = sum(outputs[i][2] for i in range(len(outputs)) if outputs[i][0]["question_type"] == "multiple-choice")
total_mc = sum(1 for example in outputs if example[0]["question_type"] == "multiple-choice")
print(mc_correct, total_mc)
print(mc_correct / total_mc)
print(sum(outputs[i][1].get("answer", None) is None for i in range(len(outputs))))

# Note: Run above here

395.0 805
0.4906832298136646
16


# Make sure that multiple images work

## No examples

In [None]:
import PIL
def set_image_to_black_square(example, key):
    example_copy = example.copy()
    example_copy[key] = PIL.Image.open("black_image_300x300.png")
    return example_copy.with_inputs(*example.inputs().keys())

print(updated_devset[0]["image_1"])
print(updated_devset[0]["image_2"])
examples_no_image_1 = list(map(lambda x: set_image_to_black_square(x, "image_1"), updated_valset))
print(examples_no_image_1[0]["image_1"] == PIL.Image.open("black_image_300x300.png"))
print(examples_no_image_1[0]["image_2"] == PIL.Image.open("black_image_300x300.png"))
examples_no_image_2 = list(map(lambda x: set_image_to_black_square(x, "image_2"), updated_valset))
print(examples_no_image_2[0]["image_1"] == PIL.Image.open("black_image_300x300.png"))
print(examples_no_image_2[0]["image_2"] == PIL.Image.open("black_image_300x300.png"))

examples_no_actual_image = list(map(lambda x: set_image_to_black_square(x, "image_1"), updated_valset))
examples_no_actual_image = list(map(lambda x: set_image_to_black_square(x, "image_2"), examples_no_actual_image))
print(examples_no_actual_image[0]["image_1"] == PIL.Image.open("black_image_300x300.png"))
print(examples_no_actual_image[0]["image_2"] == PIL.Image.open("black_image_300x300.png"))


In [None]:
mmmu = MMMUModule()
print(examples_no_image_1[0].inputs())
print(mmmu(**examples_no_image_1[0].inputs()))

print(examples_no_image_2[0].inputs())
print(mmmu(**examples_no_image_2[0].inputs()))


In [None]:
normal = evaluate_mmmu(mmmu, devset=updated_valset)
no_image_1 = evaluate_mmmu(mmmu, devset=examples_no_image_1)
no_image_2 = evaluate_mmmu(mmmu, devset=examples_no_image_2)
no_actual_image = evaluate_mmmu(mmmu, devset=examples_no_actual_image)
print("Testing on MMMU validation set (N=", len(updated_valset), ")")
print("Score with both images:", normal)
print("Score with image_1 set to black square:", no_image_1)
print("Score with image_2 set to black square:", no_image_2)
print("Score with both images set to black squares:", no_actual_image)

## TODO: Test with bootstrapped examples


# Make sure that JPGs work

## Convert images to JPGs

In [None]:
import io
from PIL import Image

def convert_to_jpg(example):
    example_copy = example.copy()
    for key in ['image_1', 'image_2']:
        if key in example_copy and isinstance(example_copy[key], Image.Image):
            # Convert to RGB mode (in case it's not already)
            img = example[key].convert('RGB')
            
            # Save as JPG in memory
            buffer = io.BytesIO()
            img.save(buffer, format='JPEG')
            buffer.seek(0)
            
            # Load the JPG back as a PIL Image
            example_copy[key] = Image.open(buffer)
    
    return example_copy.with_inputs(*example.inputs().keys())

# Convert all images in the dataset to JPG
examples_jpg = list(map(convert_to_jpg, updated_valset))

# Verify conversion
print("Original image format:", updated_valset[0]['image_1'].format)
print("Converted image format:", examples_jpg[0]['image_1'].format)


In [None]:
examples_jpg = list(map(convert_to_jpg, updated_valset))
examples_no_image_1_jpg = list(map(lambda x: convert_to_jpg(x), examples_no_image_1))
examples_no_image_2_jpg = list(map(lambda x: convert_to_jpg(x), examples_no_image_2))
examples_no_actual_image_jpg = list(map(lambda x: convert_to_jpg(x), examples_no_actual_image))

mmmu = MMMUModule()
print(examples_no_image_1_jpg[0].inputs())
print(mmmu(**examples_no_image_1_jpg[0].inputs()))
print(examples_no_image_1_jpg[0]["image_1"].format)

In [None]:
normal = evaluate_mmmu(mmmu, devset=examples_jpg)
no_image_1 = evaluate_mmmu(mmmu, devset=examples_no_image_1_jpg)
no_image_2 = evaluate_mmmu(mmmu, devset=examples_no_image_2_jpg)
no_actual_image = evaluate_mmmu(mmmu, devset=examples_no_actual_image_jpg)
print("Testing on MMMU validation set (N=", len(updated_valset), ")")
print("Score with both images:", normal)
print("Score with image_1 set to black square:", no_image_1)
print("Score with image_2 set to black square:", no_image_2)
print("Score with both images set to black squares:", no_actual_image)

In [None]:
lm.inspect_history()

# Testing that URLs work

In [None]:

colors = {
    "White": "FFFFFF",
    "Red": "FF0000",
    "Green": "00FF00",
    "Blue": "0000FF",
    "Yellow": "FFFF00",
    "Cyan": "00FFFF",
    "Magenta": "FF00FF",
    "Gray": "808080",
    "Orange": "FFA500",
    "Purple": "800080"
}
def get_color_image_url(color, file_extension="png"):
    return f"https://placehold.co/300/{colors[color]}/{colors[color]}.{file_extension}"


In [None]:
import random

def generate_random_2_color_image_examples(n):
    examples = []
    for _ in range(n):
        color_1, color_2 = random.sample(list(colors.keys()), 2)
        chosen_color = color_1 if random.random() < 0.5 else color_2
        chosen_image = "image_1" if chosen_color == color_1 else "image_2"
        example_kwargs = {
            "image_1": get_color_image_url(color_1),
            "image_2": get_color_image_url(color_2),
            "question": f"What color is {chosen_image}?",
            "answer": chosen_color
        }
        examples.append(dspy.Example(**example_kwargs).with_inputs("image_1", "image_2", "question"))
    return examples

examples = generate_random_2_color_image_examples(100)
print(examples[0])


In [None]:
class ColorSignature(dspy.Signature):
    """Output the color of the designated image."""
    image_1: dspy.Image = dspy.InputField(desc="An image")
    image_2: dspy.Image = dspy.InputField(desc="An image")
    question: str = dspy.InputField(desc="A question about the image")
    answer: str = dspy.OutputField(desc="The color of the designated image")
color_program = dspy.Predict(ColorSignature)


In [None]:
print(examples[0])
print(color_program(**examples[0].inputs()))

In [None]:
few_shot_optimizer = dspy.BootstrapFewShot(metric=answer_exact_match, max_bootstrapped_demos=3, max_labeled_demos=10)
smaller_few_shot_optimizer = dspy.BootstrapFewShot(metric=answer_exact_match, max_bootstrapped_demos=1, max_labeled_demos=1)
dataset = generate_random_2_color_image_examples(1000)
trainset = dataset[:200]
validationset = dataset[200:400]
evaluate_colors = Evaluate(metric=answer_exact_match, num_threads=300, devset=validationset)

In [None]:
compiled_color_program = few_shot_optimizer.compile(color_program, trainset=trainset)
compiled_smaller_color_program = smaller_few_shot_optimizer.compile(color_program, trainset=trainset)
print(evaluate_colors(color_program))
print(evaluate_colors(compiled_color_program))
print(evaluate_colors(compiled_smaller_color_program))

In [None]:
print(compiled_color_program(**validationset[0].inputs()))
lm.inspect_history()

# TODO(Isaac): Delete; Archive of old experiments

In [None]:
dataset = DataLoader().from_huggingface("Alanox/stanford-dogs", split="full", input_keys=("image",), trust_remote_code=True)

In [None]:
# rename the field from "image" to "image_1"
def rename_field(example, old_name, new_name):
    try:
        example[new_name] = example[old_name]
        del example[old_name]
    except Exception:
        pass
    return example
    
dog_dataset = list(map(rename_field, dataset, ["image"]*len(dataset), ["image_1"]*len(dataset)))
dog_dataset2 = list(map(rename_field, dog_dataset, ["target"]*len(dog_dataset), ["answer"]*len(dog_dataset)))
dog_dataset3 = list(map(lambda x: x.with_inputs("image_1"), dog_dataset2))
dog_dataset = dog_dataset3
random.shuffle(dog_dataset)

In [None]:
class DogPictureSignature(dspy.Signature):
    """Output the dog breed of the dog in the image."""
    image_1: dspy.Image = dspy.InputField(desc="An image of a dog")
    answer: str = dspy.OutputField(desc="The dog breed of the dog in the image")

class DogPicture(dspy.Module):
    def __init__(self) -> None:
        self.predictor = dspy.ChainOfThought(DogPictureSignature)
    
    def __call__(self, **kwargs):
        return self.predictor(**kwargs)

dog_picture = DogPicture()
print(dog_picture(**dog_dataset[0].inputs()))

In [None]:
evaluate = Evaluate(metric=answer_exact_match, num_threads=100, devset= dog_dataset[-500:], display_progress=True, max_errors=10000)


In [None]:
# TODO: Test inline signature
# TODO: Test json adapter