In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import dspy
from dspy.datasets import DataLoader
from dspy.evaluate.metrics import answer_exact_match
from typing import List
from dspy.evaluate import Evaluate

import dotenv
import litellm

litellm.suppress_debug_info = True

dotenv.load_dotenv()

def debug_exact_match(example, pred, trace=None, frac=1.0):
    print(example.inputs())
    print(example.answer)
    print(pred)
    # print(trace)
    # print(frac)
    return answer_exact_match(example, pred, trace, frac)

  from .autonotebook import tqdm as notebook_tqdm


In [16]:
# vllm serve Qwen/Qwen2-VL-7B-Instruct --trust-remote-code --limit-mm-per-prompt image=16 --seed 42 --pipeline-parallel-size 2
qwen_lm = dspy.LM(model="openai/Qwen/Qwen2-VL-7B-Instruct", api_base="http://localhost:8000/v1", api_key="sk-fake-key", max_tokens=5000)
haiku_lm = dspy.LM(model="anthropic/claude-3-haiku-20240307", max_tokens=4096)
# vllm serve meta-llama/Llama-3.2-11B-Vision-Instruct --trust-remote-code --limit-mm-per-prompt image=16 --seed 42 --enforce-eager --max-num-seqs 48
llama_lm = dspy.LM(model="openai/meta-llama/Llama-3.2-11B-Vision-Instruct", api_base="http://localhost:8000/v1", api_key="sk-fake-key", max_tokens=5000)
internlm_lm = dspy.LM(model="openai/OpenGVLab/InternVL2-8B", api_base="http://localhost:8000/v1", api_key="sk-fake-key", max_tokens=5000)
gpt_lm = dspy.LM(model="openai/gpt-4o-mini", max_tokens=5000)
all_lms = [qwen_lm, haiku_lm, llama_lm, gpt_lm]

dspy.settings.configure(lm=qwen_lm)

In [4]:
%%capture
from concurrent.futures import ThreadPoolExecutor

input_keys = tuple([f"image_{i}" for i in range(1, 3)] + ["question", "options"])
subsets = ['Accounting', 'Agriculture', 'Architecture_and_Engineering', 'Art', 'Art_Theory', 'Basic_Medical_Science', 'Biology', 'Chemistry', 'Clinical_Medicine', 'Computer_Science', 'Design', 'Diagnostics_and_Laboratory_Medicine', 'Economics', 'Electronics', 'Energy_and_Power', 'Finance', 'Geography', 'History', 'Literature', 'Manage', 'Marketing', 'Materials', 'Math', 'Mechanical_Engineering', 'Music', 'Pharmacy', 'Physics', 'Psychology', 'Public_Health', 'Sociology']

devset = []
valset = []
with ThreadPoolExecutor(max_workers=len(subsets)) as executor:
    def load_dataset(subset_index_subset):
        subset_index, subset = subset_index_subset
        dataset = DataLoader().from_huggingface("MMMU/MMMU", subset, split=["dev", "validation"], input_keys=input_keys)
        return subset_index, dataset["dev"], dataset["validation"]
    
    results = list(executor.map(load_dataset, enumerate(subsets)))
    
    results.sort(key=lambda x: x[0])
    
    for _, dev, val in results:
        devset.extend(dev)
        valset.extend(val)

print(len(devset))
print(len(valset))

In [5]:
import ast

def count_images(dataset):
    image_counts = {i: 0 for i in range(6)}  # Initialize counts for 0 to 2 images
    for example in dataset:
        count = sum(1 for key in example.inputs().keys() if key.startswith('image_') and example.inputs()[key] is not None)
        image_counts[count] += 1
    return image_counts

def count_multiple_choice_questions(dataset):
    return sum(1 for example in dataset if example["question_type"] == "multiple-choice")
max_images = 5

num_images = 1

devset_filtered = [example for example in devset if sum(1 for key in example.inputs().keys() if key.startswith('image_') and example.inputs()[key] is not None) == num_images]
valset_filtered = [example for example in valset if sum(1 for key in example.inputs().keys() if key.startswith('image_') and example.inputs()[key] is not None) == num_images]

devset_filtered = [example for example in devset_filtered if example["question_type"] == "multiple-choice"]
valset_filtered = [example for example in valset_filtered if example["question_type"] == "multiple-choice"]

devset_image_counts = count_images(devset_filtered)
valset_image_counts = count_images(valset_filtered)

devset_multiple_choice_questions = count_multiple_choice_questions(devset_filtered)
valset_multiple_choice_questions = count_multiple_choice_questions(valset_filtered)

print("Image counts in devset:")
for count, num_examples in devset_image_counts.items():
    print(f"{count} image(s): {num_examples} examples")

print("\nImage counts in valset:")
for count, num_examples in valset_image_counts.items():
    print(f"{count} image(s): {num_examples} examples")

print("\nMultiple choice questions in devset:")
print(devset_multiple_choice_questions, "out of", len(devset_filtered))
print("\nMultiple choice questions in valset:")
print(valset_multiple_choice_questions, "out of", len(valset_filtered))

def convert_multiple_choice_to_letter(dataset):
    new_dataset = []
    for example in dataset:
        if example["question_type"] == "multiple-choice":
            # print(example["options"])
            options = ast.literal_eval(example["options"])
            example["answer_choices"] = str([chr(65 + i) + ". " + option for i, option in enumerate(options)])
        else:
            example["answer_choices"] = str(ast.literal_eval(example["options"]))
            if example["answer_choices"] == []:
                example["answer_choices"] = "Free response"

        updated_example = example.with_inputs(*example.inputs().keys(), "answer_choices")
        new_dataset.append(updated_example)
    return new_dataset

print(devset_filtered[0])
updated_devset = convert_multiple_choice_to_letter(devset_filtered)
print(updated_devset[0])
updated_valset = convert_multiple_choice_to_letter(valset_filtered)


Image counts in devset:
0 image(s): 0 examples
1 image(s): 137 examples
2 image(s): 0 examples
3 image(s): 0 examples
4 image(s): 0 examples
5 image(s): 0 examples

Image counts in valset:
0 image(s): 0 examples
1 image(s): 805 examples
2 image(s): 0 examples
3 image(s): 0 examples
4 image(s): 0 examples
5 image(s): 0 examples

Multiple choice questions in devset:
137 out of 137

Multiple choice questions in valset:
805 out of 805
Example({'id': 'dev_Accounting_1', 'question': 'Each of the following situations relates to a different company. <image 1> For company B, find the missing amounts.', 'options': "['$63,020', '$58,410', '$71,320', '$77,490']", 'explanation': '', 'image_1': <PIL.PngImagePlugin.PngImageFile image mode=RGBA size=1234x289 at 0x7019597751E0>, 'image_2': None, 'image_3': None, 'image_4': None, 'image_5': None, 'image_6': None, 'image_7': None, 'img_type': "['Tables']", 'answer': 'D', 'topic_difficulty': 'Easy', 'question_type': 'multiple-choice', 'subfield': 'Financi

In [6]:
import re
class MMMUSignature(dspy.Signature):
    """Output a rationale and the answer to a multiple choice question about an image with the letter of the correct answer, if present, otherwise the exact answer."""

    question: str = dspy.InputField(desc="A question about the image(s)")
    image_1: dspy.Image = dspy.InputField(desc="An image relating to the shown problem")
    answer_choices: List[str] = dspy.InputField(desc="The answer options for the question")
    answer: str = dspy.OutputField(desc="The single letter of the correct answer. Do not include the entire answer or a period at the end.")

class MMMUModule(dspy.Module):
    def __init__(self):
        super().__init__()
        self.predictor = dspy.ChainOfThought(MMMUSignature)

    def __call__(self, **kwargs):
        # Clean up predictions
        prediction = self.predictor(**kwargs)
        # Multiple choice case
        if "A." in kwargs["answer_choices"]:
            # regex to extract A, B, C, or D, or E
            answer = re.search(r'[A-E]', prediction["answer"])
            if not answer:
                answer = prediction["answer"]
            else:
                answer = answer.group(0)
            prediction["answer"] = answer
        # Free response case
        return prediction


In [18]:


sample_input = updated_devset[0]
# print(sample_input.inputs())
# print(encode_image(sample_input.inputs()["image_1"]))
mmmu = MMMUModule()
print(sample_input.inputs())
print(mmmu(**sample_input.inputs()))
print(sample_input.answer)

evaluate_mmmu = Evaluate(metric=answer_exact_match, num_threads=50, devset=updated_valset, display_progress=True, max_errors=500, return_outputs=True)

Example({'question': 'Each of the following situations relates to a different company. <image 1> For company B, find the missing amounts.', 'options': "['$63,020', '$58,410', '$71,320', '$77,490']", 'image_1': <PIL.PngImagePlugin.PngImageFile image mode=RGBA size=1234x289 at 0x7019597751E0>, 'image_2': None, 'answer_choices': "['A. $63,020', 'B. $58,410', 'C. $71,320', 'D. $77,490']"}) (input_keys={'question', 'answer_choices', 'image_2', 'options', 'image_1'})
Prediction(
    reasoning='To find the missing amounts for company B, we need to balance the income statement. We know the revenues, expenses, gains, and losses. We can calculate the missing amounts by subtracting the known amounts from the total.',
    answer='B'
)
D


In [19]:

def test_lm(lm):
    with dspy.context(lm=lm):
        scores, outputs = evaluate_mmmu(mmmu)
        mc_correct = sum(outputs[i][2] for i in range(len(outputs)) if outputs[i][0]["question_type"] == "multiple-choice")
        total_mc = sum(1 for example in outputs if example[0]["question_type"] == "multiple-choice")
        num_bad_format = sum(1 for example in outputs if example[1].get("answer", None) is None)
        print(lm.model, scores, mc_correct, total_mc, mc_correct / total_mc, num_bad_format)
        return scores, num_bad_format

# test_lm(qwen_lm)
# test_lm(haiku_lm)
# test_lm(llama_lm)
test_lm(internlm_lm)
# test_lm(gpt_lm)


[2m2024-10-31T03:36:19.769653Z[0m [[31m[1merror    [0m] [1mError for example in dev set: 		 Images are not yet supported in JSON mode.. Set `provide_traceback=True` to see the stack trace.[0m [[0m[1m[34mdspy.evaluate.evaluate[0m][0m [36mfilename[0m=[35mevaluate.py[0m [36mlineno[0m=[35m200[0m
[2m2024-10-31T03:36:19.778993Z[0m [[31m[1merror    [0m] [1mError for example in dev set: 		 Images are not yet supported in JSON mode.. Set `provide_traceback=True` to see the stack trace.[0m [[0m[1m[34mdspy.evaluate.evaluate[0m][0m [36mfilename[0m=[35mevaluate.py[0m [36mlineno[0m=[35m200[0m


ExpectedExpected dict_keys(['reasoning', 'answer']) but got dict_keys(['reasoning']) from [[ ## reasoning ## ]]
To determine if you should accept the offer, we need to calculate the present value of the future cash flows. The present value (PV) of an annuity is calculated using the formula:

\[ PV = \frac{C}{(1 + r)^n} + \frac{C}{(1 + r)^{n-1}} + \frac{C}{(1 + r)^{n-2}} + \ldots + \frac{C}{(1 + r)} \]

where \( C \) is the cash flow, \( r \) is the discount rate, and \( n \) is the number of periods.

For the given cash flows:
- Year 1: -$4,500
- Year 2: -$3,100
- Year 3: -$2,400
- Year 4: -$1,800

The discount rate \( r \) is 10% or 0.10. The number of periods \( n \) is 4.

Calculating the present value:

\[ PV = \frac{-4,500}{(1 + 0.10)^1} + \frac{-3,100}{(1 + 0.10)^2} + \frac{-2,400}{(1 + 0.10)^3} + \frac{-1,800}{(1 + 0.10)^4} \]

\[ PV = \frac{-4,500}{1.10} + \frac{-3,100}{1.21} + \frac{-2,400}{1.331} + \frac{-1,800}{1.4641} \]

\[ PV = -4,090.91 - -2,569.09 - -1,797.27 - -1,238.5

Average Metric: 22.0 / 49  (44.9):   6%|▌         | 48/805 [00:01<00:27, 27.33it/s][2m2024-10-31T03:36:21.589982Z[0m [[31m[1merror    [0m] [1mError for example in dev set: 		 Images are not yet supported in JSON mode.. Set `provide_traceback=True` to see the stack trace.[0m [[0m[1m[34mdspy.evaluate.evaluate[0m][0m [36mfilename[0m=[35mevaluate.py[0m [36mlineno[0m=[35m200[0m
Average Metric: 25.0 / 55  (45.5):   7%|▋         | 54/805 [00:01<00:31, 23.50it/s]

Expected dict_keys(['reasoning', 'answer']) but got dict_keys([]) from [[ The image depicts a typical action potential in a neuron. The action potential is characterized by a rapid change in membrane potential from a negative to a positive value, followed by a return to the negative value. The labeled points on the graph show the progression of the action potential. Point E represents the peak of the action potential, which is the highest point on the graph. This peak corresponds to the action potential phase. ]]

[[ C. Action potential ]]

[[ ## completed ## ]]


Average Metric: 36.0 / 77  (46.8):   9%|▉         | 76/805 [00:03<00:47, 15.22it/s][2m2024-10-31T03:36:23.497018Z[0m [[31m[1merror    [0m] [1mError for example in dev set: 		 Images are not yet supported in JSON mode.. Set `provide_traceback=True` to see the stack trace.[0m [[0m[1m[34mdspy.evaluate.evaluate[0m][0m [36mfilename[0m=[35mevaluate.py[0m [36mlineno[0m=[35m200[0m
Average Metric: 37.0 / 79  (46.8):  10%|▉         | 79/805 [00:03<00:59, 12.22it/s][2m2024-10-31T03:36:23.794750Z[0m [[31m[1merror    [0m] [1mError for example in dev set: 		 Images are not yet supported in JSON mode.. Set `provide_traceback=True` to see the stack trace.[0m [[0m[1m[34mdspy.evaluate.evaluate[0m][0m [36mfilename[0m=[35mevaluate.py[0m [36mlineno[0m=[35m200[0m


Expected dict_keys(['reasoning', 'answer']) but got dict_keys([]) from To determine the price of the basket in 2014, we need to calculate the value of the basket using the prices from 2014. The basket consists of 200 apples and 290 textbooks. The base year is 2015, so we will use the prices from 2015 to calculate the value of the basket in 2014.

First, we need to find the price of the basket in 2015:
- Price of Apples in 2015: $1.10
- Price of Textbooks in 2015: $140

The value of the basket in 2015 is:
\[ 200 \times 1.10 + 290 \times 140 = 220 + 40600 = 40,820 \]

Next, we need to find the deflator for 2014. The deflator is the ratio of the price in 2015 to the price in 2014. We can use the prices from 2014 to calculate this:
- Price of Apples in 2014: $1.40
- Price of Textbooks in 2014: $200

The deflator is:
\[ \frac{1.10}{1.40} \times \frac{140}{200} = \frac{1.10}{1.40} \times \frac{7}{10} = \frac{1.10 \times 7}{1.40 \times 10} = \frac{7.7}{14} = 0.55 \]

Now, we can calculate the

[2m2024-10-31T03:36:23.831619Z[0m [[31m[1merror    [0m] [1mError for example in dev set: 		 Images are not yet supported in JSON mode.. Set `provide_traceback=True` to see the stack trace.[0m [[0m[1m[34mdspy.evaluate.evaluate[0m][0m [36mfilename[0m=[35mevaluate.py[0m [36mlineno[0m=[35m200[0m
Average Metric: 40.0 / 83  (48.2):  10%|█         | 82/805 [00:03<01:04, 11.20it/s][2m2024-10-31T03:36:24.082869Z[0m [[31m[1merror    [0m] [1mError for example in dev set: 		 Images are not yet supported in JSON mode.. Set `provide_traceback=True` to see the stack trace.[0m [[0m[1m[34mdspy.evaluate.evaluate[0m][0m [36mfilename[0m=[35mevaluate.py[0m [36mlineno[0m=[35m200[0m
Average Metric: 44.0 / 88  (50.0):  11%|█         | 87/805 [00:04<00:47, 15.28it/s]

Expected dict_keys(['reasoning', 'answer']) but got dict_keys([]) from [[ The bar chart in <image 1> shows the percentage of different species, with each color representing a different level of danger. The yellow color represents the highest danger level, the blue color represents the second highest, and the orange color represents the lowest danger level. The reasoning for this is that the yellow color is at the top of the chart, indicating the highest percentage, followed by blue, and then orange, indicating the lowest percentage. Therefore, the correct answer is 'A. Vulnerable, Endangered, Critical'.]]

[[ A]]

[[ Completed]]


Average Metric: 49.0 / 97  (50.5):  12%|█▏        | 97/805 [00:05<01:01, 11.48it/s][2m2024-10-31T03:36:25.279936Z[0m [[31m[1merror    [0m] [1mError for example in dev set: 		 Images are not yet supported in JSON mode.. Set `provide_traceback=True` to see the stack trace.[0m [[0m[1m[34mdspy.evaluate.evaluate[0m][0m [36mfilename[0m=[35mevaluate.py[0m [36mlineno[0m=[35m200[0m
Average Metric: 51.0 / 99  (51.5):  12%|█▏        | 98/805 [00:05<01:01, 11.48it/s]

ExpectedExpected dict_keys(['reasoning', 'answer']) but got dict_keys(['reasoning']) from [[ ## reasoning ## ]]
To find the length of \(AD\), we can use the properties of the trapezoid and the given information. Since \(O\) is the intersection of the diagonals, and \(P\) is the midpoint of \(BD\), we can use the fact that \(OP\) is perpendicular to \(BD\). This implies that \(OP\) is the height of the right triangle \(OPD\). Given \(OP = 11\) and \(BD = 43\), we can use the Pythagorean theorem in \(\triangle ODP\) to find \(AD\).

First, we find \(OD\):
\[OD = \frac{BD}{2} = \frac{43}{2} = 21.5\]

Next, we use the Pythagorean theorem in \(\triangle ODP\):
\[AD^2 = OP^2 + OD^2\]
\[AD^2 = 11^2 + 21.5^2\]
\[AD^2 = 121 + 462.25\]
\[AD^2 = 583.25\]
\[AD = \sqrt{583.25}\]
\[AD = \sqrt{583.25} = \sqrt{583.25} = \sqrt{583.25} = \sqrt{583.25} = \sqrt{583.25} = \sqrt{583.25} = \sqrt{583.25} = \sqrt{583.25} = \sqrt{583.25} = \sqrt{583.25} = \sqrt{583.25} = \sqrt{583.25} = \sqrt{583.25} = \sqrt{58

[2m2024-10-31T03:36:25.374539Z[0m [[31m[1merror    [0m] [1mError for example in dev set: 		 Images are not yet supported in JSON mode.. Set `provide_traceback=True` to see the stack trace.[0m [[0m[1m[34mdspy.evaluate.evaluate[0m][0m [36mfilename[0m=[35mevaluate.py[0m [36mlineno[0m=[35m200[0m
Average Metric: 52.0 / 100  (52.0):  12%|█▏        | 99/805 [00:05<01:43,  6.85it/s][2m2024-10-31T03:36:25.428529Z[0m [[31m[1merror    [0m] [1mError for example in dev set: 		 Images are not yet supported in JSON mode.. Set `provide_traceback=True` to see the stack trace.[0m [[0m[1m[34mdspy.evaluate.evaluate[0m][0m [36mfilename[0m=[35mevaluate.py[0m [36mlineno[0m=[35m200[0m


Expected dict_keys(['reasoning', 'answer']) but got dict_keys(['reasoning']) from [[ ## reasoning ## ]]
To determine which melody fragments are in compound meters, we need to look at the time signatures. A compound meter has a time signature that includes a number other than 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168

Average Metric: 52.0 / 101  (51.5):  12%|█▏        | 100/805 [00:05<01:42,  6.85it/s][2m2024-10-31T03:36:25.494967Z[0m [[31m[1merror    [0m] [1mError for example in dev set: 		 Images are not yet supported in JSON mode.. Set `provide_traceback=True` to see the stack trace.[0m [[0m[1m[34mdspy.evaluate.evaluate[0m][0m [36mfilename[0m=[35mevaluate.py[0m [36mlineno[0m=[35m200[0m
[2m2024-10-31T03:36:25.778982Z[0m [[31m[1merror    [0m] [1mError for example in dev set: 		 Images are not yet supported in JSON mode.. Set `provide_traceback=True` to see the stack trace.[0m [[0m[1m[34mdspy.evaluate.evaluate[0m][0m [36mfilename[0m=[35mevaluate.py[0m [36mlineno[0m=[35m200[0m
Average Metric: 52.0 / 101  (51.5):  13%|█▎        | 101/805 [00:06<01:46,  6.61it/s][2m2024-10-31T03:36:26.017478Z[0m [[31m[1merror    [0m] [1mError for example in dev set: 		 Images are not yet supported in JSON mode.. Set `provide_traceback=True` to see the stack trace.[0m [[0m

Expected dict_keys(['reasoning', 'answer']) but got dict_keys(['reasoning']) from [[ ## reasoning ## ]]
To determine the missing quantities and percent yields, we need to work backwards from compound F. We know that compound F is produced with a yield of 0.0100 mol. Since the reaction is in a one-to-one mole ratio, the amount of compound E required to produce compound F is 0.0100 mol. The yield for this step is 10.0%, so the amount of compound E initially used is 0.0100 mol / 0.10 = 0.1000 mol. Similarly, compound D is required to produce compound E with a yield of 10.0%, so the amount of compound D used is 0.1000 mol / 0.10 = 1.000 mol. Continuing this process, compound C is required to produce compound D with a yield of 25.0%, so the amount of compound C used is 1.000 mol / 0.25 = 4.000 mol. Finally, compound B is required to produce compound C with a yield of 40.0%, so the amount of compound B used is 4.000 mol / 0.40 = 10.000 mol. The percent yield for the conversion of compound B 

[2m2024-10-31T03:36:26.069020Z[0m [[31m[1merror    [0m] [1mError for example in dev set: 		 Images are not yet supported in JSON mode.. Set `provide_traceback=True` to see the stack trace.[0m [[0m[1m[34mdspy.evaluate.evaluate[0m][0m [36mfilename[0m=[35mevaluate.py[0m [36mlineno[0m=[35m200[0m
Average Metric: 405.0 / 805  (50.3): 100%|██████████| 805/805 [00:08<00:00, 90.35it/s] 

openai/OpenGVLab/InternVL2-8B 50.31 405.0 805 0.5031055900621118 14





(50.31, 14)

In [None]:
scores, outputs = evaluate_mmmu(mmmu)
# lm.inspect_history()


In [9]:
from collections import Counter
c = Counter([outputs[i][1].get("answer", "nothing returned") for i in range(len(outputs))])
non_letters = sum([1 for output in outputs if output[1].get("answer", "nothing returned") not in ["A", "B", "C", "D"]])
print(c)
print(non_letters)




Counter({'A': 214, 'B': 213, 'C': 212, 'D': 130, 'nothing returned': 16, 'E': 7, '$750': 1, 'F': 1, 'P, Q': 1, 'None of the provided answer choices can be selected as the correct answer because the necessary data to perform the calculation is not available in the image.': 1, "None of the options can be the sequence of edges added to the minimum spanning tree using Kruskal's algorithm because they all form cycles with the edges already in the tree.": 1, 'None of the given answer choices match the calculated height of 10.5 cm. Therefore, the correct answer is not provided in the given answer choices.': 1, 'None of the answer choices are correct.': 1, 'None of the answer choices provided directly relate to the size of the countries on the map. Therefore, the correct answer is not listed.': 1, 'None of the provided answer choices match the calculated cumulative relative frequency of 0.6.': 1, '59': 1, '1': 1, 'None of the provided answer choices match the calculated value.': 1, 'None of th

In [10]:
mc_correct = sum(outputs[i][2] for i in range(len(outputs)) if outputs[i][0]["question_type"] == "multiple-choice")
total_mc = sum(1 for example in outputs if example[0]["question_type"] == "multiple-choice")
print(mc_correct, total_mc)
print(mc_correct / total_mc)
print(sum(outputs[i][1].get("answer", None) is None for i in range(len(outputs))))

# Note: Run above here

395.0 805
0.4906832298136646
16


# Make sure that multiple images work

## No examples

In [None]:
import PIL
def set_image_to_black_square(example, key):
    example_copy = example.copy()
    example_copy[key] = PIL.Image.open("black_image_300x300.png")
    return example_copy.with_inputs(*example.inputs().keys())

print(updated_devset[0]["image_1"])
print(updated_devset[0]["image_2"])
examples_no_image_1 = list(map(lambda x: set_image_to_black_square(x, "image_1"), updated_valset))
print(examples_no_image_1[0]["image_1"] == PIL.Image.open("black_image_300x300.png"))
print(examples_no_image_1[0]["image_2"] == PIL.Image.open("black_image_300x300.png"))
examples_no_image_2 = list(map(lambda x: set_image_to_black_square(x, "image_2"), updated_valset))
print(examples_no_image_2[0]["image_1"] == PIL.Image.open("black_image_300x300.png"))
print(examples_no_image_2[0]["image_2"] == PIL.Image.open("black_image_300x300.png"))

examples_no_actual_image = list(map(lambda x: set_image_to_black_square(x, "image_1"), updated_valset))
examples_no_actual_image = list(map(lambda x: set_image_to_black_square(x, "image_2"), examples_no_actual_image))
print(examples_no_actual_image[0]["image_1"] == PIL.Image.open("black_image_300x300.png"))
print(examples_no_actual_image[0]["image_2"] == PIL.Image.open("black_image_300x300.png"))


In [None]:
mmmu = MMMUModule()
print(examples_no_image_1[0].inputs())
print(mmmu(**examples_no_image_1[0].inputs()))

print(examples_no_image_2[0].inputs())
print(mmmu(**examples_no_image_2[0].inputs()))


In [None]:
normal = evaluate_mmmu(mmmu, devset=updated_valset)
no_image_1 = evaluate_mmmu(mmmu, devset=examples_no_image_1)
no_image_2 = evaluate_mmmu(mmmu, devset=examples_no_image_2)
no_actual_image = evaluate_mmmu(mmmu, devset=examples_no_actual_image)
print("Testing on MMMU validation set (N=", len(updated_valset), ")")
print("Score with both images:", normal)
print("Score with image_1 set to black square:", no_image_1)
print("Score with image_2 set to black square:", no_image_2)
print("Score with both images set to black squares:", no_actual_image)

## TODO: Test with bootstrapped examples


# Make sure that JPGs work

## Convert images to JPGs

In [None]:
import io
from PIL import Image

def convert_to_jpg(example):
    example_copy = example.copy()
    for key in ['image_1', 'image_2']:
        if key in example_copy and isinstance(example_copy[key], Image.Image):
            # Convert to RGB mode (in case it's not already)
            img = example[key].convert('RGB')
            
            # Save as JPG in memory
            buffer = io.BytesIO()
            img.save(buffer, format='JPEG')
            buffer.seek(0)
            
            # Load the JPG back as a PIL Image
            example_copy[key] = Image.open(buffer)
    
    return example_copy.with_inputs(*example.inputs().keys())

# Convert all images in the dataset to JPG
examples_jpg = list(map(convert_to_jpg, updated_valset))

# Verify conversion
print("Original image format:", updated_valset[0]['image_1'].format)
print("Converted image format:", examples_jpg[0]['image_1'].format)


In [None]:
examples_jpg = list(map(convert_to_jpg, updated_valset))
examples_no_image_1_jpg = list(map(lambda x: convert_to_jpg(x), examples_no_image_1))
examples_no_image_2_jpg = list(map(lambda x: convert_to_jpg(x), examples_no_image_2))
examples_no_actual_image_jpg = list(map(lambda x: convert_to_jpg(x), examples_no_actual_image))

mmmu = MMMUModule()
print(examples_no_image_1_jpg[0].inputs())
print(mmmu(**examples_no_image_1_jpg[0].inputs()))
print(examples_no_image_1_jpg[0]["image_1"].format)

In [None]:
normal = evaluate_mmmu(mmmu, devset=examples_jpg)
no_image_1 = evaluate_mmmu(mmmu, devset=examples_no_image_1_jpg)
no_image_2 = evaluate_mmmu(mmmu, devset=examples_no_image_2_jpg)
no_actual_image = evaluate_mmmu(mmmu, devset=examples_no_actual_image_jpg)
print("Testing on MMMU validation set (N=", len(updated_valset), ")")
print("Score with both images:", normal)
print("Score with image_1 set to black square:", no_image_1)
print("Score with image_2 set to black square:", no_image_2)
print("Score with both images set to black squares:", no_actual_image)

In [None]:
lm.inspect_history()

# Testing that URLs work

In [None]:

colors = {
    "White": "FFFFFF",
    "Red": "FF0000",
    "Green": "00FF00",
    "Blue": "0000FF",
    "Yellow": "FFFF00",
    "Cyan": "00FFFF",
    "Magenta": "FF00FF",
    "Gray": "808080",
    "Orange": "FFA500",
    "Purple": "800080"
}
def get_color_image_url(color, file_extension="png"):
    return f"https://placehold.co/300/{colors[color]}/{colors[color]}.{file_extension}"


In [None]:
import random

def generate_random_2_color_image_examples(n):
    examples = []
    for _ in range(n):
        color_1, color_2 = random.sample(list(colors.keys()), 2)
        chosen_color = color_1 if random.random() < 0.5 else color_2
        chosen_image = "image_1" if chosen_color == color_1 else "image_2"
        example_kwargs = {
            "image_1": get_color_image_url(color_1),
            "image_2": get_color_image_url(color_2),
            "question": f"What color is {chosen_image}?",
            "answer": chosen_color
        }
        examples.append(dspy.Example(**example_kwargs).with_inputs("image_1", "image_2", "question"))
    return examples

examples = generate_random_2_color_image_examples(100)
print(examples[0])


In [None]:
class ColorSignature(dspy.Signature):
    """Output the color of the designated image."""
    image_1: dspy.Image = dspy.InputField(desc="An image")
    image_2: dspy.Image = dspy.InputField(desc="An image")
    question: str = dspy.InputField(desc="A question about the image")
    answer: str = dspy.OutputField(desc="The color of the designated image")
color_program = dspy.Predict(ColorSignature)


In [None]:
print(examples[0])
print(color_program(**examples[0].inputs()))

In [None]:
few_shot_optimizer = dspy.BootstrapFewShot(metric=answer_exact_match, max_bootstrapped_demos=3, max_labeled_demos=10)
smaller_few_shot_optimizer = dspy.BootstrapFewShot(metric=answer_exact_match, max_bootstrapped_demos=1, max_labeled_demos=1)
dataset = generate_random_2_color_image_examples(1000)
trainset = dataset[:200]
validationset = dataset[200:400]
evaluate_colors = Evaluate(metric=answer_exact_match, num_threads=300, devset=validationset)

In [None]:
compiled_color_program = few_shot_optimizer.compile(color_program, trainset=trainset)
compiled_smaller_color_program = smaller_few_shot_optimizer.compile(color_program, trainset=trainset)
print(evaluate_colors(color_program))
print(evaluate_colors(compiled_color_program))
print(evaluate_colors(compiled_smaller_color_program))

In [None]:
print(compiled_color_program(**validationset[0].inputs()))
lm.inspect_history()

# TODO(Isaac): Delete; Archive of old experiments

In [None]:
dataset = DataLoader().from_huggingface("Alanox/stanford-dogs", split="full", input_keys=("image",), trust_remote_code=True)

In [None]:
# rename the field from "image" to "image_1"
def rename_field(example, old_name, new_name):
    try:
        example[new_name] = example[old_name]
        del example[old_name]
    except Exception:
        pass
    return example
    
dog_dataset = list(map(rename_field, dataset, ["image"]*len(dataset), ["image_1"]*len(dataset)))
dog_dataset2 = list(map(rename_field, dog_dataset, ["target"]*len(dog_dataset), ["answer"]*len(dog_dataset)))
dog_dataset3 = list(map(lambda x: x.with_inputs("image_1"), dog_dataset2))
dog_dataset = dog_dataset3
random.shuffle(dog_dataset)

In [None]:
class DogPictureSignature(dspy.Signature):
    """Output the dog breed of the dog in the image."""
    image_1: dspy.Image = dspy.InputField(desc="An image of a dog")
    answer: str = dspy.OutputField(desc="The dog breed of the dog in the image")

class DogPicture(dspy.Module):
    def __init__(self) -> None:
        self.predictor = dspy.ChainOfThought(DogPictureSignature)
    
    def __call__(self, **kwargs):
        return self.predictor(**kwargs)

dog_picture = DogPicture()
print(dog_picture(**dog_dataset[0].inputs()))

In [None]:
evaluate = Evaluate(metric=answer_exact_match, num_threads=100, devset= dog_dataset[-500:], display_progress=True, max_errors=10000)


In [None]:
# TODO: Test inline signature
# TODO: Test json adapter