playing around with mmlu pro https://huggingface.co/datasets/TIGER-Lab/MMLU-Pro

In [None]:
%pip install dspy
%pip install ipywidgets
%pip install IPython
%pip install requests

In [1]:
import dspy
from dspy.datasets import DataLoader
import dspy
import os
import dotenv
import requests
import pydantic

setup DSPy

In [3]:
dotenv.load_dotenv()
assert 'OPENAI_API_KEY' in os.environ
llm = dspy.OpenAI(model='gpt-4o', max_tokens=4096, temperature=0.1)
dspy.settings.configure(lm=llm)

Download MMLU-Pro

In [6]:
dl = DataLoader()
mmlu_pro = dl.from_huggingface("TIGER-Lab/MMLU-Pro")
validation = mmlu_pro['validation']
#display(validation)



Get 10 random entries and format for DSPy 

In [7]:
import random

try10 = random.sample(validation, 10)
#display(try10)

try10ex = [example.with_inputs("question", "options") for example in random.sample(validation, 10)]
for ex in try10ex:
    ex['options'] = str(ex['options'])
    ex['answer_index'] = str(ex['answer_index'])
display(try10ex)


[Example({'question_id': 9, 'question': 'Why are parvoviruses a highly impactful parasite?', 'options': '["They are able to alter the host\'s DNA", \'Because they have no nucleic acid\', \'They can survive in extreme temperatures\', \'Only replicate in dividing cells\', \'They can infect multiple species\', "They don\'t require a host to survive", \'Can integrate into host chromosomes\', \'N/A\', \'N/A\', \'N/A\']', 'answer': 'B', 'answer_index': '1', 'cot_content': "A: Let's think step by step. We refer to Wikipedia articles on virology for help. Paroviruses are highly impactful because they do not have nucleic acid. The answer is (B).", 'category': 'health', 'src': 'cot_lib-virology'}) (input_keys={'options', 'question'}),
 Example({'question_id': 11, 'question': 'Where do most short-period comets come from and how do we know?', 'options': "['The Kuiper belt; short period comets tend to be in the plane of the solar system just like the Kuiper belt.', 'The asteroid belt; short period 

maybe someday implement typed inputs...this caused too much trouble

In [57]:
# from typing import List
# from dspy.functional import TypedPredictor, TypedChainOfThought
# from pydantic import BaseModel, Field

# class Input(BaseModel):
#     question: str = Field(description="The question.")
#     #options: List[str] = Field(description="A list of options.")
#     options: str = Field(description="A list of options.")

# class OutputCode(BaseModel):
#     #answer: str = Field(description="must be one of the first ten letters in the English alphabet: A,B,C,D,E,F,G,H,I,J")
#     answer_index: str = Field(description="must be one of the first ten letters in the English alphabet: A,B,C,D,E,F,G,H,I,J")
#     cot_content: str = Field(description="your full rationale for why you chose the answer")



Make the signature and the module

In [8]:
class MMLUPro(dspy.Signature):
    """Answer questions with a rationale and a multiple choice answer."""

    question = dspy.InputField()
    options = dspy.InputField()
    cot_content = dspy.OutputField(desc="your full rationale for why you chose the answer")
    answer_index = dspy.OutputField(desc="reply with the index of the correct answer from the options. you must reply with only one choice from 0-9")
    



In [9]:
class CoTMMLU(dspy.Module):  # let's define a new module
    def __init__(self):
        super().__init__()
       
        self.signature = MMLUPro
        self.predictor = dspy.ChainOfThought(self.signature)
    
    def forward(self, question, options):
        result = self.predictor(question=question, options=options)
        return dspy.Prediction(answer_index=str(result.answer_index), cot_content=str(result.cot_content))



test just one question to be sure it works

In [13]:
sample_question = try10ex[0]['question']
sample_options = try10ex[0]['options']
sample_answer_index = try10ex[0]['answer_index']

print("Sample Question:", sample_question)
print("Options:", sample_options)
print("Answer Index:", sample_answer_index)




Sample Question: Why are parvoviruses a highly impactful parasite?
Options: ["They are able to alter the host's DNA", 'Because they have no nucleic acid', 'They can survive in extreme temperatures', 'Only replicate in dividing cells', 'They can infect multiple species', "They don't require a host to survive", 'Can integrate into host chromosomes', 'N/A', 'N/A', 'N/A']
Answer Index: 1


In [14]:
uncompiled = CoTMMLU()
response = uncompiled(question=sample_question, options=str(sample_options))
display(response)




Prediction(
    answer_index='3',
    cot_content='Parvoviruses are highly impactful because they can only replicate in dividing cells. This dependency on host cell division means that they can cause significant damage to tissues where cells are rapidly dividing. For example, in dogs, canine parvovirus targets the rapidly dividing cells in the intestines, leading to severe gastrointestinal illness. In humans, parvovirus B19 targets erythroid progenitor cells in the bone marrow, which can lead to conditions like anemia. This specific replication requirement is a key factor in their pathogenicity.'
)

In [None]:
llm.inspect_history(n=1)

bootstrapfewshot as a simple optimizer...later we can get crazy, just prove the workflow for now.

In [15]:
from dspy.teleprompt import BootstrapFewShot

def custom_answer_exact_match(prediction, example, trace=None):
    return prediction.answer_index == example['answer_index']

compiled = BootstrapFewShot(
    metric=custom_answer_exact_match,
    # num_threads=30,
    # num_candidate_programs=5,
    max_labeled_demos=8,
).compile(
    student=CoTMMLU(),
    trainset=try10ex,
)



  0%|          | 0/10 [00:00<?, ?it/s]

 50%|█████     | 5/10 [00:17<00:17,  3.51s/it]

Bootstrapped 4 full traces after 6 examples in round 0.





In [None]:
llm.inspect_history(n=1)

get 10 new questions 

In [16]:
try10 = random.sample(validation, 10)
#display(try10)

try10test = [example.with_inputs("question", "options") for example in random.sample(validation, 10)]
for ex in try10test:
    ex['options'] = str(ex['options'])
    ex['answer_index'] = str(ex['answer_index'])
display(try10test)

[Example({'question_id': 4, 'question': 'A total of 30 players will play basketball at a park. There will be exactly 5 players on each team. Which statement correctly explains how to find the number of teams needed?', 'options': "['Multiply 5 by 5 to find 25 teams.', 'Divide 30 by 5 to find 6 teams.', 'Add 5 to 30 to find 35 teams.', 'Subtract 30 from 5 to find -25 teams.', 'Divide 5 by 30 to find 0.1667 teams.', 'Add 5 to 30 then divide by 2 to find 17.5 teams.', 'N/A', 'N/A', 'N/A', 'N/A']", 'answer': 'B', 'answer_index': '1', 'cot_content': "A: Let's think step by step. We want to find the number of teams. We know that there are 5 players/team, and 30 players. Thus to get the number of teams we divide players by players/team, so 30 players / 5 players/team = 6 teams. The answer is (B).", 'category': 'math', 'src': 'cot_lib-elementary_mathematics'}) (input_keys={'options', 'question'}),
 Example({'question_id': 52, 'question': 'Which of the following cases established the precedent t

evaluate uncompiled vs compiled on the 10 new questions

In [17]:
from dspy.evaluate import Evaluate

# Set up the evaluator, which can be re-used in your code.
#evaluator = Evaluate(devset=try10test, num_threads=1, display_progress=True, display_table=5)
evaluator = Evaluate(devset=try10test, num_threads=1, display_progress=True)

# Launch evaluation.
print("uncompiled")
evaluator(uncompiled, metric=custom_answer_exact_match)
print("---")
print("compiled")
evaluator(compiled, metric=custom_answer_exact_match)
print("---")



uncompiled


Average Metric: 9 / 10  (90.0): 100%|██████████| 10/10 [00:38<00:00,  3.86s/it]


Average Metric: 9 / 10  (90.0%)
---
compiled


Average Metric: 9 / 10  (90.0): 100%|██████████| 10/10 [00:44<00:00,  4.46s/it]

Average Metric: 9 / 10  (90.0%)
---





future directions:

does retrieving wikipedia colbert give better mmlupro results? a la https://github.com/stanfordnlp/dspy/blob/5db3e3461a5db8ddb0a03c8ae59e606f4efc1e95/intro.ipynb#L4

score is number of tries to get the correct answer? or just exact match boolean 0 / 1?

does training on mmlu give a smarter bot? need to make options optional. 

kind of like - are people that are really good at taking tests actually useful? are bots that are really good at benchmarks useful?

proper optimizer copro / mipro, work up to using the full 70 in the validation set