In [8]:
from datasets import load_dataset, load_from_disk
from transformers import T5Tokenizer, T5ForConditionalGeneration
from tqdm import tqdm
import torch

In [4]:
def process_metamathqa(examples, prefix="Please answer the following question: "):
    """Add a prefix to the inputs and tokenize the inputs and targets.

    Args:
        examples: dataset examples.
        tokenizer: tokenizer.
        prefix (str, optional): Prefix to add to each inputs. Defaults to "Please answer the following question: ".

    Returns:
        dataset: processed examples.
    """
    # The "inputs" are the tokenized answer:
    inputs = [prefix + doc for doc in examples["query"]]

    # The "labels" are the tokenized outputs:
    labels = examples["response"]
    return {"question": inputs, "answer": labels}

def load_metamathqa(dev_mode=False):
    """Load the metamathqa dataset.

    Returns:
        Dataset: metamathqa dataset.
    """
    dataset = load_dataset("meta-math/MetaMathQA")
    if dev_mode:
        dataset["train"] = dataset["train"].select(range(20))  # For development
    else:
        dataset["train"] = dataset["train"].select(range(50000))  # For fine-tuning
    dataset = dataset["train"].train_test_split(test_size=0.2)
    tokenized_dataset = dataset.map(
        process_metamathqa,
        batched=True,
        remove_columns=["query", "response", "type", "original_question"],
    )
    return tokenized_dataset

In [5]:
def process_mmlu_stem(examples):
    alphabet = list(map(chr, range(65, 91)))
    labels = [alphabet[ans] for ans in examples["answer"]]
    inputs = ["Question: " + question +"\n\nOptions:\n" + "\n".join(f"{index}. {value}" for index, value in zip(alphabet[:len(choices)], choices)) + "\n\nAnswer:" for question, choices in zip(examples["question"], examples["choices"])]
    data = {"question": inputs, "answer": labels}
    
    return data

def load_mmlu_stem(dev_mode=False):
    """Load the mmlu_stem dataset.

    Returns:
        Dataset: mmlu_stem dataset.
    """
    dataset = load_dataset("TIGER-Lab/MMLU-STEM")
    dataset['train'] = dataset['train'].shuffle(seed=42)
    if dev_mode:
        dataset["train"] = dataset["train"].select(range(1000))  # For development
    dataset = dataset["train"].train_test_split(test_size=0.2)
    tokenized_dataset = dataset.map(
        process_mmlu_stem,
        remove_columns=["question", "subject", "choices", "answer"],
        batched=True,

    )
    return tokenized_dataset

In [6]:
mmlu = load_mmlu_stem(dev_mode=True)

Map: 100%|██████████| 800/800 [00:00<00:00, 33915.63 examples/s]
Map: 100%|██████████| 200/200 [00:00<00:00, 19584.91 examples/s]


In [7]:
mmlu

DatasetDict({
    train: Dataset({
        features: ['question', 'answer'],
        num_rows: 800
    })
    test: Dataset({
        features: ['question', 'answer'],
        num_rows: 200
    })
})

In [8]:
import gpt_wrapper
from gpt_wrapper.chat import Chat
gpt_wrapper.api_base = "http://mnlp-backend-938795011.eu-central-1.elb.amazonaws.com"
gpt_wrapper.api_key = "a216b70d-88dc-4400-b6ee-b7e3f33da050"

In [15]:
chat = Chat.create("mmlu-stem")
def generate_gpt_answers(examples):
    gpt_answer = []
    for question in tqdm(examples["question"]):
        try:
            answer = chat.ask(question).to_dict()['content']
            gpt_answer.append(answer)
        except:
            gpt_answer.append("I don't know.")
    print(chat.budget())
    return {"question": examples["question"], "answer": gpt_answer}

In [16]:
chat.budget()

{'limit': 10000000, 'usage': 2001048}

In [17]:
mmlu_gpt = mmlu.map(generate_gpt_answers, batched=True)
mmlu_gpt.save_to_disk("mmlu_gpt.hf")

100%|██████████| 800/800 [21:38<00:00,  1.62s/it]s]
Map: 100%|██████████| 800/800 [21:38<00:00,  1.62s/ examples]


{'limit': 10000000, 'usage': 3466833}


Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Server timeout.
Retrying in 10 seconds...


100%|██████████| 200/200 [05:35<00:00,  1.68s/it]
Map: 100%|██████████| 200/200 [05:35<00:00,  1.68s/ examples]


{'limit': 10000000, 'usage': 3466833}


Saving the dataset (1/1 shards): 100%|██████████| 800/800 [00:00<00:00, 177734.16 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 200/200 [00:00<00:00, 59684.16 examples/s]


In [32]:
mmlu_gpt_filter = load_from_disk("mmlu_gpt.hf")

In [35]:
counter = 0
for example in test['train']:
    if example['answer'] == "I don't know.":
        counter += 1
counter

627

In [None]:
dataset = DatasetDict()


In [36]:
chat.budget()

{'limit': 10000000, 'usage': 3466833}

In [13]:
model = T5ForConditionalGeneration.from_pretrained("../../checkpoints/checkpoints/sft/full_model/", local_files_only=True)
tokenizer = T5Tokenizer.from_pretrained("../../checkpoints/checkpoints/sft/full_model/", local_files_only=True)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [15]:
def generate_t5_answers(examples):
    inputs = tokenizer(examples["question"], return_tensors="pt", padding=True, truncation=True)
    outputs = model.generate(**inputs)
    answers = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    return {"question": examples["question"], "answer": answers}

In [35]:
print(tokenizer.batch_decode(model.generate(**tokenizer("Answer the following question: Explain the pythagorean theorem?",
                                                  return_tensors="pt",
                                                  padding=True,
                                                  truncation=True), max_length=512, num_beams=4, num_return_sequences=1),
                       skip_special_tokens=True)[0])

The pythagorean theorem states that the sum of the roots of a polynomial $ax2 + bx + c = 0$ is equal to $-fracba$. In this case, the roots are $a = 1$ and $b = -2$, so the pythagorean theorem states that the sum of the roots of a polynomial $ax2 + bx + c = 0$ is equal to $-fracba$. Therefore, the pythagorean theorem states that the sum of the roots of a polynomial $ax2 + bx + c = 0$ is equal to $-fracba$. So, the pythagorean theorem states that the sum of the roots of a polynomial $ax2 + bx + c = 0$ is equal to $-fracba$. In this case, the roots are $a = 1$ and $b = -2$, so the pythagorean theorem states that the sum of the roots of a polynomial $ax2 + bx + c = 0$ is equal to $-fracba$. In this case, the roots are $a = 1$ and $b = -2$, so the pythagorean theorem states that the sum of the roots of a polynomial $ax2 + bx + c = 0$ is equal to $-fracba$. Therefore, the pythagorean theorem states that the sum of the roots of a polynomial $ax2 + bx + c = 0$ is equal to $-fracba$. Therefore,