Datasets to make:

- [x] cola
- [x] qnli
- [x] qqp
- [x] sst2
- [x] ag_news
- [x] commonsense_qa
- [x] mnli
- [x] mmlu


In [1]:
import datasets
from datasets import load_dataset, concatenate_datasets
from promptsource.templates import DatasetTemplates
from pathlib import Path
import json
from tqdm import tqdm
from collections import Counter

In [2]:
def load_prompt(dataset_name, config=None, prompt_idx=0):
    """this function loads a test prompt for a specified dataset to see if promptSource supports it"""
    all_prompts = DatasetTemplates(dataset_name, config) if config != None else DatasetTemplates(dataset_name)
    prompt_name_list = list(all_prompts.name_to_id_mapping.keys())
    prompt = all_prompts[prompt_name_list[prompt_idx]]
    return prompt 

# qnli

In [61]:
# Function to add the original split name to each example
def add_original_split(example, split_name):
    example['original_dataset_subset'] = split_name
    return example

In [62]:
qnli_dataset = load_dataset("nyu-mll/glue","qnli")
print(qnli_dataset)

DatasetDict({
    train: Dataset({
        features: ['question', 'sentence', 'label', 'idx'],
        num_rows: 104743
    })
    validation: Dataset({
        features: ['question', 'sentence', 'label', 'idx'],
        num_rows: 5463
    })
    test: Dataset({
        features: ['question', 'sentence', 'label', 'idx'],
        num_rows: 5463
    })
})


In [63]:
for split in ['train', 'validation', 'test']:
    qnli_dataset[split] = qnli_dataset[split].map(lambda x: add_original_split(x, split))

In [64]:
qnli_dataset = qnli_dataset.shuffle(seed=42)

In [65]:
qnli_prompt_template = load_prompt("glue", "qnli", 0)

In [66]:
train_set = []
num_training_samples = 100000

for i in range(num_training_samples):
    dataset_element = qnli_dataset['train'][i]
    input_txt, output_txt = qnli_prompt_template.apply(dataset_element)
    
    dataset_obj = {
        "input": input_txt,
        "output": output_txt,
        "combined": input_txt + "\n" + output_txt,
        "original_dataset_subset": dataset_element['original_dataset_subset'],
        "original_idx": dataset_element['idx']
    }
    
    train_set.append(dataset_obj)

In [15]:
train_set[0]

{'input': 'Apparently the sailor did not connect with the soldier, as Mahan believed he was innovating the term Middle East.\nDoes that sentence have all you need to answer the question "Who did not connect with the soldier?"?',
 'output': 'yes',
 'combined': 'Apparently the sailor did not connect with the soldier, as Mahan believed he was innovating the term Middle East.\nDoes that sentence have all you need to answer the question "Who did not connect with the soldier?"?\nyes',
 'original_dataset_subset': 'train',
 'original_idx': 99928}

In [16]:
qnli_val_set = []
num_eval_samples = 5000

for i in range(num_eval_samples):
    dataset_element = qnli_dataset['train'][i + num_training_samples]
    input_txt, output_txt = qnli_prompt_template.apply(dataset_element)
    
    dataset_obj = {
        "input": input_txt,
        "output": output_txt,
        "combined": input_txt + "\n" + output_txt,
        "original_dataset_subset": dataset_element['original_dataset_subset'],
        "original_idx": dataset_element['idx']
    }
    
    qnli_val_set.append(dataset_obj)

In [67]:
combined_qnli_dataset = {
    "train": train_set
}

save_path = Path(f"datasets/deduped_train_qnli.json")
with save_path.open("w") as f:
    json.dump(combined_qnli_dataset, f)

In [69]:
# show the label distribution of both the train and validation sets
# qnli_train_labels = [x['output'] for x in combined_qnli_dataset["train"]]
qnli_validation_labels = [x['output'] for x in combined_qnli_dataset["train"]]


# qnli_train_counter = Counter(qnli_train_labels)
qnli_validation_counter = Counter(qnli_validation_labels)

# print(qnli_train_counter)
print(qnli_validation_counter)

Counter({'yes': 50031, 'no': 49969})


# qqp

In [70]:
# Function to add the original split name to each example
def add_original_split(example, split_name):
    example['original_dataset_subset'] = split_name
    return example

In [71]:
qqp_dataset = load_dataset("nyu-mll/glue","qqp")
print(qqp_dataset)

DatasetDict({
    train: Dataset({
        features: ['question1', 'question2', 'label', 'idx'],
        num_rows: 363846
    })
    validation: Dataset({
        features: ['question1', 'question2', 'label', 'idx'],
        num_rows: 40430
    })
    test: Dataset({
        features: ['question1', 'question2', 'label', 'idx'],
        num_rows: 390965
    })
})


In [72]:
for split in ['train', 'validation', 'test']:
    qqp_dataset[split] = qqp_dataset[split].map(lambda x: add_original_split(x, split))

In [73]:
qqp_dataset = qqp_dataset.shuffle(seed=42)

In [74]:
qqp_prompt_template = load_prompt("glue", "qqp", 0)

In [76]:
qqp_train_set = []
num_training_samples = 100000

for i in range(num_training_samples):
    dataset_element = qqp_dataset['train'][i]
    input_txt, output_txt = qqp_prompt_template.apply(dataset_element)
    
    dataset_obj = {
        "input": input_txt,
        "output": output_txt,
        "combined": input_txt + "\n" + output_txt,
        "original_dataset_subset": dataset_element['original_dataset_subset'],
        "original_idx": dataset_element['idx']
    }
    
    qqp_train_set.append(dataset_obj)

In [77]:
print(qqp_train_set[0])

{'input': 'I\'m an administrator on the website Quora. There are two posts, one that asks "What would Rhaegar and Jon have thought of each other?" and another that asks "What is the amount of torque a 2000 Jeep Cherokee can output?". I can merge questions if they are asking the same thing. Can I merge these two questions?', 'output': 'no', 'combined': 'I\'m an administrator on the website Quora. There are two posts, one that asks "What would Rhaegar and Jon have thought of each other?" and another that asks "What is the amount of torque a 2000 Jeep Cherokee can output?". I can merge questions if they are asking the same thing. Can I merge these two questions?\nno', 'original_dataset_subset': 'train', 'original_idx': 260796}


In [33]:
qqp_val_set = []
num_eval_samples = 5000

for i in range(num_eval_samples):
    dataset_element = qqp_dataset['train'][i + num_training_samples]
    input_txt, output_txt = qqp_prompt_template.apply(dataset_element)
    
    dataset_obj = {
        "input": input_txt,
        "output": output_txt,
        "combined": input_txt + "\n" + output_txt,
        "original_dataset_subset": dataset_element['original_dataset_subset'],
        "original_idx": dataset_element['idx']
    }
    
    qqp_val_set.append(dataset_obj)

In [78]:
combined_qqp_dataset = {
    "train": qqp_train_set,
    # "validation": qqp_val_set
}

save_path = Path(f"datasets/deduped_train_qqp.json")
with save_path.open("w") as f:
    json.dump(combined_qqp_dataset, f)

In [79]:
# show the label distribution of both the train and validation sets
# qqp_train_labels = [x['output'] for x in combined_qqp_dataset["train"]]
qqp_validation_labels = [x['output'] for x in combined_qqp_dataset["train"]]


# qqp_train_counter = Counter(qqp_train_labels)
qqp_validation_counter = Counter(qqp_validation_labels)

# print(qqp_train_counter)
print(qqp_validation_counter)

Counter({'no': 63074, 'yes': 36926})


# ag_news

In [80]:
# Function to add the original split name to each example
def add_original_split(example, split_name):
    example['original_dataset_subset'] = split_name
    return example

In [81]:
ag_news_dataset = load_dataset("fancyzhx/ag_news")
print(ag_news_dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 120000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 7600
    })
})


In [82]:
for split in ['train', 'test']:
    ag_news_dataset[split] = ag_news_dataset[split].map(lambda x: add_original_split(x, split))

In [83]:
ag_news_dataset = ag_news_dataset.shuffle(seed=42)

In [84]:
ag_news_prompt_template = load_prompt("ag_news", 0)

In [85]:
ag_news_train_set = []
num_training_samples = 100000

for i in range(num_training_samples):
    dataset_element = ag_news_dataset['train'][i]
    input_txt, output_txt = ag_news_prompt_template.apply(dataset_element)
    
    dataset_obj = {
        "input": input_txt,
        "output": output_txt,
        "combined": input_txt + "\n" + output_txt,
        "original_dataset_subset": dataset_element['original_dataset_subset'],
        # "original_idx": dataset_element['idx']
    }
    
    ag_news_train_set.append(dataset_obj)

In [87]:
ag_news_train_set[0]

{'input': 'What label best describes this news article?\nBangladesh paralysed by strikes Opposition activists have brought many towns and cities in Bangladesh to a halt, the day after 18 people died in explosions at a political rally.',
 'output': 'World politics',
 'combined': 'What label best describes this news article?\nBangladesh paralysed by strikes Opposition activists have brought many towns and cities in Bangladesh to a halt, the day after 18 people died in explosions at a political rally.\nWorld politics',
 'original_dataset_subset': 'train'}

In [49]:
ag_news_val_set = []
num_eval_samples = 5000

for i in range(num_eval_samples):
    dataset_element = ag_news_dataset['train'][i + num_training_samples]
    input_txt, output_txt = ag_news_prompt_template.apply(dataset_element)
    
    dataset_obj = {
        "input": input_txt,
        "output": output_txt,
        "combined": input_txt + "\n" + output_txt,
        "original_dataset_subset": dataset_element['original_dataset_subset'],
        # "original_idx": dataset_element['idx']
    }
    
    ag_news_val_set.append(dataset_obj)

In [88]:
combined_ag_news_dataset = {
    "train": ag_news_train_set
}

save_path = Path(f"datasets/deduped_train_ag_news.json")
with save_path.open("w") as f:
    json.dump(combined_ag_news_dataset, f)

In [89]:
# show the label distribution of both the train and validation sets
# ag_news_train_labels = [x['output'] for x in combined_ag_news_dataset["train"]]
ag_news_validation_labels = [x['output'] for x in combined_ag_news_dataset["train"]]


# ag_news_train_counter = Counter(ag_news_train_labels)
ag_news_validation_counter = Counter(ag_news_validation_labels)

# print(ag_news_train_counter)
print(ag_news_validation_counter)

Counter({'Sports': 25078, 'Science and technology': 25018, 'Business': 25004, 'World politics': 24900})


# mnli

In [90]:
# Function to add the original split name to each example
def add_original_split(example, split_name):
    example['original_dataset_subset'] = split_name
    return example

In [91]:
mnli_dataset = load_dataset("nyu-mll/glue","mnli")
print(mnli_dataset)

DatasetDict({
    train: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 392702
    })
    validation_matched: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 9815
    })
    validation_mismatched: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 9832
    })
    test_matched: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 9796
    })
    test_mismatched: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 9847
    })
})


In [92]:
for split in ['train', 'validation_matched']:
    mnli_dataset[split] = mnli_dataset[split].map(lambda x: add_original_split(x, split))

In [93]:
mnli_dataset = mnli_dataset.shuffle(seed=42)

In [94]:
mnli_prompt_template = load_prompt("glue/mnli", prompt_idx=2)

In [95]:
mnli_train_set = []
num_training_samples = 100000

for i in range(num_training_samples):
    dataset_element = mnli_dataset['train'][i]
    input_txt, output_txt = mnli_prompt_template.apply(dataset_element)
    
    dataset_obj = {
        "input": input_txt,
        "output": output_txt,
        "combined": input_txt + "\n" + output_txt,
        "original_dataset_subset": dataset_element['original_dataset_subset'],
    }
    
    mnli_train_set.append(dataset_obj)

In [96]:
mnli_train_set[0]

{'input': 'I\'ll hurry over that part. Are we justified in saying that ""I\'ll be quick with that part.""? Yes, no, or maybe?',
 'output': 'Yes',
 'combined': 'I\'ll hurry over that part. Are we justified in saying that ""I\'ll be quick with that part.""? Yes, no, or maybe?\nYes',
 'original_dataset_subset': 'train'}

In [59]:
mnli_val_set = []
num_eval_samples = 5000

for i in range(num_eval_samples):
    dataset_element = mnli_dataset['train'][i + num_training_samples]
    input_txt, output_txt = mnli_prompt_template.apply(dataset_element)
    
    dataset_obj = {
        "input": input_txt,
        "output": output_txt,
        "combined": input_txt + "\n" + output_txt,
        "original_dataset_subset": dataset_element['original_dataset_subset'],
    }
    
    mnli_val_set.append(dataset_obj)

In [97]:
combined_mnli_dataset = {
    "train": mnli_train_set,
    # "validation": mnli_val_set
}

save_path = Path(f"datasets/deduped_train_mnli.json")
with save_path.open("w") as f:
    json.dump(combined_mnli_dataset, f)

In [98]:
# show the label distribution of both the train and validation sets
mnli_train_labels = [x['output'] for x in combined_mnli_dataset["train"]]
mnli_validation_labels = [x['output'] for x in combined_mnli_dataset["validation"]]


mnli_train_counter = Counter(mnli_train_labels)
mnli_validation_counter = Counter(mnli_validation_labels)

print(mnli_train_counter)
print(mnli_validation_counter)

KeyError: 'validation'