In [7]:
from datasets import load_from_disk, Dataset, concatenate_datasets

In [14]:
target_dataset_size = len(load_from_disk("data/city_train"))

for domain in ["nobel_prize_winner", "verb", "physical_object"]:
    original_dataset = load_from_disk(f"data/{domain}_train")
    print(f"Original {domain} dataset size: {len(original_dataset)}")
    
    if len(original_dataset) < target_dataset_size:
        print(f"Upsampling {domain} dataset")
        extra_size = target_dataset_size - len(original_dataset)
        print(f"Extra size: {extra_size}")
        shuffled_dataset = original_dataset.shuffle(seed=42)
        shuffled_dataset = shuffled_dataset.select([i % len(original_dataset) for i in range(extra_size)])
        new_dataset = concatenate_datasets([shuffled_dataset, original_dataset])
        new_dataset.save_to_disk(f"data/{domain}_train_large")
        print(f"New {domain} dataset size: {len(new_dataset)}")

Original nobel_prize_winner dataset size: 72965
Upsampling nobel_prize_winner dataset
Extra size: 11434


Saving the dataset (0/1 shards):   0%|          | 0/84399 [00:00<?, ? examples/s]

Saving the dataset (1/1 shards): 100%|██████████| 84399/84399 [00:01<00:00, 57485.97 examples/s]


New nobel_prize_winner dataset size: 84399
Original verb dataset size: 22931
Upsampling verb dataset
Extra size: 61468


Saving the dataset (1/1 shards): 100%|██████████| 84399/84399 [00:00<00:00, 95022.97 examples/s]


New verb dataset size: 84399
Original physical_object dataset size: 42226
Upsampling physical_object dataset
Extra size: 42173


Saving the dataset (1/1 shards): 100%|██████████| 84399/84399 [00:00<00:00, 92287.13 examples/s]

New physical_object dataset size: 84399





In [15]:
all_train_sets = []
all_test_sets = []

for domain in ["nobel_prize_winner", "verb", "physical_object", "occupation"]:
    all_train_sets.append(load_from_disk(f"data/{domain}_train_large"))
    all_test_sets.append(load_from_disk(f"data/{domain}_test"))
    
all_train_sets.append(load_from_disk("data/city_train"))
all_test_sets.append(load_from_disk("data/city_test"))

train_dataset = concatenate_datasets(all_train_sets)
test_dataset = concatenate_datasets(all_test_sets)

train_dataset.save_to_disk("data/train_large")

# Randomly downsample the test set to 2000 examples
test_dataset = test_dataset.shuffle(seed=42)
test_dataset = test_dataset.select(range(2000))
test_dataset.save_to_disk("data/test")

Saving the dataset (1/1 shards): 100%|██████████| 421092/421092 [00:00<00:00, 531942.64 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 46629/46629 [00:00<00:00, 502659.33 examples/s]
