In [4]:
import json
import random
from pathlib import Path

CATEGORY_ITEMS = {
    "fruit": [
        "apple", "banana", "cherry", "grape", "orange",
        "pear", "peach", "mango", "tangerine", "plum"
    ],
    "animal": [
        "dog", "cat", "horse", "cow", "sheep",
        "lion", "tiger", "bear", "rabbit", "fox"
    ],
    "vehicle": [
        "car", "bus", "truck", "bicycle", "motorcycle",
        "train", "boat", "plane", "scooter", "van"
    ],
    "instrument": [           
        "guitar", "piano", "violin", "drum", "flute",
        "trumpet", "saxophone", "cello", "clarinet", "harp"
    ],
    "furniture": [            
        "chair", "table", "sofa", "bed", "desk",
        "cabinet", "dresser", "stool", "wardrobe", "bookshelf"
    ],
}


GLOBAL_POOL = [w for words in CATEGORY_ITEMS.values() for w in words]


NUM_EXAMPLES  = 5000
LIST_MIN_LEN  = 6
LIST_MAX_LEN  = 12
SEED          = 42
OUTFILE       = Path("dataset.json")

random.seed(SEED)

dataset = []

for _ in range(NUM_EXAMPLES):
    target_type = random.choice(list(CATEGORY_ITEMS.keys()))
    target_pool = CATEGORY_ITEMS[target_type]

    list_len = random.randint(LIST_MIN_LEN, LIST_MAX_LEN)
    sample   = random.sample(GLOBAL_POOL, list_len)

    # Ensure at least one match exists
    if not any(word in target_pool for word in sample):
        sample[random.randrange(list_len)] = random.choice(target_pool)

    answer = sum(word in target_pool for word in sample)

    prompt = (
        "Count the number of words in the following list that match the given type, "
        "and put the numerical answer in parentheses.\n"
        f"Type: {target_type}\n"
        f"List: [{', '.join(sample)}]\n"
        "Answer: ("
    )

    dataset.append({
        "prompt":      prompt,
        "answer":      answer,
        "type":        target_type,
        "list_length": list_len
    })

OUTFILE.write_text(json.dumps(dataset, indent=2))
print(f"Wrote {len(dataset):,d} examples → {OUTFILE.resolve()}")

Wrote 5,000 examples → /net/scratch/slhleosun/counting_items/dataset.json


In [6]:
dataset

[{'prompt': 'Count the number of words in the following list that match the given type, and put the numerical answer in parentheses.\nType: fruit\nList: [stool, bear, lion, sheep, tangerine, peach]\nAnswer: (',
  'answer': 2,
  'type': 'fruit',
  'list_length': 6},
 {'prompt': 'Count the number of words in the following list that match the given type, and put the numerical answer in parentheses.\nType: furniture\nList: [cello, plane, cherry, banana, bed, cow]\nAnswer: (',
  'answer': 1,
  'type': 'furniture',
  'list_length': 6},
 {'prompt': 'Count the number of words in the following list that match the given type, and put the numerical answer in parentheses.\nType: furniture\nList: [trumpet, horse, cabinet, table, desk, flute]\nAnswer: (',
  'answer': 3,
  'type': 'furniture',
  'list_length': 6},
 {'prompt': 'Count the number of words in the following list that match the given type, and put the numerical answer in parentheses.\nType: instrument\nList: [scooter, cello, bear, apple, d