In [None]:
#!/usr/bin/env python3
import json
import random
from collections import defaultdict

INFILE  = "rush.txt"                           
OUTFILE = "rush_no_wall_1000_balanced.json"    
EXIT    = [3, 6]                              
SEED    = 42                                  
TARGET  = 1000                               

random.seed(SEED)

In [None]:
def parse_line(line: str):
    parts = line.strip().split()
    if len(parts) < 3:
        return None
    try:
        moves = int(parts[0])
    except ValueError:
        return None
    board = "".join(parts[1:-1])
    if len(board) != 36:
        return None
    return moves, board

def to_cell(c: str):
    # 'o', '.', 'x' -> null (we also exclude any 'x' boards entirely)
    if c in ("o", ".", "x"):
        return None
    return c

def board_to_matrix(board: str):
    return [
        [to_cell(board[r*6 + c]) for c in range(6)]
        for r in range(6)
    ]

def allocate_balanced(buckets, target):
    keys = sorted(buckets.keys())
    k = len(keys)
    if k == 0:
        return []

    base = target // k
    rem  = target % k
    quota = {m: base + (1 if i < rem else 0) for i, m in enumerate(keys)}

    # First pass: cap by availability
    selected = {m: [] for m in keys}
    leftover = 0
    for m in keys:
        want = quota[m]
        have = len(buckets[m])
        take = min(want, have)
        if take > 0:
            selected[m] = random.sample(buckets[m], take)
        if have < want:
            leftover += (want - have)

    if leftover <= 0:
        # Already met target
        out = []
        for m in keys:
            out.extend(selected[m])
        return out

    # Second pass: redistribute leftover among buckets with spare capacity
    # Build lists of remaining candidates (not already selected)
    remaining_pool = {m: [x for x in buckets[m] if x not in set(selected[m])] for m in keys}

    # Keep adding from buckets that still have remaining items
    # round-robin over keys
    added = 0
    i = 0
    while added < leftover:
        progressed = False
        for m in keys:
            if remaining_pool[m]:
                selected[m].append(remaining_pool[m].pop())
                added += 1
                progressed = True
                if added >= leftover:
                    break
        if not progressed:
            # no more items anywhere (can't reach target)
            break

    out = []
    for m in keys:
        out.extend(selected[m])
    # If we somehow overshot (shouldn't), trim randomly
    if len(out) > target:
        out = random.sample(out, target)
    return out

def main():
    # read and filter wall-free
    wall_free = []  # list of tuples: (idx, moves, board)
    with open(INFILE, "r", encoding="utf-8", errors="ignore") as f:
        for idx, line in enumerate(f, start=1):
            parsed = parse_line(line)
            if not parsed:
                continue
            moves, board = parsed
            if "x" in board:  # exclude any puzzle with walls
                continue
            wall_free.append((idx, moves, board))

    # count distinct move counts among wall-free
    by_moves = defaultdict(list)
    for idx, moves, board in wall_free:
        by_moves[moves].append((idx, moves, board))

    # Print summary
    distinct_moves = sorted(by_moves.keys())
    print(f"Wall-free puzzles: {len(wall_free)}")
    print(f"Distinct move counts (wall-free): {len(distinct_moves)}")
    for m in distinct_moves:
        print(f"  moves={m}: {len(by_moves[m])} puzzles")

    # select 1000 puzzles balanced across moves (only wall-free)
    chosen = allocate_balanced(by_moves, TARGET)
    print(f"Selected {len(chosen)} puzzles for output (target={TARGET}).")

    # build JSON in your exact format
    puzzles = []
    for idx, moves, board in chosen:
        puzzles.append({
            "name": idx,            # keep original line index as name
            "exit": EXIT,           # 1-indexed [row, col]
            "board": board_to_matrix(board)
            # If you later want to include moves/cluster, add fields here
        })

    # 5) Inline JSON with spaces after commas
    json_str = json.dumps(puzzles, ensure_ascii=False)
    json_str = json_str.replace(",", ", ")

    with open(OUTFILE, "w", encoding="utf-8") as f:
        f.write(json_str)

    print(f"Wrote balanced wall-free dataset to {OUTFILE}")

In [10]:
main()

Wall-free puzzles: 476118
Distinct move counts (wall-free): 51
  moves=1: 1 puzzles
  moves=2: 8 puzzles
  moves=3: 128 puzzles
  moves=4: 767 puzzles
  moves=5: 3561 puzzles
  moves=6: 10629 puzzles
  moves=7: 21266 puzzles
  moves=8: 34032 puzzles
  moves=9: 48301 puzzles
  moves=10: 59777 puzzles
  moves=11: 61984 puzzles
  moves=12: 53381 puzzles
  moves=13: 41957 puzzles
  moves=14: 31534 puzzles
  moves=15: 24407 puzzles
  moves=16: 19192 puzzles
  moves=17: 14696 puzzles
  moves=18: 11643 puzzles
  moves=19: 9086 puzzles
  moves=20: 7151 puzzles
  moves=21: 5584 puzzles
  moves=22: 4212 puzzles
  moves=23: 3280 puzzles
  moves=24: 2370 puzzles
  moves=25: 1792 puzzles
  moves=26: 1323 puzzles
  moves=27: 995 puzzles
  moves=28: 748 puzzles
  moves=29: 586 puzzles
  moves=30: 391 puzzles
  moves=31: 316 puzzles
  moves=32: 224 puzzles
  moves=33: 176 puzzles
  moves=34: 126 puzzles
  moves=35: 114 puzzles
  moves=36: 84 puzzles
  moves=37: 70 puzzles
  moves=38: 47 puzzles
  move