In [3]:
import json
from pathlib import Path
from collections import Counter
from json import JSONDecodeError

# Combine up to 6 JSON files in this folder into 02.combined.json
# Keep only: question, entities, sparql, id
# Rules:
#  - Include an item only if it has BOTH question and sparql.
#  - If entities is missing, add an empty list [].
#  - If id is missing, generate a new sequential string id.
#  - If item has only id (no question/sparql), skip it.
#  - Ignore all other keys.

input_dir = Path.cwd()
output_file = input_dir / "combined.json"

# Pick JSON files in current dir, excluding the combined output and any file starting with "combined"
files = sorted(
    f for f in input_dir.glob("*.json")
    if f.name.lower() not in {"combined.json"} and not f.name.lower().startswith("combined")
)

def iter_items_from_file(path: Path):
    try:
        text = path.read_text(encoding="utf-8")
        if not text.strip():
            return
        data = json.loads(text)
    except JSONDecodeError as e:
        print(f"Skipping {path.name}: JSON parse error -> {e}")
        return
    except Exception as e:
        print(f"Skipping {path.name}: {e}")
        return
    if isinstance(data, list):
        for obj in data:
            yield obj
    elif isinstance(data, dict):
        yield data
    else:
        # Unsupported top-level JSON type
        return

all_raw_items = []
all_keys_counter = Counter()

for f in files[:6]:  # consider up to 6 JSON files
    for obj in iter_items_from_file(f):
        if isinstance(obj, dict):
            all_raw_items.append(obj)
            all_keys_counter.update(obj.keys())

combined = []
seen_ids = set()
next_id = 1

def get_next_id():
    global next_id
    # Generate simple sequential IDs as strings
    i = next_id
    next_id += 1
    return str(i)

for obj in all_raw_items:
    # Must have both question and sparql to be included
    q = obj.get("question")
    s = obj.get("sparql")
    if not (q and s):
        # Skip entries that don't provide both fields (incl. only-id cases)
        continue
    # Determine id: preserve if present/non-empty and not duplicated; else generate
    _id = obj.get("id")
    if isinstance(_id, (int, float)):
        _id = str(_id)
    if not _id or _id in seen_ids:
        _id = get_next_id()
    seen_ids.add(_id)
    # Entities: ensure key exists; empty list if missing
    entities = obj.get("entities")
    if entities is None:
        entities = []
    # Build minimal projected object
    combined.append({
        "id": _id,
        "question": q,
        "entities": entities,
        "sparql": s
        
    })

# Write output
output_file.parent.mkdir(parents=True, exist_ok=True)
output_file.write_text(json.dumps(combined, ensure_ascii=False, indent=2), encoding="utf-8")

# Brief inspection summary
print(f"Scanned files: {[f.name for f in files[:6]]}")
print(f"Unique input keys detected: {sorted(all_keys_counter.keys())}")
print(f"Combined {len(combined)} items into {output_file}")

Scanned files: ['bldg2.json', 'bldg3.json', 'dataset2.json', 'raw_merged_extended_datasets.json', 'raw_merged_schema_datasets.json', 'training_data.json']
Unique input keys detected: ['ID', '_meta', 'category', 'en', 'entities', 'entity', 'explanation', 'id', 'notes', 'question', 'record_id', 'response', 'source_building', 'sparql']
Combined 141494 items into c:\Users\suhas\Documents\GitHub\OntoBot\Assets\combined.json
