In [8]:
import os
import json


def process_json_files(input_root, output_root):
    # Ensure the output root directory exists
    os.makedirs(output_root, exist_ok=True)

    # Iterate over all directories and subdirectories
    for subdir, _, files in os.walk(input_root):
        # Process only JSON files
        for file in files:
            if file.endswith(".json"):
                input_file_path = os.path.join(subdir, file)
                relative_path = os.path.relpath(subdir, input_root)
                output_dir = os.path.join(output_root, relative_path)
                os.makedirs(output_dir, exist_ok=True)
                output_file_path = os.path.join(output_dir, file)

                # Check if the file is empty
                if os.path.getsize(input_file_path) == 0:
                    print(f"Skipping empty file: {input_file_path}")
                    continue

                try:
                    # Load JSON file
                    with open(
                        input_file_path, "r", encoding="utf-8-sig"
                    ) as f:  # Handle BOM
                        data = json.load(f)

                    # Check for required keys
                    if "question_info" not in data or "OCR_info" not in data:
                        print(f"Missing required keys in JSON file {input_file_path}")
                        continue

                    # Extract required fields
                    extracted_data = {
                        "question_topic_name": data["question_info"][0].get(
                            "question_topic_name", "N/A"
                        ),
                        "question_sector2": data["question_info"][0].get(
                            "question_sector2", "N/A"
                        ),
                        "question_text": data["OCR_info"][0].get(
                            "question_text", "N/A"
                        ),
                    }

                    # Save the extracted data to a new JSON file
                    with open(output_file_path, "w", encoding="utf-8") as f:
                        json.dump(extracted_data, f, ensure_ascii=False, indent=4)

                except json.JSONDecodeError as e:
                    print(f"Error decoding JSON file {input_file_path}: {e}")
                except (KeyError, IndexError) as e:
                    print(f"Error accessing keys in JSON file {input_file_path}: {e}")


# Define input and output directories
input_root = "D:/programming/python/chunjae/finalproject/images/validation/y"
output_root = "D:/programming/python/chunjae/finalproject/images/processed_data/validation_st"

# Process files
process_json_files(input_root, output_root)