In [19]:
import re
import json
from docx import Document

def parse_tick_field(text):
    """
    Given a string that uses tick markers "☒" and "☐", 
    returns a dict with "options" (list of options) 
    and "selected" (list of options marked with ☒).
    Example: "☒ Female\t\t☐ Male" becomes:
      {"options": ["Female", "Male"], "selected": ["Female"]}
    """
    # This regular expression matches a tick marker followed by text until the next tick marker (if any)
    pattern = r"(☒|☐)\s*([^☒☐]+)"
    matches = re.findall(pattern, text)
    options = []
    selected = []
    for mark, option in matches:
        option = option.strip()
        options.append(option)
        if mark == "☒":
            selected.append(option)
    return {"options": options, "selected": selected}

def clean_text(text):
    """
    Clean extra spaces, newlines, and tab characters.
    """
    return ' '.join(text.replace("\t", " ").split())

def parse_docx_to_json(doc):
    """
    Takes a python-docx Document and parses its tables into meaningful JSON.
    
    Heuristics:
      - A table with one row and one cell (non-empty) is used as a section header.
      - Other tables are assumed to contain key/value fields.
      - If a cell's text includes tick characters, we parse that cell to return both 
        the available options and the ones selected.
      - If the same key appears multiple times within a section, the values are grouped in a list.
    """
    result = {}
    current_section = "Default"  # fallback if no header is found yet
    result[current_section] = {}

    for table in doc.tables:
        # Convert table rows into lists of cleaned cell text
        table_data = []
        for row in table.rows:
            row_values = [clean_text(cell.text) for cell in row.cells]
            # Only add row if at least one cell is non-empty
            if any(row_values):
                table_data.append(row_values)
        
        # Heuristic: if this table is a single-cell header table
        if len(table_data) == 1 and len(table_data[0]) == 1:
            section_title = table_data[0][0]
            if section_title:
                current_section = section_title
                result[current_section] = {}
            continue

        # Otherwise, we treat this table as data for the current section.
        for row in table_data:
            # Assume the first cell is the key.
            key = row[0] if row[0] else None
            if not key:
                continue  # skip if no key

            # For the value, look through the remaining cells for non-empty content.
            # (Sometimes there might be multiple pieces of information.)
            value_candidates = [cell for cell in row[1:] if cell]
            if not value_candidates:
                value = None
            elif len(value_candidates) == 1:
                value = value_candidates[0]
            else:
                value = value_candidates

            # Check if this value appears to be a tick field.
            if value and isinstance(value, str) and ("☒" in value or "☐" in value):
                value = parse_tick_field(value)
            elif isinstance(value, list):
                # For lists, check if any element is a tick field.
                new_vals = []
                for v in value:
                    if "☒" in v or "☐" in v:
                        new_vals.append(parse_tick_field(v))
                    else:
                        new_vals.append(v)
                value = new_vals

            # If the key already exists, merge values in a list.
            if key in result[current_section]:
                # Convert the existing value into a list if it’s not already one.
                if not isinstance(result[current_section][key], list):
                    result[current_section][key] = [result[current_section][key]]
                result[current_section][key].append(value)
            else:
                result[current_section][key] = value

    return result

# Usage example: Load a docx file named "profile.docx" (assumed to be in the same folder)
if __name__ == "__main__":
    doc = Document("profile.docx")
    parsed_data = parse_docx_to_json(doc)
    print(json.dumps(parsed_data, indent=2, ensure_ascii=False))


{
  "Default": {},
  "Client Information": {
    "Last Name": "Vasconcelos",
    "First/ Middle Name (s)": "Joana Leonor",
    "Address": "Rua de Cedofeita 97, 0890-321 Ponta Delgada",
    "Country of Domicile": "Portugal",
    "Date of birth": "1961-07-24",
    "Nationality": "Portuguese",
    "Passport No/ Unique ID": "QG6601374",
    "ID Type": "passport",
    "ID Issue Date": "2024-06-21",
    "ID Expiry Date": "2029-06-20",
    "Gender": {
      "options": [
        "Female",
        "Male"
      ],
      "selected": [
        "Female"
      ]
    }
  },
  "Account Holder – Contact Management and Services – Contact Info": {
    "Communication Medium": "Telephone 932 718 896"
  },
  "Account Holder – Personal Info": {
    "Is the client or associated person a Politically Exposed Person as defined in the Client Acceptance Policy?": [
      {
        "options": [
          "No",
          "Yes"
        ],
        "selected": [
          "No"
        ]
      },
      "If ‘Yes’, please