# Profile

In [1]:
import re
import json
from docx import Document

def parse_tick_field(text):
    """
    Given a string that uses tick markers "☒" and "☐", 
    returns a dict with "options" (list of options) 
    and "selected" (list of options marked with ☒).
    Example: "☒ Female\t\t☐ Male" becomes:
      {"options": ["Female", "Male"], "selected": ["Female"]}
    """
    # This regular expression matches a tick marker followed by text until the next tick marker (if any)
    pattern = r"(☒|☐)\s*([^☒☐]+)"
    matches = re.findall(pattern, text)
    options = []
    selected = []
    for mark, option in matches:
        option = option.strip()
        options.append(option)
        if mark == "☒":
            selected.append(option)
    return {"options": options, "selected": selected}

def clean_text(text):
    """
    Clean extra spaces, newlines, and tab characters.
    """
    return ' '.join(text.replace("\t", " ").split())

def parse_docx_to_json(doc):
    """
    Takes a python-docx Document and parses its tables into meaningful JSON.
    
    Heuristics:
      - A table with one row and one cell (non-empty) is used as a section header.
      - Other tables are assumed to contain key/value fields.
      - If a cell's text includes tick characters, we parse that cell to return both 
        the available options and the ones selected.
      - If the same key appears multiple times within a section, the values are grouped in a list.
    """
    result = {}
    current_section = "Default"  # fallback if no header is found yet
    result[current_section] = {}

    for table in doc.tables:
        # Convert table rows into lists of cleaned cell text
        table_data = []
        for row in table.rows:
            row_values = [clean_text(cell.text) for cell in row.cells]
            # Only add row if at least one cell is non-empty
            if any(row_values):
                table_data.append(row_values)
        
        # Heuristic: if this table is a single-cell header table
        if len(table_data) == 1 and len(table_data[0]) == 1:
            section_title = table_data[0][0]
            if section_title:
                current_section = section_title
                result[current_section] = {}
            continue

        # Otherwise, we treat this table as data for the current section.
        for row in table_data:
            # Assume the first cell is the key.
            key = row[0] if row[0] else None
            if not key:
                continue  # skip if no key

            # For the value, look through the remaining cells for non-empty content.
            # (Sometimes there might be multiple pieces of information.)
            value_candidates = [cell for cell in row[1:] if cell]
            if not value_candidates:
                value = None
            elif len(value_candidates) == 1:
                value = value_candidates[0]
            else:
                value = value_candidates

            # Check if this value appears to be a tick field.
            if value and isinstance(value, str) and ("☒" in value or "☐" in value):
                value = parse_tick_field(value)
            elif isinstance(value, list):
                # For lists, check if any element is a tick field.
                new_vals = []
                for v in value:
                    if "☒" in v or "☐" in v:
                        new_vals.append(parse_tick_field(v))
                    else:
                        new_vals.append(v)
                value = new_vals

            # If the key already exists, merge values in a list.
            if key in result[current_section]:
                # Convert the existing value into a list if it’s not already one.
                if not isinstance(result[current_section][key], list):
                    result[current_section][key] = [result[current_section][key]]
                result[current_section][key].append(value)
            else:
                result[current_section][key] = value

    return result

# Usage example: Load a docx file named "profile.docx" (assumed to be in the same folder)
if __name__ == "__main__":
    doc = Document("profile.docx")
    parsed_data = parse_docx_to_json(doc)
    print(json.dumps(parsed_data, indent=2, ensure_ascii=False))


{
  "Default": {},
  "Client Information": {
    "Last Name": "Vasconcelos",
    "First/ Middle Name (s)": "Joana Leonor",
    "Address": "Rua de Cedofeita 97, 0890-321 Ponta Delgada",
    "Country of Domicile": "Portugal",
    "Date of birth": "1961-07-24",
    "Nationality": "Portuguese",
    "Passport No/ Unique ID": "QG6601374",
    "ID Type": "passport",
    "ID Issue Date": "2024-06-21",
    "ID Expiry Date": "2029-06-20",
    "Gender": {
      "options": [
        "Female",
        "Male"
      ],
      "selected": [
        "Female"
      ]
    }
  },
  "Account Holder – Contact Management and Services – Contact Info": {
    "Communication Medium": "Telephone 932 718 896"
  },
  "Account Holder – Personal Info": {
    "Is the client or associated person a Politically Exposed Person as defined in the Client Acceptance Policy?": [
      {
        "options": [
          "No",
          "Yes"
        ],
        "selected": [
          "No"
        ]
      },
      "If ‘Yes’, please

In [10]:
import zipfile
from docx import Document
import os

def locate_profile(client_number: int) -> Document:
    """
    Given a client number, locate and return the Document() instance
    for the corresponding profile inside the appropriate zip archive.
    Assumes each zip contains a file named profile.docx.
    
    The archives are arranged in a cycle of 500 clients divided into
    three segments: the first 200 clients, the next 200, and then the final 100.
    For example:
      - Clients 1-200   -> folder "client_001_200"
      - Clients 201-400 -> folder "client_201_400"
      - Clients 401-500 -> folder "client_401_500"
      - Clients 501-700 -> folder "client_501_700"
      - Clients 701-900 -> folder "client_701_900"
      - Clients 901-1000-> folder "client_901_1000"
      ... and so on.
    """
    if client_number < 1:
        raise ValueError("Client number must be a positive integer.")
    
    # Determine the cycle of 500 in which the client resides.
    cycle = (client_number - 1) // 500  # 0-indexed cycle
    offset = (client_number - 1) % 500 + 1  # position within the current cycle (1 to 500)
    
    # Determine the bucket (segment) within the 500-client cycle.
    if offset <= 200:
        bucket_start = cycle * 500 + 1
        bucket_end = cycle * 500 + 200
    elif offset <= 400:
        bucket_start = cycle * 500 + 201
        bucket_end = cycle * 500 + 400
    else:
        bucket_start = cycle * 500 + 401
        bucket_end = cycle * 500 + 500
    
    # Format the folder name with zero-padding.
    folder = f"client_{str(bucket_start).zfill(3)}_{str(bucket_end).zfill(3)}"
    
    # Construct the zip file name. (The zip file name is not zero-padded.)
    zip_filename = f"client_{client_number}.zip"
    zip_path = os.path.join("data", folder, zip_filename)
    
    # Check that the zip archive exists.
    if not os.path.exists(zip_path):
        raise FileNotFoundError(f"Could not find zip file: {zip_path}")
    
    # Open the zip archive and load the profile document.
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        if "profile.docx" not in zip_ref.namelist():
            raise FileNotFoundError("profile.docx not found in the zip archive.")
        with zip_ref.open("profile.docx") as docx_file:
            return Document(docx_file)


In [None]:
doc = locate_profile(900)
parsed_data = parse_docx_to_json(doc)
print(json.dumps(parsed_data, indent=2, ensure_ascii=False))

{
  "Default": {},
  "Client Information": {
    "Last Name": "Schneider",
    "First/ Middle Name (s)": "Simon Julian",
    "Address": "Deák Ferenc tér 44, 534 40 Győr",
    "Country of Domicile": "Hungary",
    "Date of birth": "1956-07-09",
    "Nationality": "German",
    "Passport No/ Unique ID": "ZE3638628",
    "ID Type": "passport",
    "ID Issue Date": "2018-03-24",
    "ID Expiry Date": "2028-03-23",
    "Gender": {
      "options": [
        "Female",
        "Male"
      ],
      "selected": [
        "Male"
      ]
    }
  },
  "Account Holder – Contact Management and Services – Contact Info": {
    "Communication Medium": "Telephone +36 38 496 1433"
  },
  "Account Holder – Personal Info": {
    "Is the client or associated person a Politically Exposed Person as defined in the Client Acceptance Policy?": [
      {
        "options": [
          "No",
          "Yes"
        ],
        "selected": [
          "No"
        ]
      },
      "If ‘Yes’, please complete the App

# Account

In [17]:
!pip install PyPDF2

Defaulting to user installation because normal site-packages is not writeable
Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
Installing collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [29]:
import json
from io import BytesIO
from PyPDF2 import PdfReader

def extract_form_data(pdf_input):
    """
    Given a PDF file path, a file-like object, or raw PDF bytes, this function
    extracts the interactive form fields from the PDF and returns them as a dictionary.
    
    The PDF's interactive form data is expected to be stored in the /AcroForm.
    If no AcroForm fields are found, an empty dictionary is returned.
    
    Parameters:
        pdf_input (str or bytes or file-like object): A PDF file path, or PDF data.
        
    Returns:
        dict: A dictionary of form field values where keys are the field names.
    """
    # If pdf_input is bytes, wrap it in a BytesIO object.
    pdf_file = BytesIO(pdf_input)

    # Create a PdfReader instance.
    reader = PdfReader(pdf_file)
    
    # Extract the form fields from the PDF's AcroForm.
    fields = reader.get_fields()
    
    if not fields:
        print("No AcroForm fields found or the PDF is not a standard fillable form.")
        return {}

    # Build the dictionary of field values.
    form_data = {}
    for field_name, field_info in fields.items():
        # Retrieve the value for each field (defaults to empty string if not available)
        value = field_info.get('/V', '')
        form_data[field_name] = value

    return form_data

In [30]:
import zipfile
import os

def locate_account(client_number: int) -> bytes:
    """
    Given a client number, locate and return the raw bytes of the account PDF file 
    from the corresponding zip archive.
    
    """
    if client_number < 1:
        raise ValueError("Client number must be a positive integer.")

    # Determine the current cycle among groups of 500 clients (0-indexed).
    cycle = (client_number - 1) // 500
    offset = (client_number - 1) % 500 + 1  # Position within the current cycle (1 to 500)

    # Determine the bucket (segment) within the 500-client cycle.
    if offset <= 200:
        bucket_start = cycle * 500 + 1
        bucket_end = cycle * 500 + 200
    elif offset <= 400:
        bucket_start = cycle * 500 + 201
        bucket_end = cycle * 500 + 400
    else:
        bucket_start = cycle * 500 + 401
        bucket_end = cycle * 500 + 500

    # Format the folder name with zero-padding.
    folder = f"client_{str(bucket_start).zfill(3)}_{str(bucket_end).zfill(3)}"

    # Construct the ZIP file name. (The zip file name is not zero-padded.)
    zip_filename = f"client_{client_number}.zip"
    zip_path = os.path.join("data", folder, zip_filename)

    # Check that the zip archive exists.
    if not os.path.exists(zip_path):
        raise FileNotFoundError(f"Could not find zip file: {zip_path}")

    # Open the zip archive and load the account PDF file.
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        if "account.pdf" not in zip_ref.namelist():
            raise FileNotFoundError("account.pdf not found in the zip archive.")
        with zip_ref.open("account.pdf") as pdf_file:
            return pdf_file.read()

In [31]:
account_pdf_bytes = locate_account(700)
json_out = extract_form_data(account_pdf_bytes)
print(json.dumps(json_out, indent=4))

{
    "account_name": "Davide Leonardo Bruno",
    "account_holder_name": "Davide Leonardo",
    "account_holder_surname": "Marino",
    "passport_number": "HO7528324",
    "chf": "/Off",
    "eur": "/Yes",
    "usd": "/Off",
    "other_ccy": "",
    "building_number": "81",
    "postal_code": "3878 53",
    "city": "Maastricht",
    "country": "Netherlands",
    "name": "Davide Leonardo Bruno",
    "phone_number": "+31 06 26905184",
    "email": "davide.bruno@planet.nl",
    "street_name": "Rembrandtplein"
}
