In [None]:
import pandas as pd

from dapla_pseudo import Depseudonymize

JSON_FILE = "data/personer_pseudonymized.json"
CSV_FILE = "data/personer_pseudonymized.csv"

df = pd.read_json(
    JSON_FILE,
    dtype={
        "fnr": "string",
        "fornavn": "string",
        "etternavn": "string",
        "kjonn": "category",
        "fodselsdato": "string",
    },
)
df.head()

# Case: Single field, default encryption (DAEAD)

In [None]:
result = (
    Depseudonymize               ### GENERAL WORKFLOW ###
    .from_pandas(df)             # Select dataset
    .on_fields("fnr")            # Select fields in dataset
    .with_default_encryption()   # Select encryption method on fields
    .run()                       # Apply pseudonymization
)
result.to_polars().head()        # Convert result to Polars DataFrame

# Case: Single field, default encryption from file

In [None]:
# Read from a bucket
result = (
    Depseudonymize
    .from_file("gs://path/to/bucket/file.json")
    .on_fields("fnr")
    .with_default_encryption()
    .run()
)
# Write to bucket
result.to_file("gs://path/to/bucket/file.json")

# Read from local filesystem
result = (
    Depseudonymize
    .from_file(JSON_FILE)
    .on_fields("fnr")
    .with_default_encryption()
    .run()
)
# Write to local filesystem
result.to_file("/not/a/real/path.json")

# Case: Single field, FPE encryption (used for e.g. existing stable ID/snr/sid)

In [None]:
result = (
    Depseudonymize.from_pandas(df)
    .on_fields("fnr")
    .with_papis_compatible_encryption()
    .run()
)
result.to_polars().head()

# Case: Multiple fields, default encryption (DAEAD)

In [None]:
result = (
    Depseudonymize.from_pandas(df)
    .on_fields("fornavn", "etternavn", "fodselsdato")
    .with_default_encryption()
    .run()
)
result.to_polars().head()

# Case: Hierarchical dataset field selection
When working with hierarchical datasets, we sometimes need more granular control for selecting fields.

The "on_fields()" method supports glob-syntax *if and only if* reading directly from a file

In [None]:

# JSON_HIERARCHICAL follows the structure of the following JSON:
"""{ 
    "person_info": {
      "fnr": "11854898347",
      "fornavn": "Donald",
      "etternavn": "Duck"
    },
    "kjonn": "M",
    "fodselsdato": "020995"
  }
}"""
JSON_HIERARCHICAL = "data/personer_hierarchical.json"

# NOTE: Only showing the first methods until "on_fields()" - the rest of the code is the same as in other examples
result = (
  Depseudonymize.
  from_file(JSON_HIERARCHICAL)
  .on_fields("fnr")             # -> Select all nested fields that matches the name "fnr"
)

result = (
  Depseudonymize.
  from_file(JSON_HIERARCHICAL)
  .on_fields("person_info/fnr") # -> Select the nested field "fnr" inside any match of the field "person_info"
)

result = (
  Depseudonymize.
  from_file(JSON_HIERARCHICAL)
  .on_fields("p*/fnr")         # -> Select the nested field "fnr" inside a match of any field that starts with 'p'
)

result = (
  Depseudonymize.
  from_file(JSON_HIERARCHICAL)
  .on_fields("person_info/f*") # -> Select any field that starts with "f" inside the field "person_info"
)