In [None]:
from io import StringIO
from pathlib import Path

from dapla_pseudo import PseudoData
import pandas as pd

JSON_FILE = "data/personer.json"
CSV_FILE = "data/personer.csv"

df = pd.read_json(
    JSON_FILE,
    dtype={
        "fnr": "string",
        "fornavn": "string",
        "etternavn": "string",
        "kjonn": "category",
        "fodselsdato": "string",
    },
)
df.head()

# Case: Single field, default encryption (DAEAD)

In [None]:
result = (
    PseudoData                   ### GENERAL WORKFLOW ###
    .from_pandas(df)             # Select dataset
    .on_fields("fnr")            # Select fields in dataset
    .with_default_encryption()   # Select encryption method on fields
    .pseudonymize()              # Apply pseudonymization
)
result.to_polars().head()        # Convert result to Polars DataFrame

# Case: Single field, default encryption from file

In [None]:
# Note that pseudonymizing from file is only recommended if you *cannot* read the data into a DataFrame.
# This is typically the case when you have a hierarchical dataset, and reading from file is the only option.

# Read from a bucket
result = (
    PseudoData
    .from_file("gs://path/to/bucket/file.json")
    .on_fields("fnr")
    .with_default_encryption()
    .pseudonymize()
)
# Write to bucket
result.to_file("gs://path/to/bucket/file.json")

# Read from local filesystem
result = (
    PseudoData
    .from_file(JSON_FILE)
    .on_fields("fnr")
    .with_default_encryption()
    .pseudonymize()
)
# Write to local filesystem
result.to_file("/not/a/real/path.json")

# Case: Single field, Stable ID mapping

In [None]:
result = (
    PseudoData.from_pandas(df)
    .on_fields("fnr")
    .with_stable_id()
    .pseudonymize()
)
result.to_polars().head()

# Case: Single field, FPE encryption (used for e.g. existing stable ID/snr/sid)

In [None]:
result = (
    PseudoData.from_pandas(df)
    .on_fields("fnr")
    .with_papis_compatible_encryption()
    .pseudonymize()
)
result.to_polars().head()

# Case: Multiple fields, default encryption (DAEAD)

In [None]:
result = (
    PseudoData.from_pandas(df)
    .on_fields("fornavn", "etternavn", "fodselsdato")
    .with_default_encryption()
    .pseudonymize()
)
result.to_polars().head()

# Case: Chaining calls
Calls may simply be chained together to apply different pseudonymization to different fields.

In [None]:
result = (
    PseudoData.from_pandas(df)
    .on_fields("fnr")
    .with_stable_id()          # Only applies to the selected field in the previous line, "fnr"
    .on_fields("fornavn", "etternavn", "fodselsdato")
    .with_default_encryption() # Only applies to the selected fields in the previous line, "fornavn", "etternavn", "fodselsdato"
    .pseudonymize()
)
result.to_polars().head()

# Case: Hierarchical dataset field selection
When working with hierarchical datasets, we sometimes need more granular control for selecting fields.

The "on_fields()" method supports glob-syntax *if and only if* reading directly from a file

In [None]:
import json

# JSON_HIERARCHICAL follows the structure of the following JSON:
"""{ 
    "person_info": {
      "fnr": "11854898347",
      "fornavn": "Donald",
      "etternavn": "Duck"
    },
    "kjonn": "M",
    "fodselsdato": "020995"
  }
}"""
JSON_HIERARCHICAL = "data/personer_hierarchical.json"

# Only showing the first methods until "on_fields()" - the rest of the code is the same as in other examples
result = (
  PseudoData.
  from_file(JSON_HIERARCHICAL)
  .on_fields("fnr")             # -> Select all nested fields that matches the name "fnr"
)

result = (
  PseudoData.
  from_file(JSON_HIERARCHICAL)
  .on_fields("person_info/fnr") # -> Select the nested field "fnr" inside any match of the field "person_info"
)

result = (
  PseudoData.
  from_file(JSON_HIERARCHICAL)
  .on_fields("p*/fnr")         # -> Select the nested field "fnr" inside a match of any field that starts with 'p'
)

result = (
  PseudoData.
  from_file(JSON_HIERARCHICAL)
  .on_fields("person_info/f*") # -> Select any field that starts with "f" inside the field "person_info"
)

# Case: Single field SID mapping with specific SID version
The versions are deduced by supplying a timestamp

In [None]:
# With date string (YYYY-MM-DD)
result = (
    PseudoData.from_pandas(df)
    .on_fields("fnr")
    .with_stable_id(sid_snapshot_date="2023-07-01")
    .pseudonymize()
)
result.to_polars().head()

# With date type
from datetime import date
result = (
    PseudoData.from_pandas(df)
    .on_fields("fnr")
    .with_stable_id(sid_snapshot_date=date.today())
    .pseudonymize()
)
result.to_polars().head()

# Case: Validate field SID mapping
Validate that all values in a column have valid SIDs

In [None]:
from dapla_pseudo import Validator

result = (
    Validator.from_pandas(df)
    .on_field("fnr")
    .validate_map_to_stable_id()
)
result.to_polars().head()