## Loader - Benchmark

In [1]:
from petsard.loader import Loader

load = Loader(filepath="benchmark://adult-income")
data, meta = load.load()
print(f"data: {type(data)}, meta: {type(meta)}")

data: <class 'pandas.core.frame.DataFrame'>, meta: <class 'petsard.metadater.schema.schema_types.SchemaMetadata'>


# Loader - Load from csv

In [2]:
load = Loader(filepath="benchmark/adult-income.csv")
data, meta = load.load()
print(f"data: {type(data)}, meta: {type(meta)}")

data: <class 'pandas.core.frame.DataFrame'>, meta: <class 'petsard.metadater.schema.schema_types.SchemaMetadata'>


# Just simplify

In [3]:
data = data.loc[0:99, :]
data.shape

(100, 15)

# Splitter

In [4]:
from petsard.loader import Splitter

split = Splitter(num_samples=5, train_split_ratio=0.8)
split_data, split_meta, exist_index = split.split(data=data)

train_data = split_data[1]["train"]  # 第一次分割的訓練集
test_data = split_data[1]["validation"]  # 第一次分割的測試集

print(f"train_data: {type(train_data)}, test_data: {type(test_data)}")
print(f"train_data shape: {train_data.shape}, test_data shape: {test_data.shape}")

train_data: <class 'pandas.core.frame.DataFrame'>, test_data: <class 'pandas.core.frame.DataFrame'>
train_data shape: (80, 15), test_data shape: (20, 15)


## Splitter - custom_data

In [5]:
split = Splitter(
    method="custom_data",
    filepath={
        "ori": "benchmark://adult-income_ori",
        "control": "benchmark://adult-income_control",
    },
)
split_data, split_meta, exist_index = split.split()

train_data = split_data[1]["train"]  # 第一次分割的訓練集
test_data = split_data[1]["validation"]  # 第一次分割的測試集

print(f"train_data: {type(train_data)}, test_data: {type(test_data)}")
print(f"train_data shape: {train_data.shape}, test_data shape: {test_data.shape}")

train_data: <class 'pandas.core.frame.DataFrame'>, test_data: <class 'pandas.core.frame.DataFrame'>
train_data shape: (151, 15), test_data shape: (9769, 15)


# Processor (Pre-proc)

In [6]:
from petsard.processor import Processor

default_proc = Processor(
    metadata=meta,
)

default_proc.fit(data=train_data)

default_preproc_data = default_proc.transform(data=train_data)
print(f"default_preproc_data shape: {default_preproc_data.shape}")

default_preproc_data shape: (136, 15)


## Processor - Missing only

In [7]:
proc = Processor(
    metadata=meta,
    config={
        "missing": {
            "age": "missing_mean",
        },
    },
)

proc.fit(
    data=train_data,
    sequence=["missing"],
)

preproc_data = proc.transform(data=train_data)
print(f"preproc_data shape: {preproc_data.shape}")
preproc_data["age"].head(10)

preproc_data shape: (150, 15)


0    25
1    38
2    28
3    44
4    18
5    34
6    29
7    24
8    55
9    36
Name: age, dtype: int8

## Processor - Outlier only

In [8]:
proc = Processor(
    metadata=meta,
    config={
        "outlier": {
            "age": "outlier_zscore",
        },
    },
)

proc.fit(
    data=train_data,
    sequence=["outlier"],
)

preproc_data = proc.transform(data=train_data)
print(f"preproc_data shape: {preproc_data.shape}")
preproc_data["age"].head(10)

preproc_data shape: (137, 15)


0    25
1    38
2    28
3    44
4    18
5    34
6    29
7    24
8    36
9    26
Name: age, dtype: int8

## Processor - Encoder only

In [9]:
proc = Processor(
    metadata=meta,
    config={
        "encoder": {
            "workclass": "encoder_onehot",
        },
    },
)

proc.fit(
    data=train_data,
    sequence=["encoder"],
)

preproc_data = proc.transform(data=train_data)
print(f"preproc_data shape: {preproc_data.shape}")
preproc_data.loc[
    :, [col for col in preproc_data.columns if col.startswith("workclass_")]
].head(10)

preproc_data shape: (151, 20)


Unnamed: 0,workclass_Federal-gov,workclass_Local-gov,workclass_Private,workclass_Self-emp-inc,workclass_Self-emp-not-inc,workclass_State-gov
0,0.0,0.0,1.0,0.0,0.0,0.0
1,0.0,0.0,1.0,0.0,0.0,0.0
2,0.0,1.0,0.0,0.0,0.0,0.0
3,0.0,0.0,1.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,1.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,1.0,0.0,0.0,0.0
8,0.0,0.0,1.0,0.0,0.0,0.0
9,1.0,0.0,0.0,0.0,0.0,0.0


## Processor - Scaler only

In [10]:
proc = Processor(
    metadata=meta,
    config={
        "scaler": {
            "age": "scaler_minmax",
        },
    },
)

proc.fit(
    data=train_data,
    sequence=["scaler"],
)

preproc_data = proc.transform(data=train_data)
print(f"preproc_data shape: {preproc_data.shape}")
preproc_data["age"].head(10)

preproc_data shape: (151, 15)


0    0.145455
1    0.381818
2    0.200000
3    0.490909
4    0.018182
5    0.309091
6    0.218182
7    0.127273
8    0.690909
9    0.345455
Name: age, dtype: float64

# Synthesizer

In [11]:
from petsard.synthesizer import Synthesizer

syn = Synthesizer(
    method="default",
)

syn.create(metadata=meta)
syn_data = syn.fit_sample(data=default_preproc_data)
print(f"syn_data shape: {syn_data.shape}")

syn_data shape: (136, 15)


# Processor (Post-proc)

In [12]:
postproc_data = default_proc.inverse_transform(data=syn_data)
print(f"postproc_data shape: {postproc_data.shape}")

postproc_data shape: (136, 15)


# Constrainer

In [13]:
from petsard.constrainer import Constrainer

config = {}
cnst = Constrainer(config=config)
cnst_data = cnst.apply(df=postproc_data)
print(f"cnst_data shape: {cnst_data.shape}")

satisfy_data = cnst.resample_until_satisfy(
    data=postproc_data,
    target_rows=postproc_data.shape[0],
    synthesizer=syn,
    max_trials=300,
    sampling_ratio=10.0,
    verbose_step=10,
)
print(f"satisfy_data shape: {satisfy_data.shape}")

cnst_data shape: (136, 15)
satisfy_data shape: (136, 15)


## Constrainer - nan_groups

In [14]:
config = {
    "nan_groups": {
        "gender": "delete",  # gender 缺值的紀錄直接刪除
        "age": {  # age 缺值的紀錄把 workclass, education, education-num 欄位設為缺值
            "erase": ["workclass", "education", "educational-num"],
        },
        "marital-status": {
            "copy": "relationship",  # marital-status 缺值的紀錄把 relationship 欄位的值複製過來
        },
    }
}
cnst = Constrainer(config=config)
cnst_data = cnst.apply(df=postproc_data)
print(f"cnst_data shape: {cnst_data.shape}")

satisfy_data = cnst.resample_until_satisfy(
    data=postproc_data,
    target_rows=postproc_data.shape[0],
    synthesizer=syn,
    max_trials=300,
    sampling_ratio=10.0,
    verbose_step=10,
)
print(f"satisfy_data shape: {satisfy_data.shape}")

cnst_data shape: (136, 15)
satisfy_data shape: (136, 15)


## Constrainer - field_constraints

In [15]:
config = {
    "field_constraints": [
        "age >= 20 & age <= 70",  # age 欄位的值必須在 20 到 70 歲之間
    ],
}
cnst = Constrainer(config=config)
cnst_data = cnst.apply(df=postproc_data)
print(f"cnst_data shape: {cnst_data.shape}")

satisfy_data = cnst.resample_until_satisfy(
    data=postproc_data,
    target_rows=postproc_data.shape[0],
    synthesizer=syn,
    max_trials=300,
    sampling_ratio=10.0,
    verbose_step=10,
)
print(f"satisfy_data shape: {satisfy_data.shape}")

cnst_data shape: (123, 15)
Trial 10: Got 123 rows, need 13 more
Trial 20: Got 123 rows, need 13 more
Trial 30: Got 123 rows, need 13 more
Trial 40: Got 123 rows, need 13 more
Trial 50: Got 123 rows, need 13 more
Trial 60: Got 123 rows, need 13 more
Trial 70: Got 123 rows, need 13 more
Trial 80: Got 123 rows, need 13 more
Trial 90: Got 123 rows, need 13 more
Trial 100: Got 123 rows, need 13 more
Trial 110: Got 123 rows, need 13 more
Trial 120: Got 123 rows, need 13 more
Trial 130: Got 123 rows, need 13 more
Trial 140: Got 123 rows, need 13 more
Trial 150: Got 123 rows, need 13 more
Trial 160: Got 123 rows, need 13 more
Trial 170: Got 123 rows, need 13 more
Trial 180: Got 123 rows, need 13 more
Trial 190: Got 123 rows, need 13 more
Trial 200: Got 123 rows, need 13 more
Trial 210: Got 123 rows, need 13 more
Trial 220: Got 123 rows, need 13 more
Trial 230: Got 123 rows, need 13 more
Trial 240: Got 123 rows, need 13 more
Trial 250: Got 123 rows, need 13 more
Trial 260: Got 123 rows, need 13

  satisfy_data = cnst.resample_until_satisfy(


## Constrainer - field_combinations

In [16]:
config = {
    "field_combinations": [
        (
            {
                "marital-status": "relationship"
            },  # 指定 'marital-status' 和 'relationship' 欄位之間的關係
            {  # 定義每個 'marital-status' 對應的 'relationship' 欄位值
                "Divorced": ["Not-in-family", "Own-child", "Unmarried"],
                "Married-civ-spouse": ["Husband", "Wife"],
                "Married-spouse-absent": ["Not-in-family"],
                "Separated": ["Not-in-family", "Unmarried"],
                "Never-married": ["Not-in-family", "Own-child", "Unmarried"],
                "Widowed": ["Not-in-family", "Unmarried"],
            },
        )
    ],
}
cnst = Constrainer(config=config)
cnst_data = cnst.apply(df=postproc_data)
print(f"cnst_data shape: {cnst_data.shape}")

satisfy_data = cnst.resample_until_satisfy(
    data=postproc_data,
    target_rows=postproc_data.shape[0],
    synthesizer=syn,
    max_trials=300,
    sampling_ratio=10.0,
    verbose_step=10,
)
print(f"satisfy_data shape: {satisfy_data.shape}")

cnst_data shape: (93, 15)
satisfy_data shape: (136, 15)


# Evaluator

In [17]:
from petsard.evaluator import Evaluator

evaluation = Evaluator(method="default")
evaluation.create()

eval_result = evaluation.eval(
    data={
        "ori": train_data,
        "control": test_data,
        "syn": syn_data,
    }
)

print(eval_result["global"].head(1))
print(eval_result["columnwise"].head(1))
print(eval_result["pairwise"].head(1))

Generating report ...

(1/2) Evaluating Column Shapes: |██████████| 15/15 [00:00<00:00, 955.48it/s]|
Column Shapes Score: 0.26%

(2/2) Evaluating Column Pair Trends: |██████████| 105/105 [00:00<00:00, 290.51it/s]|
Column Pair Trends Score: 6.23%

Overall Score (Average): 3.24%

        Score  Column Shapes  Column Pair Trends
result   0.03            NaN                 NaN
          Property        Metric  Score
age  Column Shapes  KSComplement    0.0
                         Property                 Metric  Score  \
age workclass  Column Pair Trends  ContingencySimilarity    0.0   

               Real Correlation  Synthetic Correlation  
age workclass               NaN                    NaN  


## Evaluator - Singling-out

In [18]:
evaluation = Evaluator(
    method="anonymeter-singlingout",
    n_attacks=3,
)
evaluation.create()

eval_result = evaluation.eval(
    data={
        "ori": train_data,
        "control": test_data,
        "syn": satisfy_data,
    }
)

print(eval_result["global"].head(1))
print(eval_result["details"])

Found 2 failed queries out of 3. Check DEBUG messages for more details.


        risk  risk_CI_btm  risk_CI_top  attack_rate  attack_rate_err  \
result   0.2          0.0          0.8         0.43             0.37   

        baseline_rate  baseline_rate_err  control_rate  control_rate_err  
result           0.28               0.28          0.28              0.28  
{'attack_queries': ["age>= 20 & workclass== 'Local-gov' & education== 'Some-college'"], 'baseline_queries': []}


## Evaluator - Linkability

In [19]:
evaluation = Evaluator(
    method="anonymeter-linkability",
    n_attacks=3,
    aux_cols=[
        ["age", "marital-status", "relationship", "race", "gender"],
        ["workclass", "education", "educational-num", "occupation"],
    ],
)
evaluation.create()

eval_result = evaluation.eval(
    data={
        "ori": train_data,
        "control": test_data,
        "syn": satisfy_data,
    }
)

print(eval_result["global"].head(1))
print(eval_result["details"])

TypeError: Encoders require their input argument must be uniformly strings or numbers. Got ['float', 'str']

## Evaluator - Inference

In [None]:
evaluation = Evaluator(
    method="anonymeter-inference",
    n_attacks=3,
    secret="income",
)
evaluation.create()

eval_result = evaluation.eval(
    data={
        "ori": train_data,
        "control": test_data,
        "syn": satisfy_data,
    }
)

print(eval_result["global"].head(1))
print(eval_result["details"])

        risk  risk_CI_btm  risk_CI_top  attack_rate  attack_rate_err  \
result   0.0          0.0          0.9         0.28             0.28   

        baseline_rate  baseline_rate_err  control_rate  control_rate_err  
result           0.43               0.37          0.57              0.37  
None


  self._sanity_check()


## Evaluator - Diagnostic

In [None]:
evaluation = Evaluator(method="sdmetrics-diagnosticreport")
evaluation.create()

eval_result = evaluation.eval(
    data={
        "ori": train_data,
        "control": test_data,
        "syn": satisfy_data,
    }
)

print(eval_result["global"].head(1))
print(eval_result["columnwise"].head(1))
print(eval_result["pairwise"].head(1))

Generating report ...

(1/2) Evaluating Data Validity: |██████████| 15/15 [00:00<00:00, 1134.99it/s]|
Data Validity Score: 100.0%

(2/2) Evaluating Data Structure: |██████████| 1/1 [00:00<00:00, 1388.38it/s]|
Data Structure Score: 100.0%

Overall Score (Average): 100.0%

        Score  Data Validity  Data Structure
result    1.0            NaN             NaN
          Property             Metric  Score
age  Data Validity  BoundaryAdherence    1.0
          Property             Metric  Score
age  Data Validity  BoundaryAdherence    1.0


## Evaluator - Quality

In [None]:
evaluation = Evaluator(method="sdmetrics-qualityreport")
evaluation.create()

eval_result = evaluation.eval(
    data={
        "ori": train_data,
        "control": test_data,
        "syn": satisfy_data,
    }
)

print(eval_result["global"].head(1))
print(eval_result["columnwise"].head(1))
print(eval_result["pairwise"].head(1))

Generating report ...

(1/2) Evaluating Column Shapes: |██████████| 15/15 [00:00<00:00, 145.69it/s]|
Column Shapes Score: 94.23%

(2/2) Evaluating Column Pair Trends: |██████████| 105/105 [00:00<00:00, 409.22it/s]|
Column Pair Trends Score: 60.21%

Overall Score (Average): 77.22%

        Score  Column Shapes  Column Pair Trends
result   0.77            NaN                 NaN
          Property        Metric     Score
age  Column Shapes  KSComplement  0.961759
                         Property                 Metric     Score  \
age workclass  Column Pair Trends  ContingencySimilarity  0.859197   

               Real Correlation  Synthetic Correlation Error  
age workclass               NaN                    NaN  None  


In [None]:
evaluation = Evaluator(
    method="mlutility-classification",
    target="income",
)
evaluation.create()

eval_result = evaluation.eval(
    data={
        "ori": train_data,
        "control": test_data,
        "syn": satisfy_data,
    }
)

print(eval_result["global"].head(1))

   ori_mean  ori_std  syn_mean  syn_std  diff
0      0.86     0.01      0.77      0.0 -0.09


## Evaluator - Classification

# Describer

In [None]:
from petsard.evaluator import Describer


desc = Describer(method="default")
desc.create()

desc_result = desc.eval(
    data={
        "data": satisfy_data,
    }
)

print(desc_result["global"].head(1))
print(desc_result["columnwise"].head(1))
print(desc_result["pairwise"].head(1))

   row_count  col_count  na_count
0      22795         15         0
     mean  median    std   min   max  kurtosis  skew     q1     q3  na_count  \
age  38.2   36.86  12.11  17.0  78.0     -0.44  0.47  28.52  46.49       0.0   

    nunique  
age    <NA>  
  column1 column2  corr
0     age     age   1.0


# Reporter

## Reporter - Save Data

In [None]:
from petsard.reporter import Reporter

rpt = Reporter(
    method="save_data",
    source="Postprocessor",
)
rpt.create({("Postprocessor", "exp1"): satisfy_data})
rpt.report()

Now is petsard_Postprocessor[exp1] save to csv...


## Reporter - Save Report

In [None]:
rpt = Reporter(
    method="save_report",
    granularity="global",
)
rpt.create(
    {
        ("Evaluator", "eval1_[global]"): desc_result["global"],
    }
)
rpt.report()

Now is petsard[Report]_[global] save to csv...
