# Step 5: Measure Population Fidelity (PF)

In [1]:
import pandas as pd 
import sys
import os
import re

from sdv.metadata import SingleTableMetadata

sys.path.append('../src')
from PF_metrics import compute_all_pf_measures
from utils import (getExperimentConfig, 
                   getPicklesFromDir, 
                   get_synthetic_filepaths_from_original_data_id)

config = getExperimentConfig()
folders = config['folders']

settings = getPicklesFromDir(folders['settings_dir'])

In [2]:
# read file f it exists, if not-create dataframe
#pf_measures_filepath = folders['pf_measures_filepath']
pf_measures_filepath='../data/result/Cluster_1.csv'

if os.path.exists(pf_measures_filepath):
    result_df = pd.read_csv(pf_measures_filepath)
else:
    result_df = pd.DataFrame(columns=['Dataset id'])

run_dataset = config['run_dataset']

for dataset_setting in settings:
            
    if run_dataset is not None and dataset_setting['meta']['id'] not in run_dataset:
        # Checks if run_dataset contains dataset_id's
        # if it does, run the experiment only on specified datasets
        print(f"Skip pf for data id: {dataset_setting['meta']['id']}")
        continue
        
    original_data_id = dataset_setting['meta']['id']
    synthetic_datasets = get_synthetic_filepaths_from_original_data_id(original_data_id)
    # there is no synthetic datasets according to the Data id, skip and try next
    if synthetic_datasets == []:
        print(f"Did not find synthetic dataset files")
        continue

    original_data = pd.read_csv(folders['real_dir']+dataset_setting['meta']['filename'], dtype=dataset_setting['meta']['cols_dtype'])
    metadata = SingleTableMetadata().load_from_json(dataset_setting['meta']['meta_filepath']).to_dict()

    
    print(f"Data id: {original_data_id}")
    for sd_filename in synthetic_datasets:   
        sd_id = os.path.splitext(sd_filename)[0]
        quality = re.findall('Q\d+', sd_id)[0]
        sd_path = folders['sd_dir']+sd_filename

        # Check if the dataset has already been evaluated
        row_exists = ((result_df['Dataset id'] == sd_id)).any()
        if not row_exists:
            print(f"Computing PF on: {sd_id}")
            synthetic_data = pd.read_csv(sd_path)

            pf_measures = compute_all_pf_measures(original_data=original_data,
                                                  synthetic_data=synthetic_data,
                                                  custom_metadata=dataset_setting['meta'],
                                                  sdv_metadata=metadata,
                                                  SD_id=sd_id)
            pf_measures['Quality'] = quality
            pf_measures['Original dataset'] = original_data_id
            result_df = pd.concat([result_df, pf_measures], axis=0, ignore_index=True)

        # save the results
        result_df.to_csv(pf_measures_filepath, index=False)

Data id: D0
Computing PF on: SD0Q1000_0
Best run was number 9
N_Clusters: 8
Value: 2.7780055030645046, Time: 99.0905853
Computing PF on: SD0Q1000_1
Best run was number 4
N_Clusters: 8
Value: 2.589754108822777, Time: 85.3902251
Computing PF on: SD0Q1000_2
Best run was number 37
N_Clusters: 8
Value: 3.154691085454195, Time: 75.02919490000002
Computing PF on: SD0Q1000_3
Best run was number 13
N_Clusters: 8
Value: 2.6628798777669473, Time: 94.21524149999999
Computing PF on: SD0Q1000_4
Best run was number 18
N_Clusters: 8
Value: 2.284705179681267, Time: 89.74104730000005
Computing PF on: SD0Q1000_5
Best run was number 16
N_Clusters: 8
Value: 2.110674435604803, Time: 86.29725480000002
Computing PF on: SD0Q1000_6
Best run was number 47
N_Clusters: 8
Value: 2.446583461638348, Time: 84.67134490000001
Computing PF on: SD0Q1000_7
Best run was number 99
N_Clusters: 8
Value: 2.4827142182806994, Time: 84.04000020000001
Computing PF on: SD0Q1000_8
Best run was number 15
N_Clusters: 8
Value: 1.9590635

Best run was number 61
N_Clusters: 9
Value: 14.78932865538567, Time: 46.958470900000066
Computing PF on: SD1Q10_3
Best run was number 92
N_Clusters: 9
Value: 13.32072403598103, Time: 46.25960589999977
Computing PF on: SD1Q10_4
Best run was number 53
N_Clusters: 9
Value: 13.326649705991771, Time: 48.26311059999989
Computing PF on: SD1Q10_5
Best run was number 58
N_Clusters: 9
Value: 13.66731667207673, Time: 48.14868650000062
Computing PF on: SD1Q10_6
Best run was number 48
N_Clusters: 9
Value: 11.703191927338358, Time: 47.604516200000035
Computing PF on: SD1Q10_7
Best run was number 8
N_Clusters: 9
Value: 13.909885904242966, Time: 47.46745100000044
Computing PF on: SD1Q10_8
Best run was number 99
N_Clusters: 9
Value: 12.55483677216344, Time: 54.50429179999992
Computing PF on: SD1Q10_9
Best run was number 93
N_Clusters: 9
Value: 15.226928824861606, Time: 58.14947459999985
Computing PF on: SD1Q1500_0
Best run was number 66
N_Clusters: 9
Value: 1.0707079828425792, Time: 40.73792270000013
C

Best run was number 6
N_Clusters: 23
Value: 2.7460515364624665, Time: 319.48584309999933
Computing PF on: SD205Q500_4
Best run was number 73
N_Clusters: 23
Value: 1.7442284722487174, Time: 317.55369150000115
Computing PF on: SD205Q500_5
Best run was number 15
N_Clusters: 23
Value: 2.948233391316423, Time: 299.8985114999996
Computing PF on: SD205Q500_6
Best run was number 5
N_Clusters: 23
Value: 1.7646528834619701, Time: 313.81990000000224
Computing PF on: SD205Q500_7
Best run was number 24
N_Clusters: 23
Value: 2.042848138467349, Time: 316.1168250999981
Computing PF on: SD205Q500_8
Best run was number 5
N_Clusters: 23
Value: 2.352540319545088, Time: 313.0288069999988
Computing PF on: SD205Q500_9
Best run was number 86
N_Clusters: 23
Value: 2.3605289717124087, Time: 331.41579809999894
Skip pf for data id: D3
Data id: D305
Computing PF on: SD305Q1000_0
Best run was number 3
N_Clusters: 35
Value: 5.723599532076075, Time: 3292.919630100001
Computing PF on: SD305Q1000_1
Best run was number 

Best run was number 30
N_Clusters: 23
Value: 4.485467362475245, Time: 310.18004979999387
Computing PF on: SD405Q100_3
Best run was number 43
N_Clusters: 23
Value: 4.228972244651827, Time: 315.53714650002075
Computing PF on: SD405Q100_4
Best run was number 65
N_Clusters: 23
Value: 3.426758593406982, Time: 313.0143420000095
Computing PF on: SD405Q100_5
Best run was number 77
N_Clusters: 23
Value: 3.5104698108766637, Time: 308.4106171000167
Computing PF on: SD405Q100_6
Best run was number 18
N_Clusters: 23
Value: 3.1275139961382306, Time: 316.54720780000207
Computing PF on: SD405Q100_7
Best run was number 11
N_Clusters: 23
Value: 3.0783208320763005, Time: 316.0455687000067
Computing PF on: SD405Q100_8
Best run was number 15
N_Clusters: 23
Value: 3.1913483219279, Time: 314.97670130000915
Computing PF on: SD405Q100_9
Best run was number 82
N_Clusters: 23
Value: 3.500984422646428, Time: 288.93188889999874
Computing PF on: SD405Q10_0
Best run was number 30
N_Clusters: 23
Value: 17.82013965547