# Step 5: Measure Population Fidelity (PF)

In [None]:
import pandas as pd 
import sys
import os
import re

from sdv.metadata import SingleTableMetadata

sys.path.append('../src')
from PF_metrics import compute_pf_measures
from utils import (getExperimentConfig, 
                   getPicklesFromDir, 
                   get_synthetic_filepaths_from_original_data_id)

config = getExperimentConfig()
folders = config['folders']

settings = getPicklesFromDir(folders['settings_dir'])

In [None]:
# read file f it exists, if not-create dataframe
pf_measures_filepath = folders['pf_measures_filepath']
#pf_measures_filepath='../data/result/pMSE.csv'

if os.path.exists(pf_measures_filepath):
    result_df = pd.read_csv(pf_measures_filepath)
else:
    result_df = pd.DataFrame(columns=['Dataset id'])

run_dataset = config['run_dataset']

for dataset_setting in settings:
            
    if run_dataset is not None and dataset_setting['meta']['id'] not in run_dataset:
        # Checks if run_dataset contains dataset_id's
        # if it does, run the experiment only on specified datasets
        print(f"Skip pf for data id: {dataset_setting['meta']['id']}")
        continue
        
    original_data_id = dataset_setting['meta']['id']
    synthetic_datasets = get_synthetic_filepaths_from_original_data_id(original_data_id)
    # there is no synthetic datasets according to the Data id, skip and try next
    if synthetic_datasets == []:
        print(f"Did not find synthetic dataset files")
        continue

    original_data = pd.read_csv(folders['real_dir']+dataset_setting['meta']['filename'], dtype=dataset_setting['meta']['cols_dtype'])
    metadata = SingleTableMetadata().load_from_json(dataset_setting['meta']['meta_filepath']).to_dict()

    
    print(f"Data id: {original_data_id}")
    for sd_filename in synthetic_datasets:   
        sd_id = os.path.splitext(sd_filename)[0]
        quality = re.findall('Q\d+', sd_id)[0]
        sd_path = folders['sd_dir']+sd_filename

        # Check if the dataset has already been evaluated
        row_exists = ((result_df['Dataset id'] == sd_id)).any()
        if not row_exists:
            print(f"Computing PF on: {sd_id}")
            synthetic_data = pd.read_csv(sd_path)

            pf_measures = compute_pf_measures(original_data=original_data,
                                                  synthetic_data=synthetic_data,
                                                  custom_metadata=dataset_setting['meta'],
                                                  sdv_metadata=metadata,
                                                  SD_id=sd_id)
            pf_measures['Quality'] = quality
            pf_measures['Original dataset'] = original_data_id
            result_df = pd.concat([result_df, pf_measures], axis=0, ignore_index=True)

        # save the results
        result_df.to_csv(pf_measures_filepath, index=False)