# Step 5: Measure Population Fidelity (PF)

In [None]:
import pandas as pd 
import sys
import os
import re

sys.path.append('../src')
from utils import (getExperimentConfig, 
                   getPicklesFromDir, 
                   get_synthetic_filepaths_from_original_data_id)

config = getExperimentConfig()
folders = config['folders']

settings = getPicklesFromDir(folders['settings_dir'])

from kmodes.kprototypes import KPrototypes
from preprocessing import one_hot_encode, compute_zscore

In [None]:
# read file f it exists, if not-create dataframe
#pf_measures_filepath = folders['pf_measures_filepath']
pf_measures_filepath='../data/result/Cluster_2.csv'

result_df = pd.DataFrame(columns=['Dataset id'])
run_dataset = 'D1'

dataset_setting = {}
for data in settings:
     if data['meta']['id'] == run_dataset:
        dataset_setting = data.copy()
        break

original_data_id = dataset_setting['meta']['id']
synthetic_datasets = get_synthetic_filepaths_from_original_data_id(original_data_id)
# there is no synthetic datasets according to the Data id, skip and try next
if synthetic_datasets == []:
    print(f"Did not find synthetic dataset files")

original_data = pd.read_csv(folders['real_dir']+dataset_setting['meta']['filename'], dtype=dataset_setting['meta']['cols_dtype'])
n_clusters =  round((original_data.shape[0])    * 0.020)
print(f"Data id: {original_data_id}")
sd_filename = synthetic_datasets[12]  
sd_id = os.path.splitext(sd_filename)[0]
quality = re.findall('Q\d+', sd_id)[0]
sd_path = folders['sd_dir']+sd_filename

display(sd_id)

In [None]:
synthetic_data = pd.read_csv(sd_path, dtype=dataset_setting['meta']['cols_dtype'])       
c_df = pd.concat([original_data, synthetic_data], axis=0, copy=True, ignore_index=True)

numeric_cols = dataset_setting['meta']['numeric_features'] 
cat_cols = [dataset_setting['meta']['target']]
cat_cols.extend(dataset_setting['meta']['categorical_features']) if dataset_setting['meta']['categorical_features'] != None else _
cat_cols.extend(list(dataset_setting['meta']['ordinal_features'].keys())) if dataset_setting['meta']['ordinal_features'] != None else _

cat_cols = list(set(cat_cols))

cat_indices = [c_df.columns.get_loc(col) for col in cat_cols]
c_df.isna().sum()

display(c_df.shape)
display(numeric_cols, len(numeric_cols))
display(cat_cols, len(cat_cols))
display(cat_indices)

In [None]:
from preprocessing import create_preprocessing_pipeline

if dataset_setting['meta']['categorical_features'] != None:
    dataset_setting['meta']['categorical_features'].append(dataset_setting['meta']['target'])
else:
    dataset_setting['meta']['categorical_features'] = [dataset_setting['meta']['target']]

preprocessor = create_preprocessing_pipeline(dataset_setting['meta'])
t_df = preprocessor.fit_transform(c_df)

n_clusters = round((original_data.shape[0])    * 0.020)
display(f"N_clusters: {n_clusters}")
display(t_df.shape)
display(dataset_setting['meta']['target'],dataset_setting['meta']['categorical_features'],dataset_setting['meta']['numeric_features'])

kproto = KPrototypes(n_clusters=n_clusters, init='Cao', n_jobs=-1, max_iter=200, verbose=3)

clusters = kproto.fit_predict(t_df, categorical=cat_indices)

In [None]:
t_cat_df, ohe_cols = one_hot_encode(c_df, cat_cols)
t_num_df = compute_zscore(c_df, numeric_cols)
t_df = pd.concat([t_num_df[numeric_cols], t_cat_df], axis=1, ignore_index=False, copy=True)
display(f"Num cols: {len(t_df.columns)}, Unique cols: {len(t_df.columns.unique())}")
display(t_num_df[numeric_cols].columns)
cat_indices = [t_df.columns.get_loc(col) for col in ohe_cols]


In [None]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

cat_cols = list(set(cat_cols))
list_column_transformer = []
num_transformer = Pipeline([('scaler', StandardScaler())])
list_column_transformer.append(('num', num_transformer, numeric_cols))

cat_transformer = Pipeline([('ohe', OneHotEncoder(handle_unknown='ignore', sparse=False))])
list_column_transformer.append(('cat', cat_transformer, cat_cols))

preprocessor = ColumnTransformer(list_column_transformer)

cat_indices = [c_df.columns.get_loc(col) for col in cat_cols]
t_df = preprocessor.fit_transform(c_df)

display(t_df.shape)
t_df

