### Generate a report for Hellinger Distance between real and synthetic tables 

In [1]:
from pathlib import Path
import os
import sys 
import pandas as pd
import numpy as np
import json
import copy

sys.path.append('/home/samer/projects/fuzzy_sql/src') #This will enable reading the modules
from fuzzy_sql.fuzzy_sql import *


In [2]:
def fix_index_type(index_names: list) -> list:
        index_names=[str(index_name) for index_name in index_names]
        for i in range(len(index_names)):
            if index_names[i]=='True':
                index_names[i]='1'
            elif  index_names[i]=='False':
                index_names[i]='0'
        return index_names

In [3]:
def calc_hlngr(real: pd.DataFrame, meta:dict, syn:pd.DataFrame) -> float:
    cat_vars=[var for var in meta.keys() if meta[var]=='nominal']
    vars_hlngr=[] #this will catch hellinger distance calculated for each variable in cat_vars
    for var in cat_vars:
        real_var=real[var].value_counts()
        real_var.sort_index(inplace=True, ascending=True)
        new_index=fix_index_type(list(real_var.index)) #fix_index_type added to ensure TRUE and False classes  are renamed into 1  and 0 with dtype(int)
        real_var.index=new_index
        real_idx=real_var.index

        syn_var=syn[var].value_counts()
        syn_var.sort_index(inplace=True, ascending=True)
        new_index=fix_index_type(list(syn_var.index)) #fix_index_type added to ensure TRUE and False classes  are renamed into 1  and 0 with dtype(int)
        syn_var.index=new_index
        syn_idx=syn_var.index

        if len(real_idx)<len(syn_idx):
            print(f"Added for debugging. It is likely that a continuous variable is mistakenly defined as nominal in metadata ")
            
        #match records
        if len(real_idx)!=len(syn_idx):
            missed_in_syn=real_idx.difference(syn_idx) # find missing classes in syn results
            missed_in_real=syn_idx.difference(real_idx) # find missing classes in real results( very unlikely and it may indicate that a continuous varibale is mistakely defjned as nominal in the metadata)
            for missed_idx in list(missed_in_syn):
                syn_var[missed_idx]=0
            for missed_idx in list(missed_in_real):
                real_var[missed_idx]=0
        # if len(real_idx)>len(syn_idx):
        #     missing=real_idx.difference(syn_idx) #get missing index in syn
        #     for idx in list(missing): #insert missing indices in syn 
        #         syn_var[idx]=0 #penalize it by adding zero count
        # elif len(real_idx)<len(syn_idx):
        #     missing=syn_idx.difference(real_idx) #do same thing with missing index in real
        #     for idx in list(missing):  
        #         real_var[idx]=0
        assert len(real_var)==len(syn_var)
        real_prob=real_var/sum(real_var)
        syn_prob=syn_var/sum(syn_var)
        var_hlngr_dist=np.sqrt(np.sum((np.sqrt(real_prob)-np.sqrt(syn_prob))**2))/np.sqrt(2)
        vars_hlngr.append(var_hlngr_dist)
    hlngr_dist=np.mean(vars_hlngr)
    return hlngr_dist

In [4]:
#set paths
root_dir=Path('/home/samer/projects/fuzzy_sql')
real_dir=os.path.join(root_dir,'data/tabular/ready/real')
meta_dir=os.path.join(root_dir,'data/tabular/ready/metadata')
syn_dir=os.path.join(root_dir,'data/tabular/ready/synthetic')



In [5]:
#extract real and synthetic data names
real_names=extract_fnames(real_dir)
real_names.sort()
names_dict=find_syn_fnames(syn_dir, real_names)

Extracted the names of 40 real datasets
Extracted the names of all available synthetic datasets corresponding to 40 real datasets


In [6]:
# carry out analysis
hlngr_dict={}
for real_name in names_dict:
    hlngr_dict[real_name]=[]
    real_path=os.path.join(real_dir, real_name+'.csv')
    meta_path=os.path.join(meta_dir, real_name+'.json') #If no corresponding metadata is available, skip tuple
    if not os.path.exists(meta_path): # skip if there is no metadata defined for the dataset
            continue
    if len(names_dict[real_name])==0: #skip if  no synthetic data is available 
        continue
    for syn_name in names_dict[real_name]:
        syn_path=os.path.join(syn_dir, syn_name+'.csv')
    
        real=pd.read_csv(real_path, dtype=str, encoding='iso-8859-1')
        syn=pd.read_csv(syn_path, dtype=str, encoding='iso-8859-1')
        with open(meta_path) as f:
            meta=json.load(f)

        if syn_name=='C7_syn_default_13':
            print("time to debug!!")

        hlngr=calc_hlngr(real, meta, syn)
        print(f'The Hellinger distance between {real_name} and {syn_name} is {hlngr}')
        hlngr_dict[real_name].append((syn_name,hlngr))
        



The Hellinger distance between C1 and C1_syn_default_19 is 0.11268710033576193
The Hellinger distance between C1 and C1_syn_default_3 is 0.11271912122341053
The Hellinger distance between C1 and C1_syn_default_16 is 0.11343320275179172
The Hellinger distance between C1 and C1_syn_default_17 is 0.11250740257846226
The Hellinger distance between C1 and C1_syn_default_11 is 0.11335498724105723
The Hellinger distance between C1 and C1_syn_default_15 is 0.11341614238534009
The Hellinger distance between C1 and C1_syn_default_9 is 0.11236795514437506
The Hellinger distance between C1 and C1_syn_default_1 is 0.11376686313543455
The Hellinger distance between C1 and C1_syn_default_20 is 0.11315473822120849
The Hellinger distance between C1 and C1_syn_default_14 is 0.1127187796398484
The Hellinger distance between C1 and C1_syn_default_2 is 0.11379060159027467
The Hellinger distance between C1 and C1_syn_default_10 is 0.1133305269946761
The Hellinger distance between C1 and C1_syn_default_13 is

In [7]:
hlngr_dict

{'C1': [('C1_syn_default_19', 0.11268710033576193),
  ('C1_syn_default_3', 0.11271912122341053),
  ('C1_syn_default_16', 0.11343320275179172),
  ('C1_syn_default_17', 0.11250740257846226),
  ('C1_syn_default_11', 0.11335498724105723),
  ('C1_syn_default_15', 0.11341614238534009),
  ('C1_syn_default_9', 0.11236795514437506),
  ('C1_syn_default_1', 0.11376686313543455),
  ('C1_syn_default_20', 0.11315473822120849),
  ('C1_syn_default_14', 0.1127187796398484),
  ('C1_syn_default_2', 0.11379060159027467),
  ('C1_syn_default_10', 0.1133305269946761),
  ('C1_syn_default_13', 0.11388135624347928),
  ('C1_syn_default_8', 0.11173766607754455),
  ('C1_syn_default_12', 0.11308297706316546),
  ('C1_syn_default_4', 0.11231312896744328),
  ('C1_syn_default_7', 0.11378382604344248),
  ('C1_syn_default_5', 0.11353180810211772),
  ('C1_syn_default_6', 0.11294411124778997),
  ('C1_syn_default_18', 0.11253427307294045),
  ('C1_syn_06', 0.0745311491896218)],
 'C10': [('C10_syn_default_20', 0.0429072748099

In [8]:
# #define path tuples
# from sys import meta_path


# path_tuple=[]
# for real_name in list(names_dict.keys()):
#     for syn_name in names_dict[real_name]:
#         real_path=os.path.join(real_dir, real_name+'.csv')
#         syn_path=os.path.join(syn_dir, syn_name+'.csv')
#         meta_path=os.path.join(meta_dir, real_name+'.json') #If no corresponding metadata is available, skip tuple
#         if not os.path.exists(meta_path):
#             continue
#         path_tuple.append((real_path, meta_path, syn_path))
