### Generate a report for Hellinger Distance between real and synthetic random queries

In [None]:
from pathlib import Path
import os
import sys 
import pandas as pd
from csv import writer  
import numpy as np
import weasyprint

sys.path.append('/home/samer/projects/fuzzy_sql/src') #This will enable reading the modules
from fuzzy_sql.fuzzy_sql import *


In [None]:
#set paths
root_dir=Path('/home/samer/projects/fuzzy_sql')
real_dir=os.path.join(root_dir,'data/tabular/ready/real')
meta_dir=os.path.join(root_dir,'data/tabular/ready/metadata')
syn_dir=os.path.join(root_dir,'data/tabular/ready/synthetic')

In [None]:
#extract real and synthetic data names
real_names=extract_fnames(real_dir)
real_names.sort()
names_dict=find_syn_fnames(syn_dir, real_names)

In [None]:
def summarize_queries(names_dict,no_of_queries):
    syn_dict={} #dictionary of Hellinger and Euclidean Stats per each synthetic trial
    syn_dict['real_name']=[]
    syn_dict['syn_name']=[]
    syn_dict['no_queries']=[]
    syn_dict['hlngr_mean']=[]
    syn_dict['hlngr_median']=[]
    syn_dict['hlngr_stddev']=[]
    syn_dict['ecldn_mean']=[]
    syn_dict['ecldn_median']=[]
    syn_dict['ecldn_stddev']=[]

    for real_name in names_dict:
        real_path=os.path.join(real_dir, real_name+'.csv')
        meta_path=os.path.join(meta_dir, real_name+'.json') #If no corresponding metadata is available, skip tuple
        if not os.path.exists(meta_path): # skip if there is no metadata defined for the dataset
                continue
        if len(names_dict[real_name])==0: #skip if  no synthetic data is available 
            continue
        
        for syn_name in names_dict[real_name]:
            syn_dict['real_name'].append(real_name)
            syn_dict['syn_name'].append(syn_name)
            syn_path=os.path.join(syn_dir, syn_name+'.csv')
            scored_queries=fuzz_tabular(no_of_queries,'twin_aggfltr', real_path, meta_path,syn_path,run_folder='../.runs', printme=False)
            syn_dict['no_queries'].append(no_of_queries)
            syn_dict['hlngr_mean'].append(np.nanmean(scored_queries['hlngr_dist']))
            syn_dict['hlngr_median'].append(np.nanmedian(scored_queries['hlngr_dist']))
            syn_dict['hlngr_stddev'].append(np.nanstd(scored_queries['hlngr_dist']))
            syn_dict['ecldn_mean'].append(np.nanmean(scored_queries['ecldn_dist']))
            syn_dict['ecldn_median'].append(np.nanmedian(scored_queries['ecldn_dist']))
            syn_dict['ecldn_stddev'].append(np.nanstd(scored_queries['ecldn_dist']))
            
    return pd.DataFrame(syn_dict)

   

In [None]:
#Generate most abstrcated summary per real dataset
#test_dict={k: names_dict[k] for k in list(names_dict)[:2]}
syn_summary=summarize_queries(names_dict,10)
counts=syn_summary.groupby('real_name').count()['hlngr_mean'].values
real_summary=syn_summary.groupby(['real_name']).mean()
real_summary.insert(1,'no_syn',counts)


In [None]:
# Generate Detailed Report in HTML
with open('../reports/SEP202022/frame_syn.html','r') as f:
    html_string=f.read()
with open('../reports/SEP202022/report_syn.html', 'w') as f:
    f.write(html_string.format(pandas_table=syn_summary.to_html(classes='mystyle')))


# Generate Report in HTML
with open('../reports/SEP202022/frame_real.html','r') as f:
    html_string=f.read()
with open('../reports/SEP202022/report_real.html', 'w') as f:
    f.write(html_string.format(pandas_table=real_summary.to_html(classes='mystyle')))

In [None]:
#Convert Detailed to pdf
# with open('../reports/SEP202022/hlngr_queries_syn_report.html','r') as f:
#     html_string=f.read()
# #html_string=html_string.replace('\n',"")


pdf=weasyprint.HTML('../reports/SEP202022/report_syn.html').write_pdf()
with open('../reports/SEP202022/report_syn.pdf','w+b') as f:
    f.write(pdf) 



#Convert summary to pdf
# with open('../reports/SEP202022/hlngr_queries_report.html','r') as f:
#     html_string=f.read()
# #html_string=html_string.replace('\n',"")

import weasyprint
pdf=weasyprint.HTML('../reports/SEP202022/report_real.html').write_pdf()
with open('../reports/SEP202022/report_real.pdf','w+b') as f:
    f.write(pdf) 