## Analysis of Synthetic Data Generated from the Berka Dataset
#### Make sure the main directory structure and files are downloaded locally into directory called berka_results
#### Link: https://drive.google.com/drive/folders/12OvCmPUDUtffQnu_ZiCQ4SWjPL3i0kvd

In [1]:
import sqlalchemy
import psycopg2
from sqlalchemy import text
from sqlalchemy import create_engine
import pandas as pd
from genai_evaluation import multivariate_ecdf, ks_statistic
from nogan_synthesizer.preprocessing import wrap_category_columns, unwrap_category_columns
import os
from pandasql import sqldf
from typing import Dict, List

In [2]:
DATASETS_DIR = "./berka_results"
ORIG_DIR = f"{DATASETS_DIR}/original"
GRETEL_DIR = f"{DATASETS_DIR}/gretel"
YDATA_DIR = f"{DATASETS_DIR}/ydata"
MOSTLYAI_DIR = f"{DATASETS_DIR}/mostlyai"
SDV_DIR = f"{DATASETS_DIR}/sdv"

In [3]:
def read_csv_files(directory):
    csv_dict = {}
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith('.csv'):
                file_path = os.path.join(root, file)
                filename_without_extension = os.path.splitext(file)[0]
                df = pd.read_csv(file_path, parse_dates=True)
                csv_dict[filename_without_extension] = df
    return csv_dict

#### Read Datasets

In [4]:
## Original
orig_data = read_csv_files(ORIG_DIR)

## Gretel
gretel_synth_data = read_csv_files(GRETEL_DIR)

## Mostly AI
mostlyai_synth_data = read_csv_files(MOSTLYAI_DIR)

## SDV 
sdv_synth_data = read_csv_files(SDV_DIR)

## YData
ydata_synth_data = read_csv_files(YDATA_DIR)

In [5]:
tables = [
    'account', 
    'card', 
    'client', 
    'disposition', 
    'district',
    'loan', 
    'orders', 
    'transaction'
]

In [6]:
print("----- Original Data Details ------")
for table in tables:
    print(f"{table} Shape: {orig_data[table].shape}")

print("")
print("----- Gretel Synth Data Details ------")
for table in tables:
    print(f"{table} Shape: {gretel_synth_data[table].shape}")

print("")
print("----- MostlyAI Synth Data Details ------")
for table in tables:
    print(f"{table} Shape: {mostlyai_synth_data[table].shape}")

print("")
print("----- YData Synth Data Details ------")
for table in tables:
    print(f"{table} Shape: {ydata_synth_data[table].shape}")

----- Original Data Details ------
account Shape: (4500, 4)
card Shape: (892, 4)
client Shape: (5369, 3)
disposition Shape: (5369, 4)
district Shape: (77, 16)
loan Shape: (682, 7)
orders Shape: (6471, 6)
transaction Shape: (49498, 9)

----- Gretel Synth Data Details ------
account Shape: (4500, 4)
card Shape: (892, 4)
client Shape: (5369, 3)
disposition Shape: (5369, 4)
district Shape: (77, 16)
loan Shape: (682, 7)
orders Shape: (6471, 6)
transaction Shape: (49498, 9)

----- MostlyAI Synth Data Details ------
account Shape: (3960, 4)
card Shape: (616, 4)
client Shape: (6299, 3)
disposition Shape: (4730, 4)
district Shape: (77, 16)
loan Shape: (816, 7)
orders Shape: (7823, 6)
transaction Shape: (43560, 9)

----- YData Synth Data Details ------
account Shape: (3908, 4)
card Shape: (892, 4)
client Shape: (5413, 3)
disposition Shape: (5369, 4)
district Shape: (77, 16)
loan Shape: (90, 7)
orders Shape: (5404, 6)
transaction Shape: (42986, 9)


In [7]:
def get_foreign_keys_issues(data:Dict, foreign_key_constraints:List[Dict]):
    foreign_key_issues = []
    for constraint in foreign_key_constraints:
        child, child_column, parent, parent_column = constraint.values()
        values_present = data[child][child_column].isin(data[parent][parent_column]).all()
        if not values_present:
            foreign_key_issues.append(f"{child}-{parent}")
    return foreign_key_issues

def get_primary_keys_issues(data:Dict, primary_key_constraints:List[Dict]):
    primary_key_issues = []
    for constraint in primary_key_constraints:
        table, key_column = constraint.values()
        if ydata_synth_data[table].shape[0] != ydata_synth_data[table][key_column].nunique():
            primary_key_issues.append(table)
    return primary_key_issues

In [8]:
primary_key_constraints = [
    {'table': 'account', 'key_column': 'account_id' },
    {'table': 'card', 'key_column': 'card_id' },
    {'table': 'client', 'key_column': 'client_id' },
    {'table': 'disposition', 'key_column': 'disposition_id' },
    {'table': 'district', 'key_column': 'district_id' },
    {'table': 'loan', 'key_column': 'loan_id' },
    {'table': 'orders', 'key_column': 'orders_id' },
    {'table': 'transaction', 'key_column': 'transaction_id' }
]

foreign_key_constraints = \
[
    {
    "child_table": "account", 
    "child_column": "district_id",
    "parent_table": "district", 
    "parent_column": "district_id" 
    },
    {
    "child_table": "client", 
    "child_column": "district_id",
    "parent_table": "district", 
    "parent_column": "district_id" 
    },
    {
    "child_table": "disposition", 
    "child_column": "account_id",
    "parent_table": "account", 
    "parent_column": "account_id" 
    },    
    {
    "child_table": "transaction", 
    "child_column": "account_id",
    "parent_table": "account", 
    "parent_column": "account_id" 
    },
    {
    "child_table": "loan", 
    "child_column": "account_id",
    "parent_table": "account", 
    "parent_column": "account_id" 
    },
    {
    "child_table": "orders", 
    "child_column": "account_id",
    "parent_table": "account", 
    "parent_column": "account_id" 
    },
    {
    "child_table": "disposition", 
    "child_column": "client_id",
    "parent_table": "client", 
    "parent_column": "client_id" 
    },
    {
    "child_table": "card", 
    "child_column": "disposition_id",
    "parent_table": "disposition", 
    "parent_column": "disposition_id" 
    }                 
]

In [9]:
constraints = {
    "primary_keys": 8,
    "foreign_keys": 8
}

total_constraints = 16

### Evaluate

#### Set Seed

In [10]:
pd.core.common.random_state(None)
seed = 1047
ks_seed = 1034

In [11]:
def evaluate_synth(synth_solution:str, original_data:Dict, synth_data:Dict, tables:List, 
                   primary_key_constraints:List[Dict], foreign_key_constraints:List[Dict], 
                   total_constraints, verbose = False
                  ):
    ## Data Integrity
    if verbose:
        print(f"---------{synth_solution} Evaluation-----------")
        print(f"{synth_solution} Constraint Issues Details")
    primary_keys_issues = get_primary_keys_issues(synth_data, primary_key_constraints)
    if verbose and primary_keys_issues:
        print(f"Primary Key Issues: {primary_keys_issues}")
    primary_keys_issue_count = len(primary_keys_issues)

    foreign_keys_issues = get_foreign_keys_issues(synth_data, foreign_key_constraints)
    if verbose and foreign_keys_issues:
        print(f"Foreign Key Issues: {foreign_keys_issues}")    
    foreign_keys_issue_count = len(foreign_keys_issues)
    data_integrity_score = 1 - ((primary_keys_issue_count + foreign_keys_issue_count)/total_constraints)
    if verbose:
        print(f" -Primary Keys Issues Count: {primary_keys_issue_count}")
        print(f" -Foreign Keys Issues Count: {foreign_keys_issue_count}")
        print(f" -Data Integrity Score: {data_integrity_score}")

    genai_eval_scores = []
    for table in tables:
        # print(table)
        orig_df = original_data[table]
        synth_df = synth_data[table]

        for col in orig_df.columns:
            if col in ['date', 'issued']:
                orig_df[col] = pd.to_datetime(orig_df[col])
                synth_df[col] = pd.to_datetime(synth_df[col])

        # Remove id columns
        non_id_cols = [col for col in orig_df.columns if not col.endswith('_id')]
        orig_df = orig_df[non_id_cols]
        synth_df = synth_df[non_id_cols]
        cat_columns = orig_df.select_dtypes(exclude=["number","bool_","datetime64[ns]"]).columns.tolist()
        # cat_columns = [col for col in cat_columns if not col.endswith('_id')]
        # print(f"Category Cols: {cat_columns}")
        date_columns = orig_df.select_dtypes(include=["datetime64[ns]"]).columns.tolist()
        if date_columns:
            # print(f"Date Cols: {date_columns}")
            orig_df = orig_df.drop(date_columns, axis = 1)
            synth_df = synth_df.drop(date_columns, axis = 1)
        
        # Encode Category Columns
        if cat_columns:
            wrapped_orig, idx_to_key_orig, key_to_idx_orig = wrap_category_columns(orig_df,cat_columns)
            wrapped_synth, idx_to_key_synth, key_to_idx_synth = wrap_category_columns(synth_df,cat_columns)
        else:
            wrapped_orig = orig_df
            wrapped_synth = synth_df
        # Calculate ECDF
        if verbose:
            print("----Calculating ECDF------")
        query_orig, ecdf_orig, ecdf_synth = \
            multivariate_ecdf(wrapped_orig, 
                              wrapped_synth, 
                              n_nodes = 3000, 
                              verbose = verbose,
                              random_seed=ks_seed) 

        # Calculate KS Stat
        ks_stat = ks_statistic(ecdf_orig, ecdf_synth)
        
        genai_eval_scores.append({"table":table, "eval_score": ks_stat})
    return {"data_integrity_score": data_integrity_score, "eval_scores": genai_eval_scores}

In [12]:
gretel_results = evaluate_synth(synth_solution = "gretel", 
                                original_data = orig_data, 
                                synth_data = gretel_synth_data,
                                tables = tables,
                                primary_key_constraints = primary_key_constraints,
                                foreign_key_constraints = foreign_key_constraints, 
                                total_constraints = total_constraints,
                                verbose=False)
print(f"Gretel Data Integrity Score: {gretel_results['data_integrity_score']}")
print(f"Gretel GenAI Eval Score: {pd.DataFrame(gretel_results['eval_scores']).eval_score.median():.3f}")

Gretel Data Integrity Score: 1.0
Gretel GenAI Eval Score: 0.270


In [13]:
mostlyai_results = evaluate_synth(synth_solution = "mostlyai", 
                                original_data = orig_data, 
                                synth_data = mostlyai_synth_data,
                                tables = tables,
                                primary_key_constraints = primary_key_constraints,
                                foreign_key_constraints = foreign_key_constraints, 
                                total_constraints = total_constraints,
                                verbose=False)
print(f"MostlyAI Data Integrity Score: {mostlyai_results['data_integrity_score']}")
print(f"MostlyAI GenAI Eval Score: {pd.DataFrame(mostlyai_results['eval_scores']).eval_score.mean():.3f}")

MostlyAI Data Integrity Score: 1.0
MostlyAI GenAI Eval Score: 0.314


In [14]:
ydata_results = evaluate_synth(synth_solution = "ydata", 
                                original_data = orig_data, 
                                synth_data = ydata_synth_data,
                                tables = tables,
                                primary_key_constraints = primary_key_constraints,
                                foreign_key_constraints = foreign_key_constraints, 
                                total_constraints = total_constraints,
                                verbose=False)
print(f"YData Data Integrity Score: {ydata_results['data_integrity_score']}")
print(f"YData GenAI Eval Score: {pd.DataFrame(ydata_results['eval_scores']).eval_score.mean():.3f}")

YData Data Integrity Score: 1.0
YData GenAI Eval Score: 0.160


In [15]:
gretel_synth_time = "40 mins"
mostlyai_synth_time = "2 mins"
ydata_synth_time = "7 mins"
results = [
    {"method": "Gretel", "synth_time": gretel_synth_time, 
     "data_integrity_score": gretel_results['data_integrity_score'], 
     "genai_eval_score": pd.DataFrame(gretel_results['eval_scores']).eval_score.mean().round(2)},
    {"method": "Mostly AI", "synth_time": mostlyai_synth_time, 
     "data_integrity_score": mostlyai_results['data_integrity_score'], 
     "genai_eval_score": pd.DataFrame(mostlyai_results['eval_scores']).eval_score.mean().round(2) },
    {"method": "YData AI", "synth_time": ydata_synth_time, 
     "data_integrity_score": ydata_results['data_integrity_score'], 
     "genai_eval_score": pd.DataFrame(ydata_results['eval_scores']).eval_score.mean().round(2) }  
]

pd.DataFrame(results)

Unnamed: 0,method,synth_time,data_integrity_score,genai_eval_score
0,Gretel,40 mins,1.0,0.28
1,Mostly AI,2 mins,1.0,0.31
2,YData AI,7 mins,1.0,0.16
