In [1]:
import os
import time
import math
import random
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import metrics
from scipy.stats import ks_2samp
from sdv.metadata import MultiTableMetadata
from sdv.evaluation.multi_table import evaluate_quality

# Load Processed Data From Generation Stage

In [2]:
with open('pkl/real_data_collection.pkl', 'rb') as f:
    real_data_collection = pickle.load(f)

In [3]:
with open('pkl/synthetic_data_full_epoch.pkl', 'rb') as f:
    synthetic_data_collection = pickle.load(f)

In [4]:
with open('pkl/sdv_metadata.pkl', 'rb') as f:
    sdv_metadata = pickle.load(f)

In [5]:
with open('pkl/metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

In [6]:
real_data_collection

{'agency':     agency_id  agency_url  agency_timezone  agency_lang  agency_phone
 0           0           0                0            0             0
 1           1           0                0            0             0
 2           2           0                0            0             0
 3           3           0                0            0             0
 4           4           0                0            0             0
 5           5           0                0            0             0
 6           6           0                0            0             0
 7           7           0                0            0             0
 8           8           0                0            0             0
 9           9           0                0            0             0
 10         10           0                0            0             0
 11         11           0                0            0             0
 12         12           0                0            0           

In [7]:
synthetic_data_collection

{'agency':     agency_id  agency_url  agency_timezone  agency_lang  agency_phone
 0           0           0                0            0             0
 1           1           0                0            0             0
 2           2           0                0            0             0
 3           3           0                0            0             0
 4           4           0                0            0             0
 5           5           0                0            0             0
 6           6           0                0            0             0
 7           7           0                0            0             0
 8           8           0                0            0             0
 9           9           0                0            0             0
 10         10           0                0            0             0
 11         11           0                0            0             0
 12         12           0                0            0           

# Metrics

### Cosine Complement Test

In [8]:
def cos_test(df1, df2):
    cos_sim = metrics.pairwise.cosine_similarity(df1.values.T, df2.values.T)
    mean_cos_sim = np.mean(cos_sim)
    return mean_cos_sim

In [9]:
def batch_cos_test(fake_collection, real_collection):
    test_dict = {}
    for df_name in fake_collection.keys():
        mean_cos_sim = cos_test(real_collection[df_name], fake_collection[df_name])
        test_dict[df_name] = 1-mean_cos_sim
    return test_dict

In [10]:
%time batch_cos_test(synthetic_data_collection, real_data_collection)

CPU times: total: 1.77 s
Wall time: 227 ms


{'agency': 0.96,
 'calendar': 0.7973341253002043,
 'calendar_dates': 0.09911736825504547,
 'routes': 0.7722847960862144,
 'stops': 0.8700902255396581,
 'stop_times': 0.5878181395847124,
 'trips': 0.5210619437775119}

### KS (Kolmogorov-Smirnov) Complement Test

In [11]:
def ks_test(df1, df2):
    ks_stats = []
    # wasserstein_distances = []
    p_values = []
    for column in df1.columns:
        ks_stat, ks_p_value = ks_2samp(df1[column], df2[column])
        ks_stats.append(ks_stat)
        p_values.append(ks_p_value)
        # w_dist = wasserstein_distance(df1[column], df2[column])
        # wasserstein_distances.append(w_dist)
    mean_ks_stat = np.mean(ks_stats)
    # mean_wasserstein_distance = np.mean(wasserstein_distances)
    mean_p_value = np.mean(p_values)
    return mean_ks_stat, mean_p_value

In [12]:
def batch_ks_test(collection1, collection2):
    stats_dict = {}
    for df_name in collection1:
        ks_results, p_value = ks_test(collection1[df_name], collection2[df_name])
        stats_dict[df_name] = 1-ks_results
    return stats_dict

In [13]:
%time batch_ks_test(synthetic_data_collection, real_data_collection)

CPU times: total: 3.44 s
Wall time: 2.36 s


{'agency': 1.0,
 'calendar': 0.556198347107438,
 'calendar_dates': 0.413946587537092,
 'routes': 0.7325581395348837,
 'stops': 0.7054413662992751,
 'stop_times': 0.6916818543840958,
 'trips': 0.7225025460605499}

## Cardinality Metrics

In [14]:
def cardinality_shape_similarity(real_tables, fake_tables, relationships):
    ks_scores = []

    for parent_table_name, relationship in relationships.items():
        parent_primary_key = relationship["primary_key"]
        children = relationship["child"]

        for child_table_name, child_foreign_key in children.items():
            # Calculate cardinality for real data
            real_child_table = real_tables[child_table_name]
            real_cardinality = real_child_table[child_foreign_key].value_counts().values

            # Calculate cardinality for synthetic data
            fake_child_table = fake_tables[child_table_name]
            fake_cardinality = fake_child_table[child_foreign_key].value_counts().values

            # Compute KSComplement score
            ks_statistic, _ = ks_2samp(real_cardinality, fake_cardinality)
            ks_complement = 1 - ks_statistic
            ks_scores.append(ks_complement)

    # Calculate CardinalityShapeSimilarity metric as the mean of KSComplement scores
    cardinality_shape_similarity_metric = sum(ks_scores) / len(ks_scores)
    
    return cardinality_shape_similarity_metric

In [15]:
cardinality_shape_similarity(real_data_collection, synthetic_data_collection, metadata)

0.7855401551279061

# SDMetrics (optional)

In [16]:
quality_report = evaluate_quality(
    real_data=real_data_collection,
    synthetic_data=synthetic_data_collection,
    metadata=sdv_metadata)

Creating report: 100%|███████████████████████████████████████████████████████████████████| 5/5 [00:14<00:00,  2.88s/it]



Overall Quality Score: 61.77%

Properties:
Column Shapes: 63.02%
Column Pair Trends: 41.95%
Parent Child Relationships: 80.35%


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


# Join Test

## Join Data

In [17]:
def auto_join(df_list, how='inner'):
    result = df_list[0]
    for df in df_list[1:]:
        shared_columns = list(set(result.columns) & set(df.columns))
        if not shared_columns:
            continue
        result = result.merge(df, on=shared_columns, how=how)
    return result

In [18]:
df_list = [synthetic_data_collection['routes'], synthetic_data_collection['trips']]
fake_routes_trips = auto_join(df_list)

In [19]:
df_list = [synthetic_data_collection['stops'], synthetic_data_collection['stop_times'], synthetic_data_collection['trips']]
fake_stops_stop_times_trips = auto_join(df_list)

In [20]:
df_list = [real_data_collection['routes'], real_data_collection['trips']]
real_routes_trips = auto_join(df_list)

In [21]:
df_list = [real_data_collection['stops'], real_data_collection['stop_times'], real_data_collection['trips']]
real_stops_stop_times_trips = auto_join(df_list)

## Measue # of Rows Abandoned

In [22]:
def size_difference_in_percentage(df1, df2):
    df1_len = len(df1)
    df2_len = len(df2)
    min_ = min(df1_len, df2_len)
    max_ = max(df1_len, df2_len)
    leftout = (((max_ - min_) / max_) + ((max_ - min_) / min_)) / 2
    return leftout

In [23]:
size_difference_in_percentage(fake_routes_trips, real_routes_trips)

0.0

In [24]:
size_difference_in_percentage(fake_stops_stop_times_trips, real_stops_stop_times_trips)

0.0

## CosSim Test For Joined Data

In [25]:
min_ = min(len(fake_routes_trips), len(real_routes_trips))
cos_test_result = cos_test(fake_routes_trips.sample(min_), real_routes_trips.sample(min_))

In [26]:
1-cos_test_result

0.66276639035927

In [27]:
min_ = min(len(fake_stops_stop_times_trips), len(real_stops_stop_times_trips))
cos_test_result = cos_test(fake_stops_stop_times_trips.sample(min_), real_stops_stop_times_trips.sample(min_))

In [28]:
1-cos_test_result

0.7084451705643939

## KS Test For Joined Data

In [29]:
min_ = min(len(fake_routes_trips), len(real_routes_trips))
ks_stat, p_value = ks_test(fake_routes_trips.sample(min_), real_routes_trips.sample(min_))

In [30]:
1-ks_stat

0.7052338458689722

In [31]:
min_ = min(len(fake_stops_stop_times_trips), len(real_stops_stop_times_trips))
ks_stat, p_value = ks_test(fake_stops_stop_times_trips.sample(min_), real_stops_stop_times_trips.sample(min_))

In [32]:
1-ks_stat

0.6901966300851271