In [1]:
import os
import time
import math
import random
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import metrics
from scipy.stats import ks_2samp
from sdv.metadata import SingleTableMetadata
from sdv.metadata import MultiTableMetadata
from sdv.evaluation.single_table import evaluate_quality as st_evaluate_quality
from sdv.evaluation.single_table import run_diagnostic as st_run_diagnostic
from sdv.evaluation.multi_table import evaluate_quality as mt_evaluate_quality
from sdv.evaluation.multi_table import run_diagnostic as mt_run_diagnostic
from sdv.evaluation.single_table import get_column_plot
from sdv.evaluation.single_table import get_column_pair_plot

# Load Processed Data From Generation Stage

In [2]:
with open('pkl/gtfs/real_data_collection.pkl', 'rb') as f:
    real_data_collection = pickle.load(f)

In [3]:
with open('pkl/gtfs/synthetic_data_full_epoch.pkl', 'rb') as f:
    synthetic_data_collection = pickle.load(f)

In [4]:
# with open('pkl/gtfs/synthetic_data_10epoch.pkl', 'rb') as f:
#     synthetic_data_collection = pickle.load(f)

In [5]:
with open('pkl/gtfs/sdv_metadata.pkl', 'rb') as f:
    sdv_metadata = pickle.load(f)

# Metrics

In [6]:
mt_quality_report = mt_evaluate_quality(
    real_data=real_data_collection,
    synthetic_data=synthetic_data_collection,
    metadata=sdv_metadata)

Creating report: 100%|███████████████████████████████████████████████████████████████████| 5/5 [00:28<00:00,  5.77s/it]



Overall Quality Score: 86.92%

Properties:
Column Shapes: 90.49%
Column Pair Trends: 78.68%
Parent Child Relationships: 91.6%


## Overall Shape Score Per Table

In [7]:
mt_quality_report.get_details(property_name='Column Shapes').groupby('Table')['Quality Score'].mean().round(3)

Table
agency            1.000
calendar          0.882
calendar_dates    0.982
routes            0.764
stop_times        0.966
stops             0.839
trips             1.000
Name: Quality Score, dtype: float64

## Overall Trend Score Per Table

In [8]:
mt_quality_report.get_details(property_name='Column Pair Trends').groupby('Table')['Quality Score'].mean().round(3)

Table
agency            1.000
calendar          0.707
calendar_dates    0.966
routes            0.572
stop_times        0.896
stops             0.743
trips             1.000
Name: Quality Score, dtype: float64

## Individual Shape Score Per Table

In [9]:
mt_quality_report.get_details(property_name='Column Shapes').groupby(['Table','Metric'])['Quality Score'].mean().round(3)

Table           Metric      
agency          TVComplement    1.000
calendar        KSComplement    0.562
                TVComplement    0.974
calendar_dates  KSComplement    0.973
                TVComplement    0.991
routes          TVComplement    0.764
stop_times      KSComplement    0.953
                TVComplement    1.000
stops           KSComplement    0.737
                TVComplement    0.941
trips           TVComplement    1.000
Name: Quality Score, dtype: float64

## Individual Trend Score Per Table

In [10]:
mt_quality_report.get_details(property_name='Column Pair Trends').groupby(['Table','Metric'])['Quality Score'].mean().round(3)

Table           Metric               
agency          ContingencySimilarity    1.000
calendar        ContingencySimilarity    0.700
                CorrelationSimilarity    0.985
calendar_dates  ContingencySimilarity    0.966
routes          ContingencySimilarity    0.572
stop_times      ContingencySimilarity    0.901
                CorrelationSimilarity    0.886
stops           ContingencySimilarity    0.702
                CorrelationSimilarity    0.936
trips           ContingencySimilarity    1.000
Name: Quality Score, dtype: float64

# Diagnostics

In [11]:
mt_run_diagnostic(
    real_data=real_data_collection,
    synthetic_data=synthetic_data_collection,
    metadata=sdv_metadata)

Creating report: 100%|███████████████████████████████████████████████████████████████████| 4/4 [03:34<00:00, 53.70s/it]


DiagnosticResults:

SUCCESS:
✓ The synthetic data covers over 90% of the categories present in the real data
✓ The synthetic data follows over 90% of the min/max boundaries set by the real data

! The synthetic data is missing more than 10% of the numerical ranges present in the real data

DANGER:
x More than 50% of the synthetic rows are copies of the real data





<sdmetrics.reports.multi_table.diagnostic_report.DiagnosticReport at 0x1ba81625430>

# Join Test

## Join Data

In [12]:
def auto_join(df_list, how='inner'):
    result = df_list[0]
    for df in df_list[1:]:
        shared_columns = list(set(result.columns) & set(df.columns))
        if not shared_columns:
            continue
        result = result.merge(df, on=shared_columns, how=how)
    return result

In [13]:
df_list = [real_data_collection['routes'], real_data_collection['trips']]
real_routes_trips = auto_join(df_list)

In [14]:
df_list = [synthetic_data_collection['routes'], synthetic_data_collection['trips']]
fake_routes_trips = auto_join(df_list)

In [15]:
routes_trips_meta = SingleTableMetadata()
routes_trips_meta.detect_from_dataframe(data=real_routes_trips)

In [16]:
routes_trips_meta.update_column(column_name='route_id',sdtype='id')
routes_trips_meta.update_column(column_name='agency_id',sdtype='id')
routes_trips_meta.update_column(column_name='route_type',sdtype='categorical')
routes_trips_meta.update_column(column_name='contract_id',sdtype='id')
routes_trips_meta.update_column(column_name='service_id',sdtype='id')
routes_trips_meta.update_column(column_name='trip_id',sdtype='id')
routes_trips_meta.update_column(column_name='direction_id',sdtype='categorical')
routes_trips_meta.update_column(column_name='shape_id',sdtype='id')
routes_trips_meta.update_column(column_name='wheelchair_accessible',sdtype='boolean')
routes_trips_meta.update_column(column_name='bikes_allowed',sdtype='boolean')

## Check Join Size Difference Between Real and Fake Data

In [17]:
def size_difference_in_percentage(df1, df2):
    df1_len = len(df1)
    df2_len = len(df2)
    min_ = min(df1_len,df2_len)
    max_ = max(df1_len,df2_len)
    leftout = (((max_ - min_) / max_) + ((max_ - min_) / min_)) / 2
    return leftout

In [18]:
size_difference_in_percentage(fake_routes_trips, real_routes_trips)

0.0

## Perform Metrics

In [19]:
st_quality_report = st_evaluate_quality(
    real_data=real_routes_trips,
    synthetic_data=fake_routes_trips,
    metadata=routes_trips_meta
)

Creating report: 100%|███████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00,  6.85it/s]


Overall Quality Score: 80.96%

Properties:
Column Shapes: 87.01%
Column Pair Trends: 74.91%





In [20]:
st_quality_report.get_details(property_name='Column Shapes')

Unnamed: 0,Column,Metric,Quality Score
0,route_type,TVComplement,0.932753
1,route_color,TVComplement,0.495757
2,route_text_color,TVComplement,0.793661
3,direction_id,TVComplement,0.998611
4,wheelchair_accessible,TVComplement,1.0
5,bikes_allowed,TVComplement,1.0


In [21]:
st_quality_report.get_details(property_name='Column Pair Trends')

Unnamed: 0,Column 1,Column 2,Metric,Quality Score,Real Correlation,Synthetic Correlation
0,route_color,route_type,ContingencySimilarity,0.451933,,
1,route_text_color,route_type,ContingencySimilarity,0.758047,,
2,direction_id,route_type,ContingencySimilarity,0.932753,,
3,route_type,wheelchair_accessible,ContingencySimilarity,0.932753,,
4,bikes_allowed,route_type,ContingencySimilarity,0.932753,,
5,route_color,route_text_color,ContingencySimilarity,0.36617,,
6,direction_id,route_color,ContingencySimilarity,0.4923,,
7,route_color,wheelchair_accessible,ContingencySimilarity,0.495757,,
8,bikes_allowed,route_color,ContingencySimilarity,0.495757,,
9,direction_id,route_text_color,ContingencySimilarity,0.793661,,
