In [1]:
import os
import time
import math
import random
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import metrics
from scipy.stats import ks_2samp
from sdv.metadata import MultiTableMetadata
from sdv.evaluation.multi_table import evaluate_quality
from sdv.multi_table import HMASynthesizer

In [2]:
with open('pkl/university/university.pkl', 'rb') as f:
    real_data_collection = pickle.load(f)

In [3]:
with open('pkl/university/university_synth_full_epoch.pkl', 'rb') as f:
    synthetic_data_collection = pickle.load(f)

In [4]:
with open('pkl/university/university_sdvmetadata.pkl', 'rb') as f:
    sdv_metadata = pickle.load(f)

In [5]:
generation_dict = {k:{'nrows':len(v)} for k,v in real_data_collection.items()}

In [6]:
generation_dict

{'RA': {'nrows': 25},
 'course': {'nrows': 10},
 'prof': {'nrows': 6},
 'registration': {'nrows': 92},
 'student': {'nrows': 38}}

# HMA

In [7]:
synthesizer = HMASynthesizer(sdv_metadata)

In [8]:
%%time
%%capture
synthesizer.fit(real_data_collection)
synthesizer.save(filepath='bm_models/university_models/university_hma.pkl')

CPU times: total: 8.7 s
Wall time: 8.7 s


In [9]:
%%time
%%capture
synthesizer = HMASynthesizer.load(filepath='bm_models/university_models/university_hma.pkl')
hma_synthetic_data = synthesizer.sample(
    scale=1
)

CPU times: total: 3.53 s
Wall time: 477 ms


In [10]:
evaluate_quality(real_data=real_data_collection, synthetic_data=hma_synthetic_data, metadata=sdv_metadata)

Creating report: 100%|███████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 30.11it/s]



Overall Quality Score: 70.92%

Properties:
Column Shapes: 79.52%
Column Pair Trends: 64.72%
Parent Child Relationships: 68.51%


<sdmetrics.reports.multi_table.quality_report.QualityReport at 0x1c346bd3f40>

In [11]:
evaluate_quality(real_data=real_data_collection, synthetic_data=synthetic_data_collection, metadata=sdv_metadata)

Creating report: 100%|███████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 30.11it/s]



Overall Quality Score: 68.05%

Properties:
Column Shapes: 76.68%
Column Pair Trends: 48.04%
Parent Child Relationships: 79.43%


<sdmetrics.reports.multi_table.quality_report.QualityReport at 0x1c346cbb070>