In [1]:
import os
import time
import math
import random
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import metrics
from scipy.stats import ks_2samp
from sdv.metadata import MultiTableMetadata
from sdv.evaluation.multi_table import evaluate_quality
from sdv.multi_table import HMASynthesizer

In [2]:
with open('pkl/biodegrade/biodegrade.pkl', 'rb') as f:
    real_data_collection = pickle.load(f)

In [3]:
with open('pkl/biodegrade/biodegrade_synth_full_epoch.pkl', 'rb') as f:
    synthetic_data_collection = pickle.load(f)

In [4]:
with open('pkl/biodegrade/biodegrade_sdvmetadata.pkl', 'rb') as f:
    sdv_metadata = pickle.load(f)

In [5]:
generation_dict = {k:{'nrows':len(v)} for k,v in real_data_collection.items()}

In [6]:
generation_dict

{'atom': {'nrows': 6568},
 'bond': {'nrows': 6616},
 'gmember': {'nrows': 6647},
 'group': {'nrows': 1736},
 'molecule': {'nrows': 328}}

# HMA

In [7]:
synthesizer = HMASynthesizer(sdv_metadata)

In [8]:
%%time
%%capture

synthesizer.fit(real_data_collection)
synthesizer.save(filepath='bm_models/biodegrade_models/biodegrade_hma.pkl')

CPU times: total: 5min 1s
Wall time: 5min 1s


In [9]:
%%time
%%capture

synthesizer = HMASynthesizer.load(filepath='bm_models/biodegrade_models/biodegrade_hma.pkl')
hma_synthetic_data = synthesizer.sample(
    scale=1
)

CPU times: total: 2min 41s
Wall time: 1min 56s


In [10]:
evaluate_quality(real_data=real_data_collection, synthetic_data=hma_synthetic_data, metadata=sdv_metadata).get_details(property_name='Column Shapes')

Creating report: 100%|███████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00,  6.52it/s]



Overall Quality Score: 75.77%

Properties:
Column Shapes: 82.65%
Column Pair Trends: 76.24%
Parent Child Relationships: 68.44%


Unnamed: 0,Table,Column,Metric,Quality Score
0,atom,nb_rows_in_bond,KSComplement,0.937105
1,atom,nb_rows_in_gmember,KSComplement,0.656615
2,atom,type,TVComplement,0.666964
3,bond,type,KSComplement,0.790586
4,group,nb_rows_in_gmember,KSComplement,0.93606
5,group,type,TVComplement,0.451613
6,molecule,activity,KSComplement,0.853659
7,molecule,logp,KSComplement,0.935976
8,molecule,mweight,KSComplement,0.95122
9,molecule,add_numerical,KSComplement,0.957317


In [11]:
eq = evaluate_quality(real_data=real_data_collection, synthetic_data=synthetic_data_collection, metadata=sdv_metadata)

Creating report: 100%|███████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 36.76it/s]



Overall Quality Score: 70.52%

Properties:
Column Shapes: 75.35%
Column Pair Trends: 53.98%
Parent Child Relationships: 82.22%
