In [1]:
import os
import time
import math
import random
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import metrics
from scipy.stats import ks_2samp
from sdv.metadata import MultiTableMetadata
from sdv.evaluation.multi_table import evaluate_quality
from sdv.multi_table import HMASynthesizer

In [2]:
with open('pkl/imdb/imdb.pkl', 'rb') as f:
    real_data_collection = pickle.load(f)

In [3]:
with open('pkl/imdb/imdb_synth_full_epoch.pkl', 'rb') as f:
    synthetic_data_collection = pickle.load(f)

In [4]:
with open('pkl/imdb/imdb_sdvmetadata.pkl', 'rb') as f:
    sdv_metadata = pickle.load(f)

In [5]:
generation_dict = {k:{'nrows':len(v)} for k,v in real_data_collection.items()}

In [6]:
generation_dict

{'actors': {'nrows': 1907},
 'directors': {'nrows': 34},
 'directors_genres': {'nrows': 285},
 'movies': {'nrows': 36},
 'movies_directors': {'nrows': 41},
 'movies_genres': {'nrows': 103},
 'roles': {'nrows': 1989}}

# HMA

In [7]:
synthesizer = HMASynthesizer(sdv_metadata)

In [8]:
%%time
%%capture
synthesizer.fit(real_data_collection)
synthesizer.save(filepath='bm_models/imdb_models/imdb_hma.pkl')

CPU times: total: 21.5 s
Wall time: 21.5 s


In [9]:
%%time
%%capture
synthesizer = HMASynthesizer.load(filepath='bm_models/imdb_models/imdb_hma.pkl')
hma_synthetic_data = synthesizer.sample(
    scale=1
)

CPU times: total: 25.5 s
Wall time: 21.2 s


In [10]:
evaluate_quality(real_data=real_data_collection, synthetic_data=hma_synthetic_data, metadata=sdv_metadata)

Creating report: 100%|███████████████████████████████████████████████████████████████████| 5/5 [00:02<00:00,  2.13it/s]



Overall Quality Score: 46.65%

Properties:
Column Shapes: 68.03%
Column Pair Trends: 42.55%
Parent Child Relationships: 29.37%


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


<sdmetrics.reports.multi_table.quality_report.QualityReport at 0x189c0c5aac0>

In [11]:
evaluate_quality(real_data=real_data_collection, synthetic_data=synthetic_data_collection, metadata=sdv_metadata)

Creating report: 100%|███████████████████████████████████████████████████████████████████| 5/5 [00:04<00:00,  1.22it/s]



Overall Quality Score: 36.54%

Properties:
Column Shapes: 56.86%
Column Pair Trends: 45.14%
Parent Child Relationships: 7.62%


<sdmetrics.reports.multi_table.quality_report.QualityReport at 0x189c0d84310>