In [29]:
import pandas as pd
import src.data_processing as dp
from sdmetrics.reports.single_table import QualityReport
from sdmetrics.reports.single_table import DiagnosticReport
from realtabformer import REaLTabFormer

In [6]:
train_data, test_data, sample_data = dp.csv_data_split("../data/breast-cancer-wisconsin.csv")
my_metadata_dict = dp.metadata("../data/metadata.json")

In [14]:
model = REaLTabFormer.load_from_dir("../models/small_model_full/id000017334601255547021312")

In [27]:
synthetic_data = model.sample(n_samples=len(test_data))


The device=cuda is not available, using device=cpu instead.



  0%|          | 0/137 [00:00<?, ?it/s]

Generated 0 invalid samples out of total 256 samples generated. Sampling efficiency is: 100.0000%


In [20]:
quality = QualityReport()
quality.generate(test_data,synthetic_data,my_metadata_dict,verbose=False)

In [22]:
print(quality.get_properties())

             Property     Score
0       Column Shapes  0.886861
1  Column Pair Trends  0.913510


In [25]:
diagnostic = DiagnosticReport()
diagnostic.generate(test_data,synthetic_data,my_metadata_dict,verbose=False)

In [26]:
print(diagnostic.get_properties())

         Property     Score
0   Data Validity  0.998673
1  Data Structure  1.000000


In [30]:
models_dict = {
    "small_model": "../models/small_model_full/id000017334601255547021312",
    "regular_model": "../models/regular_model_full/id000017334628363135500288",
}

In [33]:
# Placeholder for results
results = []

# Loop through different models
for model_name, model_path in models_dict.items():
    # Load model
    model = REaLTabFormer.load_from_dir(model_path)
    
    # Generate synthetic data
    synthetic_data = model.sample(n_samples=len(test_data))
    
    # Generate quality and diagnostic reports
    quality = QualityReport()
    quality.generate(test_data, synthetic_data, my_metadata_dict, verbose=False)
    diagnostic = DiagnosticReport()
    diagnostic.generate(test_data, synthetic_data, my_metadata_dict, verbose=False)
    
    # Extract individual scores
    column_shapes = quality.get_properties().loc[
        quality.get_properties()['Property'] == 'Column Shapes', 'Score'
    ].values[0]
    column_pair_trends = quality.get_properties().loc[
        quality.get_properties()['Property'] == 'Column Pair Trends', 'Score'
    ].values[0]
    data_validity = diagnostic.get_properties().loc[
        diagnostic.get_properties()['Property'] == 'Data Validity', 'Score'
    ].values[0]
    data_structure = diagnostic.get_properties().loc[
        diagnostic.get_properties()['Property'] == 'Data Structure', 'Score'
    ].values[0]
    
    # Calculate weighted total score
    total_score = (
        0.25 * column_shapes +
        0.25 * column_pair_trends +
        0.25 * data_validity +
        0.25 * data_structure
    )
    
    # Append results to the list
    results.append({
        "Model": model_name,
        "Column Shapes": column_shapes,
        "Column Pair Trends": column_pair_trends,
        "Data Validity": data_validity,
        "Data Structure": data_structure,
        "Total Score": total_score
    })

# Convert results to a DataFrame
results_df = pd.DataFrame(results)

# Display results as a table
results_df


The device=cuda is not available, using device=cpu instead.



  0%|          | 0/137 [00:00<?, ?it/s]

Generated 0 invalid samples out of total 256 samples generated. Sampling efficiency is: 100.0000%



The device=cuda is not available, using device=cpu instead.



  0%|          | 0/137 [00:00<?, ?it/s]

Generated 0 invalid samples out of total 256 samples generated. Sampling efficiency is: 100.0000%


Unnamed: 0,Model,Column Shapes,Column Pair Trends,Data Validity,Data Structure,Total Score
0,small_model,0.827007,0.897929,0.998009,1.0,0.930736
1,regular_model,0.837956,0.90575,0.999336,1.0,0.935761
