In [2]:
import pandas as pd
import transformers.pytorch_utils
from torch.nn.utils import prune
import transformers.pytorch_utils
import src.data_processing as dp
from sdmetrics.reports.single_table import QualityReport
from sdmetrics.reports.single_table import DiagnosticReport
from realtabformer import REaLTabFormer
import numpy as np
from sdmetrics.single_table import BinaryDecisionTreeClassifier

In [3]:
# Splits dataset
train_data, test_data, sample_data = dp.csv_data_split("../data/breast-cancer-wisconsin.csv")

# Metadata for the SDMetrics library
my_metadata_dict = dp.metadata("../data/cancer_metadata.json")

In [4]:
# Loading regular model
model = REaLTabFormer.load_from_dir("../models/rtf_regular/id000017342890144858071040")

In [5]:
# Generate synth data
synthetic_data = model.sample(n_samples=len(test_data))



  0%|          | 0/137 [00:00<?, ?it/s]

Generated 0 invalid samples out of total 256 samples generated. Sampling efficiency is: 100.0000%


In [6]:
# Generate quality report
quality = QualityReport()
quality.generate(test_data,synthetic_data,my_metadata_dict,verbose=False)
quality.get_properties()

Unnamed: 0,Property,Score
0,Column Shapes,0.927737
1,Column Pair Trends,0.907774


In [7]:
# Generate diagnostic report
diagnostic = DiagnosticReport()
diagnostic.generate(test_data,synthetic_data,my_metadata_dict,verbose=False)
diagnostic.get_properties()

Unnamed: 0,Property,Score
0,Data Validity,0.992037
1,Data Structure,1.0


In [8]:
# Dictionary of models to evaluate

models_dict = {
    "small_model": "../models/rtf_small/id000017342868701547638784",
    "regular_model": "../models/rtf_regular/id000017342890144858071040",
    # "large_model": "../models/rtf_large/id000017341472610579369984"
}

In [9]:
# Storing results here
results = []

In [10]:
# Number of eval rounds
n_generations = 5

# loop through different models
for model_name, model_path in models_dict.items():
    # load the model
    model = REaLTabFormer.load_from_dir(model_path)
    
    # Initialize accumulators for scores
    column_shapes_scores = []
    column_pair_trends_scores = []
    data_validity_scores = []
    data_structure_scores = []
    
    # Generate multiple synthetic datasets and compute the scores
    for _ in range(n_generations):
        synthetic_data = model.sample(n_samples=len(test_data))
        
        # Generate quality and diagnostic reports
        quality = QualityReport()
        quality.generate(test_data, synthetic_data, my_metadata_dict, verbose=False)
        diagnostic = DiagnosticReport()
        diagnostic.generate(test_data, synthetic_data, my_metadata_dict, verbose=False)
        
        # Extract individual scores
        column_shapes = quality.get_properties().loc[
            quality.get_properties()['Property'] == 'Column Shapes', 'Score'
        ].values[0]
        column_pair_trends = quality.get_properties().loc[
            quality.get_properties()['Property'] == 'Column Pair Trends', 'Score'
        ].values[0]
        data_validity = diagnostic.get_properties().loc[
            diagnostic.get_properties()['Property'] == 'Data Validity', 'Score'
        ].values[0]
        data_structure = diagnostic.get_properties().loc[
            diagnostic.get_properties()['Property'] == 'Data Structure', 'Score'
        ].values[0]
        
        # Append scores to accumulators
        column_shapes_scores.append(column_shapes)
        column_pair_trends_scores.append(column_pair_trends)
        data_validity_scores.append(data_validity)
        data_structure_scores.append(data_structure)
    
    # Calculate average scores
    avg_column_shapes = sum(column_shapes_scores) / n_generations
    avg_column_pair_trends = sum(column_pair_trends_scores) / n_generations
    avg_data_validity = sum(data_validity_scores) / n_generations
    avg_data_structure = sum(data_structure_scores) / n_generations
    
    # Calculate total score (different weights could be applied to each, but for simplicity 0.25 is used)
    avg_total_score = (
        0.25 * avg_column_shapes +
        0.25 * avg_column_pair_trends +
        0.25 * avg_data_validity +
        0.25 * avg_data_structure
    )
    
    # Append results
    results.append({
        "Model": model_name,
        "Avg Column Shapes": avg_column_shapes,
        "Avg Column Pair Trends": avg_column_pair_trends,
        "Avg Data Validity": avg_data_validity,
        "Avg Data Structure": avg_data_structure,
        "Avg Total Score": avg_total_score
    })

# Convert results to DataFrame
results_df = pd.DataFrame(results)

results_df



  0%|          | 0/137 [00:00<?, ?it/s]

Generated 0 invalid samples out of total 256 samples generated. Sampling efficiency is: 100.0000%




  0%|          | 0/137 [00:00<?, ?it/s]

Generated 0 invalid samples out of total 256 samples generated. Sampling efficiency is: 100.0000%




  0%|          | 0/137 [00:00<?, ?it/s]

Generated 0 invalid samples out of total 256 samples generated. Sampling efficiency is: 100.0000%




  0%|          | 0/137 [00:00<?, ?it/s]

Generated 0 invalid samples out of total 256 samples generated. Sampling efficiency is: 100.0000%




  0%|          | 0/137 [00:00<?, ?it/s]

Generated 0 invalid samples out of total 256 samples generated. Sampling efficiency is: 100.0000%




  0%|          | 0/137 [00:00<?, ?it/s]

Generated 0 invalid samples out of total 256 samples generated. Sampling efficiency is: 100.0000%




  0%|          | 0/137 [00:00<?, ?it/s]

Generated 0 invalid samples out of total 256 samples generated. Sampling efficiency is: 100.0000%




  0%|          | 0/137 [00:00<?, ?it/s]

Generated 0 invalid samples out of total 256 samples generated. Sampling efficiency is: 100.0000%




  0%|          | 0/137 [00:00<?, ?it/s]

Generated 0 invalid samples out of total 256 samples generated. Sampling efficiency is: 100.0000%




  0%|          | 0/137 [00:00<?, ?it/s]

Generated 0 invalid samples out of total 256 samples generated. Sampling efficiency is: 100.0000%


Unnamed: 0,Model,Avg Column Shapes,Avg Column Pair Trends,Avg Data Validity,Avg Data Structure,Avg Total Score
0,small_model,0.492263,0.614016,0.942402,1.0,0.76217
1,regular_model,0.91854,0.922007,0.994559,1.0,0.958776


In [11]:
def load_and_generate_synthetic_data(model_dir, n_samples, metadata):
    model = REaLTabFormer.load_from_dir(model_dir)
    synthetic_data = model.sample(n_samples=n_samples)
    return synthetic_data

def evaluate_model(test_data, synthetic_data, target, metadata):
    return BinaryDecisionTreeClassifier.compute(
        test_data=test_data,
        train_data=synthetic_data,
        target=target,
        metadata=metadata
    )

# Model directories
model_dirs = [
    "../models/rtf_small/id000017342868701547638784",
    "../models/rtf_regular/id000017342890144858071040",
    "../models/rtf_large/id000017342929846661560320"
]

# Number of runs
n_runs = 5

# Evaluate each model
for model_dir in model_dirs:
    scores = []
    for _ in range(n_runs):
        # Generate synthetic data
        synthetic_data = load_and_generate_synthetic_data(model_dir, len(test_data), my_metadata_dict)
        
        # Evaluate the synthetic data
        evaluation_score = evaluate_model(test_data, synthetic_data, target='Class', metadata=my_metadata_dict)
        
        # Append score to list
        scores.append(evaluation_score)

    # Compute average score
    average_score = np.mean(scores)
    print(f"Average Evaluation for model {model_dir}: {average_score}")




  0%|          | 0/137 [00:00<?, ?it/s]

Generated 0 invalid samples out of total 256 samples generated. Sampling efficiency is: 100.0000%




  0%|          | 0/137 [00:00<?, ?it/s]

Generated 0 invalid samples out of total 256 samples generated. Sampling efficiency is: 100.0000%




  0%|          | 0/137 [00:00<?, ?it/s]

Generated 0 invalid samples out of total 256 samples generated. Sampling efficiency is: 100.0000%




  0%|          | 0/137 [00:00<?, ?it/s]

Generated 0 invalid samples out of total 256 samples generated. Sampling efficiency is: 100.0000%




  0%|          | 0/137 [00:00<?, ?it/s]

Generated 0 invalid samples out of total 256 samples generated. Sampling efficiency is: 100.0000%
Average Evaluation for model ../models/rtf_small/id000017342868701547638784: 0.29873013278537963




  0%|          | 0/137 [00:00<?, ?it/s]

Generated 0 invalid samples out of total 256 samples generated. Sampling efficiency is: 100.0000%




  0%|          | 0/137 [00:00<?, ?it/s]

Generated 0 invalid samples out of total 256 samples generated. Sampling efficiency is: 100.0000%




  0%|          | 0/137 [00:00<?, ?it/s]

Generated 0 invalid samples out of total 256 samples generated. Sampling efficiency is: 100.0000%




  0%|          | 0/137 [00:00<?, ?it/s]

Generated 0 invalid samples out of total 256 samples generated. Sampling efficiency is: 100.0000%




  0%|          | 0/137 [00:00<?, ?it/s]

Generated 0 invalid samples out of total 256 samples generated. Sampling efficiency is: 100.0000%
Average Evaluation for model ../models/rtf_regular/id000017342890144858071040: 0.9169137818622868




  0%|          | 0/137 [00:00<?, ?it/s]

Generated 0 invalid samples out of total 256 samples generated. Sampling efficiency is: 100.0000%




  0%|          | 0/137 [00:00<?, ?it/s]

Generated 0 invalid samples out of total 256 samples generated. Sampling efficiency is: 100.0000%




  0%|          | 0/137 [00:00<?, ?it/s]

Generated 0 invalid samples out of total 256 samples generated. Sampling efficiency is: 100.0000%




  0%|          | 0/137 [00:00<?, ?it/s]

Generated 0 invalid samples out of total 256 samples generated. Sampling efficiency is: 100.0000%




  0%|          | 0/137 [00:00<?, ?it/s]

Generated 0 invalid samples out of total 256 samples generated. Sampling efficiency is: 100.0000%
Average Evaluation for model ../models/rtf_large/id000017342929846661560320: 0.8949550788769803


In [12]:
def print_tensor(model):
    sparse_model = model
    for name, param in model.named_parameters():
        if param.dim() == 2:
            print(name,param.size())

# Attempt to convert to sparse (Sparse tensors not allowed for REalTabFormer)
def convert_to_sparse(model):
    test = model
    for name, param in test.named_parameters():
        if param.dim() == 2:  # Apply to weight matrices
            # Convert to sparse tensor
            param  = param.data.to_sparse()
        
    return test

In [13]:
for name, module in model.model.named_modules():        
    if isinstance(module, transformers.pytorch_utils.Conv1D):
            print(name)

transformer.h.0.attn.c_attn
transformer.h.0.attn.c_proj
transformer.h.0.mlp.c_fc
transformer.h.0.mlp.c_proj
transformer.h.1.attn.c_attn
transformer.h.1.attn.c_proj
transformer.h.1.mlp.c_fc
transformer.h.1.mlp.c_proj
transformer.h.2.attn.c_attn
transformer.h.2.attn.c_proj
transformer.h.2.mlp.c_fc
transformer.h.2.mlp.c_proj
transformer.h.3.attn.c_attn
transformer.h.3.attn.c_proj
transformer.h.3.mlp.c_fc
transformer.h.3.mlp.c_proj
transformer.h.4.attn.c_attn
transformer.h.4.attn.c_proj
transformer.h.4.mlp.c_fc
transformer.h.4.mlp.c_proj
transformer.h.5.attn.c_attn
transformer.h.5.attn.c_proj
transformer.h.5.mlp.c_fc
transformer.h.5.mlp.c_proj


In [14]:
def compute_sparsity(model):
    total_params = 0
    zero_params = 0
    for param in model.parameters():
        total_params += param.numel()
        zero_params += (param == 0).sum().item()
    
    sparsity = zero_params / total_params
    return sparsity,total_params, zero_params

# Example usage
sparsity, total_params, zero_params = compute_sparsity(model.model)
print(f"Sparsity: {sparsity * 100:.2f}%")
print(f"Total: {total_params}")
print(f"Zero: {zero_params}")

Sparsity: 0.00%
Total: 43435008
Zero: 0


In [15]:
model.save("../models/small/")

Copying artefacts from: best-disc-model
Copying artefacts from: mean-best-disc-model
Copying artefacts from: not-best-disc-model
Copying artefacts from: last-epoch-model
