In [2]:
import pandas as pd
import transformers.pytorch_utils
import torch
import torch.nn as nn
from torch.nn.utils import prune
import transformers.pytorch_utils
import src.data_processing as dp
from sdmetrics.reports.single_table import QualityReport
from sdmetrics.reports.single_table import DiagnosticReport
from realtabformer import REaLTabFormer

In [3]:
train_data, test_data, sample_data = dp.csv_data_split("../data/breast-cancer-wisconsin.csv")
my_metadata_dict = dp.metadata("../data/cancer_metadata.json")
test_data

Unnamed: 0,ID,CT,UCSi,UCSh,Madh,SECS,BN,BC,NN,Mi,Class
175,1001010,1,1,1,1,0,1,1,1,1,0
162,1198611,3,1,1,1,0,1,3,1,1,0
356,190561,1,3,0,1,3,1,0,1,1,0
488,1065899,1,1,1,1,0,1,3,1,1,0
409,1057938,3,1,1,1,0,1,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...
181,1006811,10,5,6,10,6,10,7,7,10,1
448,1080058,1,1,1,1,0,1,1,0,1,0
112,1173035,3,3,0,1,0,3,3,1,1,0
557,183936,3,1,1,1,0,1,0,1,1,0


In [5]:
model = REaLTabFormer.load_from_dir("../models/rtf_small/id000017342868701547638784")

In [6]:
synthetic_data = model.sample(n_samples=len(test_data))



  0%|          | 0/137 [00:00<?, ?it/s]

Generated 0 invalid samples out of total 256 samples generated. Sampling efficiency is: 100.0000%


In [7]:
quality = QualityReport()
quality.generate(test_data,synthetic_data,my_metadata_dict,verbose=False)
quality.get_properties()

Unnamed: 0,Property,Score
0,Column Shapes,0.489051
1,Column Pair Trends,0.60661


In [8]:
diagnostic = DiagnosticReport()
diagnostic.generate(test_data,synthetic_data,my_metadata_dict,verbose=False)
diagnostic.get_properties()

Unnamed: 0,Property,Score
0,Data Validity,0.935634
1,Data Structure,1.0


In [None]:
models_dict = {
    "small_model": "../models/rtf_small/id000017342868701547638784",
    "regular_model": "../models/rtf_regular/id000017342890144858071040",
    # "large_model": "../models/rtf_large/id000017341472610579369984"
}

In [None]:
results = []

In [None]:
n_generations = 5

# Loop through different models
for model_name, model_path in models_dict.items():
    # Load the model
    model = REaLTabFormer.load_from_dir(model_path)
    
    # Initialize accumulators for scores
    column_shapes_scores = []
    column_pair_trends_scores = []
    data_validity_scores = []
    data_structure_scores = []
    
    # Generate multiple synthetic datasets and compute scores
    for _ in range(n_generations):
        synthetic_data = model.sample(n_samples=len(test_data))
        
        # Generate quality and diagnostic reports
        quality = QualityReport()
        quality.generate(test_data, synthetic_data, my_metadata_dict, verbose=False)
        diagnostic = DiagnosticReport()
        diagnostic.generate(test_data, synthetic_data, my_metadata_dict, verbose=False)
        
        # Extract individual scores
        column_shapes = quality.get_properties().loc[
            quality.get_properties()['Property'] == 'Column Shapes', 'Score'
        ].values[0]
        column_pair_trends = quality.get_properties().loc[
            quality.get_properties()['Property'] == 'Column Pair Trends', 'Score'
        ].values[0]
        data_validity = diagnostic.get_properties().loc[
            diagnostic.get_properties()['Property'] == 'Data Validity', 'Score'
        ].values[0]
        data_structure = diagnostic.get_properties().loc[
            diagnostic.get_properties()['Property'] == 'Data Structure', 'Score'
        ].values[0]
        
        # Append scores to accumulators
        column_shapes_scores.append(column_shapes)
        column_pair_trends_scores.append(column_pair_trends)
        data_validity_scores.append(data_validity)
        data_structure_scores.append(data_structure)
    
    # Calculate average scores
    avg_column_shapes = sum(column_shapes_scores) / n_generations
    avg_column_pair_trends = sum(column_pair_trends_scores) / n_generations
    avg_data_validity = sum(data_validity_scores) / n_generations
    avg_data_structure = sum(data_structure_scores) / n_generations
    
    # Calculate total score
    avg_total_score = (
        0.40 * avg_column_shapes +
        0.40 * avg_column_pair_trends +
        0.10 * avg_data_validity +
        0.10 * avg_data_structure
    )
    
    # Append results
    results.append({
        "Model": model_name,
        "Avg Column Shapes": avg_column_shapes,
        "Avg Column Pair Trends": avg_column_pair_trends,
        "Avg Data Validity": avg_data_validity,
        "Avg Data Structure": avg_data_structure,
        "Avg Total Score": avg_total_score
    })

# Convert results to DataFrame
results_df = pd.DataFrame(results)

results_df

In [None]:
temp = train_data.head()
temp

In [None]:
predictions = model.predict(
    data=test_data.drop('Class', axis=1),
    target_col='Class',
    disable_progress_bar = True,
    fillunk=False,
    target_pos_val=1
)
predictions

In [None]:
test_data['Class'].head(10)

In [None]:
import numpy as np
from sdmetrics.single_table import BinaryDecisionTreeClassifier

def load_and_generate_synthetic_data(model_dir, n_samples, metadata):
    model = REaLTabFormer.load_from_dir(model_dir)
    synthetic_data = model.sample(n_samples=n_samples)
    return synthetic_data

def evaluate_model(test_data, synthetic_data, target, metadata):
    return BinaryDecisionTreeClassifier.compute(
        test_data=test_data,
        train_data=synthetic_data,
        target=target,
        metadata=metadata
    )

# Model directories
model_dirs = [
    "../models/rtf_small/id000017342868701547638784",
    "../models/rtf_regular/id000017342890144858071040",
    "../models/rtf_large/id000017342929846661560320"
]

# Number of runs
n_runs = 5

# Evaluate each model
for model_dir in model_dirs:
    scores = []
    for _ in range(n_runs):
        # Generate synthetic data
        synthetic_data = load_and_generate_synthetic_data(model_dir, len(test_data), my_metadata_dict)
        
        # Evaluate the synthetic data
        evaluation_score = evaluate_model(test_data, synthetic_data, target='Class', metadata=my_metadata_dict)
        
        # Append score to list
        scores.append(evaluation_score)

    # Compute average score
    average_score = np.mean(scores)
    print(f"Average Evaluation for model {model_dir}: {average_score}")


In [21]:
def prune_conv1d_layer(layer,amount):
    prune.ln_structured(layer, name='weight', amount=amount, dim=1,n=float('-inf'))
    prune.remove(layer,name='weight')
    
def apply_structured_pruning(model,amount):
    for name, module in model.named_modules():
        if isinstance(module, nn.Module):
            if isinstance(module, transformers.pytorch_utils.Conv1D):
                prune_conv1d_layer(module,amount)
                
def print_tensor(model):
    sparse_model = model
    for name, param in model.named_parameters():
        if param.dim() == 2:
            print(name,param.size())

def convert_to_sparse(model):
    test = model
    for name, param in test.named_parameters():
        if param.dim() == 2:  # Apply to weight matrices
            # Convert to sparse tensor
            param  = param.data.to_sparse()
        
    return test



In [22]:
print_tensor(model.model)

transformer.wte.weight torch.Size([156, 512])
transformer.wpe.weight torch.Size([1024, 512])
transformer.h.0.attn.c_attn.weight torch.Size([512, 1536])
transformer.h.0.attn.c_proj.weight torch.Size([512, 512])
transformer.h.0.mlp.c_fc.weight torch.Size([512, 2048])
transformer.h.0.mlp.c_proj.weight torch.Size([2048, 512])
transformer.h.1.attn.c_attn.weight torch.Size([512, 1536])
transformer.h.1.attn.c_proj.weight torch.Size([512, 512])
transformer.h.1.mlp.c_fc.weight torch.Size([512, 2048])
transformer.h.1.mlp.c_proj.weight torch.Size([2048, 512])
transformer.h.2.attn.c_attn.weight torch.Size([512, 1536])
transformer.h.2.attn.c_proj.weight torch.Size([512, 512])
transformer.h.2.mlp.c_fc.weight torch.Size([512, 2048])
transformer.h.2.mlp.c_proj.weight torch.Size([2048, 512])
transformer.h.3.attn.c_attn.weight torch.Size([512, 1536])
transformer.h.3.attn.c_proj.weight torch.Size([512, 512])
transformer.h.3.mlp.c_fc.weight torch.Size([512, 2048])
transformer.h.3.mlp.c_proj.weight torch.S

In [17]:
model_file = convert_to_sparse(model.model)

In [None]:
for name, module in model.model.named_modules():        
    if isinstance(module, transformers.pytorch_utils.Conv1D):
            print(name)

In [None]:
import torch

def compute_sparsity(model):
    total_params = 0
    zero_params = 0
    for param in model.parameters():
        total_params += param.numel()
        zero_params += (param == 0).sum().item()
    
    sparsity = zero_params / total_params
    return sparsity,total_params, zero_params

# Example usage
sparsity, total_params, zero_params = compute_sparsity(model.model)
print(f"Sparsity: {sparsity * 100:.2f}%")
print(f"Total: {total_params}")
print(f"Zero: {zero_params}")



In [None]:
model.save("../models/small/")

In [None]:
from torch.quantization import quantize_dynamic

def apply_quantization_to_conv1d(model):
    # Set the model to evaluation mode
    model.eval()

    # Apply dynamic quantization to Conv1D layers in the model
    quantized_model = quantize_dynamic(
        model,  # The model to quantize
        dtype=torch.qint8  # Use int8 for more space reduction
    )


    return quantized_model


In [None]:
quantized_model = apply_quantization_to_conv1d(model.model)
for name, param in quantized_model.named_parameters():
    param.dtype = torch.qint8


In [None]:
torch.save(quantized_model.state_dict(), 'quantized_model.pt')


In [None]:
print_tensor(quantized_model)

In [None]:
model.model

In [None]:
import torch
import torch.nn as nn
from torch.quantization import quantize_dynamic, default_dynamic_qconfig
from transformers.pytorch_utils import Conv1D

# Define a custom quantization configuration
qconfig_spec = {
    nn.Linear: default_dynamic_qconfig,
    Conv1D: default_dynamic_qconfig,  # Add Conv1D for GPT2
}

# Define a custom mapping for Conv1D to itself (dynamic quantization assumes the same layer works)
from torch.quantization.quantization_mappings import get_default_dynamic_quant_module_mappings
custom_mapping = get_default_dynamic_quant_module_mappings()
custom_mapping[Conv1D] = Conv1D

# Apply dynamic quantization
def quantize_gpt2_model(model):
    model = quantize_dynamic(
        model,
        qconfig_spec=qconfig_spec,
        mapping=custom_mapping,
        dtype=torch.qint8,  # Specify the desired dtype
        inplace=False  # Create a quantized copy
    )
    return model

# Example usage
quantized_model = quantize_gpt2_model(model.model)


In [None]:
sparse_state_dict = torch.load("/Users/sebastian/PycharmProjects/model-compression/models/rtf_small/id000017342868701547638784/rtf_model.pt")
