In [1]:
import pandas as pd
from transformers.pytorch_utils import Conv1D,prune_conv1d_layer,find_pruneable_heads_and_indices
import torch
import torch.nn as nn
from torch.nn.utils import prune
import transformers.pytorch_utils
from transformers.models.gpt2.modeling_gpt2 import GPT2Attention
import src.data_processing as dp
from sdmetrics.reports.single_table import QualityReport
from sdmetrics.reports.single_table import DiagnosticReport
from realtabformer import REaLTabFormer

In [0]:
# def prune_conv1d(layer: Conv1D, index: torch.LongTensor, dim: int = 1) -> Conv1D:
#     index = index.to(layer.weight.device)  # Ensure the index tensor is on the same device as the layer
#     W = layer.weight.index_select(dim, index).clone().detach()  # Prune weights along the specified dimension
# 
#     # Adjust bias if present, ensuring it matches the pruned weight size
#     if layer.bias is not None:
#         b = layer.bias[index].clone().detach()
#     else:
#         b = None  # No bias in the layer
# 
#     # Calculate new size after pruning
#     new_size = list(layer.weight.size())
#     new_size[dim] = len(index)  # Adjust the pruned dimension to match the selected indices
# 
#     # Create a new Conv1D layer with the pruned size
#     new_layer = Conv1D(new_size[1], new_size[0]).to(layer.weight.device)
#     new_layer.weight.requires_grad = False
#     new_layer.weight.copy_(W.contiguous())
#     new_layer.weight.requires_grad = True
# 
#     # If there's a bias, copy it as well
#     if b is not None:
#         new_layer.bias.requires_grad = False
#         new_layer.bias.copy_(b.contiguous())
#         new_layer.bias.requires_grad = True
# 
#     return new_layer

In [22]:

def prune_heads_custom(self, heads):
        if len(heads) == 0:
            return
        heads, index = find_pruneable_heads_and_indices(heads, self.num_heads, self.head_dim, self.pruned_heads)
        index_attn = torch.cat([index, index + self.split_size, index + (2 * self.split_size)])

        # Prune conv1d layers
        self.c_attn = prune_conv1d_layer(self.c_attn, index_attn, dim=1)
        self.c_proj = prune_conv1d_layer(self.c_proj, index, dim=0)

        # Update hyper params
        self.split_size = (self.split_size // self.num_heads) * (self.num_heads - len(heads))
        self.num_heads = self.num_heads - len(heads)
        self.pruned_heads = self.pruned_heads.union(heads)


def apply_structured_pruning(model):
    pruned_layers = []
    
    for name, module in model.named_modules():
        if isinstance(module, Conv1D):
            pruned_layers.append(name)
    
    for name in pruned_layers:
        if 'transformer.h.' in name:
            block_index = int(name.split('.')[2])
            
            block = model.transformer.h[block_index]
            
            if 'attn' in name:
                test = [0,1,2,3]
                prune_heads_custom(block.attn,test)
            # elif 'mlp' in name:
            #     if 'c_fc' in name:
            #         block.mlp.c_fc = pruned_layer
            #     elif 'c_proj' in name:
            #         block.mlp.c_proj = pruned_layer
        
        # print(f"Replaced layer {name} with pruned weight shape: {pruned_layer.weight.shape}")

In [5]:
def compute_attention_scores(model, dataset):
    # Example method to calculate attention scores
    attention_scores = {i: [] for i in range(model.num_heads)}
    
    for input_data in dataset:
        attention_weights = model.get_attention_weights(input_data)  # Retrieve attention weights
        for head_idx in range(model.num_heads):
            attention_scores[head_idx].append(torch.mean(attention_weights[head_idx]).item())
    
    return attention_scores

def print_tensor(model):
    for name, param in model.named_parameters():
        if param.dim() == 2:
            print(name,param.size())

def convert_to_sparse(model, pruned_layers):
    for name, param in model.named_parameters():
        # Check if the layer is in the pruned list
        if any(layer_name in name for layer_name in pruned_layers):
            if param.dim() == 2:  # Sparse only weight matrices
                sparse_param = param.to_sparse()
                param.data = sparse_param

def check_layer_names(model):
    for name, module in model.named_modules():
        print(f"Layer Name: {name}, Module: {module.__class__.__name__}")

In [6]:
train_data, test_data, sample_data = dp.csv_data_split("../data/breast-cancer-wisconsin.csv")
my_metadata_dict = dp.metadata("../data/cancer_metadata.json")

In [17]:
rtf_model = REaLTabFormer.load_from_dir("../models/rtf_small_copy/id000017342868701547638784")

In [8]:
compute_attention_scores(rtf_model.model,train_data)

AttributeError: 'GPT2LMHeadModel' object has no attribute 'num_heads'

In [18]:
for i, block in enumerate(rtf_model.model.transformer.h):
    num_heads = block.mlp
    print(f"Layer {i} has {num_heads} attention heads.")

Layer 0 has GPT2MLP(
  (c_fc): Conv1D()
  (c_proj): Conv1D()
  (act): NewGELUActivation()
  (dropout): Dropout(p=0.1, inplace=False)
) attention heads.
Layer 1 has GPT2MLP(
  (c_fc): Conv1D()
  (c_proj): Conv1D()
  (act): NewGELUActivation()
  (dropout): Dropout(p=0.1, inplace=False)
) attention heads.
Layer 2 has GPT2MLP(
  (c_fc): Conv1D()
  (c_proj): Conv1D()
  (act): NewGELUActivation()
  (dropout): Dropout(p=0.1, inplace=False)
) attention heads.
Layer 3 has GPT2MLP(
  (c_fc): Conv1D()
  (c_proj): Conv1D()
  (act): NewGELUActivation()
  (dropout): Dropout(p=0.1, inplace=False)
) attention heads.


In [19]:
synthetic_data = rtf_model.sample(n_samples=len(test_data))

quality = QualityReport()
diagnostic = DiagnosticReport()

quality.generate(test_data,synthetic_data,metadata=my_metadata_dict,verbose=False)
diagnostic.generate(test_data,synthetic_data,metadata=my_metadata_dict,verbose=False)

print(quality.get_properties())
print(diagnostic.get_properties())



  0%|          | 0/137 [00:00<?, ?it/s]

Generated 0 invalid samples out of total 256 samples generated. Sampling efficiency is: 100.0000%
             Property     Score
0       Column Shapes  0.937226
1  Column Pair Trends  0.916875
         Property     Score
0   Data Validity  0.998673
1  Data Structure  1.000000


In [24]:
print_tensor(rtf_model.model)

transformer.wte.weight torch.Size([156, 512])
transformer.wpe.weight torch.Size([1024, 512])
transformer.h.0.attn.c_attn.weight torch.Size([512, 768])
transformer.h.0.attn.c_proj.weight torch.Size([256, 512])
transformer.h.0.mlp.c_fc.weight torch.Size([512, 2048])
transformer.h.0.mlp.c_proj.weight torch.Size([2048, 512])
transformer.h.1.attn.c_attn.weight torch.Size([512, 768])
transformer.h.1.attn.c_proj.weight torch.Size([256, 512])
transformer.h.1.mlp.c_fc.weight torch.Size([512, 2048])
transformer.h.1.mlp.c_proj.weight torch.Size([2048, 512])
transformer.h.2.attn.c_attn.weight torch.Size([512, 768])
transformer.h.2.attn.c_proj.weight torch.Size([256, 512])
transformer.h.2.mlp.c_fc.weight torch.Size([512, 2048])
transformer.h.2.mlp.c_proj.weight torch.Size([2048, 512])
transformer.h.3.attn.c_attn.weight torch.Size([512, 768])
transformer.h.3.attn.c_proj.weight torch.Size([256, 512])
transformer.h.3.mlp.c_fc.weight torch.Size([512, 2048])
transformer.h.3.mlp.c_proj.weight torch.Size(

In [23]:
# indices = torch.LongTensor([i for i in range(0, 512, 2 )])  # Keep 100 out of 512 channels
apply_structured_pruning(rtf_model.model)

In [None]:
rtf_model.model

In [25]:
def compute_sparsity(model):
    total_params = 0
    zero_params = 0
    for param in model.parameters():
        total_params += param.numel()
        zero_params += (param == 0).sum().item()
    
    sparsity = zero_params / total_params
    return sparsity,total_params, zero_params

# Example usage
sparsity, total_params, zero_params = compute_sparsity(rtf_model.model)
print(f"Sparsity: {sparsity * 100:.2f}%")
print(f"Total: {total_params}")
print(f"Zero: {zero_params}")



Sparsity: 0.00%
Total: 11114496
Zero: 0


In [26]:
11114496/13214720

0.8410693529639675

In [27]:
def model_size(model):
    # Compute the total size of the model in bytes
    return sum(p.numel() * p.element_size() for p in model.parameters())

print(f"Original model size: {model_size(rtf_model.model)} bytes")

Original model size: 44457984 bytes


In [31]:
synthetic_data = rtf_model.sample(n_samples=(len(test_data)))

quality = QualityReport()
diagnostic = DiagnosticReport()

quality.generate(test_data,synthetic_data,metadata=my_metadata_dict,verbose=False)
diagnostic.generate(test_data,synthetic_data,metadata=my_metadata_dict,verbose=False)

print(quality.get_properties())
print(diagnostic.get_properties())



  0%|          | 0/137 [00:00<?, ?it/s]

Generated 0 invalid samples out of total 256 samples generated. Sampling efficiency is: 100.0000%
             Property     Score
0       Column Shapes  0.891971
1  Column Pair Trends  0.803713
         Property     Score
0   Data Validity  0.998673
1  Data Structure  1.000000
