# Import Libraries

In [None]:
import os
import sys
import torch
import pprint
import pandas as pd
from collections import defaultdict
from datasets import load_dataset
from torch.utils.data import DataLoader
from transformers import (
    AutoModelForSequenceClassification,
    AutoModelForImageClassification,
    AutoModelForCausalLM,
    AutoModelForSeq2SeqLM
)


# --- Path and Module Setup ---
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
src_path = os.path.join(project_root, 'src')
if src_path not in sys.path: sys.path.append(src_path)

from utils import calculate_delta_parameters, calculate_parameters_size, calculate_compressed_size

from evaluation import evaluate_accuracy
from finetuner import fine_tune_model

from pipeline import compress_model, decompress_model
from runner import run_classification_experiment





# Configure pandas for better table display
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 20)
pd.set_option('display.width', 140)

print("Setup complete.")


Setup complete.


# Debugging 

## Basic Testing

In [None]:
# Load Models
pretrained_model = AutoModelForSequenceClassification.from_pretrained("roberta-base")
finetuned_model = AutoModelForSequenceClassification.from_pretrained("textattack/roberta-base-SST-2")

# Load and Prepare Dataset
eval_dataset = load_dataset("glue", "sst2", split="validation")
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
def tokenize_function(e): return tokenizer(e["sentence"], padding="max_length", truncation=True)
tokenized_eval_dataset = eval_dataset.map(tokenize_function, batched=True)
tokenized_eval_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
eval_dataloader = DataLoader(tokenized_eval_dataset, batch_size=16)

print("Models and data loaded.")

In [None]:
# --- Compression ---
compressed_data = compress_model(
    pretrained_model,
    finetuned_model,
    patch_size=8,
    bit_strategy=[(2, 0.5), (0, 0.5)]
)

# --- Decompression ---
reconstructed_model = decompress_model(pretrained_model, compressed_data)
print("\nCompression and Decompression pipelines finished.")

In [None]:
# --- Accuracy Evaluation ---
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

original_accuracy = evaluate_model_accuracy(finetuned_model, eval_dataloader, device)
reconstructed_accuracy = evaluate_model_accuracy(reconstructed_model, eval_dataloader, device)

print("\n--- Accuracy Comparison ---")
print(f"Original fine-tuned model accuracy: {original_accuracy:.4f}")
print(f"Reconstructed model accuracy:   {reconstructed_accuracy:.4f}")
print(f"Accuracy drop: {(original_accuracy - reconstructed_accuracy):.4f}")

# --- Storage Size Evaluation ---
uncompressed_delta_weights = calculate_delta_parameters(pretrained_model, finetuned_model)
original_delta_size = calculate_parameters_size(uncompressed_delta_weights)
compressed_delta_size = calculate_compressed_size(compressed_data)

print("\n--- Storage Size Comparison ---")
print(f"Original delta parameters size:  {original_delta_size:.2f} MB")
print(f"Compressed delta data size:      {compressed_delta_size:.2f} MB")
if compressed_delta_size > 0:
    print(f"Compression Ratio: {(original_delta_size / compressed_delta_size):.2f}x")

In [None]:
all_results = []
device_to_use = "cpu"

# --- Experiment Definitions ---
pretrained_id = "roberta-base"
finetuned_variants = [
    "textattack/roberta-base-SST-2",
    # You can add other RoBERTa models fine-tuned on SST-2 here if you find them
]
patch_sizes = [8, 16, 32]
bit_strategies = [
    [(2, 0.5), (0, 0.5)],
    [(4, 0.5), (0, 0.5)]
]

# --- Main Loop ---
for finetuned_id in finetuned_variants:
    for p_size in patch_sizes:
        for bit_strat in bit_strategies:
            result = run_classification_experiment(
                pretrained_model_id=pretrained_id,
                finetuned_model_id=finetuned_id,
                patch_size=p_size,
                bit_strategy=bit_strat,
                device=device_to_use
            )
            all_results.append(result)

In [None]:
# In model_evaluation.ipynb (Cell 2)

# This list defines the groups of experiments we want to run.
experiments_config_groups = [
    {
        "group_name": "RoBERTa-SST2",
        "pretrained_model_id": "roberta-base",
        "finetuned_model_id": "textattack/roberta-base-SST-2",
        "model_class": AutoModelForSequenceClassification,
        "task_info": {"name": "glue", "config": "sst2", "split": "validation", "text_column": "sentence"},
        # --- Define lists of hyperparameters to test ---
        "patch_sizes": [8, 16, 32],
        "bit_strategies": [
            [(2, 0.5), (0, 0.5)],
            [(4, 0.5), (0, 0.5)]
        ]
    },
    {
        "group_name": "ViT-CIFAR10",
        # --- FIX IS HERE: Corrected the typo from 'in2k' to 'in21k' ---
        "pretrained_model_id": "google/vit-base-patch16-224-in21k",
        "finetuned_model_id": "nateraw/vit-base-patch16-224-cifar10",
        "model_class": AutoModelForImageClassification,
        "task_info": {"name": "cifar10", "config": None, "split": "test"},
        "patch_sizes": [16],
        "bit_strategies": [
            [(2, 0.5), (0, 0.5)]
        ]
    }
]

In [None]:
# In model_evaluation.ipynb (Cell 3)

all_results = []
device_to_use = "cpu"

# Loop through each group of experiments
for experiment_group in experiments_config_groups:
    # Loop through each hyperparameter combination
    for p_size in experiment_group["patch_sizes"]:
        for bit_strat in experiment_group["bit_strategies"]:
            
            # --- Create the final, flat config for this specific run ---
            config = {
                "pretrained_model_id": experiment_group["pretrained_model_id"],
                "finetuned_model_id": experiment_group["finetuned_model_id"],
                "model_class": experiment_group["model_class"],
                "task_info": experiment_group["task_info"],
                "patch_size": p_size,      # Use the singular key 'patch_size'
                "bit_strategy": bit_strat  # Use the singular key 'bit_strategy'
            }

            try:
                result = run_classification_experiment(
                    config=config,
                    device=device_to_use
                )
                all_results.append(result)
            except Exception as e:
                print(f"\n!!!!!! An error occurred during experiment: {config.get('finetuned_model_id')} !!!!!!")
                print(f"Error: {e}\n")

print("\n\nAll experiments have been completed.")

In [None]:
if all_results:
    results_df = pd.DataFrame(all_results)
    print("\n--- All Experiment Results ---")
    display(results_df)
else:
    print("No results to display. Please check for errors in the previous cell.")

## Multimodal Testing

In [2]:
experiments_config = [
    {
        "pretrained_model_id": "roberta-base",
        "finetuned_model_id": "textattack/roberta-base-SST-2",
        "model_class": AutoModelForSequenceClassification,
        "task_info": {"name": "glue", "config": "sst2", "split": "validation", "text_column": "sentence"},
        "patch_size": 16,
        "bit_strategy": [(2, 0.5), (0, 0.5)]
    },
    {
        "pretrained_model_id": "google/vit-base-patch16-224-in21k",
        "finetuned_model_id": "nateraw/vit-base-patch16-224-cifar10",
        "model_class": AutoModelForImageClassification,
        "task_info": {"name": "cifar10", "config": None, "split": "test"},
        "patch_size": 16,
        "bit_strategy": [(2, 0.5), (0, 0.5)]
    },
    {
    "pretrained_model_id": "distilbert-base-uncased",
    "finetuned_model_id": "distilbert-base-uncased-finetuned-sst-2-english",
    "model_class": AutoModelForSequenceClassification,
    "task_info": {"name": "glue", "config": "sst2", "split": "validation", "text_column": "sentence"},
    "patch_size": 16,
    "bit_strategy": [(2, 0.5), (0, 0.5)]
    },
        # --- NEW: Swin Transformer (Image Classification) Experiment ---
    {
        "pretrained_model_id": "microsoft/swin-tiny-patch4-window7-224",
        "finetuned_model_id":  "rs127/swin-tiny-patch4-window7-224-finetuned-cifar10",
        "model_class": AutoModelForImageClassification,
        "task_info": {"name": "cifar10", "config": None, "split": "test"},
        "patch_size": 16, # You can experiment with other patch sizes like 8 or 32
        "bit_strategy": [(2, 0.5), (0, 0.5)]
    }

]

In [3]:
all_results = []
device_to_use = "cpu"

for config in experiments_config:
    try:
        result = run_classification_experiment(
            config=config,
            device=device_to_use
        )
        all_results.append(result)
    except Exception as e:
        print(f"\n!!!!!! An error occurred during experiment: {config.get('finetuned_model_id')} !!!!!!")
        print(f"Error: {e}\n")

print("\n\nAll experiments have been completed.")


Loading models: roberta-base and textattack/roberta-base-SST-2


Some weights of the model checkpoint at textattack/roberta-base-SST-2 were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream 

Preparing dataloader for dataset: glue
Starting model compression using transform: DCT...
Compressing layer: roberta.embeddings.word_embeddings.weight...
Compressing layer: roberta.embeddings.position_embeddings.weight...
Compressing layer: roberta.embeddings.token_type_embeddings.weight...
Skipping compression for layer 'roberta.embeddings.LayerNorm.weight'. Storing uncompressed.
Skipping compression for layer 'roberta.embeddings.LayerNorm.bias'. Storing uncompressed.
Compressing layer: roberta.encoder.layer.0.attention.self.query.weight...
Skipping compression for layer 'roberta.encoder.layer.0.attention.self.query.bias'. Storing uncompressed.
Compressing layer: roberta.encoder.layer.0.attention.self.key.weight...
Skipping compression for layer 'roberta.encoder.layer.0.attention.self.key.bias'. Storing uncompressed.
Compressing layer: roberta.encoder.layer.0.attention.self.value.weight...
Skipping compression for layer 'roberta.encoder.layer.0.attention.self.value.bias'. Storing unco

Evaluating Accuracy: 100%|██████████| 7/7 [00:05<00:00,  1.29it/s]
Evaluating Accuracy: 100%|██████████| 7/7 [00:05<00:00,  1.39it/s]


Releasing models from memory...

Loading models: google/vit-base-patch16-224-in21k and nateraw/vit-base-patch16-224-cifar10


Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Preparing dataloader for dataset: cifar10
Starting model compression using transform: DCT...
Skipping compression for layer 'vit.embeddings.cls_token'. Storing uncompressed.
Skipping compression for layer 'vit.embeddings.position_embeddings'. Storing uncompressed.
Skipping compression for layer 'vit.embeddings.patch_embeddings.projection.weight'. Storing uncompressed.
Skipping compression for layer 'vit.embeddings.patch_embeddings.projection.bias'. Storing uncompressed.
Compressing layer: vit.encoder.layer.0.attention.attention.query.weight...
Skipping compression for layer 'vit.encoder.layer.0.attention.attention.query.bias'. Storing uncompressed.
Compressing layer: vit.encoder.layer.0.attention.attention.key.weight...
Skipping compression for layer 'vit.encoder.layer.0.attention.attention.key.bias'. Storing uncompressed.
Compressing layer: vit.encoder.layer.0.attention.attention.value.weight...
Skipping compression for layer 'vit.encoder.layer.0.attention.attention.value.bias'. Stori

Evaluating Accuracy: 100%|██████████| 7/7 [00:08<00:00,  1.17s/it]
Evaluating Accuracy: 100%|██████████| 7/7 [00:08<00:00,  1.16s/it]


Releasing models from memory...

Loading models: distilbert-base-uncased and distilbert-base-uncased-finetuned-sst-2-english


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Preparing dataloader for dataset: glue
Starting model compression using transform: DCT...
Compressing layer: distilbert.embeddings.word_embeddings.weight...
Compressing layer: distilbert.embeddings.position_embeddings.weight...
Skipping compression for layer 'distilbert.embeddings.LayerNorm.weight'. Storing uncompressed.
Skipping compression for layer 'distilbert.embeddings.LayerNorm.bias'. Storing uncompressed.
Compressing layer: distilbert.transformer.layer.0.attention.q_lin.weight...
Skipping compression for layer 'distilbert.transformer.layer.0.attention.q_lin.bias'. Storing uncompressed.
Compressing layer: distilbert.transformer.layer.0.attention.k_lin.weight...
Skipping compression for layer 'distilbert.transformer.layer.0.attention.k_lin.bias'. Storing uncompressed.
Compressing layer: distilbert.transformer.layer.0.attention.v_lin.weight...
Skipping compression for layer 'distilbert.transformer.layer.0.attention.v_lin.bias'. Storing uncompressed.
Compressing layer: distilbert.tr

Evaluating Accuracy: 100%|██████████| 7/7 [00:02<00:00,  2.74it/s]
Evaluating Accuracy: 100%|██████████| 7/7 [00:02<00:00,  2.75it/s]


Releasing models from memory...

Loading models: microsoft/swin-tiny-patch4-window7-224 and rs127/swin-tiny-patch4-window7-224-finetuned-cifar10


Some weights of SwinForImageClassification were not initialized from the model checkpoint at microsoft/swin-tiny-patch4-window7-224 and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([10]) in the model instantiated
- classifier.weight: found shape torch.Size([1000, 768]) in the checkpoint and torch.Size([10, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Preparing dataloader for dataset: cifar10


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


Starting model compression using transform: DCT...
Skipping compression for layer 'swin.embeddings.patch_embeddings.projection.weight'. Storing uncompressed.
Skipping compression for layer 'swin.embeddings.patch_embeddings.projection.bias'. Storing uncompressed.
Skipping compression for layer 'swin.embeddings.norm.weight'. Storing uncompressed.
Skipping compression for layer 'swin.embeddings.norm.bias'. Storing uncompressed.
Skipping compression for layer 'swin.encoder.layers.0.blocks.0.layernorm_before.weight'. Storing uncompressed.
Skipping compression for layer 'swin.encoder.layers.0.blocks.0.layernorm_before.bias'. Storing uncompressed.
Compressing layer: swin.encoder.layers.0.blocks.0.attention.self.relative_position_bias_table...
Compressing layer: swin.encoder.layers.0.blocks.0.attention.self.relative_position_index...
Compressing layer: swin.encoder.layers.0.blocks.0.attention.self.query.weight...
Skipping compression for layer 'swin.encoder.layers.0.blocks.0.attention.self.que

Evaluating Accuracy: 100%|██████████| 7/7 [00:05<00:00,  1.23it/s]
Evaluating Accuracy: 100%|██████████| 7/7 [00:06<00:00,  1.11it/s]

Releasing models from memory...


All experiments have been completed.





In [4]:
if all_results:
    results_df = pd.DataFrame(all_results)
    print("\n--- Experiment Results ---")
    display(results_df)
else:
    print("No results to display. Please check for errors in the previous cell.")


--- Experiment Results ---


Unnamed: 0,model_name,transform,dwt_coeffs_kept,jpeg_quant,patch_size,bit_strategy,original_accuracy,reconstructed_accuracy,accuracy_drop,original_delta_mb,compressed_delta_mb,compression_ratio
0,roberta-base-SST-2,dct,all,False,16,"[(2, 0.5), (0, 0.5)]",0.945,0.935,0.01,475.491219,126.481941,3.759361
1,vit-base-patch16-224-cifar10,dct,all,False,16,"[(2, 0.5), (0, 0.5)]",0.995,0.985,0.01,327.325233,88.122108,3.714451
2,distilbert-base-uncased-finetuned-sst-2-english,dct,all,False,16,"[(2, 0.5), (0, 0.5)]",0.91,0.89,0.02,255.413094,68.691658,3.718255
3,swin-tiny-patch4-window7-224-finetuned-cifar10,dct,all,False,16,"[(2, 0.5), (0, 0.5)]",0.97,0.945,0.025,105.117249,27.76413,3.786081


In [None]:
from huggingface_hub import list_models

def find_huggingface_models(search_query: str, limit: int = 10):
    """
    Searches the Hugging Face Hub for models matching a query and prints a sorted list.

    Args:
        search_query (str): The term to search for (e.g., "swin tiny cifar10").
        limit (int): The maximum number of results to display.
    """
    print(f"--- Searching for models matching: '{search_query}' ---")
    
    # list_models returns a generator of models. We sort them by download count.
    models = list(list_models(
        search=search_query,
        sort="downloads",
        direction=-1,
        limit=limit
    ))
    
    if not models:
        print("No models found.")
        return

    print(f"Found {len(models)} models (sorted by popularity):\n")
    for model in models:
        print(f"ID: {model.modelId}")
        print(f"  Task: {model.pipeline_tag} | Downloads: {model.downloads}\n")

# --- Run a search to find a Swin Transformer ---
# This will find popular Swin Tiny models fine-tuned on CIFAR-10
find_huggingface_models("swin tiny cifar10")

## DWT

In [None]:
experiments_config = [
    # --- RoBERTa Experiment with DCT ---
    {
        "transform_type": "dct",
        "pretrained_model_id": "roberta-base",
        "finetuned_model_id": "textattack/roberta-base-SST-2",
        "model_class": AutoModelForSequenceClassification,
        "task_info": {"name": "glue", "config": "sst2", "split": "validation", "text_column": "sentence"},
        "patch_size": 16,
        "bit_strategy": [(2, 0.5), (0, 0.5)]
    },
    # --- RoBERTa Experiment with DWT ---
    {
        "transform_type": "dwt",
        "pretrained_model_id": "roberta-base",
        "finetuned_model_id": "textattack/roberta-base-SST-2",
        "model_class": AutoModelForSequenceClassification,
        "task_info": {"name": "glue", "config": "sst2", "split": "validation", "text_column": "sentence"},
        "patch_size": 16, # Patch sizes that are powers of 2 are good for DWT
        "bit_strategy": [(2, 0.5), (0, 0.5)]
    },

    # --- ViT Experiment with DCT ---
    {
        "transform_type": "dct",
        "pretrained_model_id": "google/vit-base-patch16-224-in21k",
        "finetuned_model_id": "nateraw/vit-base-patch16-224-cifar10",
        "model_class": AutoModelForImageClassification,
        "task_info": {"name": "cifar10", "config": None, "split": "test"},
        "patch_size": 16,
        "bit_strategy": [(2, 0.5), (0, 0.5)]
    },
    # --- ViT Experiment with DWT ---
    {
        "transform_type": "dwt",
        "pretrained_model_id": "google/vit-base-patch16-224-in21k",
        "finetuned_model_id": "nateraw/vit-base-patch16-224-cifar10",
        "model_class": AutoModelForImageClassification,
        "task_info": {"name": "cifar10", "config": None, "split": "test"},
        "patch_size": 16,
        "bit_strategy": [(2, 0.5), (0, 0.5)]
    }
]

In [None]:
all_results = []
device_to_use = "cpu"

for config in experiments_config:
    try:
        result = run_classification_experiment(
            config=config,
            device=device_to_use
        )
        all_results.append(result)
    except Exception as e:
        print(f"\n!!!!!! An error occurred during experiment: {config.get('finetuned_model_id')} !!!!!!")
        print(f"Transform: {config.get('transform_type')}")
        print(f"Error: {e}\n")

print("\n\nAll experiments have been completed.")

In [None]:
if all_results:
    results_df = pd.DataFrame(all_results)
    print("\n--- All Experiment Results ---")
    display(results_df)
else:
    print("No results to display. Please check for errors in the previous cell.")

## Table Quantization

In [None]:
experiments_config = [
    # --- RoBERTa Experiment 1: DCT without JPEG Quantization (Baseline) ---
    {
        "transform_type": "dct",
        "use_jpeg_quantization": False, # JPEG feature is OFF
        "pretrained_model_id": "roberta-base",
        "finetuned_model_id": "textattack/roberta-base-SST-2",
        "model_class": AutoModelForSequenceClassification,
        "task_info": {"name": "glue", "config": "sst2", "split": "validation", "text_column": "sentence"},
        "patch_size": 64, # Using 16x16 patch for a direct comparison
        "bit_strategy": [(2, 0.5), (0, 0.5)]
    },
    
    # --- RoBERTa Experiment 2: DCT with JPEG Quantization ---
    {
        "transform_type": "dct",
        "use_jpeg_quantization": True,  # JPEG feature is ON
        "pretrained_model_id": "roberta-base",
        "finetuned_model_id": "textattack/roberta-base-SST-2",
        "model_class": AutoModelForSequenceClassification,
        "task_info": {"name": "glue", "config": "sst2", "split": "validation", "text_column": "sentence"},
        "patch_size": 64,
        "bit_strategy": [(2, 0.5), (0, 0.5)]
    },


    # --- ViT Experiment with DCT ---
    {
        "transform_type": "dct",
        "use_jpeg_quantization": False, # JPEG feature is OFF
        "pretrained_model_id": "google/vit-base-patch16-224-in21k",
        "finetuned_model_id": "nateraw/vit-base-patch16-224-cifar10",
        "model_class": AutoModelForImageClassification,
        "task_info": {"name": "cifar10", "config": None, "split": "test"},
        "patch_size": 64,
        "bit_strategy": [(2, 0.5), (0, 0.5)]
    },
    # --- ViT Experiment with DWT ---
    {
        "transform_type": "dct",
        "use_jpeg_quantization": True,  # JPEG feature is ON
        "pretrained_model_id": "google/vit-base-patch16-224-in21k",
        "finetuned_model_id": "nateraw/vit-base-patch16-224-cifar10",
        "model_class": AutoModelForImageClassification,
        "task_info": {"name": "cifar10", "config": None, "split": "test"},
        "patch_size": 64,
        "bit_strategy": [(2, 0.5), (0, 0.5)]
    }

    # # --- RoBERTa Experiment 3: DWT without JPEG Quantization ---
    # {
    #     "transform_type": "dwt",
    #     "use_jpeg_quantization": False, # JPEG feature is OFF
    #     "pretrained_model_id": "roberta-base",
    #     "finetuned_model_id": "textattack/roberta-base-SST-2",
    #     "model_class": AutoModelForSequenceClassification,
    #     "task_info": {"name": "glue", "config": "sst2", "split": "validation", "text_column": "sentence"},
    #     "patch_size": 32,  # Using 16x16 patch for a direct comparison
    #     "bit_strategy": [(2, 0.5), (0, 0.5)]
    # },
    
    # # --- RoBERTa Experiment 4: DWT with JPEG Quantization ---
    # {
    #     "transform_type": "dwt",
    #     "use_jpeg_quantization": True,  # JPEG feature is ON
    #     "pretrained_model_id": "roberta-base",
    #     "finetuned_model_id": "textattack/roberta-base-SST-2",
    #     "model_class": AutoModelForSequenceClassification,
    #     "task_info": {"name": "glue", "config": "sst2", "split": "validation", "text_column": "sentence"},
    #     "patch_size": 32,
    #     "bit_strategy": [(2, 0.5), (0, 0.5)]
    # }
]

In [None]:
all_results = []
device_to_use = "cpu"

for config in experiments_config:
    try:
        # The runner will now check for the 'use_jpeg_quantization' key
        result = run_classification_experiment(
            config=config,
            device=device_to_use
        )
        all_results.append(result)
    except Exception as e:
        print(f"\n!!!!!! An error occurred during experiment: {config.get('finetuned_model_id')} !!!!!!")
        print(f"Transform: {config.get('transform_type')}, JPEG: {config.get('use_jpeg_quantization')}")
        print(f"Error: {e}\n")

print("\n\nAll experiments have been completed.")

In [None]:
if all_results:
    results_df = pd.DataFrame(all_results)
    print("\n--- All Experiment Results ---")
    display(results_df)
else:
    print("No results to display. Please check for errors in the previous cell.")

In [None]:
if all_results:
    results_df = pd.DataFrame(all_results)
    print("\n--- All Experiment Results ---")
    display(results_df)
else:
    print("No results to display. Please check for errors in the previous cell.")

## Post Transform

In [None]:
experiments_config = [
    # --- Experiment 1: DCT with Pre-Transform Importance (Baseline) ---
    {
        "transform_type": "dct",
        "importance_mode": "pre", # The original method
        "use_jpeg_quantization": False,
        "pretrained_model_id": "roberta-base",
        "finetuned_model_id": "textattack/roberta-base-SST-2",
        "model_class": AutoModelForSequenceClassification,
        "task_info": {"name": "glue", "config": "sst2", "split": "validation", "text_column": "sentence"},
        "patch_size": 16,
        "bit_strategy": [(2, 0.5), (0, 0.5)]
    },
    
    # --- Experiment 2: DCT with Post-Transform Importance (New Innovation) ---
    {
        "transform_type": "dct",
        "importance_mode": "post", # The new method
        "use_jpeg_quantization": False,
        "pretrained_model_id": "roberta-base",
        "finetuned_model_id": "textattack/roberta-base-SST-2",
        "model_class": AutoModelForSequenceClassification,
        "task_info": {"name": "glue", "config": "sst2", "split": "validation", "text_column": "sentence"},
        "patch_size": 16,
        "bit_strategy": [(2, 0.5), (0, 0.5)]
    },

    # --- Experiment 3: DWT with Pre-Transform Importance (Baseline) ---
        {
        "transform_type": "dwt",
        "importance_mode": "pre", # The original method
        "use_jpeg_quantization": False,
        "pretrained_model_id": "roberta-base",
        "finetuned_model_id": "textattack/roberta-base-SST-2",
        "model_class": AutoModelForSequenceClassification,
        "task_info": {"name": "glue", "config": "sst2", "split": "validation", "text_column": "sentence"},
        "patch_size": 16,
        "bit_strategy": [(2, 0.5), (0, 0.5)]
    },
    
    # --- Experiment 4: DWT with Post-Transform Importance (New Innovation) ---
    {
        "transform_type": "dwt",
        "importance_mode": "post", # The new method
        "use_jpeg_quantization": False,
        "pretrained_model_id": "roberta-base",
        "finetuned_model_id": "textattack/roberta-base-SST-2",
        "model_class": AutoModelForSequenceClassification,
        "task_info": {"name": "glue", "config": "sst2", "split": "validation", "text_column": "sentence"},
        "patch_size": 16,
        "bit_strategy": [(2, 0.5), (0, 0.5)]
    }
]

In [None]:
all_results = []
device_to_use = "cpu"

for config in experiments_config:
    try:
        # The runner will now check for the 'use_jpeg_quantization' key
        result = run_classification_experiment(
            config=config,
            device=device_to_use
        )
        all_results.append(result)
    except Exception as e:
        print(f"\n!!!!!! An error occurred during experiment: {config.get('finetuned_model_id')} !!!!!!")
        print(f"Transform: {config.get('transform_type')}, JPEG: {config.get('use_jpeg_quantization')}")
        print(f"Error: {e}\n")

print("\n\nAll experiments have been completed.")

In [None]:
if all_results:
    results_df = pd.DataFrame(all_results)
    print("\n--- All Experiment Results ---")
    display(results_df)
else:
    print("No results to display. Please check for errors in the previous cell.")

## Multi coeffient DWT

In [None]:
experiments_config = [
    # --- Baseline DCT Experiment ---
    {
        "transform_type": "dct",
        "finetuned_model_id": "textattack/roberta-base-SST-2",
        "pretrained_model_id": "roberta-base",
        "model_class": AutoModelForSequenceClassification,
        "task_info": {"name": "glue", "config": "sst2", "split": "validation", "text_column": "sentence"},
        "patch_size": 32,
        "bit_strategy": [(2, 0.5), (0, 0.5)]
    },

    # --- DWT Strategy 1: Keep ALL coefficients (Less compression, higher accuracy) ---
    {
        "transform_type": "dwt",
        "dwt_coeffs_to_keep": "all",
        "finetuned_model_id": "textattack/roberta-base-SST-2",
        "pretrained_model_id": "roberta-base",
        "model_class": AutoModelForSequenceClassification,
        "task_info": {"name": "glue", "config": "sst2", "split": "validation", "text_column": "sentence"},
        "patch_size": 32,
        "bit_strategy": [(2, 0.5), (0, 0.5)]
    },
    
    # --- DWT Strategy 2: Your Idea - Keep LL, LH, HL ---
    {
        "transform_type": "dwt",
        "dwt_coeffs_to_keep": "ll_lh_hl",
        "finetuned_model_id": "textattack/roberta-base-SST-2",
        "pretrained_model_id": "roberta-base",
        "model_class": AutoModelForSequenceClassification,
        "task_info": {"name": "glue", "config": "sst2", "split": "validation", "text_column": "sentence"},
        "patch_size": 32,
        "bit_strategy": [(2, 0.5), (0, 0.5)]
    },

    # --- DWT Strategy 3: Keep only LL (Most aggressive compression) ---
    {
        "transform_type": "dwt",
        "dwt_coeffs_to_keep": "ll_only",
        "finetuned_model_id": "textattack/roberta-base-SST-2",
        "pretrained_model_id": "roberta-base",
        "model_class": AutoModelForSequenceClassification,
        "task_info": {"name": "glue", "config": "sst2", "split": "validation", "text_column": "sentence"},
        "patch_size": 32,
        "bit_strategy": [(2, 0.5), (0, 0.5)]
    }
]

In [4]:
experiments_config = [
    # --- ViT Experiment 1: DCT (Baseline) ---
    {
        "transform_type": "dct",
        "pretrained_model_id": "google/vit-base-patch16-224-in21k",
        "finetuned_model_id": "nateraw/vit-base-patch16-224-cifar10",
        "model_class": AutoModelForImageClassification,
        "task_info": {"name": "cifar10", "config": None, "split": "test"},
        "patch_size": 16,
        "bit_strategy": [(2, 0.5), (0, 0.5)]
    },

    # --- ViT Experiment 2: DWT (Keep ALL coefficients) ---
    {
        "transform_type": "dwt",
        "dwt_coeffs_to_keep": "all",
        "pretrained_model_id": "google/vit-base-patch16-224-in21k",
        "finetuned_model_id": "nateraw/vit-base-patch16-224-cifar10",
        "model_class": AutoModelForImageClassification,
        "task_info": {"name": "cifar10", "config": None, "split": "test"},
        "patch_size": 16,
        "bit_strategy": [(2, 0.5), (0, 0.5)]
    },
    
    # --- ViT Experiment 3: DWT (Keep LL, LH, HL) ---
    {
        "transform_type": "dwt",
        "dwt_coeffs_to_keep": "ll_lh_hl",
        "pretrained_model_id": "google/vit-base-patch16-224-in21k",
        "finetuned_model_id": "nateraw/vit-base-patch16-224-cifar10",
        "model_class": AutoModelForImageClassification,
        "task_info": {"name": "cifar10", "config": None, "split": "test"},
        "patch_size": 16,
        "bit_strategy": [(2, 0.5), (0, 0.5)]
    },

    # --- ViT Experiment 4: DWT (Keep only LL) ---
    {
        "transform_type": "dwt",
        "dwt_coeffs_to_keep": "ll_only",
        "pretrained_model_id": "google/vit-base-patch16-224-in21k",
        "finetuned_model_id": "nateraw/vit-base-patch16-224-cifar10",
        "model_class": AutoModelForImageClassification,
        "task_info": {"name": "cifar10", "config": None, "split": "test"},
        "patch_size": 16,
        "bit_strategy": [(2, 0.5), (0, 0.5)]
    }
]

In [5]:
all_results = []
device_to_use = "cpu"

for config in experiments_config:
    try:
        result = run_classification_experiment(
            config=config,
            device=device_to_use
        )
        all_results.append(result)
    except Exception as e:
        print(f"\n!!!!!! An error occurred during experiment: {config.get('finetuned_model_id')} !!!!!!")
        print(f"Config: {config}")
        print(f"Error: {e}\n")

print("\n\nAll experiments have been completed.")


Loading models: google/vit-base-patch16-224-in21k and nateraw/vit-base-patch16-224-cifar10


Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Preparing dataloader for dataset: cifar10
Starting model compression using transform: DCT...
Skipping compression for layer 'vit.embeddings.cls_token'. Storing uncompressed.
Skipping compression for layer 'vit.embeddings.position_embeddings'. Storing uncompressed.
Skipping compression for layer 'vit.embeddings.patch_embeddings.projection.weight'. Storing uncompressed.
Skipping compression for layer 'vit.embeddings.patch_embeddings.projection.bias'. Storing uncompressed.
Compressing layer: vit.encoder.layer.0.attention.attention.query.weight...
Skipping compression for layer 'vit.encoder.layer.0.attention.attention.query.bias'. Storing uncompressed.
Compressing layer: vit.encoder.layer.0.attention.attention.key.weight...
Skipping compression for layer 'vit.encoder.layer.0.attention.attention.key.bias'. Storing uncompressed.
Compressing layer: vit.encoder.layer.0.attention.attention.value.weight...
Skipping compression for layer 'vit.encoder.layer.0.attention.attention.value.bias'. Stori

Evaluating Accuracy: 100%|██████████| 7/7 [00:08<00:00,  1.21s/it]
Evaluating Accuracy: 100%|██████████| 7/7 [00:08<00:00,  1.20s/it]


Releasing models from memory...

Loading models: google/vit-base-patch16-224-in21k and nateraw/vit-base-patch16-224-cifar10


Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Preparing dataloader for dataset: cifar10
Starting model compression using transform: DWT...
Skipping compression for layer 'vit.embeddings.cls_token'. Storing uncompressed.
Skipping compression for layer 'vit.embeddings.position_embeddings'. Storing uncompressed.
Skipping compression for layer 'vit.embeddings.patch_embeddings.projection.weight'. Storing uncompressed.
Skipping compression for layer 'vit.embeddings.patch_embeddings.projection.bias'. Storing uncompressed.
Compressing layer: vit.encoder.layer.0.attention.attention.query.weight...
Skipping compression for layer 'vit.encoder.layer.0.attention.attention.query.bias'. Storing uncompressed.
Compressing layer: vit.encoder.layer.0.attention.attention.key.weight...
Skipping compression for layer 'vit.encoder.layer.0.attention.attention.key.bias'. Storing uncompressed.
Compressing layer: vit.encoder.layer.0.attention.attention.value.weight...
Skipping compression for layer 'vit.encoder.layer.0.attention.attention.value.bias'. Stori

Evaluating Accuracy: 100%|██████████| 7/7 [00:08<00:00,  1.19s/it]
Evaluating Accuracy: 100%|██████████| 7/7 [00:08<00:00,  1.18s/it]


Releasing models from memory...

Loading models: google/vit-base-patch16-224-in21k and nateraw/vit-base-patch16-224-cifar10


Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Preparing dataloader for dataset: cifar10
Starting model compression using transform: DWT...
Skipping compression for layer 'vit.embeddings.cls_token'. Storing uncompressed.
Skipping compression for layer 'vit.embeddings.position_embeddings'. Storing uncompressed.
Skipping compression for layer 'vit.embeddings.patch_embeddings.projection.weight'. Storing uncompressed.
Skipping compression for layer 'vit.embeddings.patch_embeddings.projection.bias'. Storing uncompressed.
Compressing layer: vit.encoder.layer.0.attention.attention.query.weight...
Skipping compression for layer 'vit.encoder.layer.0.attention.attention.query.bias'. Storing uncompressed.
Compressing layer: vit.encoder.layer.0.attention.attention.key.weight...
Skipping compression for layer 'vit.encoder.layer.0.attention.attention.key.bias'. Storing uncompressed.
Compressing layer: vit.encoder.layer.0.attention.attention.value.weight...
Skipping compression for layer 'vit.encoder.layer.0.attention.attention.value.bias'. Stori

Evaluating Accuracy: 100%|██████████| 7/7 [00:08<00:00,  1.21s/it]
Evaluating Accuracy: 100%|██████████| 7/7 [00:08<00:00,  1.22s/it]


Releasing models from memory...

Loading models: google/vit-base-patch16-224-in21k and nateraw/vit-base-patch16-224-cifar10


Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Preparing dataloader for dataset: cifar10
Starting model compression using transform: DWT...
Skipping compression for layer 'vit.embeddings.cls_token'. Storing uncompressed.
Skipping compression for layer 'vit.embeddings.position_embeddings'. Storing uncompressed.
Skipping compression for layer 'vit.embeddings.patch_embeddings.projection.weight'. Storing uncompressed.
Skipping compression for layer 'vit.embeddings.patch_embeddings.projection.bias'. Storing uncompressed.
Compressing layer: vit.encoder.layer.0.attention.attention.query.weight...
Skipping compression for layer 'vit.encoder.layer.0.attention.attention.query.bias'. Storing uncompressed.
Compressing layer: vit.encoder.layer.0.attention.attention.key.weight...
Skipping compression for layer 'vit.encoder.layer.0.attention.attention.key.bias'. Storing uncompressed.
Compressing layer: vit.encoder.layer.0.attention.attention.value.weight...
Skipping compression for layer 'vit.encoder.layer.0.attention.attention.value.bias'. Stori

Evaluating Accuracy: 100%|██████████| 7/7 [00:08<00:00,  1.18s/it]
Evaluating Accuracy: 100%|██████████| 7/7 [00:08<00:00,  1.18s/it]


Releasing models from memory...


All experiments have been completed.


In [6]:
if all_results:
    results_df = pd.DataFrame(all_results)
    print("\n--- All Experiment Results ---")
    display(results_df)
else:
    print("No results to display. Please check for errors in the previous cell.")


--- All Experiment Results ---


Unnamed: 0,model_name,transform,dwt_coeffs_kept,jpeg_quant,patch_size,bit_strategy,original_accuracy,reconstructed_accuracy,accuracy_drop,original_delta_mb,compressed_delta_mb,compression_ratio
0,vit-base-patch16-224-cifar10,dct,all,False,16,"[(2, 0.5), (0, 0.5)]",0.995,0.985,0.01,327.325233,88.122108,3.714451
1,vit-base-patch16-224-cifar10,dwt,all,False,16,"[(2, 0.5), (0, 0.5)]",0.995,0.995,0.0,327.325233,95.715858,3.41976
2,vit-base-patch16-224-cifar10,dwt,ll_lh_hl,False,16,"[(2, 0.5), (0, 0.5)]",0.995,0.995,0.0,327.325233,72.934608,4.487927
3,vit-base-patch16-224-cifar10,dwt,ll_only,False,16,"[(2, 0.5), (0, 0.5)]",0.995,0.84,0.155,327.325233,27.372108,11.958349


In [None]:
from huggingface_hub import list_models

def find_huggingface_models(search_query: str, limit: int = 100):
    """
    Searches the Hugging Face Hub for models matching a query and prints a sorted list.

    Args:
        search_query (str): The term to search for (e.g., "swin tiny cifar10").
        limit (int): The maximum number of results to display.
    """
    print(f"--- Searching for models matching: '{search_query}' ---")
    
    # list_models returns a generator of models. We sort them by download count.
    models = list(list_models(
        search=search_query,
        sort="downloads",
        direction=-1,
        limit=limit
    ))
    
    if not models:
        print("No models found.")
        return

    print(f"Found {len(models)} models (sorted by popularity):\n")
    for model in models:
        print(f"ID: {model.modelId}")
        print(f"  Task: {model.pipeline_tag} | Downloads: {model.downloads}\n")

# --- Run a search to find a Swin Transformer ---
# This will find popular Swin Tiny models fine-tuned on CIFAR-10
find_huggingface_models("vit-base-patch32-224-in21k")

## Fine-tuning

In [None]:
# --- Cell 2: Define and Run a Fine-Tuning Job ---
# Example 1: Fine-tuning RoBERTa on SST-2
roberta_config = {
    "base_model_id": "roberta-base",
    "model_class": AutoModelForSequenceClassification,
    "dataset_name": "glue",
    "dataset_config": "sst2",
    "text_column": "sentence",
    "label_column": "label",
    "validation_split": "validation",
    "output_dir": "../models/roberta-base-finetuned-sst2", # Where to save the new model
    "num_epochs": 1 # Use 1 epoch for a quick test
}

# Run the fine-tuning
fine_tune_model(roberta_config)


# Example 2: Fine-tuning ViT on CIFAR-10
# vit_config = {
#     "base_model_id": "google/vit-base-patch16-224-in21k",
#     "model_class": AutoModelForImageClassification,
#     "dataset_name": "cifar10",
#     "image_column": "img",
#     "label_column": "label",
#     "validation_split": "test",
#     "output_dir": "../models/vit-base-finetuned-cifar10",
#     "num_epochs": 1
# }

# fine_tune_model(vit_config)

--- Starting Fine-Tuning Job ---
Base Model: roberta-base
Dataset: glue
Dataset found. Available splits: ['train', 'validation', 'test']


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(



Starting training...




Step,Training Loss
100,0.3543
200,0.0


KeyboardInterrupt: 

# Final Test

## Models 

In [48]:
models_to_test = [
    {
        "group_name": "RoBERTa-SST2",
        "pretrained_model_id": "roberta-base",
        "finetuned_model_id": "textattack/roberta-base-SST-2",
        "model_class": AutoModelForSequenceClassification,
        "task_info": {"name": "glue", "config": "sst2", "split": "validation", "text_column": "sentence"},
    },
    {
        "group_name": "DistilBERT-SST2",
        "pretrained_model_id": "distilbert-base-uncased",
        "finetuned_model_id": "distilbert-base-uncased-finetuned-sst-2-english",
        "model_class": AutoModelForSequenceClassification,
        "task_info": {"name": "glue", "config": "sst2", "split": "validation", "text_column": "sentence"},
    },
    {
        "group_name": "ViT-CIFAR10",
        "pretrained_model_id": "google/vit-base-patch16-224-in21k",
        "finetuned_model_id": "nateraw/vit-base-patch16-224-cifar10",
        "model_class": AutoModelForImageClassification,
        "task_info": {"name": "cifar10", "config": None, "split": "test", "image_column": "img"},
    },
    {
        "group_name": "Swin-CIFAR10",
        "pretrained_model_id": "microsoft/swin-tiny-patch4-window7-224",
        "finetuned_model_id":  "rs127/swin-tiny-patch4-window7-224-finetuned-cifar10",
        "model_class": AutoModelForImageClassification,
        "task_info": {"name": "cifar10", "config": None, "split": "test", "image_column": "img"},
    }
]

## DCT Only

### Configs

In [50]:
# Define the hyperparameters for the baseline experiments
# These are the variables we want to loop through.
patch_sizes_to_test = [8, 16, 32]
bit_strategies_to_test = [
    # Strategy 1: A more aggressive compression. 34% at 2 bits, 33% at 1 bits, 33% pruned.
    [(2, 0.34), (1, 0.33), (0, 0.33)],
    # Strategy 2: A more aggressive compression. 34% at 4 bits, 33% at 2 bits, 33% pruned.
    [(4, 0.34), (2, 0.33), (0, 0.33)]
]

# Automatically generate the full list of experiment configurations
experiments_config = []
for model_info in models_to_test:
    for p_size in patch_sizes_to_test:
        for bit_strat in bit_strategies_to_test:
            
            # Create a new config for each combination
            config = {
                # --- Baseline-specific settings ---
                "transform_type": "dct",        # Only test DCT for the baseline
                "use_jpeg_quantization": False, # JPEG feature is OFF
                "importance_mode": "pre",       # Standard importance scoring
                "dwt_coeffs_to_keep": "all",    # DWT setting (irrelevant for DCT, but good to have)

                # --- Model and task info ---
                "pretrained_model_id": model_info["pretrained_model_id"],
                "finetuned_model_id": model_info["finetuned_model_id"],
                "model_class": model_info["model_class"],
                "task_info": model_info["task_info"],
                
                # --- Hyperparameters for this specific run ---
                "patch_size": p_size,
                "bit_strategy": bit_strat
            }
            experiments_config.append(config)

# --- 4. (Optional) Print a summary ---
print(f"Total number of experiments configured: {len(experiments_config)}")

Total number of experiments configured: 24


### Run Experiments

In [51]:
dct_all_results = []
device_to_use = "cpu"

for config in experiments_config:
    try:
        result = run_classification_experiment(
            config=config,
            device=device_to_use
        )
        dct_all_results.append(result)
    except Exception as e:
        print(f"\n!!!!!! An error occurred during experiment: {config.get('finetuned_model_id')} !!!!!!")
        print(f"Error: {e}\n")

print("\n\nAll experiments have been completed.")


Loading models: roberta-base and textattack/roberta-base-SST-2


Some weights of the model checkpoint at textattack/roberta-base-SST-2 were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream 

Preparing dataloader for dataset: glue
Starting model compression using transform: DCT...
Compressing layer: roberta.embeddings.word_embeddings.weight...
Compressing layer: roberta.embeddings.position_embeddings.weight...
Compressing layer: roberta.embeddings.token_type_embeddings.weight...
Skipping compression for layer 'roberta.embeddings.LayerNorm.weight'. Storing uncompressed.
Skipping compression for layer 'roberta.embeddings.LayerNorm.bias'. Storing uncompressed.
Compressing layer: roberta.encoder.layer.0.attention.self.query.weight...
Skipping compression for layer 'roberta.encoder.layer.0.attention.self.query.bias'. Storing uncompressed.
Compressing layer: roberta.encoder.layer.0.attention.self.key.weight...
Skipping compression for layer 'roberta.encoder.layer.0.attention.self.key.bias'. Storing uncompressed.
Compressing layer: roberta.encoder.layer.0.attention.self.value.weight...
Skipping compression for layer 'roberta.encoder.layer.0.attention.self.value.bias'. Storing unco

Evaluating Accuracy: 100%|██████████| 7/7 [00:05<00:00,  1.33it/s]
Evaluating Accuracy: 100%|██████████| 7/7 [00:05<00:00,  1.36it/s]


Releasing models from memory...


Some weights of the model checkpoint at textattack/roberta-base-SST-2 were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).



Loading models: roberta-base and textattack/roberta-base-SST-2


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Preparing dataloader for dataset: glue
Starting model compression using transform: DCT...
Compressing layer: roberta.embeddings.word_embeddings.weight...
Compressing layer: roberta.embeddings.position_embeddings.weight...
Compressing layer: roberta.embeddings.token_type_embeddings.weight...
Skipping compression for layer 'roberta.embeddings.LayerNorm.weight'. Storing uncompressed.
Skipping compression for layer 'roberta.embeddings.LayerNorm.bias'. Storing uncompressed.
Compressing layer: roberta.encoder.layer.0.attention.self.query.weight...
Skipping compression for layer 'roberta.encoder.layer.0.attention.self.query.bias'. Storing uncompressed.
Compressing layer: roberta.encoder.layer.0.attention.self.key.weight...
Skipping compression for layer 'roberta.encoder.layer.0.attention.self.key.bias'. Storing uncompressed.
Compressing layer: roberta.encoder.layer.0.attention.self.value.weight...
Skipping compression for layer 'roberta.encoder.layer.0.attention.self.value.bias'. Storing unco

Evaluating Accuracy: 100%|██████████| 7/7 [00:04<00:00,  1.41it/s]
Evaluating Accuracy: 100%|██████████| 7/7 [00:05<00:00,  1.39it/s]


Releasing models from memory...

Loading models: roberta-base and textattack/roberta-base-SST-2


Some weights of the model checkpoint at textattack/roberta-base-SST-2 were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream 

Preparing dataloader for dataset: glue
Starting model compression using transform: DCT...
Compressing layer: roberta.embeddings.word_embeddings.weight...
Compressing layer: roberta.embeddings.position_embeddings.weight...
Compressing layer: roberta.embeddings.token_type_embeddings.weight...
Skipping compression for layer 'roberta.embeddings.LayerNorm.weight'. Storing uncompressed.
Skipping compression for layer 'roberta.embeddings.LayerNorm.bias'. Storing uncompressed.
Compressing layer: roberta.encoder.layer.0.attention.self.query.weight...
Skipping compression for layer 'roberta.encoder.layer.0.attention.self.query.bias'. Storing uncompressed.
Compressing layer: roberta.encoder.layer.0.attention.self.key.weight...
Skipping compression for layer 'roberta.encoder.layer.0.attention.self.key.bias'. Storing uncompressed.
Compressing layer: roberta.encoder.layer.0.attention.self.value.weight...
Skipping compression for layer 'roberta.encoder.layer.0.attention.self.value.bias'. Storing unco

Evaluating Accuracy: 100%|██████████| 7/7 [00:04<00:00,  1.40it/s]
Evaluating Accuracy: 100%|██████████| 7/7 [00:04<00:00,  1.41it/s]


Releasing models from memory...


Some weights of the model checkpoint at textattack/roberta-base-SST-2 were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).



Loading models: roberta-base and textattack/roberta-base-SST-2


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Preparing dataloader for dataset: glue
Starting model compression using transform: DCT...
Compressing layer: roberta.embeddings.word_embeddings.weight...
Compressing layer: roberta.embeddings.position_embeddings.weight...
Compressing layer: roberta.embeddings.token_type_embeddings.weight...
Skipping compression for layer 'roberta.embeddings.LayerNorm.weight'. Storing uncompressed.
Skipping compression for layer 'roberta.embeddings.LayerNorm.bias'. Storing uncompressed.
Compressing layer: roberta.encoder.layer.0.attention.self.query.weight...
Skipping compression for layer 'roberta.encoder.layer.0.attention.self.query.bias'. Storing uncompressed.
Compressing layer: roberta.encoder.layer.0.attention.self.key.weight...
Skipping compression for layer 'roberta.encoder.layer.0.attention.self.key.bias'. Storing uncompressed.
Compressing layer: roberta.encoder.layer.0.attention.self.value.weight...
Skipping compression for layer 'roberta.encoder.layer.0.attention.self.value.bias'. Storing unco

Evaluating Accuracy: 100%|██████████| 7/7 [00:05<00:00,  1.32it/s]
Evaluating Accuracy: 100%|██████████| 7/7 [00:04<00:00,  1.42it/s]


Releasing models from memory...


Some weights of the model checkpoint at textattack/roberta-base-SST-2 were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).



Loading models: roberta-base and textattack/roberta-base-SST-2


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Preparing dataloader for dataset: glue
Starting model compression using transform: DCT...
Compressing layer: roberta.embeddings.word_embeddings.weight...
Compressing layer: roberta.embeddings.position_embeddings.weight...
Skipping compression for layer 'roberta.embeddings.token_type_embeddings.weight'. Storing uncompressed.
Skipping compression for layer 'roberta.embeddings.LayerNorm.weight'. Storing uncompressed.
Skipping compression for layer 'roberta.embeddings.LayerNorm.bias'. Storing uncompressed.
Compressing layer: roberta.encoder.layer.0.attention.self.query.weight...
Skipping compression for layer 'roberta.encoder.layer.0.attention.self.query.bias'. Storing uncompressed.
Compressing layer: roberta.encoder.layer.0.attention.self.key.weight...
Skipping compression for layer 'roberta.encoder.layer.0.attention.self.key.bias'. Storing uncompressed.
Compressing layer: roberta.encoder.layer.0.attention.self.value.weight...
Skipping compression for layer 'roberta.encoder.layer.0.attent

Evaluating Accuracy: 100%|██████████| 7/7 [00:05<00:00,  1.34it/s]
Evaluating Accuracy: 100%|██████████| 7/7 [00:04<00:00,  1.41it/s]


Releasing models from memory...

Loading models: roberta-base and textattack/roberta-base-SST-2


Some weights of the model checkpoint at textattack/roberta-base-SST-2 were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream 

Preparing dataloader for dataset: glue
Starting model compression using transform: DCT...
Compressing layer: roberta.embeddings.word_embeddings.weight...
Compressing layer: roberta.embeddings.position_embeddings.weight...
Skipping compression for layer 'roberta.embeddings.token_type_embeddings.weight'. Storing uncompressed.
Skipping compression for layer 'roberta.embeddings.LayerNorm.weight'. Storing uncompressed.
Skipping compression for layer 'roberta.embeddings.LayerNorm.bias'. Storing uncompressed.
Compressing layer: roberta.encoder.layer.0.attention.self.query.weight...
Skipping compression for layer 'roberta.encoder.layer.0.attention.self.query.bias'. Storing uncompressed.
Compressing layer: roberta.encoder.layer.0.attention.self.key.weight...
Skipping compression for layer 'roberta.encoder.layer.0.attention.self.key.bias'. Storing uncompressed.
Compressing layer: roberta.encoder.layer.0.attention.self.value.weight...
Skipping compression for layer 'roberta.encoder.layer.0.attent

Evaluating Accuracy: 100%|██████████| 7/7 [00:05<00:00,  1.36it/s]
Evaluating Accuracy: 100%|██████████| 7/7 [00:04<00:00,  1.41it/s]


Releasing models from memory...

Loading models: distilbert-base-uncased and distilbert-base-uncased-finetuned-sst-2-english


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Preparing dataloader for dataset: glue
Starting model compression using transform: DCT...
Compressing layer: distilbert.embeddings.word_embeddings.weight...
Compressing layer: distilbert.embeddings.position_embeddings.weight...
Skipping compression for layer 'distilbert.embeddings.LayerNorm.weight'. Storing uncompressed.
Skipping compression for layer 'distilbert.embeddings.LayerNorm.bias'. Storing uncompressed.
Compressing layer: distilbert.transformer.layer.0.attention.q_lin.weight...
Skipping compression for layer 'distilbert.transformer.layer.0.attention.q_lin.bias'. Storing uncompressed.
Compressing layer: distilbert.transformer.layer.0.attention.k_lin.weight...
Skipping compression for layer 'distilbert.transformer.layer.0.attention.k_lin.bias'. Storing uncompressed.
Compressing layer: distilbert.transformer.layer.0.attention.v_lin.weight...
Skipping compression for layer 'distilbert.transformer.layer.0.attention.v_lin.bias'. Storing uncompressed.
Compressing layer: distilbert.tr

Evaluating Accuracy: 100%|██████████| 7/7 [00:02<00:00,  2.62it/s]
Evaluating Accuracy: 100%|██████████| 7/7 [00:02<00:00,  2.72it/s]


Releasing models from memory...

Loading models: distilbert-base-uncased and distilbert-base-uncased-finetuned-sst-2-english


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Preparing dataloader for dataset: glue
Starting model compression using transform: DCT...
Compressing layer: distilbert.embeddings.word_embeddings.weight...
Compressing layer: distilbert.embeddings.position_embeddings.weight...
Skipping compression for layer 'distilbert.embeddings.LayerNorm.weight'. Storing uncompressed.
Skipping compression for layer 'distilbert.embeddings.LayerNorm.bias'. Storing uncompressed.
Compressing layer: distilbert.transformer.layer.0.attention.q_lin.weight...
Skipping compression for layer 'distilbert.transformer.layer.0.attention.q_lin.bias'. Storing uncompressed.
Compressing layer: distilbert.transformer.layer.0.attention.k_lin.weight...
Skipping compression for layer 'distilbert.transformer.layer.0.attention.k_lin.bias'. Storing uncompressed.
Compressing layer: distilbert.transformer.layer.0.attention.v_lin.weight...
Skipping compression for layer 'distilbert.transformer.layer.0.attention.v_lin.bias'. Storing uncompressed.
Compressing layer: distilbert.tr

Evaluating Accuracy: 100%|██████████| 7/7 [00:02<00:00,  2.79it/s]
Evaluating Accuracy: 100%|██████████| 7/7 [00:02<00:00,  2.76it/s]


Releasing models from memory...

Loading models: distilbert-base-uncased and distilbert-base-uncased-finetuned-sst-2-english


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Preparing dataloader for dataset: glue
Starting model compression using transform: DCT...
Compressing layer: distilbert.embeddings.word_embeddings.weight...
Compressing layer: distilbert.embeddings.position_embeddings.weight...
Skipping compression for layer 'distilbert.embeddings.LayerNorm.weight'. Storing uncompressed.
Skipping compression for layer 'distilbert.embeddings.LayerNorm.bias'. Storing uncompressed.
Compressing layer: distilbert.transformer.layer.0.attention.q_lin.weight...
Skipping compression for layer 'distilbert.transformer.layer.0.attention.q_lin.bias'. Storing uncompressed.
Compressing layer: distilbert.transformer.layer.0.attention.k_lin.weight...
Skipping compression for layer 'distilbert.transformer.layer.0.attention.k_lin.bias'. Storing uncompressed.
Compressing layer: distilbert.transformer.layer.0.attention.v_lin.weight...
Skipping compression for layer 'distilbert.transformer.layer.0.attention.v_lin.bias'. Storing uncompressed.
Compressing layer: distilbert.tr

Evaluating Accuracy: 100%|██████████| 7/7 [00:02<00:00,  2.68it/s]
Evaluating Accuracy: 100%|██████████| 7/7 [00:02<00:00,  2.65it/s]


Releasing models from memory...

Loading models: distilbert-base-uncased and distilbert-base-uncased-finetuned-sst-2-english


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Preparing dataloader for dataset: glue
Starting model compression using transform: DCT...
Compressing layer: distilbert.embeddings.word_embeddings.weight...
Compressing layer: distilbert.embeddings.position_embeddings.weight...
Skipping compression for layer 'distilbert.embeddings.LayerNorm.weight'. Storing uncompressed.
Skipping compression for layer 'distilbert.embeddings.LayerNorm.bias'. Storing uncompressed.
Compressing layer: distilbert.transformer.layer.0.attention.q_lin.weight...
Skipping compression for layer 'distilbert.transformer.layer.0.attention.q_lin.bias'. Storing uncompressed.
Compressing layer: distilbert.transformer.layer.0.attention.k_lin.weight...
Skipping compression for layer 'distilbert.transformer.layer.0.attention.k_lin.bias'. Storing uncompressed.
Compressing layer: distilbert.transformer.layer.0.attention.v_lin.weight...
Skipping compression for layer 'distilbert.transformer.layer.0.attention.v_lin.bias'. Storing uncompressed.
Compressing layer: distilbert.tr

Evaluating Accuracy: 100%|██████████| 7/7 [00:02<00:00,  2.64it/s]
Evaluating Accuracy: 100%|██████████| 7/7 [00:02<00:00,  2.75it/s]


Releasing models from memory...

Loading models: distilbert-base-uncased and distilbert-base-uncased-finetuned-sst-2-english


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Preparing dataloader for dataset: glue
Starting model compression using transform: DCT...
Compressing layer: distilbert.embeddings.word_embeddings.weight...
Compressing layer: distilbert.embeddings.position_embeddings.weight...
Skipping compression for layer 'distilbert.embeddings.LayerNorm.weight'. Storing uncompressed.
Skipping compression for layer 'distilbert.embeddings.LayerNorm.bias'. Storing uncompressed.
Compressing layer: distilbert.transformer.layer.0.attention.q_lin.weight...
Skipping compression for layer 'distilbert.transformer.layer.0.attention.q_lin.bias'. Storing uncompressed.
Compressing layer: distilbert.transformer.layer.0.attention.k_lin.weight...
Skipping compression for layer 'distilbert.transformer.layer.0.attention.k_lin.bias'. Storing uncompressed.
Compressing layer: distilbert.transformer.layer.0.attention.v_lin.weight...
Skipping compression for layer 'distilbert.transformer.layer.0.attention.v_lin.bias'. Storing uncompressed.
Compressing layer: distilbert.tr

Evaluating Accuracy: 100%|██████████| 7/7 [00:02<00:00,  2.75it/s]
Evaluating Accuracy: 100%|██████████| 7/7 [00:02<00:00,  2.77it/s]


Releasing models from memory...

Loading models: distilbert-base-uncased and distilbert-base-uncased-finetuned-sst-2-english


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Preparing dataloader for dataset: glue
Starting model compression using transform: DCT...
Compressing layer: distilbert.embeddings.word_embeddings.weight...
Compressing layer: distilbert.embeddings.position_embeddings.weight...
Skipping compression for layer 'distilbert.embeddings.LayerNorm.weight'. Storing uncompressed.
Skipping compression for layer 'distilbert.embeddings.LayerNorm.bias'. Storing uncompressed.
Compressing layer: distilbert.transformer.layer.0.attention.q_lin.weight...
Skipping compression for layer 'distilbert.transformer.layer.0.attention.q_lin.bias'. Storing uncompressed.
Compressing layer: distilbert.transformer.layer.0.attention.k_lin.weight...
Skipping compression for layer 'distilbert.transformer.layer.0.attention.k_lin.bias'. Storing uncompressed.
Compressing layer: distilbert.transformer.layer.0.attention.v_lin.weight...
Skipping compression for layer 'distilbert.transformer.layer.0.attention.v_lin.bias'. Storing uncompressed.
Compressing layer: distilbert.tr

Evaluating Accuracy: 100%|██████████| 7/7 [00:02<00:00,  2.56it/s]
Evaluating Accuracy: 100%|██████████| 7/7 [00:02<00:00,  2.68it/s]


Releasing models from memory...

Loading models: google/vit-base-patch16-224-in21k and nateraw/vit-base-patch16-224-cifar10


Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Preparing dataloader for dataset: cifar10
Starting model compression using transform: DCT...
Skipping compression for layer 'vit.embeddings.cls_token'. Storing uncompressed.
Skipping compression for layer 'vit.embeddings.position_embeddings'. Storing uncompressed.
Skipping compression for layer 'vit.embeddings.patch_embeddings.projection.weight'. Storing uncompressed.
Skipping compression for layer 'vit.embeddings.patch_embeddings.projection.bias'. Storing uncompressed.
Compressing layer: vit.encoder.layer.0.attention.attention.query.weight...
Skipping compression for layer 'vit.encoder.layer.0.attention.attention.query.bias'. Storing uncompressed.
Compressing layer: vit.encoder.layer.0.attention.attention.key.weight...
Skipping compression for layer 'vit.encoder.layer.0.attention.attention.key.bias'. Storing uncompressed.
Compressing layer: vit.encoder.layer.0.attention.attention.value.weight...
Skipping compression for layer 'vit.encoder.layer.0.attention.attention.value.bias'. Stori

Evaluating Accuracy: 100%|██████████| 7/7 [00:08<00:00,  1.21s/it]
Evaluating Accuracy: 100%|██████████| 7/7 [00:08<00:00,  1.21s/it]


Releasing models from memory...

Loading models: google/vit-base-patch16-224-in21k and nateraw/vit-base-patch16-224-cifar10


Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Preparing dataloader for dataset: cifar10
Starting model compression using transform: DCT...
Skipping compression for layer 'vit.embeddings.cls_token'. Storing uncompressed.
Skipping compression for layer 'vit.embeddings.position_embeddings'. Storing uncompressed.
Skipping compression for layer 'vit.embeddings.patch_embeddings.projection.weight'. Storing uncompressed.
Skipping compression for layer 'vit.embeddings.patch_embeddings.projection.bias'. Storing uncompressed.
Compressing layer: vit.encoder.layer.0.attention.attention.query.weight...
Skipping compression for layer 'vit.encoder.layer.0.attention.attention.query.bias'. Storing uncompressed.
Compressing layer: vit.encoder.layer.0.attention.attention.key.weight...
Skipping compression for layer 'vit.encoder.layer.0.attention.attention.key.bias'. Storing uncompressed.
Compressing layer: vit.encoder.layer.0.attention.attention.value.weight...
Skipping compression for layer 'vit.encoder.layer.0.attention.attention.value.bias'. Stori

Evaluating Accuracy: 100%|██████████| 7/7 [00:08<00:00,  1.18s/it]
Evaluating Accuracy: 100%|██████████| 7/7 [00:08<00:00,  1.17s/it]


Releasing models from memory...

Loading models: google/vit-base-patch16-224-in21k and nateraw/vit-base-patch16-224-cifar10


Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Preparing dataloader for dataset: cifar10
Starting model compression using transform: DCT...
Skipping compression for layer 'vit.embeddings.cls_token'. Storing uncompressed.
Skipping compression for layer 'vit.embeddings.position_embeddings'. Storing uncompressed.
Skipping compression for layer 'vit.embeddings.patch_embeddings.projection.weight'. Storing uncompressed.
Skipping compression for layer 'vit.embeddings.patch_embeddings.projection.bias'. Storing uncompressed.
Compressing layer: vit.encoder.layer.0.attention.attention.query.weight...
Skipping compression for layer 'vit.encoder.layer.0.attention.attention.query.bias'. Storing uncompressed.
Compressing layer: vit.encoder.layer.0.attention.attention.key.weight...
Skipping compression for layer 'vit.encoder.layer.0.attention.attention.key.bias'. Storing uncompressed.
Compressing layer: vit.encoder.layer.0.attention.attention.value.weight...
Skipping compression for layer 'vit.encoder.layer.0.attention.attention.value.bias'. Stori

Evaluating Accuracy: 100%|██████████| 7/7 [00:08<00:00,  1.17s/it]
Evaluating Accuracy: 100%|██████████| 7/7 [00:08<00:00,  1.16s/it]


Releasing models from memory...

Loading models: google/vit-base-patch16-224-in21k and nateraw/vit-base-patch16-224-cifar10


Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Preparing dataloader for dataset: cifar10
Starting model compression using transform: DCT...
Skipping compression for layer 'vit.embeddings.cls_token'. Storing uncompressed.
Skipping compression for layer 'vit.embeddings.position_embeddings'. Storing uncompressed.
Skipping compression for layer 'vit.embeddings.patch_embeddings.projection.weight'. Storing uncompressed.
Skipping compression for layer 'vit.embeddings.patch_embeddings.projection.bias'. Storing uncompressed.
Compressing layer: vit.encoder.layer.0.attention.attention.query.weight...
Skipping compression for layer 'vit.encoder.layer.0.attention.attention.query.bias'. Storing uncompressed.
Compressing layer: vit.encoder.layer.0.attention.attention.key.weight...
Skipping compression for layer 'vit.encoder.layer.0.attention.attention.key.bias'. Storing uncompressed.
Compressing layer: vit.encoder.layer.0.attention.attention.value.weight...
Skipping compression for layer 'vit.encoder.layer.0.attention.attention.value.bias'. Stori

Evaluating Accuracy: 100%|██████████| 7/7 [00:08<00:00,  1.18s/it]
Evaluating Accuracy: 100%|██████████| 7/7 [00:08<00:00,  1.17s/it]


Releasing models from memory...

Loading models: google/vit-base-patch16-224-in21k and nateraw/vit-base-patch16-224-cifar10


Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Preparing dataloader for dataset: cifar10
Starting model compression using transform: DCT...
Skipping compression for layer 'vit.embeddings.cls_token'. Storing uncompressed.
Skipping compression for layer 'vit.embeddings.position_embeddings'. Storing uncompressed.
Skipping compression for layer 'vit.embeddings.patch_embeddings.projection.weight'. Storing uncompressed.
Skipping compression for layer 'vit.embeddings.patch_embeddings.projection.bias'. Storing uncompressed.
Compressing layer: vit.encoder.layer.0.attention.attention.query.weight...
Skipping compression for layer 'vit.encoder.layer.0.attention.attention.query.bias'. Storing uncompressed.
Compressing layer: vit.encoder.layer.0.attention.attention.key.weight...
Skipping compression for layer 'vit.encoder.layer.0.attention.attention.key.bias'. Storing uncompressed.
Compressing layer: vit.encoder.layer.0.attention.attention.value.weight...
Skipping compression for layer 'vit.encoder.layer.0.attention.attention.value.bias'. Stori

Evaluating Accuracy: 100%|██████████| 7/7 [00:08<00:00,  1.19s/it]
Evaluating Accuracy: 100%|██████████| 7/7 [00:08<00:00,  1.17s/it]


Releasing models from memory...

Loading models: google/vit-base-patch16-224-in21k and nateraw/vit-base-patch16-224-cifar10


Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Preparing dataloader for dataset: cifar10
Starting model compression using transform: DCT...
Skipping compression for layer 'vit.embeddings.cls_token'. Storing uncompressed.
Skipping compression for layer 'vit.embeddings.position_embeddings'. Storing uncompressed.
Skipping compression for layer 'vit.embeddings.patch_embeddings.projection.weight'. Storing uncompressed.
Skipping compression for layer 'vit.embeddings.patch_embeddings.projection.bias'. Storing uncompressed.
Compressing layer: vit.encoder.layer.0.attention.attention.query.weight...
Skipping compression for layer 'vit.encoder.layer.0.attention.attention.query.bias'. Storing uncompressed.
Compressing layer: vit.encoder.layer.0.attention.attention.key.weight...
Skipping compression for layer 'vit.encoder.layer.0.attention.attention.key.bias'. Storing uncompressed.
Compressing layer: vit.encoder.layer.0.attention.attention.value.weight...
Skipping compression for layer 'vit.encoder.layer.0.attention.attention.value.bias'. Stori

Evaluating Accuracy: 100%|██████████| 7/7 [00:08<00:00,  1.19s/it]
Evaluating Accuracy: 100%|██████████| 7/7 [00:08<00:00,  1.17s/it]


Releasing models from memory...

Loading models: microsoft/swin-tiny-patch4-window7-224 and rs127/swin-tiny-patch4-window7-224-finetuned-cifar10


Some weights of SwinForImageClassification were not initialized from the model checkpoint at microsoft/swin-tiny-patch4-window7-224 and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([10]) in the model instantiated
- classifier.weight: found shape torch.Size([1000, 768]) in the checkpoint and torch.Size([10, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Preparing dataloader for dataset: cifar10
Starting model compression using transform: DCT...
Skipping compression for layer 'swin.embeddings.patch_embeddings.projection.weight'. Storing uncompressed.
Skipping compression for layer 'swin.embeddings.patch_embeddings.projection.bias'. Storing uncompressed.
Skipping compression for layer 'swin.embeddings.norm.weight'. Storing uncompressed.
Skipping compression for layer 'swin.embeddings.norm.bias'. Storing uncompressed.
Skipping compression for layer 'swin.encoder.layers.0.blocks.0.layernorm_before.weight'. Storing uncompressed.
Skipping compression for layer 'swin.encoder.layers.0.blocks.0.layernorm_before.bias'. Storing uncompressed.
Compressing layer: swin.encoder.layers.0.blocks.0.attention.self.relative_position_bias_table...
Compressing layer: swin.encoder.layers.0.blocks.0.attention.self.relative_position_index...
Compressing layer: swin.encoder.layers.0.blocks.0.attention.self.query.weight...
Skipping compression for layer 'swin.en

Evaluating Accuracy: 100%|██████████| 7/7 [00:05<00:00,  1.23it/s]
Evaluating Accuracy: 100%|██████████| 7/7 [00:05<00:00,  1.23it/s]


Releasing models from memory...

Loading models: microsoft/swin-tiny-patch4-window7-224 and rs127/swin-tiny-patch4-window7-224-finetuned-cifar10


Some weights of SwinForImageClassification were not initialized from the model checkpoint at microsoft/swin-tiny-patch4-window7-224 and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([10]) in the model instantiated
- classifier.weight: found shape torch.Size([1000, 768]) in the checkpoint and torch.Size([10, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Preparing dataloader for dataset: cifar10
Starting model compression using transform: DCT...
Skipping compression for layer 'swin.embeddings.patch_embeddings.projection.weight'. Storing uncompressed.
Skipping compression for layer 'swin.embeddings.patch_embeddings.projection.bias'. Storing uncompressed.
Skipping compression for layer 'swin.embeddings.norm.weight'. Storing uncompressed.
Skipping compression for layer 'swin.embeddings.norm.bias'. Storing uncompressed.
Skipping compression for layer 'swin.encoder.layers.0.blocks.0.layernorm_before.weight'. Storing uncompressed.
Skipping compression for layer 'swin.encoder.layers.0.blocks.0.layernorm_before.bias'. Storing uncompressed.
Compressing layer: swin.encoder.layers.0.blocks.0.attention.self.relative_position_bias_table...
Compressing layer: swin.encoder.layers.0.blocks.0.attention.self.relative_position_index...
Compressing layer: swin.encoder.layers.0.blocks.0.attention.self.query.weight...
Skipping compression for layer 'swin.en

Evaluating Accuracy: 100%|██████████| 7/7 [00:05<00:00,  1.24it/s]
Evaluating Accuracy: 100%|██████████| 7/7 [00:05<00:00,  1.22it/s]


Releasing models from memory...

Loading models: microsoft/swin-tiny-patch4-window7-224 and rs127/swin-tiny-patch4-window7-224-finetuned-cifar10


Some weights of SwinForImageClassification were not initialized from the model checkpoint at microsoft/swin-tiny-patch4-window7-224 and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([10]) in the model instantiated
- classifier.weight: found shape torch.Size([1000, 768]) in the checkpoint and torch.Size([10, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Preparing dataloader for dataset: cifar10
Starting model compression using transform: DCT...
Skipping compression for layer 'swin.embeddings.patch_embeddings.projection.weight'. Storing uncompressed.
Skipping compression for layer 'swin.embeddings.patch_embeddings.projection.bias'. Storing uncompressed.
Skipping compression for layer 'swin.embeddings.norm.weight'. Storing uncompressed.
Skipping compression for layer 'swin.embeddings.norm.bias'. Storing uncompressed.
Skipping compression for layer 'swin.encoder.layers.0.blocks.0.layernorm_before.weight'. Storing uncompressed.
Skipping compression for layer 'swin.encoder.layers.0.blocks.0.layernorm_before.bias'. Storing uncompressed.
Compressing layer: swin.encoder.layers.0.blocks.0.attention.self.relative_position_bias_table...
Compressing layer: swin.encoder.layers.0.blocks.0.attention.self.relative_position_index...
Compressing layer: swin.encoder.layers.0.blocks.0.attention.self.query.weight...
Skipping compression for layer 'swin.en

Evaluating Accuracy: 100%|██████████| 7/7 [00:05<00:00,  1.23it/s]
Evaluating Accuracy: 100%|██████████| 7/7 [00:05<00:00,  1.25it/s]


Releasing models from memory...

Loading models: microsoft/swin-tiny-patch4-window7-224 and rs127/swin-tiny-patch4-window7-224-finetuned-cifar10


Some weights of SwinForImageClassification were not initialized from the model checkpoint at microsoft/swin-tiny-patch4-window7-224 and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([10]) in the model instantiated
- classifier.weight: found shape torch.Size([1000, 768]) in the checkpoint and torch.Size([10, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Preparing dataloader for dataset: cifar10
Starting model compression using transform: DCT...
Skipping compression for layer 'swin.embeddings.patch_embeddings.projection.weight'. Storing uncompressed.
Skipping compression for layer 'swin.embeddings.patch_embeddings.projection.bias'. Storing uncompressed.
Skipping compression for layer 'swin.embeddings.norm.weight'. Storing uncompressed.
Skipping compression for layer 'swin.embeddings.norm.bias'. Storing uncompressed.
Skipping compression for layer 'swin.encoder.layers.0.blocks.0.layernorm_before.weight'. Storing uncompressed.
Skipping compression for layer 'swin.encoder.layers.0.blocks.0.layernorm_before.bias'. Storing uncompressed.
Compressing layer: swin.encoder.layers.0.blocks.0.attention.self.relative_position_bias_table...
Compressing layer: swin.encoder.layers.0.blocks.0.attention.self.relative_position_index...
Compressing layer: swin.encoder.layers.0.blocks.0.attention.self.query.weight...
Skipping compression for layer 'swin.en

Evaluating Accuracy: 100%|██████████| 7/7 [00:06<00:00,  1.17it/s]
Evaluating Accuracy: 100%|██████████| 7/7 [00:05<00:00,  1.18it/s]


Releasing models from memory...

Loading models: microsoft/swin-tiny-patch4-window7-224 and rs127/swin-tiny-patch4-window7-224-finetuned-cifar10


Some weights of SwinForImageClassification were not initialized from the model checkpoint at microsoft/swin-tiny-patch4-window7-224 and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([10]) in the model instantiated
- classifier.weight: found shape torch.Size([1000, 768]) in the checkpoint and torch.Size([10, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Preparing dataloader for dataset: cifar10
Starting model compression using transform: DCT...
Skipping compression for layer 'swin.embeddings.patch_embeddings.projection.weight'. Storing uncompressed.
Skipping compression for layer 'swin.embeddings.patch_embeddings.projection.bias'. Storing uncompressed.
Skipping compression for layer 'swin.embeddings.norm.weight'. Storing uncompressed.
Skipping compression for layer 'swin.embeddings.norm.bias'. Storing uncompressed.
Skipping compression for layer 'swin.encoder.layers.0.blocks.0.layernorm_before.weight'. Storing uncompressed.
Skipping compression for layer 'swin.encoder.layers.0.blocks.0.layernorm_before.bias'. Storing uncompressed.
Skipping compression for layer 'swin.encoder.layers.0.blocks.0.attention.self.relative_position_bias_table'. Storing uncompressed.
Compressing layer: swin.encoder.layers.0.blocks.0.attention.self.relative_position_index...
Compressing layer: swin.encoder.layers.0.blocks.0.attention.self.query.weight...
Skipp

Evaluating Accuracy: 100%|██████████| 7/7 [00:05<00:00,  1.22it/s]
Evaluating Accuracy: 100%|██████████| 7/7 [00:05<00:00,  1.23it/s]


Releasing models from memory...

Loading models: microsoft/swin-tiny-patch4-window7-224 and rs127/swin-tiny-patch4-window7-224-finetuned-cifar10


Some weights of SwinForImageClassification were not initialized from the model checkpoint at microsoft/swin-tiny-patch4-window7-224 and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([10]) in the model instantiated
- classifier.weight: found shape torch.Size([1000, 768]) in the checkpoint and torch.Size([10, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Preparing dataloader for dataset: cifar10
Starting model compression using transform: DCT...
Skipping compression for layer 'swin.embeddings.patch_embeddings.projection.weight'. Storing uncompressed.
Skipping compression for layer 'swin.embeddings.patch_embeddings.projection.bias'. Storing uncompressed.
Skipping compression for layer 'swin.embeddings.norm.weight'. Storing uncompressed.
Skipping compression for layer 'swin.embeddings.norm.bias'. Storing uncompressed.
Skipping compression for layer 'swin.encoder.layers.0.blocks.0.layernorm_before.weight'. Storing uncompressed.
Skipping compression for layer 'swin.encoder.layers.0.blocks.0.layernorm_before.bias'. Storing uncompressed.
Skipping compression for layer 'swin.encoder.layers.0.blocks.0.attention.self.relative_position_bias_table'. Storing uncompressed.
Compressing layer: swin.encoder.layers.0.blocks.0.attention.self.relative_position_index...
Compressing layer: swin.encoder.layers.0.blocks.0.attention.self.query.weight...
Skipp

Evaluating Accuracy: 100%|██████████| 7/7 [00:06<00:00,  1.15it/s]
Evaluating Accuracy: 100%|██████████| 7/7 [00:05<00:00,  1.21it/s]

Releasing models from memory...


All experiments have been completed.





### Table

In [52]:
if dct_all_results:
    results_df = pd.DataFrame(dct_all_results)
    print("\n--- Experiment Results ---")
    display(results_df)
else:
    print("No results to display. Please check for errors in the previous cell.")


--- Experiment Results ---


Unnamed: 0,model_name,transform,dwt_coeffs_kept,jpeg_quant,patch_size,bit_strategy,original_accuracy,reconstructed_accuracy,accuracy_drop,original_delta_mb,compressed_delta_mb,compression_ratio
0,roberta-base-SST-2,dct,all,False,8,"[(2, 0.34), (1, 0.33), (0, 0.33)]",0.945,0.945,0.0,475.491219,143.092537,3.322963
1,roberta-base-SST-2,dct,all,False,8,"[(4, 0.34), (2, 0.33), (0, 0.33)]",0.945,0.945,0.0,475.491219,143.092537,3.322963
2,roberta-base-SST-2,dct,all,False,16,"[(2, 0.34), (1, 0.33), (0, 0.33)]",0.945,0.94,0.005,475.491219,126.481941,3.759361
3,roberta-base-SST-2,dct,all,False,16,"[(4, 0.34), (2, 0.33), (0, 0.33)]",0.945,0.945,0.0,475.491219,126.481941,3.759361
4,roberta-base-SST-2,dct,all,False,32,"[(2, 0.34), (1, 0.33), (0, 0.33)]",0.945,0.95,-0.005,475.491219,122.328743,3.886995
5,roberta-base-SST-2,dct,all,False,32,"[(4, 0.34), (2, 0.33), (0, 0.33)]",0.945,0.95,-0.005,475.491219,122.328743,3.886995
6,distilbert-base-uncased-finetuned-sst-2-english,dct,all,False,8,"[(2, 0.34), (1, 0.33), (0, 0.33)]",0.91,0.91,0.0,255.413094,77.583992,3.292085
7,distilbert-base-uncased-finetuned-sst-2-english,dct,all,False,8,"[(4, 0.34), (2, 0.33), (0, 0.33)]",0.91,0.905,0.005,255.413094,77.583992,3.292085
8,distilbert-base-uncased-finetuned-sst-2-english,dct,all,False,16,"[(2, 0.34), (1, 0.33), (0, 0.33)]",0.91,0.905,0.005,255.413094,68.691658,3.718255
9,distilbert-base-uncased-finetuned-sst-2-english,dct,all,False,16,"[(4, 0.34), (2, 0.33), (0, 0.33)]",0.91,0.905,0.005,255.413094,68.691658,3.718255


### Find the Best DCT Configs

In [53]:
# --- Step 1: Define your criteria for "best" ---
ACCURACY_DROP_THRESHOLD = 0.01  # 1%


# --- Step 2: The script with tiered logic ---

# Calculate accuracy drop
for result in dct_all_results:
    result['accuracy_drop'] = result['original_accuracy'] - result['reconstructed_accuracy']

# Group results
grouped_results = defaultdict(list)
for result in dct_all_results:
    grouped_results[result['model_name']].append(result)

# Find the best configuration for each group using the new tiered logic
best_configs = {}
for model_name, results_list in grouped_results.items():
    
    # Tier 1: Find configs that IMPROVED accuracy (accuracy_drop < 0)
    improving_configs = [config for config in results_list if config['accuracy_drop'] < 0]
    if improving_configs:
        print(f"Info: Found {len(improving_configs)} accuracy-improving config(s) for '{model_name}'.")
        # From the improvers, pick the one with the highest compression ratio
        best_config = max(improving_configs, key=lambda x: x['compression_ratio'])
        best_configs[model_name] = best_config
        continue # Move to the next model

    # Tier 2: If no improvers, find configs with acceptable accuracy drop (0 <= drop <= threshold)
    acceptable_configs = [config for config in results_list if 0 <= config['accuracy_drop'] <= ACCURACY_DROP_THRESHOLD]
    if acceptable_configs:
        print(f"Info: Found {len(acceptable_configs)} acceptable config(s) for '{model_name}'.")
        # From the acceptable ones, pick the one with the highest compression ratio
        best_config = max(acceptable_configs, key=lambda x: x['compression_ratio'])
        best_configs[model_name] = best_config
        continue

    # Tier 3: If none of the above, find the 'least harmful' option (minimum positive drop)
    print(f"Warning: No config for '{model_name}' met the ideal criteria. Finding the 'least harmful' option.")
    least_harmful_config = min(results_list, key=lambda x: x['accuracy_drop'])
    best_configs[model_name] = least_harmful_config


# --- Step 4: Print the results cleanly ---
print("\n--- Best DCT Configuration per Model (Improved Logic) ---")
pprint.pprint(best_configs)

Info: Found 2 accuracy-improving config(s) for 'roberta-base-SST-2'.
Info: Found 5 acceptable config(s) for 'distilbert-base-uncased-finetuned-sst-2-english'.
Info: Found 6 acceptable config(s) for 'vit-base-patch16-224-cifar10'.
Info: Found 1 accuracy-improving config(s) for 'swin-tiny-patch4-window7-224-finetuned-cifar10'.

--- Best DCT Configuration per Model (Improved Logic) ---
{'distilbert-base-uncased-finetuned-sst-2-english': {'accuracy_drop': 0.005000054836273193,
                                                     'bit_strategy': '[(4, '
                                                                     '0.34), '
                                                                     '(2, '
                                                                     '0.33), '
                                                                     '(0, '
                                                                     '0.33)]',
                                                     'co

## Innovations

### Configs

In [54]:
import ast
from transformers import (
    AutoModelForSequenceClassification,
    AutoModelForImageClassification
)

# --- Step 2: Define the innovations you want to test ---
innovations_to_test = {
    "Post-Transform": {"importance_mode": "post"},
    "Quantization-Table": {"use_jpeg_quantization": True},
    "DWT-all": {"transform_type": "dwt", "dwt_coeffs_to_keep": "all"},
    "DWT-ll_lh_hl": {"transform_type": "dwt", "dwt_coeffs_to_keep": "ll_lh_hl"},
    "DWT-ll_only": {"transform_type": "dwt", "dwt_coeffs_to_keep": "ll_only"},
}

# --- Step 3: Automatically generate the new experiment configurations ---
innovation_experiments_config = []

for model_meta in models_to_test:
    # Generate the key used in 'best_configs' from the fine-tuned model ID
    model_key = model_meta['finetuned_model_id'].split('/')[-1]

    if model_key in best_configs:
        best_params = best_configs[model_key]
        
        # Safely convert the bit_strategy string from the results back to a Python list
        best_bit_strategy = ast.literal_eval(best_params['bit_strategy'])

        # Create a new experiment config for each innovation
        for innovation_name, innovation_params in innovations_to_test.items():
            
            # Start with a base configuration using all metadata and best hyperparameters
            config = {
                "pretrained_model_id": model_meta["pretrained_model_id"],
                "finetuned_model_id": model_meta["finetuned_model_id"],
                "model_class": model_meta["model_class"],
                "task_info": model_meta["task_info"],
                "patch_size": best_params['patch_size'],
                "bit_strategy": best_bit_strategy,
                
                # Default settings that can be overridden by the innovation
                "transform_type": "dct",
                "use_jpeg_quantization": False,
                "importance_mode": "pre",
            }
            
            # Apply the specific innovation for this run
            config.update(innovation_params)
            
            innovation_experiments_config.append(config)

# --- Step 4: (Optional) Print a summary ---
print(f"Total number of new experiments to run: {len(innovation_experiments_config)}")


Total number of new experiments to run: 20


### Run Experiments

In [55]:
innovation_all_results = []
device_to_use = "cpu"

for config in innovation_experiments_config:
    try:
        result = run_classification_experiment(
            config=config,
            device=device_to_use
        )
        innovation_all_results.append(result)
    except Exception as e:
        print(f"\n!!!!!! An error occurred during experiment: {config.get('finetuned_model_id')} !!!!!!")
        print(f"Error: {e}\n")

print("\n\nAll experiments have been completed.")


Loading models: roberta-base and textattack/roberta-base-SST-2


Some weights of the model checkpoint at textattack/roberta-base-SST-2 were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream 

Preparing dataloader for dataset: glue
Starting model compression using transform: DCT...
Compressing layer: roberta.embeddings.word_embeddings.weight...
Compressing layer: roberta.embeddings.position_embeddings.weight...
Skipping compression for layer 'roberta.embeddings.token_type_embeddings.weight'. Storing uncompressed.
Skipping compression for layer 'roberta.embeddings.LayerNorm.weight'. Storing uncompressed.
Skipping compression for layer 'roberta.embeddings.LayerNorm.bias'. Storing uncompressed.
Compressing layer: roberta.encoder.layer.0.attention.self.query.weight...
Skipping compression for layer 'roberta.encoder.layer.0.attention.self.query.bias'. Storing uncompressed.
Compressing layer: roberta.encoder.layer.0.attention.self.key.weight...
Skipping compression for layer 'roberta.encoder.layer.0.attention.self.key.bias'. Storing uncompressed.
Compressing layer: roberta.encoder.layer.0.attention.self.value.weight...
Skipping compression for layer 'roberta.encoder.layer.0.attent

Evaluating Accuracy: 100%|██████████| 7/7 [00:05<00:00,  1.40it/s]
Evaluating Accuracy: 100%|██████████| 7/7 [00:04<00:00,  1.41it/s]


Releasing models from memory...

Loading models: roberta-base and textattack/roberta-base-SST-2


Some weights of the model checkpoint at textattack/roberta-base-SST-2 were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream 

Preparing dataloader for dataset: glue
JPEG-style quantization ENABLED.
Starting model compression using transform: DCT...
Compressing layer: roberta.embeddings.word_embeddings.weight...
Compressing layer: roberta.embeddings.position_embeddings.weight...
Skipping compression for layer 'roberta.embeddings.token_type_embeddings.weight'. Storing uncompressed.
Skipping compression for layer 'roberta.embeddings.LayerNorm.weight'. Storing uncompressed.
Skipping compression for layer 'roberta.embeddings.LayerNorm.bias'. Storing uncompressed.
Compressing layer: roberta.encoder.layer.0.attention.self.query.weight...
Skipping compression for layer 'roberta.encoder.layer.0.attention.self.query.bias'. Storing uncompressed.
Compressing layer: roberta.encoder.layer.0.attention.self.key.weight...
Skipping compression for layer 'roberta.encoder.layer.0.attention.self.key.bias'. Storing uncompressed.
Compressing layer: roberta.encoder.layer.0.attention.self.value.weight...
Skipping compression for laye

Evaluating Accuracy: 100%|██████████| 7/7 [00:04<00:00,  1.41it/s]
Evaluating Accuracy: 100%|██████████| 7/7 [00:04<00:00,  1.42it/s]


Releasing models from memory...

Loading models: roberta-base and textattack/roberta-base-SST-2


Some weights of the model checkpoint at textattack/roberta-base-SST-2 were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream 

Preparing dataloader for dataset: glue
Starting model compression using transform: DWT...
Compressing layer: roberta.embeddings.word_embeddings.weight...
Compressing layer: roberta.embeddings.position_embeddings.weight...
Skipping compression for layer 'roberta.embeddings.token_type_embeddings.weight'. Storing uncompressed.
Skipping compression for layer 'roberta.embeddings.LayerNorm.weight'. Storing uncompressed.
Skipping compression for layer 'roberta.embeddings.LayerNorm.bias'. Storing uncompressed.
Compressing layer: roberta.encoder.layer.0.attention.self.query.weight...
Skipping compression for layer 'roberta.encoder.layer.0.attention.self.query.bias'. Storing uncompressed.
Compressing layer: roberta.encoder.layer.0.attention.self.key.weight...
Skipping compression for layer 'roberta.encoder.layer.0.attention.self.key.bias'. Storing uncompressed.
Compressing layer: roberta.encoder.layer.0.attention.self.value.weight...
Skipping compression for layer 'roberta.encoder.layer.0.attent

Evaluating Accuracy: 100%|██████████| 7/7 [00:04<00:00,  1.41it/s]
Evaluating Accuracy: 100%|██████████| 7/7 [00:04<00:00,  1.41it/s]


Releasing models from memory...


Some weights of the model checkpoint at textattack/roberta-base-SST-2 were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).



Loading models: roberta-base and textattack/roberta-base-SST-2


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Preparing dataloader for dataset: glue
Starting model compression using transform: DWT...
Compressing layer: roberta.embeddings.word_embeddings.weight...
Compressing layer: roberta.embeddings.position_embeddings.weight...
Skipping compression for layer 'roberta.embeddings.token_type_embeddings.weight'. Storing uncompressed.
Skipping compression for layer 'roberta.embeddings.LayerNorm.weight'. Storing uncompressed.
Skipping compression for layer 'roberta.embeddings.LayerNorm.bias'. Storing uncompressed.
Compressing layer: roberta.encoder.layer.0.attention.self.query.weight...
Skipping compression for layer 'roberta.encoder.layer.0.attention.self.query.bias'. Storing uncompressed.
Compressing layer: roberta.encoder.layer.0.attention.self.key.weight...
Skipping compression for layer 'roberta.encoder.layer.0.attention.self.key.bias'. Storing uncompressed.
Compressing layer: roberta.encoder.layer.0.attention.self.value.weight...
Skipping compression for layer 'roberta.encoder.layer.0.attent

Evaluating Accuracy: 100%|██████████| 7/7 [00:05<00:00,  1.35it/s]
Evaluating Accuracy: 100%|██████████| 7/7 [00:05<00:00,  1.36it/s]


Releasing models from memory...

Loading models: roberta-base and textattack/roberta-base-SST-2


Some weights of the model checkpoint at textattack/roberta-base-SST-2 were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream 

Preparing dataloader for dataset: glue
Starting model compression using transform: DWT...
Compressing layer: roberta.embeddings.word_embeddings.weight...
Compressing layer: roberta.embeddings.position_embeddings.weight...
Skipping compression for layer 'roberta.embeddings.token_type_embeddings.weight'. Storing uncompressed.
Skipping compression for layer 'roberta.embeddings.LayerNorm.weight'. Storing uncompressed.
Skipping compression for layer 'roberta.embeddings.LayerNorm.bias'. Storing uncompressed.
Compressing layer: roberta.encoder.layer.0.attention.self.query.weight...
Skipping compression for layer 'roberta.encoder.layer.0.attention.self.query.bias'. Storing uncompressed.
Compressing layer: roberta.encoder.layer.0.attention.self.key.weight...
Skipping compression for layer 'roberta.encoder.layer.0.attention.self.key.bias'. Storing uncompressed.
Compressing layer: roberta.encoder.layer.0.attention.self.value.weight...
Skipping compression for layer 'roberta.encoder.layer.0.attent

Evaluating Accuracy: 100%|██████████| 7/7 [00:05<00:00,  1.38it/s]
Evaluating Accuracy: 100%|██████████| 7/7 [00:05<00:00,  1.32it/s]


Releasing models from memory...

Loading models: distilbert-base-uncased and distilbert-base-uncased-finetuned-sst-2-english


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Preparing dataloader for dataset: glue
Starting model compression using transform: DCT...
Compressing layer: distilbert.embeddings.word_embeddings.weight...
Compressing layer: distilbert.embeddings.position_embeddings.weight...
Skipping compression for layer 'distilbert.embeddings.LayerNorm.weight'. Storing uncompressed.
Skipping compression for layer 'distilbert.embeddings.LayerNorm.bias'. Storing uncompressed.
Compressing layer: distilbert.transformer.layer.0.attention.q_lin.weight...
Skipping compression for layer 'distilbert.transformer.layer.0.attention.q_lin.bias'. Storing uncompressed.
Compressing layer: distilbert.transformer.layer.0.attention.k_lin.weight...
Skipping compression for layer 'distilbert.transformer.layer.0.attention.k_lin.bias'. Storing uncompressed.
Compressing layer: distilbert.transformer.layer.0.attention.v_lin.weight...
Skipping compression for layer 'distilbert.transformer.layer.0.attention.v_lin.bias'. Storing uncompressed.
Compressing layer: distilbert.tr

Evaluating Accuracy: 100%|██████████| 7/7 [00:02<00:00,  2.70it/s]
Evaluating Accuracy: 100%|██████████| 7/7 [00:02<00:00,  2.74it/s]


Releasing models from memory...

Loading models: distilbert-base-uncased and distilbert-base-uncased-finetuned-sst-2-english


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Preparing dataloader for dataset: glue
JPEG-style quantization ENABLED.
Starting model compression using transform: DCT...
Compressing layer: distilbert.embeddings.word_embeddings.weight...
Compressing layer: distilbert.embeddings.position_embeddings.weight...
Skipping compression for layer 'distilbert.embeddings.LayerNorm.weight'. Storing uncompressed.
Skipping compression for layer 'distilbert.embeddings.LayerNorm.bias'. Storing uncompressed.
Compressing layer: distilbert.transformer.layer.0.attention.q_lin.weight...
Skipping compression for layer 'distilbert.transformer.layer.0.attention.q_lin.bias'. Storing uncompressed.
Compressing layer: distilbert.transformer.layer.0.attention.k_lin.weight...
Skipping compression for layer 'distilbert.transformer.layer.0.attention.k_lin.bias'. Storing uncompressed.
Compressing layer: distilbert.transformer.layer.0.attention.v_lin.weight...
Skipping compression for layer 'distilbert.transformer.layer.0.attention.v_lin.bias'. Storing uncompressed.

Evaluating Accuracy: 100%|██████████| 7/7 [00:02<00:00,  2.49it/s]
Evaluating Accuracy: 100%|██████████| 7/7 [00:02<00:00,  2.58it/s]


Releasing models from memory...

Loading models: distilbert-base-uncased and distilbert-base-uncased-finetuned-sst-2-english


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Preparing dataloader for dataset: glue
Starting model compression using transform: DWT...
Compressing layer: distilbert.embeddings.word_embeddings.weight...
Compressing layer: distilbert.embeddings.position_embeddings.weight...
Skipping compression for layer 'distilbert.embeddings.LayerNorm.weight'. Storing uncompressed.
Skipping compression for layer 'distilbert.embeddings.LayerNorm.bias'. Storing uncompressed.
Compressing layer: distilbert.transformer.layer.0.attention.q_lin.weight...
Skipping compression for layer 'distilbert.transformer.layer.0.attention.q_lin.bias'. Storing uncompressed.
Compressing layer: distilbert.transformer.layer.0.attention.k_lin.weight...
Skipping compression for layer 'distilbert.transformer.layer.0.attention.k_lin.bias'. Storing uncompressed.
Compressing layer: distilbert.transformer.layer.0.attention.v_lin.weight...
Skipping compression for layer 'distilbert.transformer.layer.0.attention.v_lin.bias'. Storing uncompressed.
Compressing layer: distilbert.tr

Evaluating Accuracy: 100%|██████████| 7/7 [00:02<00:00,  2.79it/s]
Evaluating Accuracy: 100%|██████████| 7/7 [00:02<00:00,  2.80it/s]


Releasing models from memory...

Loading models: distilbert-base-uncased and distilbert-base-uncased-finetuned-sst-2-english


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Preparing dataloader for dataset: glue
Starting model compression using transform: DWT...
Compressing layer: distilbert.embeddings.word_embeddings.weight...
Compressing layer: distilbert.embeddings.position_embeddings.weight...
Skipping compression for layer 'distilbert.embeddings.LayerNorm.weight'. Storing uncompressed.
Skipping compression for layer 'distilbert.embeddings.LayerNorm.bias'. Storing uncompressed.
Compressing layer: distilbert.transformer.layer.0.attention.q_lin.weight...
Skipping compression for layer 'distilbert.transformer.layer.0.attention.q_lin.bias'. Storing uncompressed.
Compressing layer: distilbert.transformer.layer.0.attention.k_lin.weight...
Skipping compression for layer 'distilbert.transformer.layer.0.attention.k_lin.bias'. Storing uncompressed.
Compressing layer: distilbert.transformer.layer.0.attention.v_lin.weight...
Skipping compression for layer 'distilbert.transformer.layer.0.attention.v_lin.bias'. Storing uncompressed.
Compressing layer: distilbert.tr

Evaluating Accuracy: 100%|██████████| 7/7 [00:02<00:00,  2.81it/s]
Evaluating Accuracy: 100%|██████████| 7/7 [00:02<00:00,  2.75it/s]


Releasing models from memory...

Loading models: distilbert-base-uncased and distilbert-base-uncased-finetuned-sst-2-english


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Preparing dataloader for dataset: glue
Starting model compression using transform: DWT...
Compressing layer: distilbert.embeddings.word_embeddings.weight...
Compressing layer: distilbert.embeddings.position_embeddings.weight...
Skipping compression for layer 'distilbert.embeddings.LayerNorm.weight'. Storing uncompressed.
Skipping compression for layer 'distilbert.embeddings.LayerNorm.bias'. Storing uncompressed.
Compressing layer: distilbert.transformer.layer.0.attention.q_lin.weight...
Skipping compression for layer 'distilbert.transformer.layer.0.attention.q_lin.bias'. Storing uncompressed.
Compressing layer: distilbert.transformer.layer.0.attention.k_lin.weight...
Skipping compression for layer 'distilbert.transformer.layer.0.attention.k_lin.bias'. Storing uncompressed.
Compressing layer: distilbert.transformer.layer.0.attention.v_lin.weight...
Skipping compression for layer 'distilbert.transformer.layer.0.attention.v_lin.bias'. Storing uncompressed.
Compressing layer: distilbert.tr

Evaluating Accuracy: 100%|██████████| 7/7 [00:02<00:00,  2.70it/s]
Evaluating Accuracy: 100%|██████████| 7/7 [00:02<00:00,  2.78it/s]


Releasing models from memory...

Loading models: google/vit-base-patch16-224-in21k and nateraw/vit-base-patch16-224-cifar10


Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Preparing dataloader for dataset: cifar10
Starting model compression using transform: DCT...
Skipping compression for layer 'vit.embeddings.cls_token'. Storing uncompressed.
Skipping compression for layer 'vit.embeddings.position_embeddings'. Storing uncompressed.
Skipping compression for layer 'vit.embeddings.patch_embeddings.projection.weight'. Storing uncompressed.
Skipping compression for layer 'vit.embeddings.patch_embeddings.projection.bias'. Storing uncompressed.
Compressing layer: vit.encoder.layer.0.attention.attention.query.weight...
Skipping compression for layer 'vit.encoder.layer.0.attention.attention.query.bias'. Storing uncompressed.
Compressing layer: vit.encoder.layer.0.attention.attention.key.weight...
Skipping compression for layer 'vit.encoder.layer.0.attention.attention.key.bias'. Storing uncompressed.
Compressing layer: vit.encoder.layer.0.attention.attention.value.weight...
Skipping compression for layer 'vit.encoder.layer.0.attention.attention.value.bias'. Stori

Evaluating Accuracy: 100%|██████████| 7/7 [00:08<00:00,  1.21s/it]
Evaluating Accuracy: 100%|██████████| 7/7 [00:08<00:00,  1.17s/it]


Releasing models from memory...

Loading models: google/vit-base-patch16-224-in21k and nateraw/vit-base-patch16-224-cifar10


Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Preparing dataloader for dataset: cifar10
JPEG-style quantization ENABLED.
Starting model compression using transform: DCT...
Skipping compression for layer 'vit.embeddings.cls_token'. Storing uncompressed.
Skipping compression for layer 'vit.embeddings.position_embeddings'. Storing uncompressed.
Skipping compression for layer 'vit.embeddings.patch_embeddings.projection.weight'. Storing uncompressed.
Skipping compression for layer 'vit.embeddings.patch_embeddings.projection.bias'. Storing uncompressed.
Compressing layer: vit.encoder.layer.0.attention.attention.query.weight...
Skipping compression for layer 'vit.encoder.layer.0.attention.attention.query.bias'. Storing uncompressed.
Compressing layer: vit.encoder.layer.0.attention.attention.key.weight...
Skipping compression for layer 'vit.encoder.layer.0.attention.attention.key.bias'. Storing uncompressed.
Compressing layer: vit.encoder.layer.0.attention.attention.value.weight...
Skipping compression for layer 'vit.encoder.layer.0.atten

Evaluating Accuracy: 100%|██████████| 7/7 [00:08<00:00,  1.22s/it]
Evaluating Accuracy: 100%|██████████| 7/7 [00:08<00:00,  1.16s/it]


Releasing models from memory...

Loading models: google/vit-base-patch16-224-in21k and nateraw/vit-base-patch16-224-cifar10


Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Preparing dataloader for dataset: cifar10
Starting model compression using transform: DWT...
Skipping compression for layer 'vit.embeddings.cls_token'. Storing uncompressed.
Skipping compression for layer 'vit.embeddings.position_embeddings'. Storing uncompressed.
Skipping compression for layer 'vit.embeddings.patch_embeddings.projection.weight'. Storing uncompressed.
Skipping compression for layer 'vit.embeddings.patch_embeddings.projection.bias'. Storing uncompressed.
Compressing layer: vit.encoder.layer.0.attention.attention.query.weight...
Skipping compression for layer 'vit.encoder.layer.0.attention.attention.query.bias'. Storing uncompressed.
Compressing layer: vit.encoder.layer.0.attention.attention.key.weight...
Skipping compression for layer 'vit.encoder.layer.0.attention.attention.key.bias'. Storing uncompressed.
Compressing layer: vit.encoder.layer.0.attention.attention.value.weight...
Skipping compression for layer 'vit.encoder.layer.0.attention.attention.value.bias'. Stori

Evaluating Accuracy: 100%|██████████| 7/7 [00:08<00:00,  1.22s/it]
Evaluating Accuracy: 100%|██████████| 7/7 [00:08<00:00,  1.20s/it]


Releasing models from memory...

Loading models: google/vit-base-patch16-224-in21k and nateraw/vit-base-patch16-224-cifar10


Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Preparing dataloader for dataset: cifar10
Starting model compression using transform: DWT...
Skipping compression for layer 'vit.embeddings.cls_token'. Storing uncompressed.
Skipping compression for layer 'vit.embeddings.position_embeddings'. Storing uncompressed.
Skipping compression for layer 'vit.embeddings.patch_embeddings.projection.weight'. Storing uncompressed.
Skipping compression for layer 'vit.embeddings.patch_embeddings.projection.bias'. Storing uncompressed.
Compressing layer: vit.encoder.layer.0.attention.attention.query.weight...
Skipping compression for layer 'vit.encoder.layer.0.attention.attention.query.bias'. Storing uncompressed.
Compressing layer: vit.encoder.layer.0.attention.attention.key.weight...
Skipping compression for layer 'vit.encoder.layer.0.attention.attention.key.bias'. Storing uncompressed.
Compressing layer: vit.encoder.layer.0.attention.attention.value.weight...
Skipping compression for layer 'vit.encoder.layer.0.attention.attention.value.bias'. Stori

Evaluating Accuracy: 100%|██████████| 7/7 [00:08<00:00,  1.16s/it]
Evaluating Accuracy: 100%|██████████| 7/7 [00:08<00:00,  1.15s/it]


Releasing models from memory...

Loading models: google/vit-base-patch16-224-in21k and nateraw/vit-base-patch16-224-cifar10


Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Preparing dataloader for dataset: cifar10
Starting model compression using transform: DWT...
Skipping compression for layer 'vit.embeddings.cls_token'. Storing uncompressed.
Skipping compression for layer 'vit.embeddings.position_embeddings'. Storing uncompressed.
Skipping compression for layer 'vit.embeddings.patch_embeddings.projection.weight'. Storing uncompressed.
Skipping compression for layer 'vit.embeddings.patch_embeddings.projection.bias'. Storing uncompressed.
Compressing layer: vit.encoder.layer.0.attention.attention.query.weight...
Skipping compression for layer 'vit.encoder.layer.0.attention.attention.query.bias'. Storing uncompressed.
Compressing layer: vit.encoder.layer.0.attention.attention.key.weight...
Skipping compression for layer 'vit.encoder.layer.0.attention.attention.key.bias'. Storing uncompressed.
Compressing layer: vit.encoder.layer.0.attention.attention.value.weight...
Skipping compression for layer 'vit.encoder.layer.0.attention.attention.value.bias'. Stori

Evaluating Accuracy: 100%|██████████| 7/7 [00:08<00:00,  1.17s/it]
Evaluating Accuracy: 100%|██████████| 7/7 [00:08<00:00,  1.16s/it]


Releasing models from memory...

Loading models: microsoft/swin-tiny-patch4-window7-224 and rs127/swin-tiny-patch4-window7-224-finetuned-cifar10


Some weights of SwinForImageClassification were not initialized from the model checkpoint at microsoft/swin-tiny-patch4-window7-224 and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([10]) in the model instantiated
- classifier.weight: found shape torch.Size([1000, 768]) in the checkpoint and torch.Size([10, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Preparing dataloader for dataset: cifar10
Starting model compression using transform: DCT...
Skipping compression for layer 'swin.embeddings.patch_embeddings.projection.weight'. Storing uncompressed.
Skipping compression for layer 'swin.embeddings.patch_embeddings.projection.bias'. Storing uncompressed.
Skipping compression for layer 'swin.embeddings.norm.weight'. Storing uncompressed.
Skipping compression for layer 'swin.embeddings.norm.bias'. Storing uncompressed.
Skipping compression for layer 'swin.encoder.layers.0.blocks.0.layernorm_before.weight'. Storing uncompressed.
Skipping compression for layer 'swin.encoder.layers.0.blocks.0.layernorm_before.bias'. Storing uncompressed.
Compressing layer: swin.encoder.layers.0.blocks.0.attention.self.relative_position_bias_table...
Compressing layer: swin.encoder.layers.0.blocks.0.attention.self.relative_position_index...
Compressing layer: swin.encoder.layers.0.blocks.0.attention.self.query.weight...
Skipping compression for layer 'swin.en

Evaluating Accuracy: 100%|██████████| 7/7 [00:05<00:00,  1.22it/s]
Evaluating Accuracy: 100%|██████████| 7/7 [00:05<00:00,  1.19it/s]


Releasing models from memory...

Loading models: microsoft/swin-tiny-patch4-window7-224 and rs127/swin-tiny-patch4-window7-224-finetuned-cifar10


Some weights of SwinForImageClassification were not initialized from the model checkpoint at microsoft/swin-tiny-patch4-window7-224 and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([10]) in the model instantiated
- classifier.weight: found shape torch.Size([1000, 768]) in the checkpoint and torch.Size([10, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Preparing dataloader for dataset: cifar10
JPEG-style quantization ENABLED.
Starting model compression using transform: DCT...
Skipping compression for layer 'swin.embeddings.patch_embeddings.projection.weight'. Storing uncompressed.
Skipping compression for layer 'swin.embeddings.patch_embeddings.projection.bias'. Storing uncompressed.
Skipping compression for layer 'swin.embeddings.norm.weight'. Storing uncompressed.
Skipping compression for layer 'swin.embeddings.norm.bias'. Storing uncompressed.
Skipping compression for layer 'swin.encoder.layers.0.blocks.0.layernorm_before.weight'. Storing uncompressed.
Skipping compression for layer 'swin.encoder.layers.0.blocks.0.layernorm_before.bias'. Storing uncompressed.
Compressing layer: swin.encoder.layers.0.blocks.0.attention.self.relative_position_bias_table...
Compressing layer: swin.encoder.layers.0.blocks.0.attention.self.relative_position_index...
Compressing layer: swin.encoder.layers.0.blocks.0.attention.self.query.weight...
Skippi

Evaluating Accuracy: 100%|██████████| 7/7 [00:05<00:00,  1.22it/s]
Evaluating Accuracy: 100%|██████████| 7/7 [00:05<00:00,  1.23it/s]


Releasing models from memory...

Loading models: microsoft/swin-tiny-patch4-window7-224 and rs127/swin-tiny-patch4-window7-224-finetuned-cifar10


Some weights of SwinForImageClassification were not initialized from the model checkpoint at microsoft/swin-tiny-patch4-window7-224 and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([10]) in the model instantiated
- classifier.weight: found shape torch.Size([1000, 768]) in the checkpoint and torch.Size([10, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Preparing dataloader for dataset: cifar10
Starting model compression using transform: DWT...
Skipping compression for layer 'swin.embeddings.patch_embeddings.projection.weight'. Storing uncompressed.
Skipping compression for layer 'swin.embeddings.patch_embeddings.projection.bias'. Storing uncompressed.
Skipping compression for layer 'swin.embeddings.norm.weight'. Storing uncompressed.
Skipping compression for layer 'swin.embeddings.norm.bias'. Storing uncompressed.
Skipping compression for layer 'swin.encoder.layers.0.blocks.0.layernorm_before.weight'. Storing uncompressed.
Skipping compression for layer 'swin.encoder.layers.0.blocks.0.layernorm_before.bias'. Storing uncompressed.
Compressing layer: swin.encoder.layers.0.blocks.0.attention.self.relative_position_bias_table...
Compressing layer: swin.encoder.layers.0.blocks.0.attention.self.relative_position_index...
Compressing layer: swin.encoder.layers.0.blocks.0.attention.self.query.weight...
Skipping compression for layer 'swin.en

Evaluating Accuracy: 100%|██████████| 7/7 [00:05<00:00,  1.23it/s]
Evaluating Accuracy: 100%|██████████| 7/7 [00:05<00:00,  1.23it/s]


Releasing models from memory...

Loading models: microsoft/swin-tiny-patch4-window7-224 and rs127/swin-tiny-patch4-window7-224-finetuned-cifar10


Some weights of SwinForImageClassification were not initialized from the model checkpoint at microsoft/swin-tiny-patch4-window7-224 and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([10]) in the model instantiated
- classifier.weight: found shape torch.Size([1000, 768]) in the checkpoint and torch.Size([10, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Preparing dataloader for dataset: cifar10
Starting model compression using transform: DWT...
Skipping compression for layer 'swin.embeddings.patch_embeddings.projection.weight'. Storing uncompressed.
Skipping compression for layer 'swin.embeddings.patch_embeddings.projection.bias'. Storing uncompressed.
Skipping compression for layer 'swin.embeddings.norm.weight'. Storing uncompressed.
Skipping compression for layer 'swin.embeddings.norm.bias'. Storing uncompressed.
Skipping compression for layer 'swin.encoder.layers.0.blocks.0.layernorm_before.weight'. Storing uncompressed.
Skipping compression for layer 'swin.encoder.layers.0.blocks.0.layernorm_before.bias'. Storing uncompressed.
Compressing layer: swin.encoder.layers.0.blocks.0.attention.self.relative_position_bias_table...
Compressing layer: swin.encoder.layers.0.blocks.0.attention.self.relative_position_index...
Compressing layer: swin.encoder.layers.0.blocks.0.attention.self.query.weight...
Skipping compression for layer 'swin.en

Evaluating Accuracy: 100%|██████████| 7/7 [00:05<00:00,  1.20it/s]
Evaluating Accuracy: 100%|██████████| 7/7 [00:05<00:00,  1.19it/s]


Releasing models from memory...

Loading models: microsoft/swin-tiny-patch4-window7-224 and rs127/swin-tiny-patch4-window7-224-finetuned-cifar10


Some weights of SwinForImageClassification were not initialized from the model checkpoint at microsoft/swin-tiny-patch4-window7-224 and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([10]) in the model instantiated
- classifier.weight: found shape torch.Size([1000, 768]) in the checkpoint and torch.Size([10, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Preparing dataloader for dataset: cifar10
Starting model compression using transform: DWT...
Skipping compression for layer 'swin.embeddings.patch_embeddings.projection.weight'. Storing uncompressed.
Skipping compression for layer 'swin.embeddings.patch_embeddings.projection.bias'. Storing uncompressed.
Skipping compression for layer 'swin.embeddings.norm.weight'. Storing uncompressed.
Skipping compression for layer 'swin.embeddings.norm.bias'. Storing uncompressed.
Skipping compression for layer 'swin.encoder.layers.0.blocks.0.layernorm_before.weight'. Storing uncompressed.
Skipping compression for layer 'swin.encoder.layers.0.blocks.0.layernorm_before.bias'. Storing uncompressed.
Compressing layer: swin.encoder.layers.0.blocks.0.attention.self.relative_position_bias_table...
Compressing layer: swin.encoder.layers.0.blocks.0.attention.self.relative_position_index...
Compressing layer: swin.encoder.layers.0.blocks.0.attention.self.query.weight...
Skipping compression for layer 'swin.en

Evaluating Accuracy: 100%|██████████| 7/7 [00:05<00:00,  1.19it/s]
Evaluating Accuracy: 100%|██████████| 7/7 [00:05<00:00,  1.22it/s]


Releasing models from memory...


All experiments have been completed.


### Table

In [56]:
if innovation_all_results:
    results_df = pd.DataFrame(innovation_all_results)
    print("\n--- Experiment Results ---")
    display(results_df)
else:
    print("No results to display. Please check for errors in the previous cell.")


--- Experiment Results ---


Unnamed: 0,model_name,transform,dwt_coeffs_kept,jpeg_quant,patch_size,bit_strategy,original_accuracy,reconstructed_accuracy,accuracy_drop,original_delta_mb,compressed_delta_mb,compression_ratio
0,roberta-base-SST-2,dct,all,False,32,"[(2, 0.34), (1, 0.33), (0, 0.33)]",0.945,0.95,-0.005,475.491219,122.328743,3.886995
1,roberta-base-SST-2,dct,all,True,32,"[(2, 0.34), (1, 0.33), (0, 0.33)]",0.945,0.91,0.035,475.491219,122.328743,3.886995
2,roberta-base-SST-2,dwt,all,False,32,"[(2, 0.34), (1, 0.33), (0, 0.33)]",0.945,0.925,0.02,475.491219,125.099495,3.800904
3,roberta-base-SST-2,dwt,ll_lh_hl,False,32,"[(2, 0.34), (1, 0.33), (0, 0.33)]",0.945,0.925,0.02,475.491219,94.621223,5.025207
4,roberta-base-SST-2,dwt,ll_only,False,32,"[(2, 0.34), (1, 0.33), (0, 0.33)]",0.945,0.68,0.265,475.491219,33.66468,14.124335
5,distilbert-base-uncased-finetuned-sst-2-english,dct,all,False,32,"[(4, 0.34), (2, 0.33), (0, 0.33)]",0.91,0.905,0.005,255.413094,66.468575,3.842614
6,distilbert-base-uncased-finetuned-sst-2-english,dct,all,True,32,"[(4, 0.34), (2, 0.33), (0, 0.33)]",0.91,0.895,0.015,255.413094,66.468575,3.842614
7,distilbert-base-uncased-finetuned-sst-2-english,dwt,all,False,32,"[(4, 0.34), (2, 0.33), (0, 0.33)]",0.91,0.865,0.045,255.413094,67.95063,3.758804
8,distilbert-base-uncased-finetuned-sst-2-english,dwt,ll_lh_hl,False,32,"[(4, 0.34), (2, 0.33), (0, 0.33)]",0.91,0.885,0.025,255.413094,51.648018,4.945264
9,distilbert-base-uncased-finetuned-sst-2-english,dwt,ll_only,False,32,"[(4, 0.34), (2, 0.33), (0, 0.33)]",0.91,0.84,0.07,255.413094,19.042793,13.412586


## DWT Only

### Configs

In [None]:
patch_sizes_to_test = [16, 32, 64]
bit_strategies_to_test = [
    # Strategy 1: 50% of patches at 2 bits, 50% pruned (0 bits). This is the main setting from the Delta-DCT paper.
    [(2, 0.5), (0, 0.5)],
    # Strategy 2: A slightly higher fidelity version for comparison. 50% at 4 bits, 50% pruned.
    [(4, 0.5), (0, 0.5)]

]


# --- 3. Automatically generate the full list of experiment configurations ---
dwt_experiments_config = []
for model_info in models_to_test:
    for p_size in patch_sizes_to_test:
        for bit_strat in bit_strategies_to_test:
            
            # Create a new config for each combination
            config = {
                # --- Baseline-specific settings ---
                "transform_type": "dwt",        # Only test DWT for the baseline
                "use_jpeg_quantization": False, # JPEG feature is OFF
                "importance_mode": "pre",       # Standard importance scoring
                "dwt_coeffs_to_keep": "ll_lh_hl",    # DWT setting 

                # --- Model and task info ---
                "pretrained_model_id": model_info["pretrained_model_id"],
                "finetuned_model_id": model_info["finetuned_model_id"],
                "model_class": model_info["model_class"],
                "task_info": model_info["task_info"],
                
                # --- Hyperparameters for this specific run ---
                "patch_size": p_size,
                "bit_strategy": bit_strat
            }
            dwt_experiments_config.append(config)

# --- 4. (Optional) Print a summary ---
print(f"Total number of experiments configured: {len(dwt_experiments_config)}")

Total number of experiments configured: 24


### Run Experiments

In [46]:
dwt_all_results = []
device_to_use = "cpu"

for config in dwt_experiments_config:
    try:
        result = run_classification_experiment(
            config=config,
            device=device_to_use
        )
        dwt_all_results.append(result)
    except Exception as e:
        print(f"\n!!!!!! An error occurred during experiment: {config.get('finetuned_model_id')} !!!!!!")
        print(f"Error: {e}\n")

print("\n\nAll experiments have been completed.")

Some weights of the model checkpoint at textattack/roberta-base-SST-2 were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).



Loading models: roberta-base and textattack/roberta-base-SST-2


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Preparing dataloader for dataset: glue
Starting model compression using transform: DWT...
Compressing layer: roberta.embeddings.word_embeddings.weight...
Compressing layer: roberta.embeddings.position_embeddings.weight...
Compressing layer: roberta.embeddings.token_type_embeddings.weight...
Skipping compression for layer 'roberta.embeddings.LayerNorm.weight'. Storing uncompressed.
Skipping compression for layer 'roberta.embeddings.LayerNorm.bias'. Storing uncompressed.
Compressing layer: roberta.encoder.layer.0.attention.self.query.weight...
Skipping compression for layer 'roberta.encoder.layer.0.attention.self.query.bias'. Storing uncompressed.
Compressing layer: roberta.encoder.layer.0.attention.self.key.weight...
Skipping compression for layer 'roberta.encoder.layer.0.attention.self.key.bias'. Storing uncompressed.
Compressing layer: roberta.encoder.layer.0.attention.self.value.weight...
Skipping compression for layer 'roberta.encoder.layer.0.attention.self.value.bias'. Storing unco

Evaluating Accuracy: 100%|██████████| 7/7 [00:05<00:00,  1.32it/s]
Evaluating Accuracy: 100%|██████████| 7/7 [00:05<00:00,  1.35it/s]


Releasing models from memory...

Loading models: roberta-base and textattack/roberta-base-SST-2


Some weights of the model checkpoint at textattack/roberta-base-SST-2 were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream 

Preparing dataloader for dataset: glue
Starting model compression using transform: DWT...
Compressing layer: roberta.embeddings.word_embeddings.weight...
Compressing layer: roberta.embeddings.position_embeddings.weight...
Compressing layer: roberta.embeddings.token_type_embeddings.weight...
Skipping compression for layer 'roberta.embeddings.LayerNorm.weight'. Storing uncompressed.
Skipping compression for layer 'roberta.embeddings.LayerNorm.bias'. Storing uncompressed.
Compressing layer: roberta.encoder.layer.0.attention.self.query.weight...
Skipping compression for layer 'roberta.encoder.layer.0.attention.self.query.bias'. Storing uncompressed.
Compressing layer: roberta.encoder.layer.0.attention.self.key.weight...
Skipping compression for layer 'roberta.encoder.layer.0.attention.self.key.bias'. Storing uncompressed.
Compressing layer: roberta.encoder.layer.0.attention.self.value.weight...
Skipping compression for layer 'roberta.encoder.layer.0.attention.self.value.bias'. Storing unco

Evaluating Accuracy: 100%|██████████| 7/7 [00:05<00:00,  1.36it/s]
Evaluating Accuracy: 100%|██████████| 7/7 [00:05<00:00,  1.35it/s]


Releasing models from memory...

Loading models: roberta-base and textattack/roberta-base-SST-2


Some weights of the model checkpoint at textattack/roberta-base-SST-2 were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream 

Preparing dataloader for dataset: glue
Starting model compression using transform: DWT...
Compressing layer: roberta.embeddings.word_embeddings.weight...
Compressing layer: roberta.embeddings.position_embeddings.weight...
Skipping compression for layer 'roberta.embeddings.token_type_embeddings.weight'. Storing uncompressed.
Skipping compression for layer 'roberta.embeddings.LayerNorm.weight'. Storing uncompressed.
Skipping compression for layer 'roberta.embeddings.LayerNorm.bias'. Storing uncompressed.
Compressing layer: roberta.encoder.layer.0.attention.self.query.weight...
Skipping compression for layer 'roberta.encoder.layer.0.attention.self.query.bias'. Storing uncompressed.
Compressing layer: roberta.encoder.layer.0.attention.self.key.weight...
Skipping compression for layer 'roberta.encoder.layer.0.attention.self.key.bias'. Storing uncompressed.
Compressing layer: roberta.encoder.layer.0.attention.self.value.weight...
Skipping compression for layer 'roberta.encoder.layer.0.attent

Evaluating Accuracy: 100%|██████████| 7/7 [00:05<00:00,  1.30it/s]
Evaluating Accuracy: 100%|██████████| 7/7 [00:05<00:00,  1.37it/s]


Releasing models from memory...

Loading models: roberta-base and textattack/roberta-base-SST-2


Some weights of the model checkpoint at textattack/roberta-base-SST-2 were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream 

Preparing dataloader for dataset: glue
Starting model compression using transform: DWT...
Compressing layer: roberta.embeddings.word_embeddings.weight...
Compressing layer: roberta.embeddings.position_embeddings.weight...
Skipping compression for layer 'roberta.embeddings.token_type_embeddings.weight'. Storing uncompressed.
Skipping compression for layer 'roberta.embeddings.LayerNorm.weight'. Storing uncompressed.
Skipping compression for layer 'roberta.embeddings.LayerNorm.bias'. Storing uncompressed.
Compressing layer: roberta.encoder.layer.0.attention.self.query.weight...
Skipping compression for layer 'roberta.encoder.layer.0.attention.self.query.bias'. Storing uncompressed.
Compressing layer: roberta.encoder.layer.0.attention.self.key.weight...
Skipping compression for layer 'roberta.encoder.layer.0.attention.self.key.bias'. Storing uncompressed.
Compressing layer: roberta.encoder.layer.0.attention.self.value.weight...
Skipping compression for layer 'roberta.encoder.layer.0.attent

Evaluating Accuracy: 100%|██████████| 7/7 [00:05<00:00,  1.37it/s]
Evaluating Accuracy: 100%|██████████| 7/7 [00:05<00:00,  1.37it/s]


Releasing models from memory...


Some weights of the model checkpoint at textattack/roberta-base-SST-2 were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).



Loading models: roberta-base and textattack/roberta-base-SST-2


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Preparing dataloader for dataset: glue
Starting model compression using transform: DWT...
Compressing layer: roberta.embeddings.word_embeddings.weight...
Compressing layer: roberta.embeddings.position_embeddings.weight...
Skipping compression for layer 'roberta.embeddings.token_type_embeddings.weight'. Storing uncompressed.
Skipping compression for layer 'roberta.embeddings.LayerNorm.weight'. Storing uncompressed.
Skipping compression for layer 'roberta.embeddings.LayerNorm.bias'. Storing uncompressed.
Compressing layer: roberta.encoder.layer.0.attention.self.query.weight...
Skipping compression for layer 'roberta.encoder.layer.0.attention.self.query.bias'. Storing uncompressed.
Compressing layer: roberta.encoder.layer.0.attention.self.key.weight...
Skipping compression for layer 'roberta.encoder.layer.0.attention.self.key.bias'. Storing uncompressed.
Compressing layer: roberta.encoder.layer.0.attention.self.value.weight...
Skipping compression for layer 'roberta.encoder.layer.0.attent

Evaluating Accuracy: 100%|██████████| 7/7 [00:05<00:00,  1.39it/s]
Evaluating Accuracy: 100%|██████████| 7/7 [00:05<00:00,  1.36it/s]


Releasing models from memory...

Loading models: roberta-base and textattack/roberta-base-SST-2


Some weights of the model checkpoint at textattack/roberta-base-SST-2 were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream 

Preparing dataloader for dataset: glue
Starting model compression using transform: DWT...
Compressing layer: roberta.embeddings.word_embeddings.weight...
Compressing layer: roberta.embeddings.position_embeddings.weight...
Skipping compression for layer 'roberta.embeddings.token_type_embeddings.weight'. Storing uncompressed.
Skipping compression for layer 'roberta.embeddings.LayerNorm.weight'. Storing uncompressed.
Skipping compression for layer 'roberta.embeddings.LayerNorm.bias'. Storing uncompressed.
Compressing layer: roberta.encoder.layer.0.attention.self.query.weight...
Skipping compression for layer 'roberta.encoder.layer.0.attention.self.query.bias'. Storing uncompressed.
Compressing layer: roberta.encoder.layer.0.attention.self.key.weight...
Skipping compression for layer 'roberta.encoder.layer.0.attention.self.key.bias'. Storing uncompressed.
Compressing layer: roberta.encoder.layer.0.attention.self.value.weight...
Skipping compression for layer 'roberta.encoder.layer.0.attent

Evaluating Accuracy: 100%|██████████| 7/7 [00:05<00:00,  1.40it/s]
Evaluating Accuracy: 100%|██████████| 7/7 [00:05<00:00,  1.39it/s]


Releasing models from memory...

Loading models: distilbert-base-uncased and distilbert-base-uncased-finetuned-sst-2-english


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Preparing dataloader for dataset: glue
Starting model compression using transform: DWT...
Compressing layer: distilbert.embeddings.word_embeddings.weight...
Compressing layer: distilbert.embeddings.position_embeddings.weight...
Skipping compression for layer 'distilbert.embeddings.LayerNorm.weight'. Storing uncompressed.
Skipping compression for layer 'distilbert.embeddings.LayerNorm.bias'. Storing uncompressed.
Compressing layer: distilbert.transformer.layer.0.attention.q_lin.weight...
Skipping compression for layer 'distilbert.transformer.layer.0.attention.q_lin.bias'. Storing uncompressed.
Compressing layer: distilbert.transformer.layer.0.attention.k_lin.weight...
Skipping compression for layer 'distilbert.transformer.layer.0.attention.k_lin.bias'. Storing uncompressed.
Compressing layer: distilbert.transformer.layer.0.attention.v_lin.weight...
Skipping compression for layer 'distilbert.transformer.layer.0.attention.v_lin.bias'. Storing uncompressed.
Compressing layer: distilbert.tr

Evaluating Accuracy: 100%|██████████| 7/7 [00:02<00:00,  2.57it/s]
Evaluating Accuracy: 100%|██████████| 7/7 [00:02<00:00,  2.68it/s]


Releasing models from memory...

Loading models: distilbert-base-uncased and distilbert-base-uncased-finetuned-sst-2-english


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Preparing dataloader for dataset: glue
Starting model compression using transform: DWT...
Compressing layer: distilbert.embeddings.word_embeddings.weight...
Compressing layer: distilbert.embeddings.position_embeddings.weight...
Skipping compression for layer 'distilbert.embeddings.LayerNorm.weight'. Storing uncompressed.
Skipping compression for layer 'distilbert.embeddings.LayerNorm.bias'. Storing uncompressed.
Compressing layer: distilbert.transformer.layer.0.attention.q_lin.weight...
Skipping compression for layer 'distilbert.transformer.layer.0.attention.q_lin.bias'. Storing uncompressed.
Compressing layer: distilbert.transformer.layer.0.attention.k_lin.weight...
Skipping compression for layer 'distilbert.transformer.layer.0.attention.k_lin.bias'. Storing uncompressed.
Compressing layer: distilbert.transformer.layer.0.attention.v_lin.weight...
Skipping compression for layer 'distilbert.transformer.layer.0.attention.v_lin.bias'. Storing uncompressed.
Compressing layer: distilbert.tr

Evaluating Accuracy: 100%|██████████| 7/7 [00:02<00:00,  2.70it/s]
Evaluating Accuracy: 100%|██████████| 7/7 [00:02<00:00,  2.72it/s]


Releasing models from memory...

Loading models: distilbert-base-uncased and distilbert-base-uncased-finetuned-sst-2-english


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Preparing dataloader for dataset: glue
Starting model compression using transform: DWT...
Compressing layer: distilbert.embeddings.word_embeddings.weight...
Compressing layer: distilbert.embeddings.position_embeddings.weight...
Skipping compression for layer 'distilbert.embeddings.LayerNorm.weight'. Storing uncompressed.
Skipping compression for layer 'distilbert.embeddings.LayerNorm.bias'. Storing uncompressed.
Compressing layer: distilbert.transformer.layer.0.attention.q_lin.weight...
Skipping compression for layer 'distilbert.transformer.layer.0.attention.q_lin.bias'. Storing uncompressed.
Compressing layer: distilbert.transformer.layer.0.attention.k_lin.weight...
Skipping compression for layer 'distilbert.transformer.layer.0.attention.k_lin.bias'. Storing uncompressed.
Compressing layer: distilbert.transformer.layer.0.attention.v_lin.weight...
Skipping compression for layer 'distilbert.transformer.layer.0.attention.v_lin.bias'. Storing uncompressed.
Compressing layer: distilbert.tr

Evaluating Accuracy: 100%|██████████| 7/7 [00:02<00:00,  2.69it/s]
Evaluating Accuracy: 100%|██████████| 7/7 [00:02<00:00,  2.68it/s]


Releasing models from memory...

Loading models: distilbert-base-uncased and distilbert-base-uncased-finetuned-sst-2-english


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Preparing dataloader for dataset: glue
Starting model compression using transform: DWT...
Compressing layer: distilbert.embeddings.word_embeddings.weight...
Compressing layer: distilbert.embeddings.position_embeddings.weight...
Skipping compression for layer 'distilbert.embeddings.LayerNorm.weight'. Storing uncompressed.
Skipping compression for layer 'distilbert.embeddings.LayerNorm.bias'. Storing uncompressed.
Compressing layer: distilbert.transformer.layer.0.attention.q_lin.weight...
Skipping compression for layer 'distilbert.transformer.layer.0.attention.q_lin.bias'. Storing uncompressed.
Compressing layer: distilbert.transformer.layer.0.attention.k_lin.weight...
Skipping compression for layer 'distilbert.transformer.layer.0.attention.k_lin.bias'. Storing uncompressed.
Compressing layer: distilbert.transformer.layer.0.attention.v_lin.weight...
Skipping compression for layer 'distilbert.transformer.layer.0.attention.v_lin.bias'. Storing uncompressed.
Compressing layer: distilbert.tr

Evaluating Accuracy: 100%|██████████| 7/7 [00:02<00:00,  2.73it/s]
Evaluating Accuracy: 100%|██████████| 7/7 [00:02<00:00,  2.63it/s]


Releasing models from memory...

Loading models: distilbert-base-uncased and distilbert-base-uncased-finetuned-sst-2-english


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Preparing dataloader for dataset: glue
Starting model compression using transform: DWT...
Compressing layer: distilbert.embeddings.word_embeddings.weight...
Compressing layer: distilbert.embeddings.position_embeddings.weight...
Skipping compression for layer 'distilbert.embeddings.LayerNorm.weight'. Storing uncompressed.
Skipping compression for layer 'distilbert.embeddings.LayerNorm.bias'. Storing uncompressed.
Compressing layer: distilbert.transformer.layer.0.attention.q_lin.weight...
Skipping compression for layer 'distilbert.transformer.layer.0.attention.q_lin.bias'. Storing uncompressed.
Compressing layer: distilbert.transformer.layer.0.attention.k_lin.weight...
Skipping compression for layer 'distilbert.transformer.layer.0.attention.k_lin.bias'. Storing uncompressed.
Compressing layer: distilbert.transformer.layer.0.attention.v_lin.weight...
Skipping compression for layer 'distilbert.transformer.layer.0.attention.v_lin.bias'. Storing uncompressed.
Compressing layer: distilbert.tr

Evaluating Accuracy: 100%|██████████| 7/7 [00:02<00:00,  2.63it/s]
Evaluating Accuracy: 100%|██████████| 7/7 [00:02<00:00,  2.71it/s]


Releasing models from memory...

Loading models: distilbert-base-uncased and distilbert-base-uncased-finetuned-sst-2-english


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Preparing dataloader for dataset: glue
Starting model compression using transform: DWT...
Compressing layer: distilbert.embeddings.word_embeddings.weight...
Compressing layer: distilbert.embeddings.position_embeddings.weight...
Skipping compression for layer 'distilbert.embeddings.LayerNorm.weight'. Storing uncompressed.
Skipping compression for layer 'distilbert.embeddings.LayerNorm.bias'. Storing uncompressed.
Compressing layer: distilbert.transformer.layer.0.attention.q_lin.weight...
Skipping compression for layer 'distilbert.transformer.layer.0.attention.q_lin.bias'. Storing uncompressed.
Compressing layer: distilbert.transformer.layer.0.attention.k_lin.weight...
Skipping compression for layer 'distilbert.transformer.layer.0.attention.k_lin.bias'. Storing uncompressed.
Compressing layer: distilbert.transformer.layer.0.attention.v_lin.weight...
Skipping compression for layer 'distilbert.transformer.layer.0.attention.v_lin.bias'. Storing uncompressed.
Compressing layer: distilbert.tr

Evaluating Accuracy: 100%|██████████| 7/7 [00:02<00:00,  2.71it/s]
Evaluating Accuracy: 100%|██████████| 7/7 [00:02<00:00,  2.62it/s]


Releasing models from memory...

Loading models: google/vit-base-patch16-224-in21k and nateraw/vit-base-patch16-224-cifar10


Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Preparing dataloader for dataset: cifar10
Starting model compression using transform: DWT...
Skipping compression for layer 'vit.embeddings.cls_token'. Storing uncompressed.
Skipping compression for layer 'vit.embeddings.position_embeddings'. Storing uncompressed.
Skipping compression for layer 'vit.embeddings.patch_embeddings.projection.weight'. Storing uncompressed.
Skipping compression for layer 'vit.embeddings.patch_embeddings.projection.bias'. Storing uncompressed.
Compressing layer: vit.encoder.layer.0.attention.attention.query.weight...
Skipping compression for layer 'vit.encoder.layer.0.attention.attention.query.bias'. Storing uncompressed.
Compressing layer: vit.encoder.layer.0.attention.attention.key.weight...
Skipping compression for layer 'vit.encoder.layer.0.attention.attention.key.bias'. Storing uncompressed.
Compressing layer: vit.encoder.layer.0.attention.attention.value.weight...
Skipping compression for layer 'vit.encoder.layer.0.attention.attention.value.bias'. Stori

Evaluating Accuracy: 100%|██████████| 7/7 [00:08<00:00,  1.22s/it]
Evaluating Accuracy: 100%|██████████| 7/7 [00:08<00:00,  1.21s/it]


Releasing models from memory...

Loading models: google/vit-base-patch16-224-in21k and nateraw/vit-base-patch16-224-cifar10


Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Preparing dataloader for dataset: cifar10
Starting model compression using transform: DWT...
Skipping compression for layer 'vit.embeddings.cls_token'. Storing uncompressed.
Skipping compression for layer 'vit.embeddings.position_embeddings'. Storing uncompressed.
Skipping compression for layer 'vit.embeddings.patch_embeddings.projection.weight'. Storing uncompressed.
Skipping compression for layer 'vit.embeddings.patch_embeddings.projection.bias'. Storing uncompressed.
Compressing layer: vit.encoder.layer.0.attention.attention.query.weight...
Skipping compression for layer 'vit.encoder.layer.0.attention.attention.query.bias'. Storing uncompressed.
Compressing layer: vit.encoder.layer.0.attention.attention.key.weight...
Skipping compression for layer 'vit.encoder.layer.0.attention.attention.key.bias'. Storing uncompressed.
Compressing layer: vit.encoder.layer.0.attention.attention.value.weight...
Skipping compression for layer 'vit.encoder.layer.0.attention.attention.value.bias'. Stori

Evaluating Accuracy: 100%|██████████| 7/7 [00:08<00:00,  1.23s/it]
Evaluating Accuracy: 100%|██████████| 7/7 [00:08<00:00,  1.20s/it]


Releasing models from memory...

Loading models: google/vit-base-patch16-224-in21k and nateraw/vit-base-patch16-224-cifar10


Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Preparing dataloader for dataset: cifar10
Starting model compression using transform: DWT...
Skipping compression for layer 'vit.embeddings.cls_token'. Storing uncompressed.
Skipping compression for layer 'vit.embeddings.position_embeddings'. Storing uncompressed.
Skipping compression for layer 'vit.embeddings.patch_embeddings.projection.weight'. Storing uncompressed.
Skipping compression for layer 'vit.embeddings.patch_embeddings.projection.bias'. Storing uncompressed.
Compressing layer: vit.encoder.layer.0.attention.attention.query.weight...
Skipping compression for layer 'vit.encoder.layer.0.attention.attention.query.bias'. Storing uncompressed.
Compressing layer: vit.encoder.layer.0.attention.attention.key.weight...
Skipping compression for layer 'vit.encoder.layer.0.attention.attention.key.bias'. Storing uncompressed.
Compressing layer: vit.encoder.layer.0.attention.attention.value.weight...
Skipping compression for layer 'vit.encoder.layer.0.attention.attention.value.bias'. Stori

Evaluating Accuracy: 100%|██████████| 7/7 [00:08<00:00,  1.23s/it]
Evaluating Accuracy: 100%|██████████| 7/7 [00:08<00:00,  1.23s/it]


Releasing models from memory...

Loading models: google/vit-base-patch16-224-in21k and nateraw/vit-base-patch16-224-cifar10


Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Preparing dataloader for dataset: cifar10
Starting model compression using transform: DWT...
Skipping compression for layer 'vit.embeddings.cls_token'. Storing uncompressed.
Skipping compression for layer 'vit.embeddings.position_embeddings'. Storing uncompressed.
Skipping compression for layer 'vit.embeddings.patch_embeddings.projection.weight'. Storing uncompressed.
Skipping compression for layer 'vit.embeddings.patch_embeddings.projection.bias'. Storing uncompressed.
Compressing layer: vit.encoder.layer.0.attention.attention.query.weight...
Skipping compression for layer 'vit.encoder.layer.0.attention.attention.query.bias'. Storing uncompressed.
Compressing layer: vit.encoder.layer.0.attention.attention.key.weight...
Skipping compression for layer 'vit.encoder.layer.0.attention.attention.key.bias'. Storing uncompressed.
Compressing layer: vit.encoder.layer.0.attention.attention.value.weight...
Skipping compression for layer 'vit.encoder.layer.0.attention.attention.value.bias'. Stori

Evaluating Accuracy: 100%|██████████| 7/7 [00:08<00:00,  1.22s/it]
Evaluating Accuracy: 100%|██████████| 7/7 [00:08<00:00,  1.25s/it]


Releasing models from memory...

Loading models: google/vit-base-patch16-224-in21k and nateraw/vit-base-patch16-224-cifar10


Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Preparing dataloader for dataset: cifar10
Starting model compression using transform: DWT...
Skipping compression for layer 'vit.embeddings.cls_token'. Storing uncompressed.
Skipping compression for layer 'vit.embeddings.position_embeddings'. Storing uncompressed.
Skipping compression for layer 'vit.embeddings.patch_embeddings.projection.weight'. Storing uncompressed.
Skipping compression for layer 'vit.embeddings.patch_embeddings.projection.bias'. Storing uncompressed.
Compressing layer: vit.encoder.layer.0.attention.attention.query.weight...
Skipping compression for layer 'vit.encoder.layer.0.attention.attention.query.bias'. Storing uncompressed.
Compressing layer: vit.encoder.layer.0.attention.attention.key.weight...
Skipping compression for layer 'vit.encoder.layer.0.attention.attention.key.bias'. Storing uncompressed.
Compressing layer: vit.encoder.layer.0.attention.attention.value.weight...
Skipping compression for layer 'vit.encoder.layer.0.attention.attention.value.bias'. Stori

Evaluating Accuracy: 100%|██████████| 7/7 [00:08<00:00,  1.25s/it]
Evaluating Accuracy: 100%|██████████| 7/7 [00:08<00:00,  1.25s/it]


Releasing models from memory...

Loading models: google/vit-base-patch16-224-in21k and nateraw/vit-base-patch16-224-cifar10


Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Preparing dataloader for dataset: cifar10
Starting model compression using transform: DWT...
Skipping compression for layer 'vit.embeddings.cls_token'. Storing uncompressed.
Skipping compression for layer 'vit.embeddings.position_embeddings'. Storing uncompressed.
Skipping compression for layer 'vit.embeddings.patch_embeddings.projection.weight'. Storing uncompressed.
Skipping compression for layer 'vit.embeddings.patch_embeddings.projection.bias'. Storing uncompressed.
Compressing layer: vit.encoder.layer.0.attention.attention.query.weight...
Skipping compression for layer 'vit.encoder.layer.0.attention.attention.query.bias'. Storing uncompressed.
Compressing layer: vit.encoder.layer.0.attention.attention.key.weight...
Skipping compression for layer 'vit.encoder.layer.0.attention.attention.key.bias'. Storing uncompressed.
Compressing layer: vit.encoder.layer.0.attention.attention.value.weight...
Skipping compression for layer 'vit.encoder.layer.0.attention.attention.value.bias'. Stori

Evaluating Accuracy: 100%|██████████| 7/7 [00:08<00:00,  1.25s/it]
Evaluating Accuracy: 100%|██████████| 7/7 [00:08<00:00,  1.24s/it]


Releasing models from memory...

Loading models: microsoft/swin-tiny-patch4-window7-224 and rs127/swin-tiny-patch4-window7-224-finetuned-cifar10


Some weights of SwinForImageClassification were not initialized from the model checkpoint at microsoft/swin-tiny-patch4-window7-224 and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([10]) in the model instantiated
- classifier.weight: found shape torch.Size([1000, 768]) in the checkpoint and torch.Size([10, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Preparing dataloader for dataset: cifar10
Starting model compression using transform: DWT...
Skipping compression for layer 'swin.embeddings.patch_embeddings.projection.weight'. Storing uncompressed.
Skipping compression for layer 'swin.embeddings.patch_embeddings.projection.bias'. Storing uncompressed.
Skipping compression for layer 'swin.embeddings.norm.weight'. Storing uncompressed.
Skipping compression for layer 'swin.embeddings.norm.bias'. Storing uncompressed.
Skipping compression for layer 'swin.encoder.layers.0.blocks.0.layernorm_before.weight'. Storing uncompressed.
Skipping compression for layer 'swin.encoder.layers.0.blocks.0.layernorm_before.bias'. Storing uncompressed.
Compressing layer: swin.encoder.layers.0.blocks.0.attention.self.relative_position_bias_table...
Compressing layer: swin.encoder.layers.0.blocks.0.attention.self.relative_position_index...
Compressing layer: swin.encoder.layers.0.blocks.0.attention.self.query.weight...
Skipping compression for layer 'swin.en

Evaluating Accuracy: 100%|██████████| 7/7 [00:05<00:00,  1.18it/s]
Evaluating Accuracy: 100%|██████████| 7/7 [00:05<00:00,  1.21it/s]


Releasing models from memory...

Loading models: microsoft/swin-tiny-patch4-window7-224 and rs127/swin-tiny-patch4-window7-224-finetuned-cifar10


Some weights of SwinForImageClassification were not initialized from the model checkpoint at microsoft/swin-tiny-patch4-window7-224 and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([10]) in the model instantiated
- classifier.weight: found shape torch.Size([1000, 768]) in the checkpoint and torch.Size([10, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Preparing dataloader for dataset: cifar10
Starting model compression using transform: DWT...
Skipping compression for layer 'swin.embeddings.patch_embeddings.projection.weight'. Storing uncompressed.
Skipping compression for layer 'swin.embeddings.patch_embeddings.projection.bias'. Storing uncompressed.
Skipping compression for layer 'swin.embeddings.norm.weight'. Storing uncompressed.
Skipping compression for layer 'swin.embeddings.norm.bias'. Storing uncompressed.
Skipping compression for layer 'swin.encoder.layers.0.blocks.0.layernorm_before.weight'. Storing uncompressed.
Skipping compression for layer 'swin.encoder.layers.0.blocks.0.layernorm_before.bias'. Storing uncompressed.
Compressing layer: swin.encoder.layers.0.blocks.0.attention.self.relative_position_bias_table...
Compressing layer: swin.encoder.layers.0.blocks.0.attention.self.relative_position_index...
Compressing layer: swin.encoder.layers.0.blocks.0.attention.self.query.weight...
Skipping compression for layer 'swin.en

Evaluating Accuracy: 100%|██████████| 7/7 [00:05<00:00,  1.20it/s]
Evaluating Accuracy: 100%|██████████| 7/7 [00:06<00:00,  1.16it/s]


Releasing models from memory...

Loading models: microsoft/swin-tiny-patch4-window7-224 and rs127/swin-tiny-patch4-window7-224-finetuned-cifar10


Some weights of SwinForImageClassification were not initialized from the model checkpoint at microsoft/swin-tiny-patch4-window7-224 and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([10]) in the model instantiated
- classifier.weight: found shape torch.Size([1000, 768]) in the checkpoint and torch.Size([10, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Preparing dataloader for dataset: cifar10
Starting model compression using transform: DWT...
Skipping compression for layer 'swin.embeddings.patch_embeddings.projection.weight'. Storing uncompressed.
Skipping compression for layer 'swin.embeddings.patch_embeddings.projection.bias'. Storing uncompressed.
Skipping compression for layer 'swin.embeddings.norm.weight'. Storing uncompressed.
Skipping compression for layer 'swin.embeddings.norm.bias'. Storing uncompressed.
Skipping compression for layer 'swin.encoder.layers.0.blocks.0.layernorm_before.weight'. Storing uncompressed.
Skipping compression for layer 'swin.encoder.layers.0.blocks.0.layernorm_before.bias'. Storing uncompressed.
Skipping compression for layer 'swin.encoder.layers.0.blocks.0.attention.self.relative_position_bias_table'. Storing uncompressed.
Compressing layer: swin.encoder.layers.0.blocks.0.attention.self.relative_position_index...
Compressing layer: swin.encoder.layers.0.blocks.0.attention.self.query.weight...
Skipp

Evaluating Accuracy: 100%|██████████| 7/7 [00:05<00:00,  1.19it/s]
Evaluating Accuracy: 100%|██████████| 7/7 [00:05<00:00,  1.21it/s]


Releasing models from memory...

Loading models: microsoft/swin-tiny-patch4-window7-224 and rs127/swin-tiny-patch4-window7-224-finetuned-cifar10


Some weights of SwinForImageClassification were not initialized from the model checkpoint at microsoft/swin-tiny-patch4-window7-224 and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([10]) in the model instantiated
- classifier.weight: found shape torch.Size([1000, 768]) in the checkpoint and torch.Size([10, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Preparing dataloader for dataset: cifar10
Starting model compression using transform: DWT...
Skipping compression for layer 'swin.embeddings.patch_embeddings.projection.weight'. Storing uncompressed.
Skipping compression for layer 'swin.embeddings.patch_embeddings.projection.bias'. Storing uncompressed.
Skipping compression for layer 'swin.embeddings.norm.weight'. Storing uncompressed.
Skipping compression for layer 'swin.embeddings.norm.bias'. Storing uncompressed.
Skipping compression for layer 'swin.encoder.layers.0.blocks.0.layernorm_before.weight'. Storing uncompressed.
Skipping compression for layer 'swin.encoder.layers.0.blocks.0.layernorm_before.bias'. Storing uncompressed.
Skipping compression for layer 'swin.encoder.layers.0.blocks.0.attention.self.relative_position_bias_table'. Storing uncompressed.
Compressing layer: swin.encoder.layers.0.blocks.0.attention.self.relative_position_index...
Compressing layer: swin.encoder.layers.0.blocks.0.attention.self.query.weight...
Skipp

Evaluating Accuracy: 100%|██████████| 7/7 [00:05<00:00,  1.22it/s]
Evaluating Accuracy: 100%|██████████| 7/7 [00:05<00:00,  1.22it/s]


Releasing models from memory...

Loading models: microsoft/swin-tiny-patch4-window7-224 and rs127/swin-tiny-patch4-window7-224-finetuned-cifar10


Some weights of SwinForImageClassification were not initialized from the model checkpoint at microsoft/swin-tiny-patch4-window7-224 and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([10]) in the model instantiated
- classifier.weight: found shape torch.Size([1000, 768]) in the checkpoint and torch.Size([10, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Preparing dataloader for dataset: cifar10
Starting model compression using transform: DWT...
Skipping compression for layer 'swin.embeddings.patch_embeddings.projection.weight'. Storing uncompressed.
Skipping compression for layer 'swin.embeddings.patch_embeddings.projection.bias'. Storing uncompressed.
Skipping compression for layer 'swin.embeddings.norm.weight'. Storing uncompressed.
Skipping compression for layer 'swin.embeddings.norm.bias'. Storing uncompressed.
Skipping compression for layer 'swin.encoder.layers.0.blocks.0.layernorm_before.weight'. Storing uncompressed.
Skipping compression for layer 'swin.encoder.layers.0.blocks.0.layernorm_before.bias'. Storing uncompressed.
Skipping compression for layer 'swin.encoder.layers.0.blocks.0.attention.self.relative_position_bias_table'. Storing uncompressed.
Skipping compression for layer 'swin.encoder.layers.0.blocks.0.attention.self.relative_position_index'. Storing uncompressed.
Compressing layer: swin.encoder.layers.0.blocks.0.at

Evaluating Accuracy: 100%|██████████| 7/7 [00:06<00:00,  1.12it/s]
Evaluating Accuracy: 100%|██████████| 7/7 [00:05<00:00,  1.19it/s]


Releasing models from memory...

Loading models: microsoft/swin-tiny-patch4-window7-224 and rs127/swin-tiny-patch4-window7-224-finetuned-cifar10


Some weights of SwinForImageClassification were not initialized from the model checkpoint at microsoft/swin-tiny-patch4-window7-224 and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([10]) in the model instantiated
- classifier.weight: found shape torch.Size([1000, 768]) in the checkpoint and torch.Size([10, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Preparing dataloader for dataset: cifar10
Starting model compression using transform: DWT...
Skipping compression for layer 'swin.embeddings.patch_embeddings.projection.weight'. Storing uncompressed.
Skipping compression for layer 'swin.embeddings.patch_embeddings.projection.bias'. Storing uncompressed.
Skipping compression for layer 'swin.embeddings.norm.weight'. Storing uncompressed.
Skipping compression for layer 'swin.embeddings.norm.bias'. Storing uncompressed.
Skipping compression for layer 'swin.encoder.layers.0.blocks.0.layernorm_before.weight'. Storing uncompressed.
Skipping compression for layer 'swin.encoder.layers.0.blocks.0.layernorm_before.bias'. Storing uncompressed.
Skipping compression for layer 'swin.encoder.layers.0.blocks.0.attention.self.relative_position_bias_table'. Storing uncompressed.
Skipping compression for layer 'swin.encoder.layers.0.blocks.0.attention.self.relative_position_index'. Storing uncompressed.
Compressing layer: swin.encoder.layers.0.blocks.0.at

Evaluating Accuracy: 100%|██████████| 7/7 [00:05<00:00,  1.18it/s]
Evaluating Accuracy: 100%|██████████| 7/7 [00:05<00:00,  1.20it/s]

Releasing models from memory...


All experiments have been completed.





### Table

In [47]:
# Print the results for DWT experiments
dwt_all_results
if dwt_all_results:
    results_df = pd.DataFrame(dwt_all_results)
    print("\n--- DWT Experiment Results ---")
    display(results_df)
else:
    print("No results to display for DWT experiments. Please check for errors in the previous cell.")



--- DWT Experiment Results ---


Unnamed: 0,model_name,transform,dwt_coeffs_kept,jpeg_quant,patch_size,bit_strategy,original_accuracy,reconstructed_accuracy,accuracy_drop,original_delta_mb,compressed_delta_mb,compression_ratio
0,roberta-base-SST-2,dwt,ll_lh_hl,False,16,"[(2, 0.5), (0, 0.5)]",0.945,0.935,0.01,475.491219,104.315926,4.558184
1,roberta-base-SST-2,dwt,ll_lh_hl,False,16,"[(4, 0.5), (0, 0.5)]",0.945,0.94,0.005,475.491219,104.315926,4.558184
2,roberta-base-SST-2,dwt,ll_lh_hl,False,32,"[(2, 0.5), (0, 0.5)]",0.945,0.935,0.01,475.491219,94.621223,5.025207
3,roberta-base-SST-2,dwt,ll_lh_hl,False,32,"[(4, 0.5), (0, 0.5)]",0.945,0.935,0.01,475.491219,94.621223,5.025207
4,roberta-base-SST-2,dwt,ll_lh_hl,False,64,"[(2, 0.5), (0, 0.5)]",0.945,0.935,0.01,475.491219,92.232292,5.155366
5,roberta-base-SST-2,dwt,ll_lh_hl,False,64,"[(4, 0.5), (0, 0.5)]",0.945,0.935,0.01,475.491219,92.232292,5.155366
6,distilbert-base-uncased-finetuned-sst-2-english,dwt,ll_lh_hl,False,16,"[(2, 0.5), (0, 0.5)]",0.91,0.9,0.01,255.413094,56.835213,4.493923
7,distilbert-base-uncased-finetuned-sst-2-english,dwt,ll_lh_hl,False,16,"[(4, 0.5), (0, 0.5)]",0.91,0.9,0.01,255.413094,56.835213,4.493923
8,distilbert-base-uncased-finetuned-sst-2-english,dwt,ll_lh_hl,False,32,"[(2, 0.5), (0, 0.5)]",0.91,0.89,0.02,255.413094,51.648018,4.945264
9,distilbert-base-uncased-finetuned-sst-2-english,dwt,ll_lh_hl,False,32,"[(4, 0.5), (0, 0.5)]",0.91,0.89,0.02,255.413094,51.648018,4.945264


### Find the Best DWT Configs

In [49]:
# --- Step 1: Define your criteria for "best" ---
ACCURACY_DROP_THRESHOLD = 0.01  # 1%


# --- Step 2: The script with tiered logic ---

# Calculate accuracy drop
for result in dwt_all_results:
    result['accuracy_drop'] = result['original_accuracy'] - result['reconstructed_accuracy']

# Group results
grouped_results = defaultdict(list)
for result in dwt_all_results:
    grouped_results[result['model_name']].append(result)

# Find the best configuration for each group using the new tiered logic
best_configs = {}
for model_name, results_list in grouped_results.items():
    
    # Tier 1: Find configs that IMPROVED accuracy (accuracy_drop < 0)
    improving_configs = [config for config in results_list if config['accuracy_drop'] < 0]
    if improving_configs:
        print(f"Info: Found {len(improving_configs)} accuracy-improving config(s) for '{model_name}'.")
        # From the improvers, pick the one with the highest compression ratio
        best_config = max(improving_configs, key=lambda x: x['compression_ratio'])
        best_configs[model_name] = best_config
        continue # Move to the next model

    # Tier 2: If no improvers, find configs with acceptable accuracy drop (0 <= drop <= threshold)
    acceptable_configs = [config for config in results_list if 0 <= config['accuracy_drop'] <= ACCURACY_DROP_THRESHOLD]
    if acceptable_configs:
        print(f"Info: Found {len(acceptable_configs)} acceptable config(s) for '{model_name}'.")
        # From the acceptable ones, pick the one with the highest compression ratio
        best_config = max(acceptable_configs, key=lambda x: x['compression_ratio'])
        best_configs[model_name] = best_config
        continue

    # Tier 3: If none of the above, find the 'least harmful' option (minimum positive drop)
    print(f"Warning: No config for '{model_name}' met the ideal criteria. Finding the 'least harmful' option.")
    least_harmful_config = min(results_list, key=lambda x: x['accuracy_drop'])
    best_configs[model_name] = least_harmful_config


# --- Step 4: Print the results cleanly ---
print("\n--- Best DWT Configuration per Model (Improved Logic) ---")
pprint.pprint(best_configs)

Info: Found 6 acceptable config(s) for 'roberta-base-SST-2'.
Info: Found 6 acceptable config(s) for 'vit-base-patch16-224-cifar10'.
Info: Found 4 acceptable config(s) for 'swin-tiny-patch4-window7-224-finetuned-cifar10'.

--- Best DWT Configuration per Model (Improved Logic) ---
{'distilbert-base-uncased-finetuned-sst-2-english': {'accuracy_drop': 0.010000050067901611,
                                                     'bit_strategy': '[(2, '
                                                                     '0.5), '
                                                                     '(0, '
                                                                     '0.5)]',
                                                     'compressed_delta_mb': 56.83521270751953,
                                                     'compression_ratio': 4.493923421757553,
                                                     'dwt_coeffs_kept': 'll_lh_hl',
                                               