# Compressions

## 1) Quantization

In [5]:
model = AutoModelForSequenceClassification.from_pretrained("/kaggle/input/finetuned-bertweet/pytorch/default/1/finetuned_from_old")
print(model)

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-23): 24 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=Tru

In [12]:


print(" non-Quantized model size:", sum(p.numel() for p in model.parameters()))

 non-Quantized model size: 52099072


In [8]:
from torch.ao.quantization import quantize_dynamic
model.eval().cpu()
qmodel = quantize_dynamic(model, {nn.Linear}, dtype=torch.qint8)
model = qmodel

print("\nQuantization complete. Model is ready for inference.")
print("Quantized model size:", sum(p.numel() for p in qmodel.parameters()))


Quantization complete. Model is ready for inference.
Quantized model size: 52099072


In [13]:
import os, shutil, torch, torch.nn as nn
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.ao.quantization import quantize_dynamic

SAVE_DIR = "/kaggle/working/compressed_quant_dynamic"
os.makedirs(SAVE_DIR, exist_ok=True)

# 1) Save quantized weights as a regular PyTorch file
torch.save(qmodel.state_dict(), os.path.join(SAVE_DIR, "quantized_state_dict.pt"))

# 2) Save tokenizer (for later preprocessing)
tok = AutoTokenizer.from_pretrained(BASE_MODEL)
tok.save_pretrained(SAVE_DIR)

# 3) Zip for download
shutil.make_archive(SAVE_DIR, "zip", SAVE_DIR)
print("Saved + zipped to:", SAVE_DIR + ".zip")


config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Saved + zipped to: /kaggle/working/compressed_quant_dynamic.zip


## 3) Pruning

In [19]:

from torch.nn.utils import prune
model = AutoModelForSequenceClassification.from_pretrained("/kaggle/input/finetuned-bertweet/pytorch/default/1/finetuned_from_old")


model_to_prune = model
parameters_to_prune = [
    (module, 'weight')
    for module in model_to_prune.modules() if isinstance(module, nn.Linear)]

prune.global_unstructured(
    parameters_to_prune,
    pruning_method=prune.L1Unstructured,
    amount=0.4)

print("\nPruning applied: 40% of weights zeroed out in Linear layers.")
print("Pruned model non-zero parameters:", sum((p != 0).sum().item() for p in model_to_prune.parameters()))


Pruning applied: 40% of weights zeroed out in Linear layers.
Pruned model non-zero parameters: 355364864


In [20]:
import os, shutil, torch, torch.nn as nn
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.ao.quantization import quantize_dynamic

SAVE_DIR = "/kaggle/working/compressed_prune"
os.makedirs(SAVE_DIR, exist_ok=True)

# 1) Save quantized weights as a regular PyTorch file
torch.save(qmodel.state_dict(), os.path.join(SAVE_DIR, "pruned_state_dict.pt"))

# 2) Save tokenizer (for later preprocessing)
tok = AutoTokenizer.from_pretrained(BASE_MODEL)
tok.save_pretrained(SAVE_DIR)

# 3) Zip for download
shutil.make_archive(SAVE_DIR, "zip", SAVE_DIR)
print("Saved + zipped to:", SAVE_DIR + ".zip")



Saved + zipped to: /kaggle/working/compressed_prune.zip
