Create compression version of GPT-2 nad BERT model and save the memory =)

## cola example

The Semantic Textual Similarity Benchmark (Cer et al., 2017) is a collection of sentence pairs drawn from news headlines, video and image captions, and natural language inference data. Each pair is human-annotated with a similarity score from 0 to 5.

In [1]:
import os
#os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="0"
import pickle

In [2]:
from transformers import Trainer
import evaluate as ev
import numpy as np

from transformers import TrainingArguments, Trainer, EvalPrediction

from datasets import load_dataset
import pandas as pd

from peft import (
    get_peft_config,
    get_peft_model,
    get_peft_model_state_dict,
    set_peft_model_state_dict,
    LoraConfig,
    PeftType,
    PrefixTuningConfig,
    PromptEncoderConfig,
)

In [3]:
dataset_cola = load_dataset('glue', 'sst2')
dataset_cola.num_rows

{'train': 67349, 'validation': 872, 'test': 1821}

In [4]:
dataset_cola['train'][5]

{'sentence': "that 's far too tragic to merit such superficial treatment ",
 'label': 0,
 'idx': 5}

In [5]:
label_list = dataset_cola["train"].features["label"].names
num_labels = len(label_list)


In [6]:
set(pd.DataFrame(dataset_cola['train']).label)

{0, 1}

In [7]:
import torch
torch.manual_seed(0)
task_num_labels = num_labels

from transformers import AutoConfig, BertConfig, AutoModelForSequenceClassification, AutoTokenizer

path_name = r"bert-base-uncased"

def load_model(path_name, num_labels):
    config = AutoConfig.from_pretrained(
        path_name,
        num_labels=num_labels,
    )
    
    model = AutoModelForSequenceClassification.from_pretrained(
        path_name,
        config=config,
    )
    #model.load_state_dict(torch.load("/home/jovyan/shares/SR004.nfs2/chekalina/FisherKronecker/bert_cola/bert_cola_lora.pth"))
    
    tokenizer = AutoTokenizer.from_pretrained(path_name)

    return model, tokenizer

In [8]:
model, tokenizer = load_model(path_name, num_labels)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
#model.load_state_dict(torch.load("/home/jovyan/shares/SR004.nfs2/chekalina/FisherKronecker/bert_cola/bert_cola_lora.pth"))

In [10]:
part_of = ["bert.encoder.layer.1.intermediate.dense","bert.encoder.layer.2.intermediate.dense","bert.encoder.layer.3.intermediate.dense","bert.encoder.layer.4.intermediate.dense","bert.encoder.layer.5.intermediate.dense", "bert.encoder.layer.6.intermediate.dense"]
#peft_config = LoraConfig(task_type="SEQ_CLS", inference_mode=False, r=8, target_modules=part_of, lora_alpha=16, lora_dropout=0.1)
#model = get_peft_model(model, peft_config)
#model.print_trainable_parameters()

In [11]:
task_to_keys = {
    "cola": ("sentence", None),
    "mnli": ("premise", "hypothesis"),
    "mrpc": ("sentence1", "sentence2"),
    "qnli": ("question", "sentence"),
    "qqp": ("question1", "question2"),
    "rte": ("sentence1", "sentence2"),
    "sst2": ("sentence", None),
    "stsb": ("sentence1", "sentence2"),
    "wnli": ("sentence1", "sentence2"),
}
sentence1_key, sentence2_key = task_to_keys['sst2']

In [12]:
sentence1_key, sentence2_key

('sentence', None)

In [13]:
#torch.manual_seed(0)
def preprocess_function(examples):
        # Tokenize the texts
        args = (
            (examples[sentence1_key],) if sentence2_key is None else (examples[sentence1_key], examples[sentence2_key])
        )
        
        
        result = tokenizer(*args, max_length=128, truncation=True, padding="max_length")

        result["label"] = examples["label"]
        return result

In [14]:
dataset_cola['train']

Dataset({
    features: ['sentence', 'label', 'idx'],
    num_rows: 67349
})

In [15]:
tokenized_dataset = dataset_cola.map(
            preprocess_function,
            batched=True,
            load_from_cache_file=False
        )

Map:   0%|          | 0/67349 [00:00<?, ? examples/s]

Map:   0%|          | 0/872 [00:00<?, ? examples/s]

Map:   0%|          | 0/1821 [00:00<?, ? examples/s]

In [16]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1821
    })
})

In [17]:
metric = ev.load("glue", 'sst2')

Using the latest cached version of the module from /home/jovyan/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--glue/05234ba7acc44554edcca0978db5fa3bc600eeee66229abe79ff9887eacaf3ed (last modified on Mon Jun 24 18:32:28 2024) since it couldn't be found locally at evaluate-metric--glue, or remotely on the Hugging Face Hub.


In [18]:
import numpy as np
import matplotlib.pyplot as plt

In [19]:
def compute_metrics(p: EvalPrediction):
        preds_ = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
        preds_ = np.argmax(preds_, axis=1)
        
        result = metric.compute(predictions=preds_, references=p.label_ids)
        if True:
            result["combined_score"] = np.mean(list(result.values())).item()
            return result
        else:
            return {"accuracy": (preds_ == p.label_ids).astype(np.float32).mean().item()}

## collect lora grads

In [20]:
part_of = ["bert.encoder.layer.1.intermediate.dense","bert.encoder.layer.2.intermediate.dense","bert.encoder.layer.3.intermediate.dense","bert.encoder.layer.4.intermediate.dense","bert.encoder.layer.5.intermediate.dense", "bert.encoder.layer.6.intermediate.dense", "bert.encoder.layer.7.intermediate.dense", "bert.encoder.layer.8.intermediate.dense", "bert.encoder.layer.9.intermediate.dense", "bert.encoder.layer.10.intermediate.dense", "bert.encoder.layer.11.intermediate.dense"]
peft_config = LoraConfig(task_type="SEQ_CLS", inference_mode=False, r=1, target_modules=part_of, lora_alpha=16, lora_dropout=0.1)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 43,778 || all params: 109,527,556 || trainable%: 0.0400


In [21]:
from functools import reduce
def get_module_by_name(module, access_string):
     names = access_string.split(sep='.')
     return reduce(getattr, names, module)

In [22]:
class CustomTrainer(Trainer):
    def make_grad_bank(self):
        self.grads_A = dict() #defaultdict(torch.tensor)
        self.grads_B = dict()
        for name, module in self.model.base_model.model.named_modules():
            name_m = "module." + str(name)
            if name in part_of:
                print("Init ::",name)
                self.grads_A[name] = []
                self.grads_B[name] = []
        self.avg_counter = 0

    def training_step(
        self, model, inputs, num_items_in_batch=None
    ) -> torch.Tensor:
        
        model.train()
        if hasattr(self.optimizer, "train") and callable(self.optimizer.train):
            self.optimizer.train()

        inputs = self._prepare_inputs(inputs)

        with self.compute_loss_context_manager():
            loss = self.compute_loss(model, inputs).mean()

        del inputs

        kwargs = {}

        if self.use_apex:
            with amp.scale_loss(loss, self.optimizer) as scaled_loss:
                scaled_loss.backward()
        else:
            self.accelerator.backward(loss, **kwargs)
        for name, module in self.model.base_model.model.named_modules():
            name_m = "module." + str(name)
            if name in part_of:
                #print ("collecting grads from ", name)
                #new_var = get_module_by_name(model, name).weight.grad.detach().cpu()**2
                #self.mass[name] += new_var
                #new_var = get_module_by_name(model, name).weight.detach().cpu()
                i= name.find('.')
                res = name[i+1:]
                self.grads_A[name].append(get_module_by_name(model, name).lora_A.default.weight.grad.detach().cpu().numpy())
                self.grads_B[name].append(get_module_by_name(model, name).lora_B.default.weight.grad.detach().cpu().numpy())
                #self.mass_w[name].append(get_module_by_name(model, name).weight.detach().cpu())

            self.avg_counter += 1

        return loss.detach() / self.args.gradient_accumulation_steps

In [23]:
training_args = TrainingArguments(
    learning_rate=3e-4,
    num_train_epochs=1,
    evaluation_strategy="steps",
    skip_memory_metrics = False,
    eval_steps=100,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=128,
    save_steps=1000,
    overwrite_output_dir=True,
    output_dir="./bert_stsb_128",
    # The next line is important to ensure the dataset labels are properly passed to the model
    remove_unused_columns=True,
    seed=297104,
    report_to='none',
    )



In [24]:
trainer_collect = CustomTrainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset["train"],
        eval_dataset=tokenized_dataset["validation"],
        compute_metrics=compute_metrics,
    )
trainer_collect.make_grad_bank()

[2025-03-24 00:16:25,133] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/home/jovyan/.mlspace/envs/kurkin_clone2/bin/../lib/gcc/x86_64-conda-linux-gnu/12.4.0/../../../../x86_64-conda-linux-gnu/bin/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/home/jovyan/.mlspace/envs/kurkin_clone2/bin/../lib/gcc/x86_64-conda-linux-gnu/12.4.0/../../../../x86_64-conda-linux-gnu/bin/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Init :: bert.encoder.layer.1.intermediate.dense
Init :: bert.encoder.layer.2.intermediate.dense
Init :: bert.encoder.layer.3.intermediate.dense
Init :: bert.encoder.layer.4.intermediate.dense
Init :: bert.encoder.layer.5.intermediate.dense
Init :: bert.encoder.layer.6.intermediate.dense
Init :: bert.encoder.layer.7.intermediate.dense
Init :: bert.encoder.layer.8.intermediate.dense
Init :: bert.encoder.layer.9.intermediate.dense
Init :: bert.encoder.layer.10.intermediate.dense
Init :: bert.encoder.layer.11.intermediate.dense


In [None]:
trainer_collect.train()
    

In [26]:
list_A = {}
list_B = {}
for name, module in trainer_collect.model.base_model.model.named_modules():
    name_m = "module." + str(name)
    if name in part_of:
        list_A[name] = get_module_by_name(trainer_collect.model, name).lora_A.default.weight.detach().cpu().numpy()
        list_B[name] = get_module_by_name(trainer_collect.model, name).lora_B.default.weight.detach().cpu().numpy()

In [27]:
trainer_collect.evaluate()

{'eval_loss': 0.24791806936264038,
 'eval_accuracy': 0.908256880733945,
 'eval_combined_score': 0.908256880733945}

## collect factor grads

In [28]:

with open('/home/jovyan/shares/SR004.nfs2/chekalina/FisherKronecker/bert_cola/lora/lora_grads_A.pickle', 'wb') as f:
    pickle.dump(trainer_collect.grads_A, f)

with open('/home/jovyan/shares/SR004.nfs2/chekalina/FisherKronecker/bert_cola/lora/lora_grads_B.pickle', 'wb') as f:
    pickle.dump(trainer_collect.grads_B, f)

## collect factor weigts

In [29]:
with open('/home/jovyan/shares/SR004.nfs2/chekalina/FisherKronecker/bert_cola/lora/lora_A.pickle', 'wb') as f:
    pickle.dump(list_A, f)

with open('/home/jovyan/shares/SR004.nfs2/chekalina/FisherKronecker/bert_cola/lora/lora_B.pickle', 'wb') as f:
    pickle.dump(list_B, f)

In [30]:
w_before_merge = trainer_collect.model.base_model.model.bert.encoder.layer[7].intermediate.dense.base_layer.weight
w_before_merge.shape

torch.Size([3072, 768])

In [31]:
w_before_merge[:5, :5]

tensor([[ 0.0360, -0.0048, -0.0168,  0.0016,  0.0768],
        [-0.1223, -0.0474,  0.0138,  0.0363, -0.0208],
        [ 0.0302,  0.0256, -0.0531,  0.0263,  0.0047],
        [ 0.0569, -0.0259, -0.0287, -0.0365, -0.0070],
        [ 0.0372, -0.0503, -0.0198,  0.0090,  0.0521]], device='cuda:0')

In [32]:
#merged_model = trainer_collect.model.unload()
#trainer_collect.evaluate()

In [33]:
merged_model = trainer_collect.model.merge_and_unload()
trainer_collect.evaluate()

{'eval_loss': 0.24791808426380157,
 'eval_accuracy': 0.908256880733945,
 'eval_combined_score': 0.908256880733945}

In [40]:
def get_factor_left(list_of_lora_factors):
    m, n = list_of_lora_factors[0].shape 
    list_of_grads1 = [grad.reshape(-1) for grad in list_of_lora_factors]

    grad_vectors = np.stack([grad.reshape(n,m, order = 'F') for grad in list_of_grads1])
    k, m, n = grad_vectors.shape
    print ("k, m, n", k, m, n)
    
    res = np.zeros(n*n) 
    e_d = np.ones((m, 1))
    V = (e_d@e_d.T)
    d_size = V.shape[0]*V.shape[1]
    for i in range(k):
        res += (grad_vectors[i].T @ V@grad_vectors[i]).T.ravel()/d_size
    return res/k

def get_factor_right(list_of_lora_factors):
    m, n = list_of_lora_factors[0].shape #cols rows in torch -> rows cols in numpy
    list_of_grads1 = [grad.reshape(-1) for grad in list_of_lora_factors]

    grad_vectors = np.stack([grad.reshape(n,m, order = 'F') for grad in list_of_grads1])
    k, m, n = grad_vectors.shape
    print ("k, m, n", k, m, n)
    k, m, n = grad_vectors.shape
    #e_d = np.ones((n, 1))
    res = np.zeros(m*m) 
    e_d = np.ones((n, 1))
    V = (e_d@e_d.T)
    d_size = V.shape[0]*V.shape[1]
    
    for i in range(k):
        res += (grad_vectors[i] @ V @grad_vectors[i].T).T.ravel()/d_size
    return res/k

In [35]:
def is_pos_def(x):
    return np.all(np.linalg.eigvals(x) > 0)

In [36]:
def replace_layer(model_base, layer_name, list_factors_A, list_factors_B, A, B):
    #print ("weight_old",model_base.bert.encoder.layer[7].intermediate.dense.weight)
    dW_old = B@A
    print ("\n\n\n")
    
    m, n = list_factors_A[0].shape
    R = get_factor_right(list_factors_A).reshape(n, n, order='F')
    m, n = list_factors_B[0].shape
    LT = get_factor_left(list_factors_B).reshape(m, m, order='F')
    
    #print (LT.shape, "LT.Shape")
    #print (B.shape, "B.Shape")
    #print (A.shape, "A.Shape")
    #print (R.shape, "R.Shape")
    #print ("Lt", LT[:5, :5])
    #print ("R", R[:5, :5])
    alpha = 0.0
    LT_new = LT
    while (not is_pos_def(LT_new)):
        alpha += 0.1
        print (alpha)
        LT_new  = (1 - alpha)*LT  + alpha*np.eye(len(np.diag(LT)))
    

    alpha = 0.0
    R_new = R
    while (not is_pos_def(R_new)):
        alpha += 0.1
        print (alpha)
        R_new = (1 - alpha)*R  + alpha*np.eye(len(np.diag(R)))
    
    LT_square = np.linalg.cholesky(LT_new)
    R_square = np.linalg.cholesky(R_new)
    qb, rb = np.linalg.qr(LT_square@B)
    qa, ra = np.linalg.qr(R_square.T@A.T)
    print (rb.shape)
    print (ra.T.shape)
    U, S, Vt = np.linalg.svd(rb@ra.T)#np.linalg.svd(LT@P@Q@R)
    U_new = qb@U
    Vt_new = Vt@(qa.T)
    #Vt_new = Vt @ np.linalg.inv(R_square)
    #U_new = np.linalg.inv(LT_square) @ U

    #dW_old = B@A
    dW = U_new[:,:len(S)]@np.diag(S)@Vt_new
    print ("dw old", dW_old[:5, :5])
    print ("dw new", dW[:5, :5])
    #print (model_base.bert.encoder.layer[1].intermediate.dense.weight[:5, :5])
    i = int(layer_name.split('.')[3])
    with torch.no_grad():
        if "intermediate" in layer_name:
            model_base.bert.encoder.layer[i].intermediate.dense.weight += torch.tensor(dW).to(model_base.device)
        else:
            model_base.bert.encoder.layer[i].output.dense.weight += torch.tensor(dW_old).to(model_base.device)

In [37]:
len(trainer_collect.grads_A["bert.encoder.layer.7.intermediate.dense"])

753

In [43]:
reduced_part_of = ["bert.encoder.layer.7.intermediate.dense", "bert.encoder.layer.5.intermediate.dense", "bert.encoder.layer.6.intermediate.dense", "bert.encoder.layer.2.intermediate.dense", "bert.encoder.layer.4.intermediate.dense", "bert.encoder.layer.3.intermediate.dense", "bert.encoder.layer.1.intermediate.dense"]
#part_of = ["bert.encoder.layer.1.intermediate.dense","bert.encoder.layer.2.intermediate.dense","bert.encoder.layer.3.intermediate.dense","bert.encoder.layer.4.intermediate.dense","bert.encoder.layer.5.intermediate.dense", "bert.encoder.layer.6.intermediate.dense", "bert.encoder.layer.7.intermediate.dense", "bert.encoder.layer.8.intermediate.dense", "bert.encoder.layer.9.intermediate.dense", "bert.encoder.layer.10.intermediate.dense", "bert.encoder.layer.11.intermediate.dense", "bert.encoder.layer.1.output.dense","bert.encoder.layer.2.output.dense","bert.encoder.layer.3.output.dense","bert.encoder.layer.4.output.dense","bert.encoder.layer.5.output.dense", "bert.encoder.layer.6.output.dense", "bert.encoder.layer.7.output.dense", "bert.encoder.layer.8.output.dense", "bert.encoder.layer.9.output.dense", "bert.encoder.layer.10.output.dense", "bert.encoder.layer.11.output.dense"]

for layer_name in ["bert.encoder.layer.6.intermediate.dense"]:
    print ("layer_name", layer_name)
    list_factors_A = trainer_collect.grads_A[layer_name]
    list_factors_B = trainer_collect.grads_B[layer_name]
    P = list_A[layer_name]
    Q = list_B[layer_name]
    replace_layer(trainer_collect.model, layer_name, list_factors_A, list_factors_B, P, Q)

layer_name bert.encoder.layer.6.intermediate.dense




k, m, n 753 768 1
k, m, n 753 1 3072
0.1
0.1
(1, 1)
(1, 1)
dw old [[ 3.4540356e-04 -1.9559047e-04 -5.4897678e-06  2.9698669e-04
   5.4308380e-06]
 [-3.1800839e-04  1.8007752e-04  5.0543549e-06 -2.7343165e-04
  -5.0000995e-06]
 [-5.0901499e-04  2.8823817e-04  8.0901718e-06 -4.3766393e-04
  -8.0033287e-06]
 [ 5.4320780e-04 -3.0760039e-04 -8.6336249e-06  4.6706377e-04
   8.5409474e-06]
 [-4.2654601e-06  2.4153871e-06  6.7794275e-08 -3.6675501e-06
  -6.7066544e-08]]
dw new [[ 3.42352887e-05 -1.90654874e-05 -1.90534513e-06  2.97840165e-05
  -7.59697035e-07]
 [-3.15627614e-05  1.75771682e-05  1.75660717e-06 -2.74589711e-05
   7.00392406e-07]
 [-5.05961476e-05  2.81767804e-05  2.81589923e-06 -4.40176364e-05
   1.12275213e-06]
 [ 5.37904071e-05 -2.99556501e-05 -2.99367390e-06  4.67965784e-05
  -1.19363424e-06]
 [-5.00203512e-07  2.78561219e-07  2.78385362e-08 -4.35167052e-07
   1.10997494e-08]]


In [42]:
#trainer = Trainer(
#        model=model,
#        args=training_args,
#        train_dataset=tokenized_dataset["train"],
#        eval_dataset=tokenized_dataset["validation"],
#        compute_metrics = compute_metrics,
#)
trainer_collect.evaluate()

{'eval_loss': 0.2479955404996872,
 'eval_accuracy': 0.908256880733945,
 'eval_combined_score': 0.908256880733945}

In [None]:
model.bert.encoder.layer[1].intermediate.dense.weight[:5,:5]