Create compression version of GPT-2 nad BERT model and save the memory =)

## cola example

The Semantic Textual Similarity Benchmark (Cer et al., 2017) is a collection of sentence pairs drawn from news headlines, video and image captions, and natural language inference data. Each pair is human-annotated with a similarity score from 0 to 5.

In [1]:
import os
#os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="0"


In [2]:
#!pip install Partial State

In [3]:
import transformers
print (transformers.__version__)

4.31.0


In [4]:
from sparse_grad_matrix_sparse import replace_bert_layers

In [5]:
def sparse_grad_linear(model, UV_dict):
    print ("create bert with sparse grads")
    model = replace_bert_layers(model, UV_dict)
    print ("created bert with sparse grads")
    return model

In [6]:
def create_space_matrices(model):
    if hasattr(model, "bert") and hasattr(model.bert, "encoder"):
        encoder = model.bert.encoder
    elif hasattr(model, "encoder"):
        encoder = model.encoder
    else:
        raise ValueError("Expected model to have attribute 'encoder' or 'bert.encoder'.")

    for i, layer in enumerate(encoder.layer):
        
        layer.intermediate.dense.create_UV()
          
        layer.output.dense.create_UV()
          
        #print ("new shape", layer.output.dense.weight.shape)
        #print ("\n\n")

    return model

In [7]:
from datasets import load_dataset
import pandas as pd

dataset_cola = load_dataset('glue', 'cola')
dataset_cola.num_rows

Found cached dataset glue (/root/.cache/huggingface/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


  0%|          | 0/3 [00:00<?, ?it/s]

{'train': 8551, 'validation': 1043, 'test': 1063}

In [8]:
label_list = dataset_cola["train"].features["label"].names
num_labels = len(label_list)


In [9]:
config

Available objects for config:
     AliasManager
     DisplayFormatter
     HistoryManager
     IPCompleter
     IPKernelApp
     InlineBackend
     LoggingMagics
     MagicsManager
     OSMagics
     PrefilterManager
     ScriptMagics
     StoreMagics
     ZMQInteractiveShell


In [10]:
set(pd.DataFrame(dataset_cola['train']).label)

{0, 1}

In [11]:
task_num_labels = num_labels

from transformers import AutoConfig, BertConfig, AutoModelForSequenceClassification, AutoTokenizer

path_name = r"bert-base-uncased"

config = AutoConfig.from_pretrained(
    path_name,
    num_labels=num_labels,
)

model = AutoModelForSequenceClassification.from_pretrained(
    path_name,
    config=config,
)

tokenizer = AutoTokenizer.from_pretrained(path_name)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
#model=model.to('cuda:2')

In [13]:
task_to_keys = {
    "cola": ("sentence", None),
    "mnli": ("premise", "hypothesis"),
    "mrpc": ("sentence1", "sentence2"),
    "qnli": ("question", "sentence"),
    "qqp": ("question1", "question2"),
    "rte": ("sentence1", "sentence2"),
    "sst2": ("sentence", None),
    "stsb": ("sentence1", "sentence2"),
    "wnli": ("sentence1", "sentence2"),
}
sentence1_key, sentence2_key = task_to_keys['cola']

In [14]:

def preprocess_function(examples):
        # Tokenize the texts
        args = (
            (examples[sentence1_key],) if sentence2_key is None else (examples[sentence1_key], examples[sentence2_key])
        )
        
        
        result = tokenizer.batch_encode_plus(*args, max_length=128, truncation=True, padding="max_length")

        result["label"] = examples["label"]
        return result

In [15]:
dataset_cola['train']

Dataset({
    features: ['sentence', 'label', 'idx'],
    num_rows: 8551
})

In [16]:
tokenized_dataset = dataset_cola.map(
            preprocess_function,
            batched=True,
            load_from_cache_file=False
        )

  0%|          | 0/9 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

In [17]:
tokenized_dataset['validation']

Dataset({
    features: ['sentence', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 1043
})

In [18]:
from transformers import Trainer
import evaluate as ev
import numpy as np

from transformers import TrainingArguments, Trainer, EvalPrediction

2023-09-20 21:07:24.667922: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0


In [19]:
metric = ev.load("glue", 'cola')

In [20]:
import numpy as np
import matplotlib.pyplot as plt

In [21]:
def compute_metrics(p: EvalPrediction):
        preds_ = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
        preds_ = np.argmax(preds_, axis=1)
        
        result = metric.compute(predictions=preds_, references=p.label_ids)
        if True:
            result["combined_score"] = np.mean(list(result.values())).item()
            return result
        else:
            return {"accuracy": (preds_ == p.label_ids).astype(np.float32).mean().item()}

In [22]:
training_args1 = TrainingArguments(
    learning_rate=5e-5,
    num_train_epochs=1,
    evaluation_strategy="steps",
    eval_steps=100,
    max_steps = 31,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=128,
    save_steps=1000,
    overwrite_output_dir=True,
    output_dir="./bert_stsb_128",
    # The next line is important to ensure the dataset labels are properly passed to the model
    remove_unused_columns=True,
    seed=297104,
    report_to='none',
    )


training_args2 = TrainingArguments(
    learning_rate=5e-5,
    num_train_epochs=1,
    evaluation_strategy="steps",
    skip_memory_metrics = False,
    eval_steps=100,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=128,
    save_steps=1000,
    overwrite_output_dir=True,
    output_dir="./bert_stsb_128",
    # The next line is important to ensure the dataset labels are properly passed to the model
    remove_unused_columns=True,
    seed=297104,
    report_to='none',
    )

In [23]:
from trainers_custom import TrainerBert1, TrainerBert2, TrainerDoubleOpt

In [24]:
trainer = TrainerBert1(
        model=model,
        args=training_args1,
        train_dataset=tokenized_dataset["train"],
        eval_dataset=tokenized_dataset["validation"],
        compute_metrics = compute_metrics,
)
trainer.make_grad_bank()

In [25]:
print ("len small dataset", len(trainer.train_dataset))
print ("lets collect gradients")
train_result = trainer.train()
print ("finishing collecting gradients")
#model = create_space_matrices(model)

len small dataset 8551
lets collect gradients




Step,Training Loss,Validation Loss


finishing collecting gradients


In [31]:
model.bert.encoder.layer[3].output.dense.U

tensor([[ 0.0144,  0.0006, -0.0006,  ...,  0.0047,  0.0002,  0.0098],
        [ 0.0174,  0.0055,  0.0066,  ...,  0.0164,  0.0148, -0.0201],
        [ 0.0213, -0.0012,  0.0024,  ..., -0.0002, -0.0104, -0.0359],
        ...,
        [ 0.0161, -0.0041, -0.0022,  ..., -0.0105,  0.0021,  0.0048],
        [ 0.0155, -0.0002,  0.0011,  ...,  0.0104, -0.0026,  0.0055],
        [ 0.0161,  0.0026,  0.0022,  ...,  0.0130, -0.0106,  0.0281]],
       device='cuda:0')

In [27]:
from sparse_grad_matrix_sparse import Tucker_Decomposition
import torch

UV_dict = {}

grads1 = torch.stack(trainer.grads1[:160])
u1, VT, U = Tucker_Decomposition(grads1)
UV_dict.update({"output":tuple((U, VT))})
grads2 = torch.stack(trainer.grads2[:160])
u1, VT, U = Tucker_Decomposition(grads2)
UV_dict.update({"interm":tuple((U, VT))})

In [28]:
UV_dict['output']

(tensor([[ 0.0144,  0.0006, -0.0006,  ...,  0.0047,  0.0002,  0.0098],
         [ 0.0174,  0.0055,  0.0066,  ...,  0.0164,  0.0148, -0.0201],
         [ 0.0213, -0.0012,  0.0024,  ..., -0.0002, -0.0104, -0.0359],
         ...,
         [ 0.0161, -0.0041, -0.0022,  ..., -0.0105,  0.0021,  0.0048],
         [ 0.0155, -0.0002,  0.0011,  ...,  0.0104, -0.0026,  0.0055],
         [ 0.0161,  0.0026,  0.0022,  ...,  0.0130, -0.0106,  0.0281]],
        device='cuda:0'),
 tensor([[-0.0234, -0.0092,  0.0484,  ...,  0.0119,  0.0275,  0.0418],
         [ 0.0206, -0.0178,  0.0377,  ..., -0.0446, -0.0132,  0.0470],
         [-0.0031,  0.0054,  0.0305,  ...,  0.0307,  0.0023,  0.0369],
         ...,
         [ 0.0202, -0.0089, -0.0503,  ...,  0.0668, -0.0315,  0.0320],
         [-0.0169, -0.0400,  0.0032,  ...,  0.0014,  0.0363,  0.0411],
         [-0.0054, -0.0183, -0.0064,  ..., -0.0158,  0.0601,  0.0399]],
        device='cuda:0'))

In [29]:
model = sparse_grad_linear(model, UV_dict)

create bert with sparse grads
created bert with sparse grads


In [None]:
trainer.train()

In [30]:
del grads1, grads2
torch.cuda.empty_cache()

In [None]:
#trainer = TrainerBert2(
#        model=model,
#        args=training_args2,
#        train_dataset=tokenized_dataset["train"],
#        eval_dataset=tokenized_dataset["validation"],
#        compute_metrics = compute_metrics,
#)


#trainer.create_scheduler(num_training_steps = trainer.max_steps)

#trainer.make_grad_bank()

## BERT sparse

In [None]:
trainer.train()

In [34]:
model.bert.encoder.layer[3].output.dense.weight.grad

tensor(indices=tensor([], size=(2, 0)),
       values=tensor([], size=(0,)),
       device='cuda:0', size=(768, 3072), nnz=0, layout=torch.sparse_coo)

In [35]:
del trainer
torch.cuda.empty_cache()

In [36]:
## need for check the sparsity of out_grads in UV, do not need in this branch

def hook_fn_backward(module, inp_grad, out_grad):
    module.out_grads = out_grad#@ module.VT.T

modules = model.bert.encoder.layer
for module in modules:
    module.intermediate.dense.register_backward_hook(hook_fn_backward)
    module.output.dense.register_backward_hook(hook_fn_backward)

In [42]:
trainer = TrainerDoubleOpt(
        model=model,
        args=training_args2,
        train_dataset=tokenized_dataset["train"],
        eval_dataset=tokenized_dataset["validation"],
        compute_metrics = compute_metrics,
)


#trainer.create_scheduler(num_training_steps = trainer.max_steps)

trainer.make_grad_bank()

In [43]:
trainer.train()

params ['bert.encoder.layer.0.intermediate.dense.weight', 'bert.encoder.layer.0.output.dense.weight', 'bert.encoder.layer.1.intermediate.dense.weight', 'bert.encoder.layer.1.output.dense.weight', 'bert.encoder.layer.2.intermediate.dense.weight', 'bert.encoder.layer.2.output.dense.weight', 'bert.encoder.layer.3.intermediate.dense.weight', 'bert.encoder.layer.3.output.dense.weight', 'bert.encoder.layer.4.intermediate.dense.weight', 'bert.encoder.layer.4.output.dense.weight', 'bert.encoder.layer.5.intermediate.dense.weight', 'bert.encoder.layer.5.output.dense.weight', 'bert.encoder.layer.6.intermediate.dense.weight', 'bert.encoder.layer.6.output.dense.weight', 'bert.encoder.layer.7.intermediate.dense.weight', 'bert.encoder.layer.7.output.dense.weight', 'bert.encoder.layer.8.intermediate.dense.weight', 'bert.encoder.layer.8.output.dense.weight', 'bert.encoder.layer.9.intermediate.dense.weight', 'bert.encoder.layer.9.output.dense.weight', 'bert.encoder.layer.10.intermediate.dense.weight', '

Step,Training Loss,Validation Loss,Matthews Correlation,Combined Score
100,No log,0.496959,0.43381,0.43381
200,No log,0.44478,0.477905,0.477905
300,No log,0.44448,0.502897,0.502897
400,No log,0.424684,0.515291,0.515291
500,0.449600,0.409058,0.572701,0.572701




TrainOutput(global_step=535, training_loss=0.4473402807645709, metrics={'train_runtime': 326.1663, 'train_samples_per_second': 26.217, 'train_steps_per_second': 1.64, 'total_flos': 562465658595840.0, 'train_loss': 0.4473402807645709, 'init_mem_cpu_alloc_delta': 8192, 'init_mem_gpu_alloc_delta': 0, 'init_mem_cpu_peaked_delta': 0, 'init_mem_gpu_peaked_delta': 0, 'train_mem_cpu_alloc_delta': 1757184, 'train_mem_gpu_alloc_delta': 684841984, 'train_mem_cpu_peaked_delta': 20480, 'train_mem_gpu_peaked_delta': 2035547136, 'before_init_mem_cpu': 9428717568, 'before_init_mem_gpu': 2082969600, 'epoch': 1.0})

## Memory measurement

In [38]:
from transformers import  TrainerCallback
import torch
import numpy as np
import time

class ProfCallback(TrainerCallback):
    def __init__(self, prof):
        self.prof = prof

    def on_step_end(self, args, state, control, **kwargs):
        self.prof.step()


In [39]:
profiler_result_path = 'file.json'
def train_on_epoch(generator, model, loss_function, optimizer):
    model.train()
    epoch_loss = []
    for batch_of_x, batch_of_y in generator:
        optimizer.zero_grad()
        with torch.autograd.profiler.profile(use_cuda=True, record_shapes=True, with_flops=True, profile_memory=True) as prof:
            output = model(batch_of_x)
            loss = loss_function(output, batch_of_y)
        with amp.scale_loss(loss, optimizer) as scaled_loss:
            scaled_loss.backward()
        optimizer.step()
        epoch_loss.append(loss.item()) 
    
    prof.export_chrome_trace(profiler_result_path)

In [None]:
with torch.profiler.profile(activities=[torch.profiler.ProfilerActivity.CPU,
                                        torch.profiler.ProfilerActivity.CUDA], 
                            schedule=torch.profiler.schedule(skip_first=3, wait=1, warmup=1, active=2, repeat=2),
                            on_trace_ready=torch.profiler.tensorboard_trace_handler('hf-training-trainer_torch_sparse'),
                            profile_memory=True,
                            with_stack=True,
                            record_shapes=True) as prof:
    
    trainer.add_callback(ProfCallback(prof=prof))
    train_result = trainer.train()
    prof.export_chrome_trace(profiler_result_path)

In [None]:
trainer.evaluate()

In [None]:
trainer.log_metrics("train", train_result.metrics)

In [None]:
class ProfCallback(TrainerCallback):
    def __init__(self, prof):
        self.prof = prof

    def on_step_end(self, args, state, control, **kwargs):
        self.prof.step()
