Create compression version of GPT-2 nad BERT model and save the memory =)

## cola example

The Semantic Textual Similarity Benchmark (Cer et al., 2017) is a collection of sentence pairs drawn from news headlines, video and image captions, and natural language inference data. Each pair is human-annotated with a similarity score from 0 to 5.

In [1]:
import os
#os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="0"


In [2]:
#!pip install Partial State

In [3]:
import transformers
print (transformers.__version__)

4.33.2


In [4]:
from datasets import load_dataset
import pandas as pd

dataset_cola = load_dataset('glue', 'cola')
dataset_cola.num_rows

{'train': 8551, 'validation': 1043, 'test': 1063}

In [5]:
label_list = dataset_cola["train"].features["label"].names
num_labels = len(label_list)


In [6]:
set(pd.DataFrame(dataset_cola['train']).label)

{0, 1}

In [7]:
task_num_labels = num_labels

from transformers import AutoConfig, BertConfig, AutoModelForSequenceClassification, AutoTokenizer

path_name = r"bert-base-uncased"

config = AutoConfig.from_pretrained(
    path_name,
    num_labels=num_labels,
)

model = AutoModelForSequenceClassification.from_pretrained(
    path_name,
    config=config,
)

tokenizer = AutoTokenizer.from_pretrained(path_name)

  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
#model=model.to('cuda:2')

In [9]:
task_to_keys = {
    "cola": ("sentence", None),
    "mnli": ("premise", "hypothesis"),
    "mrpc": ("sentence1", "sentence2"),
    "qnli": ("question", "sentence"),
    "qqp": ("question1", "question2"),
    "rte": ("sentence1", "sentence2"),
    "sst2": ("sentence", None),
    "stsb": ("sentence1", "sentence2"),
    "wnli": ("sentence1", "sentence2"),
}
sentence1_key, sentence2_key = task_to_keys['cola']

In [10]:

def preprocess_function(examples):
        # Tokenize the texts
        args = (
            (examples[sentence1_key],) if sentence2_key is None else (examples[sentence1_key], examples[sentence2_key])
        )
        
        
        result = tokenizer.batch_encode_plus(*args, max_length=128, truncation=True, padding="max_length")

        result["label"] = examples["label"]
        return result

In [11]:
dataset_cola['train']

Dataset({
    features: ['sentence', 'label', 'idx'],
    num_rows: 8551
})

In [12]:
tokenized_dataset = dataset_cola.map(
            preprocess_function,
            batched=True,
            load_from_cache_file=False
        )

Map:   0%|          | 0/8551 [00:00<?, ? examples/s]

Map:   0%|          | 0/1043 [00:00<?, ? examples/s]

Map:   0%|          | 0/1063 [00:00<?, ? examples/s]

In [13]:
tokenized_dataset['validation']

Dataset({
    features: ['sentence', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 1043
})

In [14]:
from transformers import Trainer
import evaluate as ev
import numpy as np

from transformers import TrainingArguments, Trainer, EvalPrediction

In [15]:
metric = ev.load("glue", 'cola')

In [16]:
import numpy as np
import matplotlib.pyplot as plt

In [17]:
def compute_metrics(p: EvalPrediction):
        preds_ = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
        preds_ = np.argmax(preds_, axis=1)
        
        result = metric.compute(predictions=preds_, references=p.label_ids)
        if True:
            result["combined_score"] = np.mean(list(result.values())).item()
            return result
        else:
            return {"accuracy": (preds_ == p.label_ids).astype(np.float32).mean().item()}

In [18]:
training_args2 = TrainingArguments(
    learning_rate=5e-5,
    num_train_epochs=1,
    evaluation_strategy="steps",
    skip_memory_metrics = False,
    eval_steps=100,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=128,
    save_steps=1000,
    overwrite_output_dir=True,
    output_dir="./bert_stsb_128",
    # The next line is important to ensure the dataset labels are properly passed to the model
    remove_unused_columns=True,
    seed=297104,
    report_to='none',
    )

In [19]:
from accelerate import Accelerator, DataLoaderConfiguration


In [20]:
trainer = Trainer(
        model=model,
        args=training_args2,
        train_dataset=tokenized_dataset["train"],
        eval_dataset=tokenized_dataset["validation"],
        compute_metrics = compute_metrics,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None)


In [21]:
trainer.train()

Step,Training Loss,Validation Loss,Matthews Correlation,Combined Score
100,No log,0.493607,0.406097,0.406097
200,No log,0.449868,0.469314,0.469314
300,No log,0.426578,0.535594,0.535594
400,No log,0.428814,0.525898,0.525898
500,0.476100,0.397602,0.57335,0.57335


TrainOutput(global_step=535, training_loss=0.47203331171909224, metrics={'train_runtime': 60.074, 'train_samples_per_second': 142.341, 'train_steps_per_second': 8.906, 'total_flos': 562465658595840.0, 'train_loss': 0.47203331171909224, 'init_mem_cpu_alloc_delta': 11403264, 'init_mem_gpu_alloc_delta': 439076352, 'init_mem_cpu_peaked_delta': 93233152, 'init_mem_gpu_peaked_delta': 0, 'train_mem_cpu_alloc_delta': 252129280, 'train_mem_gpu_alloc_delta': 898868736, 'train_mem_cpu_peaked_delta': 4096, 'train_mem_gpu_peaked_delta': 1661910528, 'before_init_mem_cpu': 1107165184, 'before_init_mem_gpu': 0, 'epoch': 1.0})

In [22]:
trainer.evaluate()

{'eval_loss': 0.40134167671203613,
 'eval_matthews_correlation': 0.5752615459764325,
 'eval_combined_score': 0.5752615459764325,
 'eval_runtime': 1.783,
 'eval_samples_per_second': 584.953,
 'eval_steps_per_second': 5.048,
 'epoch': 1.0,
 'eval_mem_cpu_alloc_delta': 8192,
 'eval_mem_gpu_alloc_delta': 0,
 'eval_mem_cpu_peaked_delta': 20480,
 'eval_mem_gpu_peaked_delta': 554128896}

# SVD 

In [23]:
fc_w = trainer.model.bert.encoder.layer[5].intermediate.dense.weight.data.cpu().data.numpy()
fc_b = trainer.model.bert.encoder.layer[5].intermediate.dense.bias.data.cpu().data.numpy()

In [35]:
import torch, torch.nn as nn

def factorize_to_svd(fc_w, fc_b, rank):
    U, S, Vt = np.linalg.svd(fc_w, full_matrices=False)
    # truncate SVD and fuse Sigma matrix
    w1 = np.dot(np.diag(np.sqrt(S[0:rank])),Vt[0:rank, :])
    w2 = np.dot(U[:,0:rank,], np.diag(np.sqrt(S[0:rank])))

    # create new layers and insert weights
    out_features, in_features = fc_w.shape
    is_bias = fc_b is not None

    linear1 = nn.Linear(in_features = in_features,
                      out_features = rank,
                      bias = False)
    linear1.weight = nn.Parameter(torch.FloatTensor(w1))

    linear2 = nn.Linear(in_features = rank,
                      out_features = out_features,
                      bias=is_bias)
    linear2.weight = nn.Parameter(torch.FloatTensor(w2))
    linear2.bias = nn.Parameter(torch.FloatTensor(fc_b))
    print (w1.shape, w2.shape)
    # create factorized layer
    factorized_layer = nn.Sequential(linear1,linear2)
    return factorized_layer

In [42]:
trainer.model.bert.encoder.layer[5].intermediate.dense = factorize_to_svd(fc_w, fc_b, 1).to(model.device)

(1, 768) (3072, 1)


In [43]:
trainer.evaluate()

{'eval_loss': 0.4393269717693329,
 'eval_matthews_correlation': 0.49038894064526567,
 'eval_combined_score': 0.49038894064526567,
 'eval_runtime': 1.8041,
 'eval_samples_per_second': 578.133,
 'eval_steps_per_second': 4.989,
 'epoch': 1.0,
 'eval_mem_cpu_alloc_delta': 2650112,
 'eval_mem_gpu_alloc_delta': 0,
 'eval_mem_cpu_peaked_delta': 20480,
 'eval_mem_gpu_peaked_delta': 554128896}

## Kronecker

In [24]:
B1 = np.load("B1_module5_last.npy")#np.linalg.cholesky(B1)
C1 = np.load("C1_module5_last.npy")#np.linalg.cholesky(C1)

In [25]:
B1.shape

(768, 768)

In [26]:
C1.shape

(3072, 3072)

In [28]:
import numpy as np

def is_pos_def(x):
    return np.all(np.linalg.eigvals(x) > 0)

is_pos_def(C1)

False

In [29]:
alpha = 0.0

С_new = (1 - alpha)*C1  + alpha*np.eye(len(np.diag(C1)))

while (not is_pos_def(С_new)):
    alpha += 0.1
    С_new = (1 - alpha)*C1  + alpha*np.eye(len(np.diag(C1)))

alpha

0.5

In [30]:
alpha = 0.0

B_new = (1 - alpha)*B1  + alpha*np.eye(len(np.diag(B1)))

while (not is_pos_def(B_new)):
    alpha += 0.05
    B_new = (1 - alpha)*B1  + alpha*np.eye(len(np.diag(B1)))

alpha

0.05

In [31]:
B1_square = np.linalg.cholesky(B_new)
C1_square = np.linalg.cholesky(С_new)

In [33]:
B1_square.shape

(768, 768)

In [34]:
fc_w.shape

(3072, 768)

In [43]:
U, S, Vt = np.linalg.svd(C1_square.T@fc_w@B1_square, full_matrices=False)

U1 = np.linalg.inv(C1_square.T)@U
V1 = np.linalg.inv(B1_square.T)@Vt.T

print((U1.T@C1_square@U1)[:3,:3])
print((V1.T@B1_square@V1)[:3,:3])

[[ 1.41360458e+00  1.23775957e-03 -3.50525431e-04]
 [-6.68278341e-04  1.41401431e+00  9.87319228e-05]
 [-7.98418208e-04  3.83257461e-04  1.41234212e+00]]
[[ 4.47309451e+00  1.27141594e-03 -7.95427501e-04]
 [-2.51848897e-03  4.47617637e+00 -3.48994681e-04]
 [ 5.83625934e-04  5.66016832e-04  4.47330798e+00]]


In [52]:
rank =1
w1 = np.diag(np.sqrt(S[:rank]))@V1.T[:rank, :]
w2 = U1[:,0:rank] @ np.diag(np.sqrt(S[:rank]))

In [53]:
import torch, torch.nn as nn

out_features, in_features = fc_w.shape
is_bias = fc_b is not None

linear1 = nn.Linear(in_features = in_features,
                      out_features = rank,
                      bias = False)
linear1.weight = nn.Parameter(torch.FloatTensor(w1))

linear2 = nn.Linear(in_features = rank,
                      out_features = out_features,
                      bias=is_bias)
linear2.weight = nn.Parameter(torch.FloatTensor(w2))
linear2.bias = nn.Parameter(torch.FloatTensor(fc_b))

# create factorized layer
factorized_layer = nn.Sequential(linear1,linear2)

In [54]:
trainer.model.bert.encoder.layer[5].intermediate.dense = factorized_layer.to(model.device)

In [55]:
trainer.evaluate()

{'eval_loss': 0.42159855365753174,
 'eval_matthews_correlation': 0.5393878269002617,
 'eval_combined_score': 0.5393878269002617,
 'eval_runtime': 1.7581,
 'eval_samples_per_second': 593.244,
 'eval_steps_per_second': 5.119,
 'epoch': 1.0,
 'eval_mem_cpu_alloc_delta': 1720320,
 'eval_mem_gpu_alloc_delta': 0,
 'eval_mem_cpu_peaked_delta': 20480,
 'eval_mem_gpu_peaked_delta': 554128896}