In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import GaussianNB
import os
import random
import sys
import transformers

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    BertTokenizer,
    BertForSequenceClassification,
    RobertaTokenizer,
    RobertaForSequenceClassification,
    EvalPrediction,
    PretrainedConfig,
    Trainer,
    TrainingArguments,
)
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import torch
import wandb
import copy
import torch.nn as nn
import math
from tqdm import tqdm, trange
from sklearn.metrics import f1_score
from transformers import EarlyStoppingCallback

# import loralib as lora
from peft import (
    get_peft_config,
    get_peft_model,
    get_peft_model_state_dict,
    set_peft_model_state_dict,
    LoraConfig,
    PeftType,
    PrefixTuningConfig,
    PromptEncoderConfig,
    prepare_model_for_kbit_training,
)
from collections import Counter
import glob
import time
import datasets
import loralib as lora

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def seed_everything(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)  # type: ignore
    torch.backends.cudnn.deterministic = True  # type: ignore
    torch.backends.cudnn.benchmark = False  # type: ignore
    torch.use_deterministic_algorithms(True)


seed_everything(42)

In [3]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [4]:
def recon_error(original_weight, approx_weight):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    return torch.linalg.norm(original_weight.to(device) - approx_weight.to(device), "fro")

In [27]:
model_id = "microsoft/deberta-v2-xxlarge"
tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=False)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForSequenceClassification.from_pretrained(model_id, num_labels=2)
config = LoraConfig(r=4, lora_alpha=8, target_modules=["query_proj", "value_proj"], task_type="SEQ_CLS")
model = get_peft_model(model, config)
print_trainable_parameters(model)

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v2-xxlarge and are newly initialized: ['pooler.dense.bias', 'classifier.weight', 'pooler.dense.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 1182722 || all params: 1568096260 || trainable%: 0.075424068672927


In [6]:
model.base_model.model.deberta.encoder.layer[0].attention.self.query_proj.lora_A

ModuleDict(
  (default): Linear(in_features=1536, out_features=4, bias=False)
)

In [7]:
lora_A = model.base_model.model.deberta.encoder.layer[0].attention.self.query_proj.lora_A.default

In [8]:
lora_A = model.base_model.model.deberta.encoder.layer[0].attention.self.query_proj.lora_A.default.weight.transpose(0,1)
lora_A, lora_A.shape

(tensor([[-0.0037, -0.0002, -0.0136, -0.0114],
         [-0.0118, -0.0211, -0.0214,  0.0007],
         [-0.0003, -0.0050, -0.0224,  0.0027],
         ...,
         [-0.0077, -0.0008, -0.0219, -0.0033],
         [-0.0131, -0.0245, -0.0117,  0.0163],
         [-0.0070, -0.0254, -0.0136,  0.0109]], grad_fn=<TransposeBackward0>),
 torch.Size([1536, 4]))

In [10]:
lora_A_norm = torch.norm(lora_A)
lora_A_norm

tensor(1.1551, grad_fn=<LinalgVectorNormBackward0>)

In [None]:
lora_B = model.base_model.model.deberta.encoder.layer[0].attention.self.query_proj.lora_B.default
lora_B.weight, lora_B.weight.shape, torch.norm(lora_B.weight)

(Parameter containing:
 tensor([[0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.],
         ...,
         [0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.]], requires_grad=True),
 torch.Size([1536, 4]),
 tensor(0., grad_fn=<LinalgVectorNormBackward0>))

### query SVD

In [None]:
q_original_weight = model.deberta.encoder.layer[0].attention.self.query_proj.weight.data.T
q_original_weight, q_original_weight.shape, torch.norm(q_original_weight)

(tensor([[ 0.0712,  0.0211, -0.0921,  ..., -0.0111,  0.0479,  0.0247],
         [-0.0326,  0.0605, -0.1219,  ...,  0.0445, -0.0812, -0.0392],
         [ 0.0090,  0.0134,  0.0068,  ...,  0.0231, -0.0630,  0.0749],
         ...,
         [ 0.0343,  0.0161, -0.0853,  ..., -0.0633, -0.0336,  0.0295],
         [ 0.1448, -0.0611, -0.0674,  ..., -0.0059,  0.0809, -0.0400],
         [-0.0075,  0.0023, -0.0746,  ..., -0.0186, -0.0690, -0.0521]]),
 torch.Size([1536, 1536]),
 tensor(88.6837))

In [None]:
q_proj_u, q_proj_s, q_proj_v = torch.linalg.svd(q_original_weight)
new_lora_A = q_proj_u[:, :4] @ torch.diag(q_proj_s[:4]).sqrt()
new_lora_A, new_lora_A.shape, torch.norm(new_lora_A)

(tensor([[ 0.2236, -0.0308,  0.0417,  0.0626],
         [ 0.1297, -0.1951,  0.0092,  0.0092],
         [ 0.0193,  0.0311, -0.0281, -0.0416],
         ...,
         [-0.0423,  0.0485, -0.1052, -0.0067],
         [ 0.0470, -0.0709, -0.0392,  0.0321],
         [-0.0780,  0.0835, -0.0392, -0.0022]]),
 torch.Size([1536, 4]),
 tensor(5.8884))

In [None]:
q_reconstructed = q_proj_u @ torch.diag(q_proj_s) @ q_proj_v
recon_error(q_original_weight, q_reconstructed)

tensor(0.0002, device='cuda:0')

In [None]:
q_reconstructed = q_proj_u[:, :4] @ torch.diag(q_proj_s[:4]) @ q_proj_v[:4, :]
recon_error(q_original_weight, q_reconstructed)

tensor(86.9137, device='cuda:0')

In [None]:
q_proj_u[:, :4]

tensor([[ 0.0683, -0.0099,  0.0151,  0.0244],
        [ 0.0396, -0.0627,  0.0033,  0.0036],
        [ 0.0059,  0.0100, -0.0102, -0.0162],
        ...,
        [-0.0129,  0.0156, -0.0380, -0.0026],
        [ 0.0144, -0.0228, -0.0142,  0.0125],
        [-0.0238,  0.0268, -0.0142, -0.0009]])

In [None]:
q_proj_u[:, :4].shape

torch.Size([1536, 4])

In [None]:
q_proj_s[:4]

tensor([10.7285,  9.6862,  7.6498,  6.6093])

In [None]:
torch.diag(q_proj_s[:4]), torch.diag(q_proj_s[:4]).shape

(tensor([[10.7285,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  9.6862,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  7.6498,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  6.6093]]),
 torch.Size([4, 4]))

In [None]:
new_lora_A[:,0],new_lora_A[:,0].shape, torch.norm(new_lora_A[:,0])

(tensor([ 0.2236,  0.1297,  0.0193,  ..., -0.0423,  0.0470, -0.0780]),
 torch.Size([1536]),
 tensor(3.2754))

In [None]:
lora_A[:, 0], lora_A[:, 0].shape, torch.norm(lora_A[:, 0])

(tensor([-0.0037, -0.0118, -0.0003,  ..., -0.0077, -0.0131, -0.0070],
        grad_fn=<SelectBackward0>),
 torch.Size([1536]),
 tensor(0.5674, grad_fn=<LinalgVectorNormBackward0>))

In [15]:
for i in range(4):
    lora_A = model.base_model.model.deberta.encoder.layer[0].attention.self.query_proj.lora_A.default.weight.T
    print(f"{i}th col",lora_A[:, i].shape, torch.norm(lora_A[:, i]))

0th col torch.Size([1536]) tensor(0.5674, grad_fn=<LinalgVectorNormBackward0>)
1th col torch.Size([1536]) tensor(0.5873, grad_fn=<LinalgVectorNormBackward0>)
2th col torch.Size([1536]) tensor(0.5792, grad_fn=<LinalgVectorNormBackward0>)
3th col torch.Size([1536]) tensor(0.5762, grad_fn=<LinalgVectorNormBackward0>)


In [26]:
x = [0,1,2,3,4]
for i in range(5):
    new_x = 5+i
    x[i]= new_x
print(x)

[5, 6, 7, 8, 9]


In [None]:
len_of_layers = len(model.deberta.encoder.layer)
q_new_lora_A_list = []
v_new_lora_A_list = []

q_new_lora_B_list = []
v_new_lora_B_list = []

approx_rank = 4
for layer_idx in range(len_of_layers):
    q_original_weight = model.deberta.encoder.layer[layer_idx].attention.self.query_proj.weight.data.T
    v_original_weight = model.deberta.encoder.layer[layer_idx].attention.self.value_proj.weight.data.T

    q_og_lora_A = model.base_model.model.deberta.encoder.layer[
        layer_idx
    ].attention.self.query_proj.lora_A.default.weight.T
    v_og_lora_A = model.base_model.model.deberta.encoder.layer[
        layer_idx
    ].attention.self.value_proj.lora_A.default.weight.T

    q_proj_u, q_proj_s, q_proj_v = torch.linalg.svd(q_original_weight)
    q_new_lora_A = q_proj_u[:, :approx_rank] @ torch.diag(q_proj_s[:approx_rank]).sqrt()
    q_new_lora_B = torch.diag(q_proj_s[:approx_rank]).sqrt() @ q_proj_v[:approx_rank, :]
    
    v_proj_u, v_proj_s, v_proj_v = torch.linalg.svd(v_original_weight)
    v_new_lora_A = v_proj_u[:, :approx_rank] @ torch.diag(v_proj_s[:approx_rank]).sqrt()
    v_new_lora_B = torch.diag(v_proj_s[:approx_rank]).sqrt() @ v_proj_v[:approx_rank, :]
    
    q_new_lora_B_list.append(q_new_lora_B)
    v_new_lora_B_list.append(v_new_lora_B)

    for i in range(4):
        print(f"Before Scale, q {i}th col Norm", torch.norm(q_new_lora_A[:, i]))

        q_og_lora_A_icol_norm = torch.norm(q_og_lora_A[:, i])
        q_new_lora_A_icol_norm = torch.norm(q_new_lora_A[:, i])

        q_scale = q_og_lora_A_icol_norm / q_new_lora_A_icol_norm

        q_new_lora_A[:, i] = q_new_lora_A[:, i] * q_scale

        print(f"After Scale, q {i}th col Norm", torch.norm(q_new_lora_A[:, i]))
        print("####################")

        print(f"Before Scale, v {i}th col Norm", torch.norm(v_new_lora_A[:, i]))

        v_og_lora_A_icol_norm = torch.norm(v_og_lora_A[:, i])
        v_new_lora_A_icol_norm = torch.norm(v_new_lora_A[:, i])

        v_scale = v_og_lora_A_icol_norm / v_new_lora_A_icol_norm

        v_new_lora_A[:, i] = v_new_lora_A[:, i] * v_scale

        print(f"After Scale, v {i}th col Norm", torch.norm(v_new_lora_A[:, i]))
        print("####################")

    q_new_lora_A_list.append(q_new_lora_A)
    v_new_lora_A_list.append(v_new_lora_A)

    model.base_model.model.deberta.encoder.layer[layer_idx].attention.self.query_proj.lora_A.default.weight.data = (
        q_new_lora_A_list[layer_idx].transpose(0, 1).contiguous()
    )
    model.base_model.model.deberta.encoder.layer[layer_idx].attention.self.value_proj.lora_A.default.weight.data = (
        v_new_lora_A_list[layer_idx].transpose(0, 1).contiguous()
    )
    
    model.base_model.model.deberta.encoder.layer[layer_idx].attention.self.query_proj.lora_B.default.weight.data = (
        q_new_lora_B_list[layer_idx].transpose(0, 1).contiguous()
    )
    model.base_model.model.deberta.encoder.layer[layer_idx].attention.self.value_proj.lora_B.default.weight.data = (
        v_new_lora_B_list[layer_idx].transpose(0, 1).contiguous()
    )

In [28]:
len_of_layers = len(model.deberta.encoder.layer)
q_new_lora_A_list = []
v_new_lora_A_list = []

for layer_idx in range(len_of_layers):
    q_original_weight = model.deberta.encoder.layer[layer_idx].attention.self.query_proj.weight.data.T
    v_original_weight = model.deberta.encoder.layer[layer_idx].attention.self.value_proj.weight.data.T

    q_og_lora_A = model.base_model.model.deberta.encoder.layer[
        layer_idx
    ].attention.self.query_proj.lora_A.default.weight.T
    v_og_lora_A = model.base_model.model.deberta.encoder.layer[
        layer_idx
    ].attention.self.value_proj.lora_A.default.weight.T

    q_proj_u, q_proj_s, q_proj_v = torch.linalg.svd(q_original_weight)
    q_new_lora_A = q_proj_u[:, :4] @ torch.diag(q_proj_s[:4]).sqrt()

    v_proj_u, v_proj_s, v_proj_v = torch.linalg.svd(v_original_weight)
    v_new_lora_A = v_proj_u[:, :4] @ torch.diag(v_proj_s[:4]).sqrt()

    for i in range(4):
        print(f"Before Scale, q {i}th col Norm", torch.norm(q_new_lora_A[:, i]))

        q_og_lora_A_icol_norm = torch.norm(q_og_lora_A[:, i])
        q_new_lora_A_icol_norm = torch.norm(q_new_lora_A[:, i])

        q_scale = q_og_lora_A_icol_norm / q_new_lora_A_icol_norm

        q_new_lora_A[:, i] = q_new_lora_A[:, i] * q_scale

        print(f"After Scale, q {i}th col Norm", torch.norm(q_new_lora_A[:, i]))
        print("####################")

        print(f"Before Scale, v {i}th col Norm", torch.norm(v_new_lora_A[:, i]))

        v_og_lora_A_icol_norm = torch.norm(v_og_lora_A[:, i])
        v_new_lora_A_icol_norm = torch.norm(v_new_lora_A[:, i])

        v_scale = v_og_lora_A_icol_norm / v_new_lora_A_icol_norm

        v_new_lora_A[:, i] = v_new_lora_A[:, i] * v_scale

        print(f"After Scale, v {i}th col Norm", torch.norm(v_new_lora_A[:, i]))
        print("####################")

    q_new_lora_A_list.append(q_new_lora_A)
    v_new_lora_A_list.append(v_new_lora_A)

    model.base_model.model.deberta.encoder.layer[layer_idx].attention.self.query_proj.lora_A.default.weight.data = (
        q_new_lora_A_list[layer_idx].transpose(0, 1).contiguous()
    )
    model.base_model.model.deberta.encoder.layer[layer_idx].attention.self.value_proj.lora_A.default.weight.data = (
        v_new_lora_A_list[layer_idx].transpose(0, 1).contiguous()
    )

Before Scale, q 0th col Norm tensor(3.2754)
After Scale, q 0th col Norm tensor(0.5823, grad_fn=<LinalgVectorNormBackward0>)
####################
Before Scale, v 0th col Norm tensor(2.0830)
After Scale, v 0th col Norm tensor(0.5780, grad_fn=<LinalgVectorNormBackward0>)
####################
Before Scale, q 1th col Norm tensor(3.1123, grad_fn=<LinalgVectorNormBackward0>)
After Scale, q 1th col Norm tensor(0.5741, grad_fn=<LinalgVectorNormBackward0>)
####################
Before Scale, v 1th col Norm tensor(1.9221, grad_fn=<LinalgVectorNormBackward0>)
After Scale, v 1th col Norm tensor(0.5697, grad_fn=<LinalgVectorNormBackward0>)
####################
Before Scale, q 2th col Norm tensor(2.7658, grad_fn=<LinalgVectorNormBackward0>)
After Scale, q 2th col Norm tensor(0.5851, grad_fn=<LinalgVectorNormBackward0>)
####################
Before Scale, v 2th col Norm tensor(1.9160, grad_fn=<LinalgVectorNormBackward0>)
After Scale, v 2th col Norm tensor(0.5672, grad_fn=<LinalgVectorNormBackward0>)
###

In [None]:
lora_A_norms = []
new_lora_A_norms = []
for i in range(48):
    lora_A = model.base_model.model.deberta.encoder.layer[i].attention.self.query_proj.lora_A.default.weight.T
    lora_A_norms.append(torch.norm(lora_A[:, 0]))

    q_original_weight = model.deberta.encoder.layer[i].attention.self.query_proj.weight.data.T
    q_proj_u, q_proj_s, q_proj_v = torch.linalg.svd(q_original_weight)
    new_lora_A = q_proj_u[:, :4] @ torch.diag(q_proj_s[:4]).sqrt()
    new_lora_A_norms.append(torch.norm(new_lora_A[:, 0]))

lora_A_norms, new_lora_A_norms

([tensor(0.5674, grad_fn=<LinalgVectorNormBackward0>),
  tensor(0.5857, grad_fn=<LinalgVectorNormBackward0>),
  tensor(0.5796, grad_fn=<LinalgVectorNormBackward0>),
  tensor(0.5761, grad_fn=<LinalgVectorNormBackward0>),
  tensor(0.5753, grad_fn=<LinalgVectorNormBackward0>),
  tensor(0.5796, grad_fn=<LinalgVectorNormBackward0>),
  tensor(0.5784, grad_fn=<LinalgVectorNormBackward0>),
  tensor(0.5738, grad_fn=<LinalgVectorNormBackward0>),
  tensor(0.5734, grad_fn=<LinalgVectorNormBackward0>),
  tensor(0.5654, grad_fn=<LinalgVectorNormBackward0>),
  tensor(0.5826, grad_fn=<LinalgVectorNormBackward0>),
  tensor(0.5692, grad_fn=<LinalgVectorNormBackward0>),
  tensor(0.5683, grad_fn=<LinalgVectorNormBackward0>),
  tensor(0.5814, grad_fn=<LinalgVectorNormBackward0>),
  tensor(0.5670, grad_fn=<LinalgVectorNormBackward0>),
  tensor(0.5818, grad_fn=<LinalgVectorNormBackward0>),
  tensor(0.5741, grad_fn=<LinalgVectorNormBackward0>),
  tensor(0.5749, grad_fn=<LinalgVectorNormBackward0>),
  tensor(0

In [None]:
new_lora_A_norms[0]

tensor(3.2754)

In [None]:
for i in range(48):
    scale = lora_A_norms[i] / new_lora_A_norms[i]
    scaled_lora = model.base_model.model.deberta.encoder.layer[i].attention.self.query_proj.lora_A.default.weight.T * scale
    scaled_norm = torch.norm(scaled_lora[:, 0])