In [None]:
import torch
from transformers import AutoModelForCausalLM

target_model = AutoModelForCausalLM.from_pretrained(
    "mistralai/Mistral-7B-Instruct-v0.1",
    device_map = "cpu"
)


In [None]:
def find_all_linear_names(
    model,
    target_module = "self_attn",
    target_module_suffixes = ["q_proj", "k_proj", "v_proj", "o_proj"]
):

    names = []
    for name, _ in model.named_modules():
        if target_module in name:
            for suffix in target_module_suffixes:
                if suffix in name:
                    names.append(name)

    return names

linear_module_names = find_all_linear_names(target_model)

In [None]:
import torch

def _low_rank_decomposition(
    weight,
    reduced_rank=64,
    torch_dtype = torch.bfloat16,
    full_matrices = False
):
    """
    Decompose a 2D matrix into low-rank matrices A and B using SVD.a

    :param weight: The matrix to decompose, of shape (H, W)
    :param reduced_rank: The final rank of the decomposition
    :return: A tuple of tensors (A, B)
    """
    if weight.dim() != 2:
        raise ValueError(f"Only support 2D matrix, but your input has {weight.dim()} dimensions.")

    # SVD Decomposition
    U, S, Vh = torch.linalg.svd(weight, full_matrices=full_matrices)

    # Truncated matrices
    A = Vh[:reduced_rank, :]
    B = U[:, :reduced_rank] @ torch.diag(S[:reduced_rank])

    if torch_dtype:
        A = A.to(torch_dtype)
        B = B.to(torch_dtype)

    return A, B

def decompose_delta_weight(new_weight, base_weight, alpha, reduced_rank, device=None):
    if device is None:
        device = 'cuda' if torch.cuda.is_available() else 'cpu'

    new_weight = new_weight.to(device)
    base_weight = base_weight.to(device)

    """
    Decompose the delta weight into low-rank matrices A and B, considering the alpha scaling factor.

    :param new_weight: The updated weight matrix after applying LoRA.
    :param base_weight: The original weight matrix before LoRA.
    :param alpha: The alpha scaling factor used in LoRA.
    :param reduced_rank: The rank for the low-rank decomposition.
    :return: A tuple of tensors (A, B)
    """
    delta_weight = new_weight - base_weight

    del new_weight
    del base_weight

    # Check if alpha is applied uniformly
    # Adjust the implementation if alpha is applied differently
    adjusted_delta_weight = delta_weight / alpha

    del delta_weight

    A, B = _low_rank_decomposition(adjusted_delta_weight, reduced_rank=reduced_rank)

    return A, B

In [None]:
from peft.tuners.tuners_utils import replicate_layers
import copy

base_model = copy.deepcopy(target_model.to("cpu"))

layer_map = [
    [0, 8],
    [13, 14],
    [10, 12],
    [13, 16],
    [13, 14],
    [14, 28],
    [13, 14],
    [13, 14],
    [30, 32]
]


replicate_layers(model = base_model,layer_map = layer_map)
base_model

In [None]:
from tqdm.notebook import tqdm
from collections import OrderedDict

loras = OrderedDict()

# lower rank captures less of the original model, a rank of 32 is probably reasonable for small delta (task specific finetunes and such)
alpha = 16
rank = 64

for module in tqdm(linear_module_names):
  
  target_tensor = target_model.state_dict()[module+".weight"]
  base_tensor = base_model.state_dict()[module+".weight"]

  lora_A, lora_B = decompose_delta_weight(target_tensor, base_tensor, alpha, rank)
  loras[f"base_model.model.{module}.lora_A.weight"] = lora_A.to('cpu')
  loras[f"base_model.model.{module}.lora_B.weight"] = lora_B.to('cpu')

In [None]:
from transformers import AutoModelForCausalLM
from peft import get_peft_model, LoraConfig
import os

LORA_OUT_DIR = "./lora"

lora_config = LoraConfig(
        lora_alpha=alpha, # Setting the alpha to the to decomposition rank value (instead of alpha value used) seems to give better performance. Further testing would be needed to understand what is the optimal alpha value to use
        lora_dropout=0, #TODO: experiment with dropout
        r=rank,
        bias="none",
        task_type="CAUSAL_LM",
        target_modules= ["q_proj", "k_proj", "v_proj", "o_proj"],
)

lora_config.save_pretrained(LORA_OUT_DIR)

In [None]:
import torch

for key in loras.keys():
    loras[key] = loras[key].to('cpu').contiguous()

torch.save(loras, os.path.join(LORA_OUT_DIR, 'adapter_model.bin'))

In [None]:
peft_model = get_peft_model(base_model, lora_config)

# Save to disk
peft_model.save_pretrained(LORA_OUT_DIR)

del peft_model

In [None]:
import os
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer

LORA_OUT_DIR = "./lora"
target_model_id = "mistralai/Mistral-7B-Instruct-v0.1"

config = PeftConfig.from_pretrained(os.path.abspath(LORA_OUT_DIR))
model = PeftModel.from_pretrained(base_model, os.path.abspath(LORA_OUT_DIR), device_map = "cuda")
tokenizer = AutoTokenizer.from_pretrained(target_model_id, use_fast=True)

In [None]:

# Test input
messages = [
    {"role": "user", "content": "Hey what's up?"},
]

tokenizer.pad_token = "[PAD]"
tokenizer.padding_side = "left"

# Tokenize and format the chat for the model
tokenized_chat = tokenizer.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_tensors="pt"
).to("cuda")

model = model.bfloat16().cuda()

# Generate a response
outputs = model.generate(tokenized_chat, max_new_tokens=256, do_sample = True, top_p = 0.95, temperature = 0.6)  # Adjust max_new_tokens if needed
print(tokenizer.decode(outputs[0]))