In [1]:
import torch
from transformers import AutoModelForCausalLM

target_model = AutoModelForCausalLM.from_pretrained(
    "mistralai/Mistral-7B-Instruct-v0.1",
    device_map = "cpu"
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [2]:
def find_all_linear_names(
    model,
    target_module = "self_attn",
    target_module_suffixes = ["q_proj", "k_proj", "v_proj", "o_proj"]
):

    names = []
    for name, _ in model.named_modules():
        if target_module in name:
            for suffix in target_module_suffixes:
                if suffix in name:
                    names.append(name)

    return names

linear_module_names = find_all_linear_names(target_model)

In [3]:
import torch

def _low_rank_decomposition(
    weight,
    reduced_rank=128,
    torch_dtype = torch.bfloat16,
    full_matrices = False
):
    """
    Decompose a 2D matrix into low-rank matrices A and B using SVD.a

    :param weight: The matrix to decompose, of shape (H, W)
    :param reduced_rank: The final rank of the decomposition
    :return: A tuple of tensors (A, B)
    """
    if weight.dim() != 2:
        raise ValueError(f"Only support 2D matrix, but your input has {weight.dim()} dimensions.")

    # SVD Decomposition
    U, S, Vh = torch.linalg.svd(weight, full_matrices=full_matrices)

    # Truncated matrices
    A = Vh[:reduced_rank, :]
    B = U[:, :reduced_rank] @ torch.diag(S[:reduced_rank])

    if torch_dtype:
        A = A.to(torch_dtype)
        B = B.to(torch_dtype)

    return A, B

def decompose_delta_weight(new_weight, base_weight, alpha, reduced_rank, device=None):
    if device is None:
        device = 'cuda' if torch.cuda.is_available() else 'cpu'

    new_weight = new_weight.to(device)
    base_weight = base_weight.to(device)

    """
    Decompose the delta weight into low-rank matrices A and B, considering the alpha scaling factor.

    :param new_weight: The updated weight matrix after applying LoRA.
    :param base_weight: The original weight matrix before LoRA.
    :param alpha: The alpha scaling factor used in LoRA.
    :param reduced_rank: The rank for the low-rank decomposition.
    :return: A tuple of tensors (A, B)
    """
    delta_weight = new_weight - base_weight

    del new_weight
    del base_weight

    # Check if alpha is applied uniformly
    # Adjust the implementation if alpha is applied differently
    adjusted_delta_weight = delta_weight / alpha

    del delta_weight

    A, B = _low_rank_decomposition(adjusted_delta_weight, reduced_rank=reduced_rank)

    return A, B

In [4]:
from peft.tuners.tuners_utils import replicate_layers
import copy

base_model = copy.deepcopy(target_model.to("cpu"))

"""
Replace the following layers:
22	layer_29	0.159546
11	layer_19	0.137939
8	layer_16	0.084167
7	layer_15	0.083313
27	layer_5	0.082794
21	layer_28	0.081604
26	layer_4	0.080139
14	layer_21	0.079468
"""

layer_map = [
    [0, 4],
    [3,4],
    [6,7],
    [6, 15],
    [14, 15],
    [17, 18],
    [17, 19],
    [18, 19],
    [20, 21],
    [20, 21],
    [22, 28],
    [27, 28],
    [30, 31],
    [30, 31],
    [31, 32]
]

replicate_layers(model = base_model,layer_map = layer_map)
base_model

MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralSdpaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): MistralRotaryEmbedding()
        )
        (mlp): MistralMLP(
          (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): MistralRMSNorm()
        (post_attention_layernorm): MistralRMSNorm()
      )
    )
    (norm): MistralRMSNorm(

In [5]:
from tqdm.notebook import tqdm
from collections import OrderedDict

loras = OrderedDict()

# lower rank captures less of the original model, a rank of 32 is probably reasonable for small delta (task specific finetunes and such)
alpha = 16
rank = 128

for module in tqdm(linear_module_names):
  
  target_tensor = target_model.state_dict()[module+".weight"]
  base_tensor = base_model.state_dict()[module+".weight"]

  lora_A, lora_B = decompose_delta_weight(target_tensor, base_tensor, alpha, rank)
  loras[f"base_model.model.{module}.lora_A.weight"] = lora_A.to('cpu')
  loras[f"base_model.model.{module}.lora_B.weight"] = lora_B.to('cpu')

  0%|          | 0/128 [00:00<?, ?it/s]

In [6]:
from transformers import AutoModelForCausalLM
from peft import get_peft_model, LoraConfig
import os

LORA_OUT_DIR = "./lora"

lora_config = LoraConfig(
        lora_alpha=alpha, # Setting the alpha to the to decomposition rank value (instead of alpha value used) seems to give better performance. Further testing would be needed to understand what is the optimal alpha value to use
        lora_dropout=0.1, # Dropout = 0.1 makes a huge diff compared to 0.0!
        r=rank,
        bias="none",
        task_type="CAUSAL_LM",
        target_modules= ["q_proj", "k_proj", "v_proj", "o_proj"],
)

lora_config.save_pretrained(LORA_OUT_DIR)

In [7]:
import torch

for key in loras.keys():
    loras[key] = loras[key].to('cpu').contiguous()

torch.save(loras, os.path.join(LORA_OUT_DIR, 'adapter_model.bin'))

In [8]:
peft_model = get_peft_model(base_model, lora_config)

# Save to disk
peft_model.save_pretrained(LORA_OUT_DIR)

del peft_model

In [9]:
import os
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer

LORA_OUT_DIR = "./lora"
target_model_id = "mistralai/Mistral-7B-Instruct-v0.1"

config = PeftConfig.from_pretrained(os.path.abspath(LORA_OUT_DIR))
model = PeftModel.from_pretrained(base_model, os.path.abspath(LORA_OUT_DIR), device_map = "cuda")
tokenizer = AutoTokenizer.from_pretrained(target_model_id, use_fast=True)

In [10]:
# Test input

placeholder = "You are a helpful assistant. Please answer the questions below in a sound and truthful way:\n{question}"
instruction = placeholder.format(
    question = "What are great things to do in Catania, Sicily?"
)

messages = [
    {"role": "user", "content": instruction},
]

tokenizer.pad_token = "[PAD]"
tokenizer.padding_side = "left"

# Tokenize and format the chat for the model
tokenized_chat = tokenizer.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_tensors="pt"
).to("cuda")

model = model.bfloat16().cuda()

# Generate a response
outputs = model.generate(tokenized_chat, max_new_tokens=2048, do_sample = True, temperature = 0.1, top_p = 0.95, no_repeat_ngram_size = 3)  # Adjust max_new_tokens if needed
print(tokenizer.decode(outputs[0]))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


<s> [INST] You are a helpful assistant. Please answer the questions below in a sound and truthful way:
What are great things to do in Catania, Sicily? [/INST] Catania is a beautiful city with many great things worth doing. Here are some of the top options:

1. Visit the Basilica of Santa Maria Lucia, which is a UNESCO World Heritage site and the largest church in the Mediterranean.
2. Take a stroll on the beach of the city, where you can enjoy the beautiful views of the Mediterranean and relax.
3. Visit one of the many historic sites, such as the Church of St. Maria da Assisi, the Church on the Hill, or the Church in the Woods.
4. Take in the stunning views of Old Aquatitaino Cemetery, which dates back to the 13th century.
5. Take part in a wine-tasting tour of the area, where visitors can sample the famous Sicolian wine.
6. Take advantage of the rich cultural heritage of the region, and visit the local museums and art galleries.
7. Take to the beautiful beaches of the island, where th

In [11]:
merged = model.merge_and_unload()

In [12]:
total_params = sum(p.numel() for p in merged.parameters())
total_params / 1000000000

5.496836096

In [13]:
torch.save(merged.state_dict(), "models/merged/state_dict.bin")