In [None]:
import torch
from transformers import AutoModelForCausalLM
import os
from huggingface_hub import list_repo_files, snapshot_download

target_model_id = "mistralai/Mistral-7B-v0.1"

def download_transformers_model(repo_id, cache_dir=None):
    # Check for .safetensors files in the repository
    repo_files = list_repo_files(repo_id)
    has_safetensors = any(file.endswith('.safetensors') for file in repo_files)

    # Define ignore_patterns based on the presence of .safetensors files
    ignore_patterns = ["*.bin"] if has_safetensors else None

    # Download the repository, ignoring PyTorch .bin files if .safetensors files are present
    local_path = snapshot_download(repo_id=repo_id,
                                    cache_dir=cache_dir,
                                    ignore_patterns=ignore_patterns,
                                    )

    print(f"Model downloaded to: {local_path}")
    if has_safetensors:
        print("Note: PyTorch .bin files were ignored due to the presence of .safetensors files.")
    return os.path.abspath(local_path), has_safetensors


In [None]:
from transformers import AutoModelForCausalLM
import torch

target_model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1", torch_dtype = torch.bfloat16, device_map = "cpu")
base_model = AutoModelForCausalLM.from_pretrained("models/reduced", device_map = "cpu")

In [None]:
target_model.save_pretrained("models/target")

In [None]:
# This code has been modified from its original version on the Axolotl project.
# Copyright 2023 Axolotl contributors.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


import os
import torch
import torch
import bitsandbytes as bnb
from transformers import AutoModelForCausalLM, AutoConfig, AutoTokenizer
from peft.tuners.lora import QuantLinear


def get_linear_embedding_layers(model_type):
    """
    returns the linear embedding layers needed for loras, dependent on the model arch
    """
    if model_type == "gpt_neox":
        return ["embed_in", "embed_out"]
    if model_type == "falcon":
        return ["word_embeddings", "lm_head"]
    return ["embed_tokens", "lm_head"]


def find_all_linear_names(model):
    cls = (bnb.nn.Linear4bit, bnb.nn.Linear8bitLt, torch.nn.Linear, QuantLinear)

    names = []
    for name, module in model.named_modules():
        if (
            isinstance(module, cls)
            or "Linear" in module.__class__.__name__
            and module.__class__.__name__ not in ("LlamaLinearScalingRotaryEmbedding",)
        ):
            names.append(name)


    return names

def get_linear_module_names(model_id):
    model = AutoModelForCausalLM.from_pretrained(model_id, state_dict={}, device_map="meta") #avoid loading weights as we won't need them
    return find_all_linear_names(model)

linear_module_names = get_linear_module_names("models/target")

In [None]:
import torch

def _low_rank_decomposition(weight, reduced_rank=16):
    """
    Decompose a 2D matrix into low-rank matrices A and B using SVD.a

    :param weight: The matrix to decompose, of shape (H, W)
    :param reduced_rank: The final rank of the decomposition
    :return: A tuple of tensors (A, B)
    """
    if weight.dim() != 2:
        raise ValueError(f"Only support 2D matrix, but your input has {weight.dim()} dimensions.")

    # SVD Decomposition
    U, S, Vh = torch.linalg.svd(weight, full_matrices=False)

    # Truncated matrices
    A = Vh[:reduced_rank, :]
    B = U[:, :reduced_rank] @ torch.diag(S[:reduced_rank])

    return A, B

def decompose_delta_weight(new_weight, base_weight, alpha, reduced_rank, device=None):
    if device is None:
        device = 'cuda' if torch.cuda.is_available() else 'cpu'

    new_weight = new_weight.to(device)
    base_weight = base_weight.to(device)

    """
    Decompose the delta weight into low-rank matrices A and B, considering the alpha scaling factor.

    :param new_weight: The updated weight matrix after applying LoRA.
    :param base_weight: The original weight matrix before LoRA.
    :param alpha: The alpha scaling factor used in LoRA.
    :param reduced_rank: The rank for the low-rank decomposition.
    :return: A tuple of tensors (A, B)
    """
    delta_weight = new_weight - base_weight

    # Check if alpha is applied uniformly
    # Adjust the implementation if alpha is applied differently
    adjusted_delta_weight = delta_weight / alpha

    A, B = _low_rank_decomposition(adjusted_delta_weight, reduced_rank=reduced_rank)

    return A, B

In [None]:
linear_module_names

In [None]:
from tqdm.notebook import tqdm
from collections import OrderedDict

loras = OrderedDict()

# lower rank captures less of the original model, a rank of 32 is probably reasonable for small delta (task specific finetunes and such)
alpha = 1
rank = 64

for module in tqdm(linear_module_names):
  target_tensor = target_model.state_dict()[module+".weight"]
  base_tensor = base_model.state_dict()[module+".weight"]

  lora_A, lora_B = decompose_delta_weight(target_tensor, base_tensor, alpha, rank)
  loras[f"base_model.model.{module}.lora_A.weight"] = lora_A.to('cpu')
  loras[f"base_model.model.{module}.lora_B.weight"] = lora_B.to('cpu')

In [None]:
def get_module_peft_name(module_name):
    return module_name.split('.')[-1]

In [None]:
from transformers import AutoModelForCausalLM, AutoConfig, AutoTokenizer
from peft import get_peft_model, LoraConfig

LORA_OUT_DIR = "./lora"

lora_config = LoraConfig(
        lora_alpha=64, # Setting the alpha to the to decomposition rank value (instead of alpha value used) seems to give better performance. Further testing would be needed to understand what is the optimal alpha value to use
        lora_dropout=0,
        r=64,
        bias="none",
        task_type="CAUSAL_LM",
        target_modules= list(set([get_module_peft_name(e) for e in linear_module_names])),
)

model = AutoModelForCausalLM.from_pretrained("models/reduced")

peft_model = get_peft_model(model, lora_config)

# Save to disk
peft_model.save_pretrained(LORA_OUT_DIR)

del peft_model

In [None]:
import torch

for key in loras.keys():
    loras[key] = loras[key].to('cpu').contiguous()

torch.save(loras, os.path.join(LORA_OUT_DIR, 'adapter_model.bin'))

In [None]:
import os
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

LORA_OUT_DIR = "./lora"
target_model_id = "mistralai/Mistral-7B-v0.1"

config = PeftConfig.from_pretrained(os.path.abspath(LORA_OUT_DIR))
model = AutoModelForCausalLM.from_pretrained("models/reduced", torch_dtype = torch.bfloat16, device_map = "cuda")
model = PeftModel.from_pretrained(model, os.path.abspath(LORA_OUT_DIR), device_map = "cuda")
tokenizer = AutoTokenizer.from_pretrained(target_model_id, use_fast=True)

In [None]:

# Test input
messages = [
    {"role": "user", "content": "Hey what's up?"},
]

tokenizer.pad_token = "[PAD]"
tokenizer.padding_side = "left"

# Tokenize and format the chat for the model
tokenized_chat = tokenizer.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_tensors="pt"
).to("cuda")

model = model.bfloat16().cuda()

# Generate a response
outputs = model.generate(tokenized_chat, max_new_tokens=256, do_sample = True, top_p = 0.95, temperature = 0.6)  # Adjust max_new_tokens if needed

In [None]:
tokenized_chat