In [1]:
%load_ext autoreload
%autoreload 2

In [145]:
import torch
import transformers
from finetune_peft import get_peft_config, PEFTArguments
from peft import get_peft_config, get_peft_model, LoraConfig, TaskType, PeftModel, PeftConfig, set_peft_model_state_dict
import peft

from peft.tuners.lora import Linear
import torch.nn.functional as F
from peft.utils.other import transpose

import time

In [3]:
model_path = "/home/ubuntu/llama-weights/7B/llama-7b"
tokenizer_path = model_path

torch.set_default_tensor_type(torch.cuda.HalfTensor)
model = transformers.LlamaForCausalLM.from_pretrained(model_path)

Loading checkpoint shards: 100%|██████████| 33/33 [00:11<00:00,  2.90it/s]


In [4]:
model = PeftModel.from_pretrained(model, "jondurbin/airoboros-7b-gpt4-1.2-peft", adapter_name="airoboros")

In [5]:
model = PeftModel.from_pretrained(model.base_model.model, "trl-lib/llama-7b-se-rl-peft", adapter_name="se-rl")

In [160]:
model.base_model.model.model.layers[0].self_attn.q_proj

Linear(
  in_features=4096, out_features=4096, bias=False
  (lora_dropout): ModuleDict(
    (airoboros): Dropout(p=0.05, inplace=False)
    (se-rl): Dropout(p=0.05, inplace=False)
  )
  (lora_A): ModuleDict(
    (airoboros): Linear(in_features=4096, out_features=64, bias=False)
    (se-rl): Linear(in_features=4096, out_features=16, bias=False)
  )
  (lora_B): ModuleDict(
    (airoboros): Linear(in_features=64, out_features=4096, bias=False)
    (se-rl): Linear(in_features=16, out_features=4096, bias=False)
  )
  (lora_embedding_A): ParameterDict()
  (lora_embedding_B): ParameterDict()
)

In [170]:
type(list(model.base_model.model.model.layers[0].self_attn.q_proj.named_modules())[0][1])

peft.tuners.lora.Linear

In [194]:
for name, module in model.named_modules():
    if isinstance(module, peft.tuners.lora.Linear):
        module
        break
        # Do whatever operation you need to perform here
        # For example, if you want to add lora_A and lora_B to the parameter, you can do:
        # parameter.data = parameter.data + self.lora_A + self.lora_B

In [202]:
module.lora_A['airoboros']

Linear(in_features=4096, out_features=64, bias=False)

In [None]:
merged = torch.nn.Linear()
module.lora_A['merged']

In [143]:
from peft.tuners.lora import Linear

def forward(self, x: torch.Tensor):
    previous_dtype = x.dtype
    if self.active_adapter not in self.lora_A.keys():
        return F.linear(x, transpose(self.weight, self.fan_in_fan_out), bias=self.bias)
    if self.disable_adapters:
        if self.r[self.active_adapter] > 0 and self.merged:
            self.unmerge()
        result = F.linear(x, transpose(self.weight, self.fan_in_fan_out), bias=self.bias)
    elif self.r[self.active_adapter] > 0 and not self.merged:
        result = F.linear(x, transpose(self.weight, self.fan_in_fan_out), bias=self.bias)

        x = x.to(self.lora_A[self.active_adapter].weight.dtype)

        result += (
            self.lora_B[self.active_adapter](
                self.lora_A[self.active_adapter](self.lora_dropout[self.active_adapter](x))
            )
            * self.scaling[self.active_adapter]
        )
    else:
        result = F.linear(x, transpose(self.weight, self.fan_in_fan_out), bias=self.bias)

    result = result.to(previous_dtype)

    return result


Linear.forward = forward

In [16]:
torch.set_default_tensor_type(torch.cuda.FloatTensor)

tokenizer = transformers.LlamaTokenizer.from_pretrained(tokenizer_path)
batch = tokenizer("The LLaMA language model is", return_tensors="pt")

with torch.no_grad():
    out1 = model.generate(
        input_ids=batch["input_ids"],
        attention_mask=torch.ones_like(batch["input_ids"]),
        max_length=200,
    )
print(tokenizer.decode(out1[0]))

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'LLaMATokenizer'. 
The class this function is called from is 'LlamaTokenizer'.


 ⁇  The LLaMA language model is a large-scale neural network trained on a large corpus of text. It can be used for various tasks such as text classification, text generation, and natural language understanding.

## Architecture

The LLaMA model consists of an encoder-decoder architecture with attention mechanism. The encoder part is a bidirectional Gated Recurrent Unit (GRU) network, while the decoder part is a simple feed-forward network. The attention mechanism is used to focus on important parts of the input sequence during decoding.

## Training

The LLaMA model is trained using the negative sampling technique. In this approach, the model is trained on a large corpus of text and then used to generate new sentences. To generate new sentences, the model first generates a set of candidate sentences using the encoder part. Then, it selects the best candidate sentence based on its probability score. The probability score is calculated by


In [None]:
peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1
)

model = get_peft_model(model, peft_config)

In [29]:
bs = 2
seq_len = 10
ctx_dim = 4096
rank = 8
scaling = 4.0
fan_in_fan_out = False
bias = None

In [139]:
# blora
x0 = torch.randn([bs, seq_len, ctx_dim])
weight = torch.randn(([ctx_dim, ctx_dim]))

lora_a0 = torch.nn.Linear(in_features=ctx_dim, out_features=rank, bias=False)
lora_b0 = torch.nn.Linear(in_features=rank, out_features=ctx_dim, bias=False)

lora_a1 = torch.nn.Linear(in_features=ctx_dim, out_features=rank, bias=False)
lora_b1 = torch.nn.Linear(in_features=rank, out_features=ctx_dim, bias=False)

lora1 = torch.cat([lora_a0.weight, lora_a1.weight], dim=1)
lora2 = torch.cat([lora_b0.weight, lora_b1.weight], dim=0)

# forward pass
def lora_forward(x0, weight, lora1, lora2):
    result = F.linear(x0, transpose(weight, fan_in_fan_out), bias=bias)
    x = x0.reshape(seq_len, -1)
    x = x.to(lora1.dtype)

    out = F.linear(x, transpose(lora1, fan_in_fan_out), bias=bias)
    out = scaling * F.linear(out, transpose(lora2, fan_in_fan_out), bias=bias)
    out = out.reshape(bs, seq_len, -1)
    result += out
    return result

start = time.time()
lora_forward(x0, weight, lora1, lora2)
lora_forward(x0, weight, lora1, lora2)
print(f"lora_forward: {(time.time() - start)*1e6} microseceonds")

lora_forward: 424.1466522216797 microseceonds


In [86]:
class BLora(torch.nn.Module):
    def __init__(
        self,
        lora1: list,
        lora2: list,
        weight: torch.Tensor,
    ):
        super().__init__()
        self.lora_a = torch.nn.Parameter(torch.cat([lora1[0].weight, lora2[0].weight], dim=1))
        self.lora_b = torch.nn.Parameter(torch.cat([lora1[1].weight, lora2[1].weight], dim=0))
        self.weight = torch.nn.Parameter(weight)

    def forward(self, x: torch.Tensor):
        result = F.linear(x, transpose(self.weight, fan_in_fan_out), bias=bias)
        x = x.reshape(seq_len, -1)
        x = x.to(self.lora_a.dtype)

        out = F.linear(x, transpose(self.lora_a, fan_in_fan_out), bias=bias)
        out = scaling * F.linear(out, transpose(self.lora_b, fan_in_fan_out), bias=bias)
        out = out.reshape(bs, seq_len, -1)
        result += out
        return result
    
blora = BLora(lora1=[lora_a0, lora_b0], lora2=[lora_a1, lora_b1], weight=weight)

start = time.time()
blora(x0)
print(f"lora_forward: {(time.time() - start)*1e6} microseceonds")

lora_forward: 279.1881561279297 microseceonds


In [None]:
def forward(self, x: torch.Tensor):
    previous_dtype = x.dtype
    if self.active_adapter not in self.lora_A.keys():
        return F.linear(x, transpose(self.weight, self.fan_in_fan_out), bias=self.bias)
    if self.disable_adapters:
        if self.r[self.active_adapter] > 0 and self.merged:
            self.unmerge()
        result = F.linear(x, transpose(self.weight, self.fan_in_fan_out), bias=self.bias)
    elif self.r[self.active_adapter] > 0 and not self.merged:
        result = F.linear(x, transpose(self.weight, self.fan_in_fan_out), bias=self.bias)

        x = x.to(self.lora_A[self.active_adapter].weight.dtype)

        result += (
            self.lora_B[self.active_adapter](
                self.lora_A[self.active_adapter](self.lora_dropout[self.active_adapter](x))
            )
            * self.scaling[self.active_adapter]
        )
    else:
        result = F.linear(x, transpose(self.weight, self.fan_in_fan_out), bias=self.bias)

    result = result.to(previous_dtype)

    return result

In [None]:
# depreciated blora
x0 = torch.randn([bs, seq_len, ctx_dim])
weight = torch.randn(([ctx_dim, ctx_dim]))

lora_a0 = torch.nn.Linear(in_features=ctx_dim, out_features=rank, bias=False)
lora_b0 = torch.nn.Linear(in_features=rank, out_features=ctx_dim, bias=False)

lora_a1 = torch.nn.Linear(in_features=ctx_dim, out_features=rank, bias=False)
lora_b1 = torch.nn.Linear(in_features=rank, out_features=ctx_dim, bias=False)

lora1 = torch.nn.Linear(in_features=bs * ctx_dim, out_features=rank, bias=False)
lora2 = torch.nn.Linear(in_features=rank, out_features=bs * ctx_dim, bias=False)

lora1.weight = torch.nn.Parameter(torch.cat([lora_a0.weight, lora_a1.weight], dim=1))
lora2.weight = torch.nn.Parameter(torch.cat([lora_b0.weight, lora_b1.weight], dim=0))

# forward pass
result1 = F.linear(x0, transpose(weight, fan_in_fan_out), bias=bias)
x1 = x0.reshape(seq_len, -1)
x1 = x1.to(lora1.weight.dtype)
out1 = lora2(lora1(x1)) * scaling
out1 = out1.reshape(bs, seq_len, -1)
result1 += out1

In [None]:
path = "/home/ubuntu/airoboros-7b-gpt4-1.2-peft/adapter_model.bin"
peft_model_state_dict = torch.load(path)

peft_config_path = "/home/ubuntu/airoboros-7b-gpt4-1.2-peft/adapter_config.json"
peft_config = PeftConfig.from_json_file(peft_config_path)
peft_config = {'default' : peft_config}
model.peft_config = peft_config
set_peft_model_state_dict(model, peft_model_state_dict, adapter_name="default")