In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import torch
import transformers
from finetune_peft import get_peft_config, PEFTArguments
from peft import get_peft_config, get_peft_model, LoraConfig, TaskType, PeftModel, PeftConfig, set_peft_model_state_dict
import peft

from peft.tuners.lora import Linear
import torch.nn.functional as F
from peft.utils.other import transpose

import time
import numpy as np

import inspect
from functools import wraps

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
model_path = "/home/ubuntu/llama-weights/7B/llama-7b"
tokenizer_path = model_path

torch.set_default_tensor_type(torch.cuda.HalfTensor)
model = transformers.LlamaForCausalLM.from_pretrained(model_path)

Loading checkpoint shards: 100%|██████████| 33/33 [00:11<00:00,  2.88it/s]


In [4]:
model = PeftModel.from_pretrained(model, "jondurbin/airoboros-7b-gpt4-1.2-peft", adapter_name="airoboros")
model = PeftModel.from_pretrained(model.base_model.model, "trl-lib/llama-7b-se-rl-peft", adapter_name="se-rl")

In [5]:
for name, module in model.named_modules():
    module.batch_lora_ids = ["airoboros", "se-rl"]

# set(type(module) for _, module in model.named_modules())

In [6]:
tokenizer = transformers.LlamaTokenizer.from_pretrained(tokenizer_path)
batch = tokenizer("The LLaMA language model is", return_tensors="pt")
b = torch.cat([batch['input_ids'], batch['input_ids']], dim=0).cuda()

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'LLaMATokenizer'. 
The class this function is called from is 'LlamaTokenizer'.
You are using the legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This means that tokens that come after special tokens will not be properly handled. We recommend you to read the related pull request available at https://github.com/huggingface/transformers/pull/24565


In [11]:
from blora import forward
Linear.forward = forward

torch.set_default_tensor_type(torch.cuda.FloatTensor)

with torch.no_grad():
    out1 = model.generate(
        input_ids=b,
        attention_mask=torch.ones_like(b),
        max_length=200,
        # batch_lora_ids=["airoboros", "se-rl"],
    )
print(tokenizer.decode(out1[0]))

 ⁇  The LLaMA language model is a large-scale neural network trained on a large corpus of text. It can be used for various tasks such as text classification, text generation, and natural language understanding.

## Architecture

The LLaMA model consists of an encoder-decoder architecture with attention mechanism. The encoder part is a bidirectional Gated Recurrent Unit (GRU) network, while the decoder part is a simple feed-forward network. The attention mechanism is used to focus on important parts of the input sequence during decoding.

## Training

The LLaMA model is trained using the negative sampling technique. In this approach, the model is trained on a large corpus of text and then used to generate new sentences. To generate new sentences, the model first generates a set of candidate sentences using the encoder part. Then, it selects the best candidate sentence based on its probability score. The probability score is calculated by


In [14]:
print(tokenizer.decode(out1[0]))

 ⁇  The LLaMA language model is a large-scale neural network trained on a large corpus of text. It can be used for various tasks such as text classification, text generation, and natural language understanding.

## Architecture

The LLaMA model consists of an encoder-decoder architecture with attention mechanism. The encoder part is a bidirectional Gated Recurrent Unit (GRU) network, while the decoder part is a simple feed-forward network. The attention mechanism is used to focus on important parts of the input sequence during decoding.

## Training

The LLaMA model is trained using the negative sampling technique. In this approach, the model is trained on a large corpus of text and then used to generate new sentences. To generate new sentences, the model first generates a set of candidate sentences using the encoder part. Then, it selects the best candidate sentence based on its probability score. The probability score is calculated by


In [15]:
print(tokenizer.decode(out1[1]))

 ⁇  The LLaMA language model is a large-scale language model that is trained on a large corpus of text. The LLaMA language model is a large-scale language model that is trained on a large corpus of text.
The LLaMA language model is a large-scale language model that is trained on a large corpus of text. The LLaMA language model is a large-scale language model that is trained on a large corpus of text.
The LLaMA language model is a large-scale language model that is trained on a large corpus of text. The LLaMA language model is a large-scale language model that is trained on a large corpus of text.
The LLaMA language model is a large-scale language model that is trained on a large corpus of text. The LLaMA language model is a large-scale language model that is trained on a large corpus of text.
The LLa


In [None]:
b, l, c = 2, 10, 1024
r = 8

X1 = torch.randn(l, c)
X2 = torch.randn(l, c)
X = torch.cat([X1, X2], dim=0)
XX = torch.cat([X1, X2], dim=1)

weight = torch.randn(([c, c]))

A1 = torch.randn(c, r)
A2 = torch.randn(c, r)
A = torch.cat([A1, A2], dim=1)
AA = torch.cat([torch.cat([A1, torch.zeros_like(A1)]), torch.cat([torch.zeros_like(A2), A2])], dim=1)
Y = X @ A

B1 = torch.randn(c, r)
B2 = torch.randn(c, r)
BB = torch.cat([torch.cat([B1, torch.zeros_like(B1)]), torch.cat([torch.zeros_like(B2), B2])], dim=1)

# ((XX @ AA) @ BB.T)[:,:c] - ((X1 @ A1) @ B1.T)
# ((XX @ AA) @ BB.T)[:,c:] - ((X2 @ A2) @ B2.T)

In [None]:
class BLora(torch.nn.Module):
    def __init__(
        self,
        lora0: list,
        lora1: list,
        weight: torch.Tensor,
        scaling: float = 4.0,
    ):
        super().__init__()
        self.lora_a = torch.cat([torch.cat([lora0[0], torch.zeros_like(lora0[0])]), torch.cat([torch.zeros_like(lora1[0]), lora1[0]])], dim=1).to('cuda:0')
        self.lora_b = torch.cat([torch.cat([lora0[1], torch.zeros_like(lora0[1])]), torch.cat([torch.zeros_like(lora1[1]), lora1[1]])], dim=1).to('cuda:0')
        self.weight = weight.to('cuda:0')
        self.scaling = scaling
        self.ctx_size = self.weight.shape[0]

    @torch.no_grad()
    def forward(self, x: torch.Tensor):
        result = F.linear(x, transpose(self.weight, False), bias=None)
        x = torch.cat([x[0], x[1]], dim=1)

        out = self.scaling * (x @ self.lora_a) @ self.lora_b.T
        out = torch.stack([out[:, :self.ctx_size], out[:, self.ctx_size:]])
        result += out
        return result


b, l, c = 2, 10, 1024
r = 8

X1 = torch.randn(l, c)
X2 = torch.randn(l, c)
X = torch.stack([X1, X2])
weight = torch.randn(([c, c]))

A1 = torch.randn(c, r)
A2 = torch.randn(c, r)

B1 = torch.randn(c, r)
B2 = torch.randn(c, r)

AA = torch.cat([torch.cat([A1, torch.zeros_like(A1)]), torch.cat([torch.zeros_like(A2), A2])], dim=1)
blora = BLora(lora0=[A1, B1], lora1=[A2, B2], weight=weight)

X.to('cuda:0')

starter, ender = torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True)
repetitions = 300
timings=np.zeros((repetitions,1))

with torch.no_grad():
    for rep in range(repetitions):
        starter.record()
        _ = blora(X)
        ender.record()
        # WAIT FOR GPU SYNC
        torch.cuda.synchronize()
        curr_time = starter.elapsed_time(ender)
        timings[rep] = curr_time

mean_syn = np.sum(timings) / repetitions
std_syn = np.std(timings)
print(mean_syn, std_syn)

In [None]:
from torch.profiler import profile, record_function, ProfilerActivity

with profile(activities=[ProfilerActivity.CUDA, ProfilerActivity.CPU], record_shapes=True) as prof:
    with record_function("model_inference"):
        blora(X)

print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=10))

In [None]:
def lora_forward(x, weight, lora_A, lora_B, scaling):
    result = F.linear(x, transpose(weight, False))
    result += (
            (x @ lora_A) @ lora_B.T
        * scaling
    )


starter, ender = torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True)
repetitions = 300
timings=np.zeros((repetitions,1))

with torch.no_grad():
    for rep in range(repetitions):
        starter.record()
        lora_forward(X1, weight, A1, B1, 4.0); 
        lora_forward(X2, weight, A2, B2, 4.0)
        ender.record()
        # WAIT FOR GPU SYNC
        torch.cuda.synchronize()
        curr_time = starter.elapsed_time(ender)
        timings[rep] = curr_time

mean_syn = np.sum(timings) / repetitions
std_syn = np.std(timings)
print(mean_syn, std_syn)

In [None]:
model.base_model.model.model.layers[0].self_attn.q_proj

In [None]:
peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1
)

model = get_peft_model(model, peft_config)

In [None]:
bs = 2
seq_len = 10
ctx_dim = 4096
rank = 8
scaling = 4.0
fan_in_fan_out = False
bias = None
device = 'cuda:0'

In [None]:
# blora
x0 = torch.randn([bs, seq_len, ctx_dim], device=device)
x1 = torch.randn([bs, seq_len, ctx_dim], device=device)
weight = torch.randn(([ctx_dim, ctx_dim]), device=device)

lora_a0 = torch.nn.Linear(in_features=ctx_dim, out_features=rank, bias=False, device=device)
lora_b0 = torch.nn.Linear(in_features=rank, out_features=ctx_dim, bias=False, device=device)

lora_a1 = torch.nn.Linear(in_features=ctx_dim, out_features=rank, bias=False, device=device)
lora_b1 = torch.nn.Linear(in_features=rank, out_features=ctx_dim, bias=False, device=device)

loraa = torch.cat([lora_a0.weight, lora_a1.weight], dim=1)
lorab = torch.cat([lora_b0.weight, lora_b1.weight], dim=0)

# forward pass
def lora_forward(x, weight, lora1, lora2, scaling):
    result = F.linear(x0, transpose(weight, fan_in_fan_out), bias=bias)

    if isinstance(lora1, torch.nn.Linear):
        x = x.to(lora1.weight.dtype)
        result += scaling * lora2(lora1(x))
        return result
    else:
        x = x.reshape(seq_len, -1)
        x = x.to(lora1.dtype)
        out = F.linear(x, transpose(lora1, fan_in_fan_out), bias=bias)
        out = scaling * F.linear(out, transpose(lora2, fan_in_fan_out), bias=bias)
        out = out.reshape(bs, seq_len, -1)
        result += out
    return result

start = time.time()
r0 = lora_forward(x0, weight, lora_a0, lora_b0, scaling)
r1 = lora_forward(x1, weight, lora_a1, lora_b1, scaling)
print(f"lora_forward: {(time.time() - start)*1e6} microseceonds")

In [None]:
class BLora(torch.nn.Module):
    def __init__(
        self,
        lora1: list,
        lora2: list,
        weight: torch.Tensor,
    ):
        super().__init__()
        self.lora_a = torch.nn.Parameter(torch.cat([lora1[0].weight, lora2[0].weight], dim=1))
        self.lora_b = torch.nn.Parameter(torch.cat([lora1[1].weight, lora2[1].weight], dim=0))
        self.weight = torch.nn.Parameter(weight)

    def forward(self, x: torch.Tensor):
        result = F.linear(x, transpose(self.weight, fan_in_fan_out), bias=bias)
        x = x.reshape(seq_len, -1)
        x = x.to(self.lora_a.dtype)

        out = F.linear(x, transpose(self.lora_a, fan_in_fan_out), bias=bias)
        out = scaling * F.linear(out, transpose(self.lora_b, fan_in_fan_out), bias=bias)
        out = out.reshape(bs, seq_len, -1)
        result += out
        return result
    
blora = BLora(lora1=[lora_a0, lora_b0], lora2=[lora_a1, lora_b1], weight=weight)

start = time.time()
r2 = blora(x0)
print(f"lora_forward: {(time.time() - start)*1e6} microseceonds")

In [None]:
def forward(self, x: torch.Tensor):
    previous_dtype = x.dtype
    if self.active_adapter not in self.lora_A.keys():
        return F.linear(x, transpose(self.weight, self.fan_in_fan_out), bias=self.bias)
    if self.disable_adapters:
        if self.r[self.active_adapter] > 0 and self.merged:
            self.unmerge()
        result = F.linear(x, transpose(self.weight, self.fan_in_fan_out), bias=self.bias)
    elif self.r[self.active_adapter] > 0 and not self.merged:
        result = F.linear(x, transpose(self.weight, self.fan_in_fan_out), bias=self.bias)

        x = x.to(self.lora_A[self.active_adapter].weight.dtype)

        result += (
            self.lora_B[self.active_adapter](
                self.lora_A[self.active_adapter](self.lora_dropout[self.active_adapter](x))
            )
            * self.scaling[self.active_adapter]
        )
    else:
        result = F.linear(x, transpose(self.weight, self.fan_in_fan_out), bias=self.bias)

    result = result.to(previous_dtype)

    return result

In [None]:
# depreciated blora
x0 = torch.randn([bs, seq_len, ctx_dim])
weight = torch.randn(([ctx_dim, ctx_dim]))

lora_a0 = torch.nn.Linear(in_features=ctx_dim, out_features=rank, bias=False)
lora_b0 = torch.nn.Linear(in_features=rank, out_features=ctx_dim, bias=False)

lora_a1 = torch.nn.Linear(in_features=ctx_dim, out_features=rank, bias=False)
lora_b1 = torch.nn.Linear(in_features=rank, out_features=ctx_dim, bias=False)

lora1 = torch.nn.Linear(in_features=bs * ctx_dim, out_features=rank, bias=False)
lora2 = torch.nn.Linear(in_features=rank, out_features=bs * ctx_dim, bias=False)

lora1.weight = torch.nn.Parameter(torch.cat([lora_a0.weight, lora_a1.weight], dim=1))
lora2.weight = torch.nn.Parameter(torch.cat([lora_b0.weight, lora_b1.weight], dim=0))

# forward pass
result1 = F.linear(x0, transpose(weight, fan_in_fan_out), bias=bias)
x1 = x0.reshape(seq_len, -1)
x1 = x1.to(lora1.weight.dtype)
out1 = lora2(lora1(x1)) * scaling
out1 = out1.reshape(bs, seq_len, -1)
result1 += out1

In [None]:
path = "/home/ubuntu/airoboros-7b-gpt4-1.2-peft/adapter_model.bin"
peft_model_state_dict = torch.load(path)

peft_config_path = "/home/ubuntu/airoboros-7b-gpt4-1.2-peft/adapter_config.json"
peft_config = PeftConfig.from_json_file(peft_config_path)
peft_config = {'default' : peft_config}
model.peft_config = peft_config
set_peft_model_state_dict(model, peft_model_state_dict, adapter_name="default")