In [2]:
%load_ext autoreload
%autoreload 2

import torch
import transformers
from transformers import TextStreamer
from finetune_peft import get_peft_config, PEFTArguments
from peft import get_peft_config, get_peft_model, LoraConfig, TaskType, PeftConfig, set_peft_model_state_dict, PeftModel
import peft

from peft.tuners.lora import Linear
import torch.nn.functional as F
from peft.utils.other import transpose

import time
import numpy as np

import inspect
from functools import wraps

from blora_utils import forward, BatchStreamer, StreamingPeftModel
import scipy.stats as stats
import matplotlib.pyplot as plt

from IPython.display import clear_output

Linear.forward = forward

torch.set_default_tensor_type(torch.cuda.FloatTensor)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
loras = ["jondurbin/airoboros-7b-gpt4-1.2-peft", "trl-lib/llama-7b-se-rl-peft", "winddude/wizardLM-LlaMA-LoRA-7B"]
adapters = [lora.replace(".", "_") for lora in loras]

In [27]:
model_path = "/home/ubuntu/llama-weights/7B/llama-7b"
tokenizer_path = model_path

torch.set_default_tensor_type(torch.cuda.HalfTensor)
model = transformers.LlamaForCausalLM.from_pretrained(model_path)

Loading checkpoint shards: 100%|██████████| 33/33 [00:12<00:00,  2.75it/s]


In [29]:
model = StreamingPeftModel.from_pretrained(model, loras[0], adapter_name=adapters[0])
for lora, adapter in zip(loras[1:], adapters[1:]):
    model = StreamingPeftModel.from_pretrained(model.base_model.model, lora, adapter_name=adapter)

In [28]:
tokenizer = transformers.LlamaTokenizer.from_pretrained(tokenizer_path)
batch = tokenizer(["The LLaMA language model is"] * 3, return_tensors="pt")

for name, module in model.named_modules():
    module.batch_lora_ids = adapters

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'LLaMATokenizer'. 
The class this function is called from is 'LlamaTokenizer'.


In [21]:
starter, ender = torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True)
repetitions = 1
timings=np.zeros((repetitions,1))

outputs = []

with torch.no_grad():
    for rep in range(repetitions):
        starter.record()
        for out in model.generate(
            input_ids=batch['input_ids'],
            attention_mask=batch['attention_mask'],
            max_length=200,
            stream_output=True
        ):
            outputs.append(out)
            batch_decoded = tokenizer.batch_decode(torch.cat([out.reshape(-1, 1) for out in outputs], dim=1))
            clear_output(wait=True)
            print("\n\n".join([lora + ":\n" + decoded for lora, decoded in zip(loras, batch_decoded)]))
        ender.record()
        torch.cuda.synchronize()
        curr_time = starter.elapsed_time(ender)
        timings[rep] = curr_time

mean_syn = np.sum(timings) / repetitions
std_syn = np.std(timings)
print(mean_syn / 1000, std_syn / 1000)

ValueError: The following `model_kwargs` are not used by the model: ['stream_output'] (note: typos in the generate arguments will also show up in this list)

In [None]:
x = np.array([1, 2, 4, 8, 16, 32, 64])
y1 = np.array([14.13, 16.05, 22.43, 35.18, 62.25, 114.11, 219.20])
y2 = 14.13 * x

plt.plot(x, y1, 'b-')
plt.plot(x, y2, 'r--')

plt.xlabel('Batch Size')
plt.ylabel('Time (s)')

plt.legend(['Batched Lora', 'Sequential'], loc='upper left')
plt.title('Generating 200 tokens with Llama-7B using Batched Lora vs Sequential on A100-80gb')

In [None]:
# get slope of y1
slope, intercept, r_value, p_value, std_err = stats.linregress(x,y1)
print(slope)
print(intercept)

# which lib to import starts from