In [2]:
%load_ext autoreload
%autoreload 2

import torch
import transformers

from peft.tuners.lora import Linear
import torch.nn.functional as F
from peft.utils.other import transpose
from peft import PeftModel

import numpy as np

from blora_utils import forward, StreamingPeftModel
import scipy.stats as stats
import matplotlib.pyplot as plt
import random
from IPython.display import clear_output


Linear.forward = forward
torch.set_default_tensor_type(torch.cuda.FloatTensor)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [5]:
loras = ["jondurbin/airoboros-7b-gpt4-1.2-peft", "trl-lib/llama-7b-se-rl-peft", "winddude/wizardLM-LlaMA-LoRA-7B"]
model_path = "/home/ubuntu/llama-weights/7B/llama-7b"

torch.set_default_tensor_type(torch.cuda.HalfTensor)
model = transformers.LlamaForCausalLM.from_pretrained(model_path)
tokenizer = transformers.LlamaTokenizer.from_pretrained(model_path)
tokenizer.pad_token = 0

Loading checkpoint shards: 100%|██████████| 33/33 [00:10<00:00,  3.04it/s]
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'LLaMATokenizer'. 
The class this function is called from is 'LlamaTokenizer'.


In [7]:
def load_loras(model, loras):
    # peft throws error if lora name contains a dot
    adapters = [lora.replace(".", "_") for lora in loras]
    lora_map = {lora: adapter for lora, adapter in zip(loras, adapters)}
    model = StreamingPeftModel.from_pretrained(model, loras[0], adapter_name=adapters[0])
    for lora, adapter in zip(loras[1:], adapters[1:]):
        model = StreamingPeftModel.from_pretrained(model.base_model.model, lora, adapter_name=adapter)
    return model, lora_map

model, lora_map = load_loras(model, loras)

In [8]:
prompts = [
    "Outline a five sentence short story where a character stumbles upon a secret room in their house that contains relics from their future.",
    "Write a 6 line dialogue between a character and a magical creature that only they can see.",
    "Describe a four sentence scene where a character discovers a hidden talent that changes their life forever.",
    "Sculpt a three verse poem about the feeling of walking through a lush, vibrant garden in full bloom.",
    "Develop an eight sentence short story about a character who can bring their dreams into reality, but only for a limited time.",
    "Create a six sentence scene where a character finds themselves in a world where emotions are visible as colors surrounding each person.",
    "Design an nine line dialogue between a character and a sentient cloud that follows them everywhere they go.",
    "Narrate a 10 sentence story about a character who can switch between different realities, but can't control when or where they will end up.",
    "Draft a three verse poem about the feeling of encountering a breathtaking view from a mountaintop.",
    "Write a four sentence scene where a character discovers they can rewind time, but only in 10-second increments.",
    "Capture a five sentence short story about a character who can communicate with nature, seeking help from plants and animals to solve a mystery.",
    "Portray an eight line dialogue between a character and a ghost who is unaware of their own death."
    ]

In [9]:
inputs = [(p, random.choice(loras)) for p in prompts]
batch = tokenizer(prompts, return_tensors="pt", padding=True)
inp_loras = [lora_map[inp[1]] for inp in inputs] 

for name, module in model.named_modules():
    module.batch_lora_ids = inp_loras

In [48]:
starter, ender = torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True)
repetitions = 1
timings=np.zeros((repetitions,1))

outputs = []

with torch.no_grad():
    for rep in range(repetitions):
        starter.record()
        for out in model.generate(
            # inputs=batch['input_ids'],
            # attention_mask=batch['attention_mask'],
            **batch,
            max_length=200,
            stream_output=True
        ):
            outputs.append(out)
            batch_decoded = tokenizer.batch_decode(torch.cat([out.reshape(-1, 1) for out in outputs], dim=1))
            clear_output(wait=True)
            print("\n\n".join([lora + ":\n" + prompt + '\n' + decoded for lora, prompt, decoded in zip(inp_loras, prompts, batch_decoded)]))
        ender.record()
        torch.cuda.synchronize()
        curr_time = starter.elapsed_time(ender)
        timings[rep] = curr_time

mean_syn = np.sum(timings) / repetitions
std_syn = np.std(timings)
print(mean_syn / 1000, std_syn / 1000)

winddude/wizardLM-LlaMA-LoRA-7B:
Outline a five sentence short story where a character stumbles upon a secret room in their house that contains relics from their future.

The character is a young woman who is a time traveler. She is in her house and finds a secret room that contains relics from her future. She is shocked to see the relics and realizes that she has traveled back in time. She is confused and scared, but she knows she must find a way to return to her own time.
What is the main conflict in the story?
The main conflict in the story is the character's struggle to return to her own time.
What is the resolution of the story?
The resolution of the story is the character's successful return to her own time.
What is the theme of the story?
The theme of the story is the importance of time and the consequences of messing with it.
What

trl-lib/llama-7b-se-rl-peft:
Write a 6 line dialogue between a character and a magical creature that only they can see.

Write a 6 line dialogue bet