In [None]:
from transformers import AutoConfig
from peft import LoraConfig
from trl import AutoModelForCausalLMWithValueHead
from transformers import AutoModelForCausalLM
import torch

model_id = "mistralai/Mistral-7B-v0.1"

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map = "auto",
    torch_dtype = torch.bfloat16
)


In [None]:
from peft.tuners.tuners_utils import replicate_layers

def compress(target_model, base_model_num_layers = 16):

        original_layers = [0, base_model_num_layers]
        additional_layers = [[base_model_num_layers - 1, base_model_num_layers]] * base_model_num_layers
        layer_map = [original_layers]
        layer_map.extend(additional_layers)

        replicate_layers(model = target_model,layer_map = layer_map)
        
        return target_model

In [None]:
compressed = compress(target_model = model)

In [None]:
import torch

torch.cuda.empty_cache()

In [None]:
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

In [None]:
from transformers import TrainingArguments, AutoTokenizer
from datasets import load_dataset
from trl import SFTTrainer

dataset = load_dataset("mhenrichsen/alpaca_2k_test")
tokenizer = AutoTokenizer.from_pretrained(model_id)

def prepare_dialogue(example):
    text = ""
    for idx, msg in enumerate(example["text"]):
        if idx % 2 == 0:
            text += f"<|user|>\n{msg}{tokenizer.eos_token}\n"
        else:
            text += f"<|assistant|>\n{msg}{tokenizer.eos_token}\n"
    example["text"] = text
    return example

dataset = dataset["train"].train_test_split(test_size=0.1)
dataset = dataset.map(prepare_dialogue, num_proc=4, remove_columns=["input", "output"])

In [None]:
args = TrainingArguments(
    max_steps = 150,
    output_dir = "models/mini",
    per_device_train_batch_size = 1,
    per_device_eval_batch_size = 1,
    logging_steps = 30,
    eval_steps = 100,
    evaluation_strategy = "steps"
)

trainer = SFTTrainer(
    args = args,
    model = compressed,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    dataset_text_field="text",
    max_seq_length=512,
    peft_config = lora_config
)
trainer.train()

In [None]:
compressed

In [None]:
merged = trainer.model.merge_and_unload()

In [None]:
from transformers import pipeline

pipe = pipeline(
    "text-generation",
    model = merged,
    tokenizer = tokenizer
)

In [None]:
text = """<|user|>
You are a helpful assistant. Please answer the question below in a truthful way: {question}
<|assistant>
"""

pipe(text.format(question = "What's ML?"), top_p = 0.95, top_k = 20, temperature = 0.1, max_new_tokens = 100)