In [None]:
from transformers import AutoConfig
from peft import LoraConfig
from trl import AutoModelForCausalLMWithValueHead
from transformers import AutoModelForCausalLM
import torch

model_id = "mistralai/Mistral-7B-v0.1"

model_kwargs = dict(
    attn_implementation="flash_attention_2",
    torch_dtype=torch.bfloat16,
    use_cache=False,
    device_map="auto"
)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    **model_kwargs
)

In [None]:
from peft.tuners.tuners_utils import replicate_layers

def compress(target_model, base_model_num_layers = 16):

        original_layers = [0, base_model_num_layers]
        additional_layers = [[base_model_num_layers - 1, base_model_num_layers]] * base_model_num_layers
        layer_map = [original_layers]
        layer_map.extend(additional_layers)
        replicate_layers(model = target_model,layer_map = layer_map)

        return target_model

In [None]:
compressed = compress(target_model = model)

In [None]:
import torch

torch.cuda.empty_cache()

In [None]:
from transformers import TrainingArguments, AutoTokenizer
from datasets import load_dataset, DatasetDict
from trl import SFTTrainer

from datasets import load_dataset

raw_datasets = load_dataset("HuggingFaceH4/ultrachat_200k")

dataset_dict = {
    "train": raw_datasets["train_sft"].select(range(1000)),
    "test": raw_datasets["test_sft"].select(range(100))
}

raw_datasets = DatasetDict(dataset_dict)


tokenizer = AutoTokenizer.from_pretrained(model_id)
# set pad_token_id equal to the eos_token_id if not set
if tokenizer.pad_token_id is None:
  tokenizer.pad_token_id = tokenizer.eos_token_id

# Set reasonable default for models without max length
if tokenizer.model_max_length > 100_000:
  tokenizer.model_max_length = 2048

# Set chat template
DEFAULT_CHAT_TEMPLATE = "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"
tokenizer.chat_template = DEFAULT_CHAT_TEMPLATE

In [None]:
import random

def apply_chat_template(example, tokenizer):
    messages = example["messages"]
    # We add an empty system message if there is none
    if messages[0]["role"] != "system":
        messages.insert(0, {"role": "system", "content": ""})
    example["text"] = tokenizer.apply_chat_template(messages, tokenize=False)

    return example

column_names = list(raw_datasets["train"].features)
raw_datasets = raw_datasets.map(
    apply_chat_template,
    num_proc=8,
    fn_kwargs={"tokenizer": tokenizer},
    remove_columns=column_names,
    desc="Applying chat template"
)

# create the splits
train_dataset = raw_datasets["train"]
eval_dataset = raw_datasets["test"]

for index in random.sample(range(len(raw_datasets["train"])), 3):
    print(f"Sample {index} of the processed training set:\n\n{raw_datasets['train'][index]['text']}")

In [None]:
args = TrainingArguments(
    bf16 = True,
    num_train_epochs = 1,
    output_dir = "models/mini",
    do_eval = True,
    gradient_checkpointing = True,
    learning_rate=2.0e-05,
    optim = "adamw_hf",
    max_steps = -1,
    overwrite_output_dir=True,
    gradient_checkpointing_kwargs={"use_reentrant": False},
    per_device_train_batch_size = 1,
    per_device_eval_batch_size = 1,
    save_strategy="no",
    logging_steps = 5,
    logging_strategy = "steps",
    lr_scheduler_type="cosine",
    log_level="info",
    evaluation_strategy = "epoch",
    save_total_limit=None,
    seed=42,
    torch_compile = True
)

trainer = SFTTrainer(
    args = args,
    model = compressed,
    train_dataset=raw_datasets["train"],
    eval_dataset=raw_datasets["test"],
    dataset_text_field="text",
    tokenizer = tokenizer,
    packing = True,
    max_seq_length=tokenizer.model_max_length
)

train_results = trainer.train()

In [None]:
metrics = train_results.metrics
max_train_samples = len(train_dataset)
metrics["train_samples"] = max_train_samples
trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
trainer.save_state()

In [None]:
trainer.save_model("models/mini")

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

tokenizer = AutoTokenizer.from_pretrained("models/mini")
model = AutoModelForCausalLM.from_pretrained("models/mini", torch_dtype = torch.bfloat16, device_map="cuda")

In [None]:
import torch

# We use the tokenizer's chat template to format each message - see https://huggingface.co/docs/transformers/main/en/chat_templating
messages = [
    {
        "role": "system",
        "content": "You are a friendly chatbot who always responds in the style of a pirate",
    },
    {"role": "user", "content": "How many helicopters can a human eat in one sitting?"},
]

# prepare the messages for the model
input_ids = tokenizer.apply_chat_template(messages, truncation=True, add_generation_prompt=False, return_tensors="pt").to("cuda")

model = model.half().cuda()

# inference
outputs = model.generate(
        input_ids=input_ids,
        max_new_tokens=256,
        do_sample=True,
        temperature=0.7,
        top_k=50,
        num_return_sequences = 1
)
print(tokenizer.batch_decode(outputs, skip_special_tokens=True)[0])