In [None]:
! nvidia-smi
! rm -r phi-2-sft

In [None]:
! pip install -U --quiet datasets evaluate torch transformers accelerate trl peft

In [None]:
#! pip install flash-attn --no-build-isolation

### **Dataset**

In [None]:
from datasets import load_dataset

OpenHermes = load_dataset("rasyosef/OpenHermes-SLM-384k", split="train")
OpenHermes

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

model_id = "microsoft/phi-2"

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.float16,
    device_map="cuda",
    #attn_implementation="flash_attention_2",
  )

In [None]:
from trl import setup_chat_format

# Set up the chat format with default 'chatml' format
model, tokenizer = setup_chat_format(model, tokenizer)

In [None]:
tokenizer.apply_chat_template([{"role": "user", "content": "Hello, there!"}, {"role": "assistant", "content": "Hi!"}], tokenize=False)

In [None]:
print(model)

### **Filter Dataset**

In [None]:
MAX_LENGTH = 448
OpenHermes_filtered = OpenHermes.filter(lambda row: row["phi_token_count"] < MAX_LENGTH)
OpenHermes_filtered

In [None]:
from collections import Counter

num_messages = OpenHermes_filtered["length"]
print(dict(Counter(num_messages)))

In [None]:
NUM_SAMPLES = 128_000
OpenHermes_Final = OpenHermes_filtered.shuffle(seed=42).select(range(NUM_SAMPLES))
OpenHermes_Final = OpenHermes_Final.train_test_split(test_size=0.01, seed=42)
OpenHermes_Final

In [None]:
num_messages = OpenHermes_Final["train"]["length"]
Counter(num_messages)

In [None]:
for i in range(5):
  print(OpenHermes_Final["train"]["messages_templated"][i])
  print("\n-------------------------------------------\n")

### **LoRA Adapter**

In [None]:
from peft import LoraConfig, get_peft_model, cast_mixed_precision_params

peft_config = LoraConfig(
    r=16,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    # Target all linear layers
    target_modules=["q_proj", "k_proj", "v_proj", "dense", "fc1", "fc2", "lm_head"]
)

model = get_peft_model(model, peft_config)
cast_mixed_precision_params(model, dtype=torch.float16)
model.print_trainable_parameters()

In [None]:
## TO CONTINUE TRAINING

# from peft import PeftModel, cast_mixed_precision_params

# peft_model_id = "rasyosef/phi-2-sft-openhermes-128k-v2"
# model = PeftModel.from_pretrained(model, peft_model_id, is_trainable=True)
# cast_mixed_precision_params(model, dtype=torch.float16)
# model.print_trainable_parameters()

### **SFT with TRL**

In [None]:
from google.colab import userdata
from trl import SFTConfig, SFTTrainer

max_seq_length = 768

batch_size = 4 # On a T4 or P100 GPU, batch_size should be set to 1 to avoid cuda out of memroy error
gradient_accum_steps = 4
epochs = 2

new_model_id = "phi-2-sft"

eval_steps = 200
save_steps = eval_steps * 2
logging_steps=eval_steps

print("Eval Steps:", eval_steps)
print("Save Steps:", save_steps)

sft_config = SFTConfig(
    dataset_text_field="messages_templated",
    max_seq_length=max_seq_length,
    output_dir=new_model_id,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    gradient_accumulation_steps=gradient_accum_steps,
    num_train_epochs=epochs,
    learning_rate=4e-5,
    warmup_steps=400,
    lr_scheduler_type="linear", # could also use a cosine scheduler
    fp16=True,
    packing=True,
    logging_strategy="steps",
    logging_steps=logging_steps,
    eval_strategy="steps",
    eval_steps=eval_steps,
    save_strategy="steps",
    save_steps=save_steps,
    save_total_limit = 1,
    neftune_noise_alpha=5, # NEFTune
    seed=42,
    # push_to_hub=True, # Uncomment this line to push model to huggingface hub
    # hub_token=userdata.get("HF_TOKEN"), # uncomment this line to set your huggingface hub write token. This must be set if push_to_hub=True

    # gradient_checkpointing=True,
    # gradient_checkpointing_kwargs={'use_reentrant':False}
  )

In [None]:
trainer = SFTTrainer(
    model,
    args=sft_config,
    train_dataset=OpenHermes_Final['train'],
    eval_dataset=OpenHermes_Final['test'],
    tokenizer=tokenizer,
)

In [None]:
import math

eval_results = trainer.evaluate()
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

In [None]:
trainer.train()

In [None]:
import math

eval_results = trainer.evaluate()
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

In [None]:
def chat(messages):
    tokenized_chat = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to("cuda")

    outputs = model.generate(tokenized_chat, max_new_tokens=128) #, stopping_criteria=["<|im_end|>"])
    print(tokenizer.decode(outputs[0]))

messages = [{"role": "user", "content": "Who is Leonhard Euler?"}]
chat(messages)

In [None]:
messages = [{"role": "user", "content": "What is quantum computing?"}]
chat(messages)

In [None]:
messages = [{"role": "user", "content": "Do you have any jokes about hats?"}]
chat(messages)

In [None]:
trainer.push_to_hub()