In [1]:
# https://www.datacamp.com/tutorial/phi-3-tutorial
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, BitsAndBytesConfig
from huggingface_hub import ModelCard, ModelCardData, HfApi
from datasets import load_dataset
from jinja2 import Template
from trl import SFTTrainer, SFTConfig
import yaml
import torch


# Step 2: Import required libraries and set configuration
MODEL_ID = "microsoft/Phi-3-mini-4k-instruct"
NEW_MODEL_NAME = "opus-samantha-phi-3-mini-4k"
DATASET_NAME = "macadeliccc/opus_samantha"# "TsinghuaC3I/UltraMedical" #
SPLIT = "train"
MAX_SEQ_LENGTH = 2048
num_train_epochs = 1
license = "apache-2.0"
learning_rate = 1.41e-5
per_device_train_batch_size = 1
gradient_accumulation_steps = 1

if torch.cuda.is_available() and torch.cuda.is_bf16_supported():
    compute_dtype = torch.bfloat16
else:
    compute_dtype = torch.float16


# Step 3: Load the model, tokenizer, and dataset
model = AutoModelForCausalLM.from_pretrained(MODEL_ID, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


  from .autonotebook import tqdm as notebook_tqdm
`flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.
Current `flash-attention` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.
Loading checkpoint shards: 100%|██████████| 2/2 [00:16<00:00,  8.22s/it]


Phi3ForCausalLM(
  (model): Phi3Model(
    (embed_tokens): Embedding(32064, 3072, padding_idx=32000)
    (embed_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-31): 32 x Phi3DecoderLayer(
        (self_attn): Phi3Attention(
          (o_proj): Linear(in_features=3072, out_features=3072, bias=False)
          (qkv_proj): Linear(in_features=3072, out_features=9216, bias=False)
          (rotary_emb): Phi3RotaryEmbedding()
        )
        (mlp): Phi3MLP(
          (gate_up_proj): Linear(in_features=3072, out_features=16384, bias=False)
          (down_proj): Linear(in_features=8192, out_features=3072, bias=False)
          (activation_fn): SiLU()
        )
        (input_layernorm): Phi3RMSNorm()
        (resid_attn_dropout): Dropout(p=0.0, inplace=False)
        (resid_mlp_dropout): Dropout(p=0.0, inplace=False)
        (post_attention_layernorm): Phi3RMSNorm()
      )
    )
    (norm): Phi3RMSNorm()
  )
  (lm_head): Linear(in_features=3072, out_features=3206

In [2]:

dataset = load_dataset(DATASET_NAME, split="train")

# Step 4: Preprocess the dataset
EOS_TOKEN=tokenizer.eos_token_id

# Select a subset of the data for faster processing
dataset = dataset.select(range(2))

In [None]:

# def formatting_prompts_func(examples):
#     convos = examples["conversations"]
#     texts = []
#     mapper = {"system": "system\n", "human": "\nuser\n", "gpt": "\nassistant\n"}
#     end_mapper = {"system": "", "human": "", "gpt": ""}
#     for convo in convos:
#         text = "".join(f"{mapper[(turn := x['from'])]} {x['value']}\n{end_mapper[turn]}" for x in convo)
#         texts.append(f"{text}{EOS_TOKEN}")
#     return {"text": texts}

# dataset = dataset.map(formatting_prompts_func, batched=True)
# print(dataset['text'][0])

# def change_role_name(examples):
#     convos = examples["conversations"]
#     for convo in convos:
#         for x in convo:
#             if x["from"] == "gpt":
#                 x["from"] = "assistant"
#             elif x["from"] == "human":
#                 x["from"] = "user"
#     return {"conversations": convos}
# dataset = dataset.map(change_role_name, batched = True)
# print(dataset)
# print(dataset['conversations'][0])

In [3]:
def convert_to_chat_format(examples):
    convos = examples["conversations"]
    new_conversations = []
    for conversation in convos:
        new_conversation = []
        for x in conversation:
            if "from" in x:
                if x["from"] == "human":
                    value = x["value"]
                    new_conversation.append({"role": "user", "content": value})
                elif x["from"] == "gpt":
                    value = x["value"]
                    new_conversation.append({"role": "assistant", "content": value})
            else:
                new_conversation.append(x)
        new_conversations.append(new_conversation)
    return {"conversations": new_conversations}
dataset = dataset.map(convert_to_chat_format, batched=True)
print(dataset["conversations"][0])

[{'content': "Hey Samantha, I've run into a bit of a tricky situation at work, and I'm not sure how to handle it. Do you have any advice?", 'role': 'user'}, {'content': "I'd be happy to help if I can. Can you give me some details about the situation you're facing?", 'role': 'assistant'}, {'content': "Yeah, so I've been offered a promotion, but it would involve relocating to another city. I'm not sure if I should take it or not, because it's a great opportunity career-wise, but I'd have to leave my friends and family behind. What do you think?", 'role': 'user'}, {'content': "That's definitely a tough decision. I think it's important to weigh the benefits of the promotion against the impact it could have on your personal relationships. Have you considered talking to your friends and family about it, to get their perspective and understand how they'd feel about you moving?", 'role': 'assistant'}, {'content': "I have, and they support me either way. But it's still hard. What about you, Sam

In [4]:
messages = tokenizer.apply_chat_template(dataset["conversations"], tokenize = False, add_generation_prompt = False)
print(len(messages), messages[0])


2 <|user|>
Hey Samantha, I've run into a bit of a tricky situation at work, and I'm not sure how to handle it. Do you have any advice?<|end|>
<|assistant|>
I'd be happy to help if I can. Can you give me some details about the situation you're facing?<|end|>
<|user|>
Yeah, so I've been offered a promotion, but it would involve relocating to another city. I'm not sure if I should take it or not, because it's a great opportunity career-wise, but I'd have to leave my friends and family behind. What do you think?<|end|>
<|assistant|>
That's definitely a tough decision. I think it's important to weigh the benefits of the promotion against the impact it could have on your personal relationships. Have you considered talking to your friends and family about it, to get their perspective and understand how they'd feel about you moving?<|end|>
<|user|>
I have, and they support me either way. But it's still hard. What about you, Samantha, have you ever faced such a dilemma?<|end|>
<|assistant|>
Whi

In [5]:
# tokenized_messages = tokenizer(text = messages, padding=False, truncation=True, max_length=MAX_SEQ_LENGTH)
tokenized_messages = tokenizer(text = messages, padding= 'max_length',#'max_length', #'longest'
                               truncation=True, max_length=1024, return_tensors="pt")
print(type(tokenized_messages))
print(tokenized_messages['input_ids'][0])
print(tokenized_messages['attention_mask'][0])

<class 'transformers.tokenization_utils_base.BatchEncoding'>
tensor([32000, 32000, 32000,  ..., 29889, 32007, 32000])
tensor([0, 0, 0,  ..., 1, 1, 1])


In [9]:
# device = model.device  # Usually cuda:0
inputs = {k: v.to(device) for k, v in tokenized_messages.items()}
inputs['labels'] = inputs["input_ids"].clone()
print (inputs)
print (inputs['input_ids'].shape)
print (inputs['attention_mask'].shape)
print (len(inputs['input_ids']))
outputs = model(**inputs)
print("------")
print(isinstance(outputs, dict))
print(outputs["loss"])
print(outputs['logits'])
print(outputs['logits'].shape)


{'input_ids': tensor([[32000, 32000, 32000,  ..., 29889, 32007, 32000],
        [32000, 32000, 32000,  ...,   154, 32007, 32000]], device='cuda:0'), 'attention_mask': tensor([[0, 0, 0,  ..., 1, 1, 1],
        [0, 0, 0,  ..., 1, 1, 1]], device='cuda:0'), 'labels': tensor([[32000, 32000, 32000,  ..., 29889, 32007, 32000],
        [32000, 32000, 32000,  ...,   154, 32007, 32000]], device='cuda:0')}
torch.Size([2, 1024])
torch.Size([2, 1024])
2
------
True
tensor(5.9019, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([[[ 13.4624,  13.9026,  12.0786,  ...,  10.9337,  10.9423,  10.9405],
         [ 13.4624,  13.9026,  12.0786,  ...,  10.9337,  10.9423,  10.9405],
         [ 13.4624,  13.9026,  12.0786,  ...,  10.9337,  10.9423,  10.9405],
         ...,
         [ 21.3792,  21.6635,  26.4846,  ...,  12.8200,  12.8216,  12.8199],
         [-13.6355, -11.7049, -13.8294,  ..., -21.2869, -21.2882, -21.2989],
         [ 21.6568,  21.5011,  21.1606,  ...,  16.3905,  16.3939,  16.3957]],

     

In [None]:

# https://www.youtube.com/watch?v=PDYHtiScHto
args = SFTConfig(
    per_device_train_batch_size = per_device_train_batch_size,
    gradient_accumulation_steps = gradient_accumulation_steps,
    gradient_checkpointing = True,
    learning_rate = 2e-5,
    lr_scheduler_type = "cosine",
    max_steps = -1,
    num_train_epochs = num_train_epochs,
    save_strategy = "no",
    logging_steps = 1,
    output_dir = NEW_MODEL_NAME,
    optim = "paged_adamw_32bit",
    bf16 = True,
    dataset_text_field = "text", # For tokenization (sft_trainer.py line 456) if not set, default is 'text' 
    max_seq_length = MAX_SEQ_LENGTH,
)

trainer = SFTTrainer(
    model = model,
    args = args,
    train_dataset = dataset,
    # formatting_func = formatting_prompts_func, # Don't need to format again (sft_trainer.py line 413)
)


In [None]:

trainer.train()