In [None]:
# https://www.datacamp.com/tutorial/phi-3-tutorial
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, BitsAndBytesConfig
from huggingface_hub import ModelCard, ModelCardData, HfApi
from datasets import load_dataset
from jinja2 import Template
from trl import SFTTrainer, SFTConfig
import yaml
import torch


# Step 2: Import required libraries and set configuration
MODEL_ID = "microsoft/Phi-3-mini-4k-instruct"
NEW_MODEL_NAME = "opus-samantha-phi-3-mini-4k"
DATASET_NAME = "macadeliccc/opus_samantha"
SPLIT = "train"
MAX_SEQ_LENGTH = 2048
num_train_epochs = 1
license = "apache-2.0"
learning_rate = 1.41e-5
per_device_train_batch_size = 1
gradient_accumulation_steps = 1

if torch.cuda.is_available() and torch.cuda.is_bf16_supported():
    compute_dtype = torch.bfloat16
else:
    compute_dtype = torch.float16


# Step 3: Load the model, tokenizer, and dataset
model = AutoModelForCausalLM.from_pretrained(MODEL_ID, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
dataset = load_dataset(DATASET_NAME, split="train")

# Step 4: Preprocess the dataset
EOS_TOKEN=tokenizer.eos_token_id

# Select a subset of the data for faster processing
# dataset = dataset.select(range(100))

def formatting_prompts_func(examples):
    convos = examples["conversations"]
    texts = []
    mapper = {"system": "system\n", "human": "\nuser\n", "gpt": "\nassistant\n"}
    end_mapper = {"system": "", "human": "", "gpt": ""}
    for convo in convos:
        text = "".join(f"{mapper[(turn := x['from'])]} {x['value']}\n{end_mapper[turn]}" for x in convo)
        texts.append(f"{text}{EOS_TOKEN}")
    return {"text": texts}

dataset = dataset.map(formatting_prompts_func, batched=True)
print(dataset['text'][0])

def change_role_name(examples):
    convos = examples["conversations"]
    for convo in convos:
        for x in convo:
            if x["from"] == "gpt":
                x["from"] = "assistant"
            elif x["from"] == "human":
                x["from"] = "user"
    return {"conversations": convos}
dataset = dataset.map(change_role_name, batched = True)
print(dataset)

# https://www.youtube.com/watch?v=PDYHtiScHto
args = SFTConfig(
    per_device_train_batch_size = 2, # per_device_train_batch_size,
    gradient_accumulation_steps = gradient_accumulation_steps,
    gradient_checkpointing = True,
    learning_rate = 2e-5,
    lr_scheduler_type = "cosine",
    max_steps = -1,
    num_train_epochs = num_train_epochs,
    save_strategy = "no",
    logging_steps = 1,
    output_dir = NEW_MODEL_NAME,
    optim = "paged_adamw_32bit",
    bf16 = True,
    dataset_text_field = "text", # For tokenization (sft_trainer.py line 456) if not set, default is 'text' 
    max_seq_length = MAX_SEQ_LENGTH,
)

trainer = SFTTrainer(
    model = model,
    args = args,
    train_dataset = dataset,
    # formatting_func = formatting_prompts_func, # Don't need to format again (sft_trainer.py line 413)
)

trainer.train()

model.to(torch.bfloat16)  # Convert to bfloat16
model.save_pretrained(NEW_MODEL_NAME, 
                        # save_function=torch.save,  # Use standard PyTorch save
                        # state_dict=model.state_dict(),  # Only save the model weights
                        # safe_serialization=True,  # More efficient serializationsave_optimizer_state=False
                    )
tokenizer.save_pretrained(NEW_MODEL_NAME,
                        # legacy_format=False  # Use newer, more efficient format
                    )

# sft_trainer.py line 445, after dataset.map, dataset becomes Dataset({'text':['<|endoftext|>', '<|endoftext|>', ...]})
# sft_trainer.py line 460, processed only {'input_ids': [32000], 'attention_mask': [1]}


# # Step 5: Set training arguments
# args = SFTConfig(
# evaluation_strategy="steps",
# per_device_train_batch_size=7,
# gradient_accumulation_steps=4,
# gradient_checkpointing=True,
# learning_rate=1e-4,
# fp16 = compute_dtype != torch.bfloat16, #not torch.cuda.is_bf16_supported(),
# bf16 = compute_dtype == torch.bfloat16, #torch.cuda.is_bf16_supported(),
# max_steps=-1,
# num_train_epochs=3,
# save_strategy="epoch",
# logging_steps=10,
# output_dir=NEW_MODEL_NAME,
# optim="paged_adamw_32bit",
# lr_scheduler_type="linear",
# # These two was in SFTTrainer, maybe the tutorial place them to the wrong place
# dataset_text_field="text",
# max_seq_length=128)

# # Step 6: Fine-tune the model
# trainer = SFTTrainer(
# model=model,
# args=args,
# train_dataset=dataset,
# # formatting_func=formatting_prompts_func
# )
# trainer.train()

# Step 7: Fine-tuning result
# TrainOutput(global_step=9, training_loss=0.7428549660576714, metrics={'train_runtime': 570.4105, 'train_samples_per_second': 0.526, 'train_steps_per_second': 0.016, 'total_flos': 691863632216064.0, 'train_loss': 0.7428549660576714, 'epoch': 2.4})

In [None]:
# Check fine-tuned model
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

# model_name = "microsoft/Phi-3-mini-4k-instruct" 
model_name = "./opus-samantha-phi-3-mini-4k" 

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code = False)
model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code = False, torch_dtype=torch.bfloat16)

print(model.config.torch_dtype)
print(model.dtype)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

In [None]:
content = "Yeah, so I've been offered a promotion, but it would involve relocating to another city. I'm not sure if I should take it or not, because it's a great opportunity career-wise, but I'd have to leave my friends and family behind. What do you think?"
temperature = 0.000001
messages = [{"role": "user", "content": f"{content}"}]
# add_generation_prompt indicates the start of a response
inputs = tokenizer.apply_chat_template(messages, add_generation_prompt = True, return_tensors = "pt").to(device)
# print("inputs:", tokenizer.apply_chat_template(messages, add_generation_prompt = True, tokenize = False))
outputs = model.generate(inputs, max_new_tokens = 1024, do_sample = temperature > 0.00001, temperature = temperature)
text = tokenizer.batch_decode(outputs)[0]
print(text)

In [1]:
# Check two models are the same

import torch

# Load your two models
from transformers import AutoModelForCausalLM
model_path1 = "../../projects/sciences/computing/sheju347/MedicalQA/train/fine_tuned_model_no_mask_entire/checkpoint-307197"
model_path2 = "../../projects/sciences/computing/sheju347/MedicalQA/train/fine_tuned_model_no_mask_entire"
model1 = AutoModelForCausalLM.from_pretrained(model_path1)
model2 = AutoModelForCausalLM.from_pretrained(model_path2)

# Extract the final layer weights (lm_head is usually the output layer)
weight1 = model1.lm_head.weight
weight2 = model2.lm_head.weight

# ✅ Compare: are they exactly the same?
are_equal = torch.equal(weight1, weight2)
print("Weights are exactly equal:", are_equal)

# Or: get the L2 norm of the difference
diff_norm = torch.norm(weight1 - weight2).item()
print("L2 norm of difference:", diff_norm)

# Optional: relative difference
relative_diff = diff_norm / torch.norm(weight1).item()
print("Relative difference:", relative_diff)

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 2/2 [00:04<00:00,  2.30s/it]
Loading checkpoint shards: 100%|██████████| 2/2 [00:05<00:00,  2.75s/it]


Weights are exactly equal: True
L2 norm of difference: 0.0
Relative difference: 0.0


In [2]:
from datasets import load_dataset

def check_train_contains_test():
    test_dataset = load_dataset("GBaker/MedQA-USMLE-4-options")["test"]
    train_dataset = load_dataset("TsinghuaC3I/UltraMedical")["train"]

    # If the first n characters match, consider they're the same question
    prefix_count = 100

    train_question_prefixes = set(train["conversations"][0]["value"][:prefix_count] for train in train_dataset)

    similar_count = 0

    progress = 0
    total_progress = len(test_dataset)
    for test in test_dataset:
        test_question_prefix = test["question"][:prefix_count]
        if test_question_prefix in train_question_prefixes:
            print(f"test:\n{test['question']}")
            print(test["options"])
            print("\n")
            for train in train_dataset:
                if train["conversations"][0]["value"][:prefix_count] == test_question_prefix:
                    print(f"train:\n{train['conversations'][0]['value']}")
                    similar_count += 1
                    # break
            # return True
        
        progress += 1
        print(f"{progress}/{total_progress} {progress/total_progress * 100}%")

    # return False

    print("similar count: ", similar_count)

print("train contains test:", check_train_contains_test())

1/1273 0.07855459544383347%
2/1273 0.15710919088766695%
3/1273 0.2356637863315004%
4/1273 0.3142183817753339%
5/1273 0.3927729772191673%
6/1273 0.4713275726630008%
7/1273 0.5498821681068342%
8/1273 0.6284367635506678%
9/1273 0.7069913589945012%
10/1273 0.7855459544383346%
11/1273 0.864100549882168%
12/1273 0.9426551453260016%
13/1273 1.021209740769835%
14/1273 1.0997643362136684%
15/1273 1.178318931657502%
16/1273 1.2568735271013356%
17/1273 1.335428122545169%
18/1273 1.4139827179890023%
19/1273 1.4925373134328357%
20/1273 1.5710919088766693%
21/1273 1.6496465043205029%
22/1273 1.728201099764336%
23/1273 1.8067556952081696%
24/1273 1.8853102906520032%
25/1273 1.9638648860958365%
26/1273 2.04241948153967%
27/1273 2.1209740769835035%
28/1273 2.199528672427337%
29/1273 2.2780832678711707%
30/1273 2.356637863315004%
31/1273 2.4351924587588374%
32/1273 2.513747054202671%
33/1273 2.592301649646504%
34/1273 2.670856245090338%
35/1273 2.7494108405341713%
36/1273 2.8279654359780046%
37/1273 2.9