In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = "0"

In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model
from datasets import load_dataset
import torch

In [3]:
device = 'cuda:0'
checkpoint = "microsoft/Phi-3-mini-4k-instruct"

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForCausalLM.from_pretrained(checkpoint,
                                             torch_dtype = torch.float16,
                                             device_map=device,
                                             attn_implementation='flash_attention_2',
                                             cache_dir='/mnt/esperanto/et/huggingface/hub',
                                             )

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
def cast_lora_fp32(model):
    for n,p in model.named_parameters():
        if 'lora' in n and p.requires_grad:
            p.data = p.data.to(torch.float32)
    
    return model

In [6]:
lora_config = LoraConfig(
        r=4,
        lora_alpha=16,
        lora_dropout=.05,
        bias="none",
        task_type="CAUSAL_LM",
        target_modules = ["o_proj", "qkv_proj", "gate_proj", "down_proj"]
)

model.enable_input_require_grads()
model = get_peft_model(model, lora_config)
model = cast_lora_fp32(model)
model.print_trainable_parameters()

trainable params: 3,801,088 || all params: 3,824,880,640 || trainable%: 0.0994


In [7]:
def prepare_data(sample, tokenizer):
    system_prompt = "You are an AI model designed to help visually impaired people. Your task is to provide a comprehensive description of the image, locating important objects to guide disabled people through the scene."

    boxes = [row for row in sample['segments_postprocessed'][0]['boxes']]
    boxes = [str([round(x,2) for x in row]) for row in boxes]
    labels = sample['segments_postprocessed'][0]['labels']
    box_prompt = '\n'.join(sorted([a + ' ' + b for a,b in zip(labels, boxes)]))

    title = sample['title']
    question_prompt = f"Below is a description of a {title} scene, along with a list of objects present in the scene along with their coordinates following the format 'object [x_min, y_min, x_max, y_max]'. Provide a descriptive paragraph using a human-like description, do not mention coordinates. Only use the position information and infer from it, do not add any comment or guess. Remain factual and avoid unnecessary embellishments, keep it simple."

    descriptive_text = sample['generated_descriptive_text']

    sample_prompt = f"""<|system|>
{system_prompt}<|end|>
<|user|>
{question_prompt}
{box_prompt}<|end|>
<|assistant|>
{descriptive_text}<|end|>"""

    inputs = tokenizer(sample_prompt)
    
    sample['full_sample'] = sample_prompt
    sample['input_ids'] = inputs.input_ids
    sample['attention_mask'] = inputs.attention_mask
    sample['labels'] = inputs.input_ids.copy()

    return sample

In [8]:
data_files = {'train': "../data/train2014_descriptive_texts.json", 'val': "../data/val2014_descriptive_texts.json"}

datasets = load_dataset("json", data_files=data_files)
datasets = datasets.map(lambda x: prepare_data(x, tokenizer))

train_data, val_data = datasets['train'], datasets['val'].select(list(range(1_000)))

In [10]:
args_output_dir = "../models/phi3-mini-VD-v2"
args_max_steps = 20_000
args_eval_freq_default = 1_000
args_log_freq_default = 1_000
args_save_freq_default = 1_000
args_batch_size = 1
args_learning_rate = 8e-5
args_lr_scheduler_type="cosine"
args_num_warmup_steps = 200
args_gradient_accumulation_steps_default = 1
args_weight_decay = 0.05

training_args = TrainingArguments(
        output_dir=args_output_dir,
        evaluation_strategy="steps",
        save_strategy="steps",
        load_best_model_at_end=True,
        dataloader_drop_last=True,
        max_steps=args_max_steps,
        eval_steps=args_eval_freq_default,
        save_steps=args_save_freq_default,
        logging_steps=args_log_freq_default,
        per_device_train_batch_size=args_batch_size,
        per_device_eval_batch_size=args_batch_size,
        learning_rate=args_learning_rate,
        lr_scheduler_type=args_lr_scheduler_type,
        warmup_steps=args_num_warmup_steps,
        gradient_accumulation_steps=args_gradient_accumulation_steps_default,
        fp16=True,
        weight_decay=args_weight_decay,
        run_name="phi3-mini-VD-v2",
        report_to='wandb',
        #push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
    )

max_steps is given, it will override any value given in num_train_epochs


In [11]:
trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33msade-adrien[0m ([33mesperanto[0m). Use [1m`wandb login --relogin`[0m to force relogin


We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)


Step,Training Loss,Validation Loss
1000,0.6677,0.586104
2000,0.5801,0.569576
3000,0.567,0.558359
4000,0.5568,0.551056
5000,0.5544,0.546547
6000,0.5416,0.543065
7000,0.5415,0.537967
8000,0.536,0.53494
9000,0.5377,0.532368
10000,0.5337,0.529203


TrainOutput(global_step=20000, training_loss=0.5386242462158203, metrics={'train_runtime': 4004.749, 'train_samples_per_second': 4.994, 'train_steps_per_second': 4.994, 'total_flos': 2.9345980575246336e+17, 'train_loss': 0.5386242462158203, 'epoch': 1.64866870002473})