## Installing dependencies

In [1]:
!pip install -U "transformers>=4.39.0"
!pip install peft bitsandbytes
!pip install -U "trl>=0.8.3"



## Importing Libraries

In [2]:
import torch
from transformers import AutoTokenizer, AutoProcessor, TrainingArguments, LlavaForConditionalGeneration, BitsAndBytesConfig
from trl import SFTTrainer
from peft import LoraConfig

2024-05-01 23:07:11.392243: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-01 23:07:11.392301: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-01 23:07:11.393916: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
torch.cuda.is_available()

True

## Load the model (4-bits quantized)

In [4]:
model_id = "llava-hf/llava-1.5-7b-hf"
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
)
model = LlavaForConditionalGeneration.from_pretrained(model_id,
                                                      quantization_config=quantization_config,
                                                      torch_dtype=torch.float16)

`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

## Create a chat template set tokenizer and processor

In [7]:
LLAVA_CHAT_TEMPLATE = """A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. {% for message in messages %}{% if message['role'] == 'user' %}USER: {% else %}ASSISTANT: {% endif %}{% for item in message['content'] %}{% if item['type'] == 'text' %}{{ item['text'] }}{% elif item['type'] == 'image' %}<image>{% endif %}{% endfor %}{% if message['role'] == 'user' %} {% else %}{{eos_token}}{% endif %}{% endfor %}"""

In [8]:
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.chat_template = LLAVA_CHAT_TEMPLATE
processor = AutoProcessor.from_pretrained(model_id)
processor.tokenizer = tokenizer

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


## Creating a DataCollator

In [9]:
from PIL import Image
class LLavaDataCollator:
    def __init__(self, processor):
        self.processor = processor

    def __call__(self, examples):
        texts = []
        images = []
        for example in examples:
            messages = example["messages"]
            text = self.processor.tokenizer.apply_chat_template(
                messages, tokenize=False, add_generation_prompt=False
            )
            texts.append(text)
            images.append(Image.open(example["image"]))

        batch = self.processor(texts, images, return_tensors="pt", padding=True)

        labels = batch["input_ids"].clone()
        if self.processor.tokenizer.pad_token_id is not None:
            labels[labels == self.processor.tokenizer.pad_token_id] = -100
        batch["labels"] = labels

        return batch

data_collator = LLavaDataCollator(processor)

In [10]:
from datasets import Dataset
import json
with open("/kaggle/input/dataset-final/dataset_final.json", "r") as f:
    dataset_list = json.load(f)
data = Dataset.from_list(dataset_list)
data

Dataset({
    features: ['image', 'messages'],
    num_rows: 97
})

In [11]:
data[0]

{'image': '/kaggle/input/dataset/dataset/images/23ad93e0-3669-4524-84ab-c81d886e787d.jpg',
 'messages': [{'content': [{'index': None,
     'text': 'What is the main theme depicted in the paintings of Ajanta Caves?',
     'type': 'text'},
    {'index': 0, 'text': None, 'type': 'image'}],
   'role': 'user'},
  {'content': [{'index': None,
     'text': 'The main theme depicted in the paintings of Ajanta Caves is the life of Buddha and various Jataka tales.',
     'type': 'text'}],
   'role': 'assistant'}]}

## Set the training arguments

In [12]:
training_args = TrainingArguments(
    output_dir="llava-1.5-7b-hf-ft-mix-vsft",
    report_to="wandb",
    learning_rate=1.4e-5,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=1,
    logging_steps=5,
    num_train_epochs=5,
    push_to_hub=True,
    gradient_checkpointing=True,
    remove_unused_columns=False,
    fp16=True,
    bf16=False
)

## Set the LoRA config

In [13]:
lora_config = LoraConfig(
    r=64,
    lora_alpha=16,
    target_modules="all-linear"
)

In [14]:
!pip install huggingface_hub



In [15]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## Create the `SFTTrainer`object

In [16]:
trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=data,
    peft_config=lora_config,
    dataset_text_field="text",  # need a dummy field
    tokenizer=tokenizer,
    data_collator=data_collator,
    dataset_kwargs={"skip_prepare_dataset": True},
)



In [17]:
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mrshah240[0m. Use [1m`wandb login --relogin`[0m to force relogin


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss
5,2.9192
10,2.7549
15,2.6437
20,2.7362
25,2.5834
30,2.5653
35,2.6564
40,2.441
45,2.2069
50,2.2364




TrainOutput(global_step=245, training_loss=1.5468081941409988, metrics={'train_runtime': 2526.8706, 'train_samples_per_second': 0.192, 'train_steps_per_second': 0.097, 'total_flos': 2118660203089920.0, 'train_loss': 1.5468081941409988, 'epoch': 5.0})

In [20]:
trainer.push_to_hub()



tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/1.29G [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/4.98k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/rshah240/llava-1.5-7b-hf-ft-mix-vsft/commit/a7696094f0293e155b319788a080c90ad2029d80', commit_message='End of training', commit_description='', oid='a7696094f0293e155b319788a080c90ad2029d80', pr_url=None, pr_revision=None, pr_num=None)

In [21]:
model_id = "llava-hf/llava-1.5-7b-hf"
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
)
model = LlavaForConditionalGeneration.from_pretrained(model_id,
                                                      quantization_config=quantization_config,
                                                      torch_dtype=torch.float16)

`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [22]:
from peft import PeftModel

In [24]:
peft_lora_adapter_path = "rshah240/llava-1.5-7b-hf-ft-mix-vsft"
peft_lora_adapter = PeftModel.from_pretrained(model, peft_lora_adapter_path, adapter_name="lora_adapter")

adapter_config.json:   0%|          | 0.00/927 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/1.29G [00:00<?, ?B/s]

In [26]:
model.load_adapter(peft_lora_adapter_path, adapter_name="lora_adapter")

In [27]:
prompt = "USER: <image>\nWhat's the content of the image? ASSISTANT:"
image = Image.open("/kaggle/input/dataset/dataset/images/0ad007ca-3098-44da-8603-d04253114a97.jpg")
processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf")
inputs = processor(text=prompt, images=image, return_tensors="pt")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [30]:
generate_ids = model.generate(**inputs, max_new_tokens=50)
decoded_response = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
print("Generated response:", decoded_response)

Generated response: USER:  
What's the content of the image? ASSISTANT: The image features a large, ornate building with a clock tower, which appears to be a historical building. The building is situated in a city, and there are several cars parked in front of it.


In [32]:
model.push_to_hub("rshah240/llava_historical_images")

README.md:   0%|          | 0.00/24.0 [00:00<?, ?B/s]



adapter_model.safetensors:   0%|          | 0.00/1.02G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/rshah240/llava_historical_images/commit/1abc130399b652f015db20a76aa1b5a4a2de3a9b', commit_message='Upload LlavaForConditionalGeneration', commit_description='', oid='1abc130399b652f015db20a76aa1b5a4a2de3a9b', pr_url=None, pr_revision=None, pr_num=None)