In [None]:
# Install Pytorch & other libraries
%pip install "torch==2.4.0" tensorboard pillow

 
# Install Hugging Face libraries
%pip install  --upgrade \
  "transformers==4.45.1" \
  "datasets==3.0.1" \
  "accelerate==0.34.2" \
  "evaluate==0.4.3" \
  "bitsandbytes==0.44.0" \
  "trl==0.11.1" \
  "peft==0.13.0" \
  "qwen-vl-utils"

In [None]:
import json

with open('/kaggle/input/new-data-set/modified_file.json', 'r') as file:
    dataset = json.load(file)
    
    


In [None]:
from huggingface_hub import login
 
login(
  token="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", # ADD YOUR TOKEN HERE
  add_to_git_credential=True
)

In [None]:
import torch
from transformers import AutoModelForVision2Seq, AutoProcessor, BitsAndBytesConfig
 
# Hugging Face model id
model_id = "Qwen/Qwen2-VL-7B-Instruct" 
 
# BitsAndBytesConfig int-4 config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
)
 
# Load model and tokenizer
model = AutoModelForVision2Seq.from_pretrained(
    model_id,
    device_map="auto",
    # attn_implementation="flash_attention_2", # not supported for training
    torch_dtype=torch.bfloat16,
    quantization_config=bnb_config
)
# device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# 
# model = torch.nn.DataParallel(model, device_ids = [0,1]).to(device)

processor = AutoProcessor.from_pretrained(model_id)
# model.load_adapter("/kaggle/working/qwen2-7b-instruct-product_descripter_v1") # load the adapter and activate

In [None]:
# Get the messages from your dataset
messages = dataset[2]["messages"]

# Create your custom system message


# Insert the system message at the beginning of the messages list
messages_with_system = messages

# Apply the chat template using the modified messages list
text = processor.apply_chat_template(
    messages_with_system, tokenize=False, add_generation_prompt=False
)
print(text)

In [None]:
from peft import LoraConfig
 
# LoRA config based on QLoRA paper & Sebastian Raschka experiment
peft_config = LoraConfig(
        lora_alpha=16,
        lora_dropout=0.05,
        r=8,
        bias="none",
        target_modules=["q_proj", "v_proj"],
        task_type="CAUSAL_LM", 
)

In [None]:
from trl import SFTConfig
from transformers import Qwen2VLProcessor
from qwen_vl_utils import process_vision_info
 
args = SFTConfig(
    output_dir="llama3.1_v1", # directory to save and repository id
    num_train_epochs=10,                     # number of training epochs
    per_device_train_batch_size=2,          # batch size per device during training
    gradient_accumulation_steps=8,          # number of steps before performing a backward/update pass
    gradient_checkpointing=True,            # use gradient checkpointing to save memory
    optim="adamw_torch_fused",              # use fused adamw optimizer
    logging_steps=4,                       # log every 10 steps
    save_strategy="epoch",                  # save checkpoint every epoch
    learning_rate=4e-4,                     # learning rate, based on QLoRA paper
    bf16=True,                              # use bfloat16 precision
    tf32=False,                              # use tf32 precision
    max_grad_norm=0.3,                      # max gradient norm based on QLoRA paper
    warmup_ratio=0.03,                      # warmup ratio based on QLoRA paper
    lr_scheduler_type="constant",           # use constant learning rate scheduler
    push_to_hub=False,                       # push model to hub
    report_to="tensorboard",                # report metrics to tensorboard
    gradient_checkpointing_kwargs = {"use_reentrant": False}, # use reentrant checkpointing
    dataset_text_field="", # need a dummy field for collator
    dataset_kwargs = {"skip_prepare_dataset": True} # important for collator
)
args.remove_unused_columns=False
 
# Create a data collator to encode text and image pairs
from PIL import Image
import torchvision.transforms as transforms

# Define your image transformation (adjust as necessary for your use case)
image_transform = transforms.Compose([
    transforms.Resize((800, 800)),  # Resize to the desired size
    transforms.ToTensor(),           # Convert to tensor
    # Add any additional transformations if necessary
])

def collate_fn(examples):
    # Get the texts from user and assistant roles
    texts = []
    for example in examples:
        system_message = next((msg["content"] for msg in example["messages"] if msg["role"] == "system"), "")
        user_message = next((msg["content"] for msg in example["messages"] if msg["role"] == "user"), "")
        assistant_message = next((msg["content"] for msg in example["messages"] if msg["role"] == "assistant"), "")
        combined_text = f"{system_message}{user_message} {assistant_message}"
        texts.append(combined_text)

    # Load and transform the images
    image_inputs = []
    for example in examples:
        for image_path in example["images"]:
            try:
                image = Image.open(image_path).convert("RGB")  # Open the image and convert to RGB
                image = image_transform(image)  # Apply transformations
                image_inputs.append(image)  # Add the transformed image to the list
            except Exception as e:
                print(f"Error loading image {image_path}: {e}")
                image_inputs.append(None)  # Append None or handle as needed

    # Stack images into a tensor if applicable
    image_inputs = torch.stack([img for img in image_inputs if img is not None])  # Remove any None values

    # Tokenize the texts and process the images
    batch = processor(text=texts, images=image_inputs, return_tensors="pt", padding=True)

    # The labels are the input_ids, and we mask the padding tokens in the loss computation
    labels = batch["input_ids"].clone()
    labels[labels == processor.tokenizer.pad_token_id] = -100  # Mask padding tokens

    # Ignore the image token index in the loss computation (model specific)
    if isinstance(processor, Qwen2VLProcessor):
        image_tokens = [151652, 151653, 151655]
    else:
        image_tokens = [processor.tokenizer.convert_tokens_to_ids(processor.image_token)]

    for image_token_id in image_tokens:
        labels[labels == image_token_id] = -100
    batch["labels"] = labels

    return batch


# def collate_fn(examples):
#     # Get the texts and images, and apply the chat template
#     texts = [processor.apply_chat_template(example["messages"], tokenize=False) for example in examples]
#     image_inputs = [process_vision_info(example["messages"])[0] for example in examples]
 
#     # Tokenize the texts and process the images
#     batch = processor(text=texts, images=image_inputs, return_tensors="pt", padding=True)
 
#     # The labels are the input_ids, and we mask the padding tokens in the loss computation
#     labels = batch["input_ids"].clone()
#     labels[labels == processor.tokenizer.pad_token_id] = -100  #
#     # Ignore the image token index in the loss computation (model specific)
#     if isinstance(processor, Qwen2VLProcessor):
#         image_tokens = [151652,151653,151655]
#     else: 
#         image_tokens = [processor.tokenizer.convert_tokens_to_ids(processor.image_token)]
#     for image_token_id in image_tokens:
#         labels[labels == image_token_id] = -100
#     batch["labels"] = labels
 
#     return batch

In [None]:
import torch
from PIL import Image

def collate_fn(examples):
    texts = []
    image_inputs = []

    for example in examples:
        # Get and combine text from 'user' and 'assistant'
        user_message = next((msg["content"] for msg in example["messages"] if msg["role"] == "user"), "")
        assistant_message = next((msg["content"] for msg in example["messages"] if msg["role"] == "assistant"), "")
        combined_text = f"{user_message} {assistant_message}"
        texts.append(combined_text)

        # Process images in each example
        example_images = []
        for image_path in example["images"]:
            try:
                image = Image.open(image_path).convert("RGB")
                image = image_transform(image)  # Assuming this is defined elsewhere
                example_images.append(image)
            except Exception as e:
                print(f"Error loading image {image_path}: {e}")
                example_images.append(None)  # Handle missing images

        # Remove None entries and validate alignment
        valid_images = [img for img in example_images if img is not None]
        if valid_images:
            image_inputs.extend(valid_images)

    # Stack images, ensure shape consistency
    if image_inputs:
        image_inputs = torch.stack(image_inputs)
    else:
        image_inputs = None

    # Check if text and image counts match
    if image_inputs is not None and len(texts) != len(image_inputs):
        raise ValueError(f"Mismatch in texts ({len(texts)}) and images ({len(image_inputs)})")

    # Process text and images using the processor
    batch = processor(text=texts, images=image_inputs, return_tensors="pt", padding=True)

    # Prepare labels with padding tokens ignored
    labels = batch["input_ids"].clone()
    labels[labels == processor.tokenizer.pad_token_id] = -100

    # Ignore image token IDs in loss computation (adjust based on processor type)
    if isinstance(processor, Qwen2VLProcessor):
        image_tokens = [151652, 151653, 151655]
    else:
        image_tokens = [processor.tokenizer.convert_tokens_to_ids(processor.image_token)]

    for image_token_id in image_tokens:
        labels[labels == image_token_id] = -100
    batch["labels"] = labels

    return batch


In [None]:
from trl import SFTTrainer
 
trainer = SFTTrainer(
    model=model,
    args=args,
    train_dataset=dataset,
    data_collator=collate_fn,
    dataset_text_field="", # needs dummy value
    peft_config=peft_config,
    tokenizer=processor.tokenizer,
)

In [None]:
# Configure FSDP auto-wrap policy
# fsdp_plugin = trainer.accelerator.state.fsdp_plugin
# fsdp_plugin.auto_wrap_policy = fsdp_auto_wrap_policy(trainer.model)

# Start training
trainer.train()

# Save the model
trainer.save_model(args.output_dir)

# # start training, the model will be automatically saved to the hub and the output directory
# trainer.train()
 
# # save model 
# trainer.save_model(args.output_dir)