In [None]:
!pip install -q datasets
# !pip install -q git+https://github.com/huggingface/transformers.git@add-model-idefics
!pip install -q bitsandbytes sentencepiece accelerate loralib
!pip install -q -U git+https://github.com/huggingface/peft.git

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [None]:
!pip install -q transforms

In [None]:
import datasets
from datasets import load_dataset
# help(load_dataset)
print(dir(load_dataset.__class__))

['__annotations__', '__builtins__', '__call__', '__class__', '__closure__', '__code__', '__defaults__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__get__', '__getattribute__', '__globals__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__kwdefaults__', '__le__', '__lt__', '__module__', '__name__', '__ne__', '__new__', '__qualname__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__']


In [None]:
from datasets import load_dataset
from peft import LoraConfig, get_peft_model
from PIL import Image
from transformers import IdeficsForVisionText2Text, AutoProcessor, Trainer, TrainingArguments, BitsAndBytesConfig
from torchvision import transforms as transforms
import torch

In [None]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
# Define a class for the CocoCaptionDataset, which encapsulates the functionality for image captioning.
class CocoCaptionDataset:

    # Constructor initializes the model checkpoint and processor using Hugging Face's AutoProcessor.
    def __init__(self):
        # Define the Hugging Face model checkpoint for image captioning.
        self.model_ckpt = "HuggingFaceM4/idefics-9b-instruct"

        # Initialize the processor using the AutoProcessor with authentication token enabled.
        self.processor = AutoProcessor.from_pretrained(self.model_ckpt, use_auth_token=True)

    # Function to load and split the dataset into training and validation sets.
    def load_data(self):
        # Load the dataset using the "load_dataset" function from the datasets library.
        # The dataset is assumed to be located at the path "cat-state/mscoco-1st-caption".
        data = load_dataset("cat-state/mscoco-1st-caption")

        # Split the loaded dataset into training and testing sets.
        # The split ratio is set to allocate 0.02% of the data for testing.
        data = data["train"].train_test_split(test_size=0.0002)

        # Retrieve the training and validation sets after the split.
        train_data = data["train"]
        val_data = data["test"]

        # Return the training and validation sets for further processing or training.
        return train_data, val_data

    # Function to convert an image to RGB format if it is not already in that format.
    def img_convert_to_rgb(self, image):
        if image.mode == "RGB":
            return image
        image_rgba = image.convert("RGBA")
        background = Image.new("RGBA", image_rgba.size, (255, 255, 255))
        alpha_composite = Image.alpha_composite(background, image_rgba)
        alpha_composite = alpha_composite.convert("RGB")
        return alpha_composite

    # Function to transform the data for model input based on defined image processing steps.
    def transform_data(self, example):
        # Retrieve image size, mean, and standard deviation from the processor.
        img_size = self.processor.image_processor.image_size
        img_mean = self.processor.image_processor.image_mean
        img_std = self.processor.image_processor.image_std

        # Define a sequence of image transformation steps using torchvision.transforms.Compose.
        img_transform = transforms.Compose([
            self.img_convert_to_rgb,
            transforms.RandomResizedCrop((img_size, img_size), scale=(0.9, 1.0), interpolation=transforms.InterpolationMode.BICUBIC),
            transforms.ToTensor(),
            transforms.Normalize(mean=img_mean, std=img_std),
        ])

        # Create prompts by combining image URL and captions in a specific format.
        prompts = []
        for i in range(len(example['caption'])):
            caption = example['caption'][i]
            prompts.append(
                [
                    example['url'][i],
                    f"Question: Explain the picture. Answer: {caption}</s>",
                ],
            )

        # Process prompts using the processor with the specified transformation and move to GPU.
        inputs = self.processor(prompts, transform=img_transform, return_tensors="pt").to("cuda")
        # Set 'labels' in inputs to be equal to 'input_ids'.
        inputs["labels"] = inputs["input_ids"]

        # Return the transformed inputs for model training or evaluation.
        return inputs

    # Function to generate training and validation datasets with the specified transformations.
    def gen_data(self):
        # Load the training and validation datasets.
        train_dataset, val_dataset = self.load_data()

        # Set the transformation function for both training and validation datasets.
        train_dataset.set_transform(self.transform_data)
        val_dataset.set_transform(self.transform_data)

        # Return the transformed training and validation datasets.
        return train_dataset, val_dataset

In [None]:
# Import the previously defined CocoCaptionDataset class.
# from coco_caption_dataset import CocoCaptionDataset
# Define a class for the ImageCaptioningModel, which encapsulates the functionality for training and pushing the model to the Hugging Face Model Hub.

from google.colab import drive

class ImageCaptioningModel:

    # Constructor initializes the CocoCaptionDataset, loads data, and sets the model checkpoint.
    def __init__(self):
        # Create an instance of CocoCaptionDataset to load and preprocess data.
        cococaptiondataset = CocoCaptionDataset()

        # Generate training and validation datasets.
        self.train_data, self.val_data = cococaptiondataset.gen_data()

        # Define the Hugging Face model checkpoint for image captioning.
        self.model_ckpt = "HuggingFaceM4/idefics-9b-instruct"

    # Function to load the vision-text2text model with quantization and specific configurations.
    def load_model(self):
        # Configure quantization settings using BitsAndBytesConfig.
        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.float16,
            llm_int8_skip_modules=["lm_head", "embed_tokens"],
        )

        # Load the IdeficsForVisionText2Text model with the specified quantization configuration.
        model = IdeficsForVisionText2Text.from_pretrained(self.model_ckpt, quantization_config=bnb_config, device_map="auto")

        # Return the loaded model.
        return model

    # Function to create a LoRA (LoraConfig) model using the provided model and configuration.
    def create_lora_model(self, model):
        # Configure LoRA (LoraConfig) settings.
        config = LoraConfig(
            r=16,
            lora_alpha=32,
            target_modules=["q_proj", "k_proj", "v_proj"],
            lora_dropout=0.05,
            bias="none",
        )

        # Get the Peft model with LoRA configuration.
        lora_model = get_peft_model(model, config)

        # Print trainable parameters of the LoRA model.
        lora_model.print_trainable_parameters()

        # Return the created LoRA model.
        return lora_model

    # Function to set training arguments for the Trainer.
    def set_training_args(self):
        # Configure training arguments using TrainingArguments.
        # Define training arguments for the Trainer using the TrainingArguments class.

        # Specify the output directory where the trained model and logs will be saved.
        output_dir = "idefics-mscoco-captioner"

        # Set the learning rate for the optimizer during training.
        learning_rate = 2e-4

        # Enable mixed-precision training using 16-bit floating-point (fp16).
        fp16 = True

        # Set the number of training samples processed per batch on each device.
        per_device_train_batch_size = 2

        # Set the number of evaluation samples processed per batch on each device.
        per_device_eval_batch_size = 2

        # Accumulate gradients over this many training steps before performing a backward/update pass.
        gradient_accumulation_steps = 8

        # Disable pinning data into memory for DataLoader (dataloader_pin_memory).
        dataloader_pin_memory = False

        # Limit the total number of checkpoints saved during training.
        save_total_limit = 1

        # Define the strategy for evaluation during training (can be "steps", "epoch", or "no").
        evaluation_strategy = "steps"

        # Define the strategy for saving checkpoints during training (can be "steps", "epoch", or "no").
        save_strategy = "steps"

        # Save a checkpoint every specified number of training steps.
        save_steps = 50

        # Evaluate the model every specified number of training steps.
        eval_steps = 50

        # Log metrics every specified number of training steps.
        logging_steps = 50

        # Limit the total number of training steps during training.
        max_steps = 100

        # Keep columns that are not used during training and evaluation.
        remove_unused_columns = False

        # Load the best model at the end of training based on the evaluation metric.
        load_best_model_at_end = True

        # Specify the optimizer to be used during training (paged_adamw_8bit in this case).
        optim = "paged_adamw_8bit"

        # Specify the names of the labels in the dataset.
        label_names = ["labels"]

        # Create a TrainingArguments object with the specified configurations.
        training_args = TrainingArguments(
            output_dir=output_dir,
            learning_rate=learning_rate,
            fp16=fp16,
            per_device_train_batch_size=per_device_train_batch_size,
            per_device_eval_batch_size=per_device_eval_batch_size,
            gradient_accumulation_steps=gradient_accumulation_steps,
            dataloader_pin_memory=dataloader_pin_memory,
            save_total_limit=save_total_limit,
            evaluation_strategy=evaluation_strategy,
            save_strategy=save_strategy,
            save_steps=save_steps,
            eval_steps=eval_steps,
            logging_steps=logging_steps,
            max_steps=max_steps,
            remove_unused_columns=remove_unused_columns,
            load_best_model_at_end=load_best_model_at_end,
            optim=optim,
            label_names=label_names
        )


        # Return the configured training arguments.
        return training_args

    # Function to train the model using Trainer and push the model to the Hugging Face Models Hub.
    def train_and_push_to_hub(self):
        # Load the model and create the LoRA model.
        img_cap_model = self.load_model()
        img_cap_model = self.create_lora_model(img_cap_model)

        # Configure Trainer with the loaded model, training arguments, and datasets.
        trainer = Trainer(
            model=img_cap_model,
            args=self.set_training_args(),
            train_dataset=self.train_data,
            eval_dataset=self.val_data
        )

        # Train the model using the configured Trainer.
        trainer.train()
        # drive.mount('/content/drive')
        # Push the trained model to the Hugging Face Model Hub.
        img_cap_model.push_to_hub("idefics-mscoco-captioner", private=False)
        # trainder.save_model("my_image_captioning_model")
        # trainer.save_model("/content/drive/MyDrive/my_image_captioning_model")

In [None]:
if __name__ == "__main__":
    imagecaptioningmodel = ImageCaptioningModel()
    imagecaptioningmodel.train_and_push_to_hub()

Resolving data files:   0%|          | 0/37 [00:00<?, ?it/s]

ValueError: Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the quantized model. If you want to dispatch the model on the CPU or the disk while keeping these modules in 32-bit, you need to set `load_in_8bit_fp32_cpu_offload=True` and pass a custom `device_map` to `from_pretrained`. Check https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu for more details. 

In [None]:
# Function to perform inference using a given model, processor, and prompts.

# Parameters:
# - model: The pretrained language model for text generation.
# - processor: The processor associated with the model, responsible for tokenization and decoding.
# - prompts: List of input prompts for text generation.
# - max_new_tokens: Maximum number of new tokens to generate in the output (default is 50).
def check_inference(model, processor, prompts, max_new_tokens=50):
    # Get the tokenizer from the processor.
    tokenizer = processor.tokenizer

    # Define a list of bad words to be avoided in the generated output.
    bad_words = ["<image>", "<fake_token_around_image>"]

    # Tokenize the bad words and obtain their input_ids.
    if len(bad_words) > 0:
        bad_words_ids = tokenizer(bad_words, add_special_tokens=False).input_ids

    # Define the end-of-sequence token and obtain its input_id.
    eos_token = "</s>"
    eos_token_id = tokenizer.convert_tokens_to_ids(eos_token)

    # Tokenize the input prompts and convert them to PyTorch tensors.
    inputs = processor(prompts, return_tensors="pt").to("cuda")

    # Generate text using the model, specifying the end-of-sequence token, bad words, and maximum tokens.
    generated_ids = model.generate(**inputs, eos_token_id=[eos_token_id], bad_words_ids=bad_words_ids, max_new_tokens=max_new_tokens, early_stopping=True)

    # Decode the generated token IDs into text, skipping special tokens, and print the result.
    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    print(generated_text)

In [None]:
bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.float16,
            llm_int8_skip_modules=["lm_head", "embed_tokens"],
)

model = IdeficsForVisionText2Text.from_pretrained("HuggingFaceM4/idefics-9b-instruct", quantization_config=bnb_config, device_map="auto")

# Save the model to a directory in Colab (or directly to Google Drive if mounted)
model.save_pretrained("/content/drive/MyDrive/my_image_captioning_model")
processor.save_pretrained("/content/drive/MyDrive/my_image_captioning_model")  # if processor is used


In [None]:
processor = AutoProcessor.from_pretrained("HuggingFaceM4/idefics-9b-instruct", use_auth_token=True)

In [None]:
from peft import PeftModel, PeftConfig
config = PeftConfig.from_pretrained("/content/idefics-mscoco-captioner")
model = PeftModel.from_pretrained(model,"/content/idefics-mscoco-captioner")

In [None]:
url = "https://www.aaronreedphotography.com/images/xl/Sweet-Dreams-2022.jpg"
prompts = [
    url,
    "Question: Explain the picture. Answer:",
]
check_inference(model, processor, prompts, max_new_tokens=50)