## Huggingface sign in

In [1]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## Installing and importing necessary libraries

In [2]:
!pip install -q bitsandbytes sentencepiece accelerate loralib peft transformers datasets

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.0/105.0 MB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m280.0/280.0 kB[0m [31m25.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.9/190.9 kB[0m [31m19.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m37.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m17.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
import torch
from datasets import load_dataset, concatenate_datasets
from peft import LoraConfig, get_peft_model, PeftModel
from PIL import Image
from transformers import IdeficsForVisionText2Text, AutoProcessor, Trainer, TrainingArguments, BitsAndBytesConfig
import torchvision.transforms as transforms
from tqdm import tqdm

## Load quantized model
First get the quantized version of the model. This will allow us to use the 9B version of Idefics with a single 16GB gpu



In [4]:
device = "cuda" if torch.cuda.is_available() else "cpu"

checkpoint = "HuggingFaceM4/idefics-9b-instruct"

# Here we skip some special modules that can't be quantized properly
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    llm_int8_skip_modules=["lm_head", "embed_tokens"],
)


model = IdeficsForVisionText2Text.from_pretrained(checkpoint, quantization_config=bnb_config, device_map="auto")
processor = AutoProcessor.from_pretrained(checkpoint)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.41k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/99.3k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/7.89G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/283 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.57k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/92.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/207 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


# Inference
Let's make a simple method to test the model's inference

In [5]:
def check_inference(model, processor, prompts, max_new_tokens=100, pre_process = True):
    # --batched mode
    if pre_process:
        inputs = processor(prompts, add_end_of_utterance_token=False, return_tensors="pt").to(device)
    else:
        inputs = prompts

    # --single sample mode
    # inputs = processor(prompts[0], return_tensors="pt").to(device)

    exit_condition = processor.tokenizer("<end_of_utterance>", add_special_tokens=False).input_ids
    bad_words_ids = processor.tokenizer(["<image>", "<fake_token_around_image>"], add_special_tokens=False).input_ids

    generated_ids = model.generate(**inputs, eos_token_id=exit_condition, bad_words_ids=bad_words_ids, max_new_tokens=max_new_tokens)
    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
    for i, t in enumerate(generated_text):
      print(f"{i}:\n{t}\n")

## LoRA


In [15]:
peft_model = PeftModel.from_pretrained(model, "WinterSchool/Midefics-lora-v3")

In [7]:
model_name = checkpoint.split("/")[1]
config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=['q_proj','k_proj','v_proj'],
    lora_dropout=0.05,
    bias="none",
)
peft_model = get_peft_model(peft_model, config) #peft_model

In [8]:
peft_model.print_trainable_parameters()

trainable params: 19,750,912 || all params: 8,949,438,736 || trainable%: 0.22069442098698333


## Finetuning dataset
Prepare the dataset that will be used for finetuning


In [None]:
def convert_to_rgb(image):
    # `image.convert("RGB")` would only work for .jpg images, as it creates a wrong background
    # for transparent images. The call to `alpha_composite` handles this case
    if image.mode == "RGB":
        return image

    image_rgba = image.convert("RGBA")
    background = Image.new("RGBA", image_rgba.size, (255, 255, 255))
    alpha_composite = Image.alpha_composite(background, image_rgba)
    alpha_composite = alpha_composite.convert("RGB")
    return alpha_composite

In [None]:
def ds_transforms_custom(example_batch):
    image_size = processor.image_processor.image_size
    image_mean = processor.image_processor.image_mean
    image_std = processor.image_processor.image_std

    image_transform = transforms.Compose([
        convert_to_rgb,
        transforms.RandomResizedCrop((image_size, image_size), scale=(0.9, 1.0), interpolation=transforms.InterpolationMode.BICUBIC),
        transforms.ToTensor(),
        transforms.Normalize(mean=image_mean, std=image_std),
    ])

    prompts = []
    for i in range(len(example_batch['conversation'])):
        data = example_batch['conversation'][i]['data']
        element =  [
                    f"User: {data[0]['question']}",
                    example_batch['image'][i],
                    "<end_of_utterance>",
                    f"\nAssistant: {data[0]['answer']}<end_of_utterance>"
                    ]
        # print(len(data))
        for j in range(1,len(data)):
            question = data[j]['question']
            answer = data[j]['answer']

            element.extend(
                [
                    f"User: {question}<end_of_utterance>",
                    f"\nAssistant: {answer}<end_of_utterance>"
                ],
            )
        prompts.append(element)
    inputs = processor(prompts, transform=image_transform, return_tensors="pt", max_length = 2048, truncation = True).to(device) #processor.tokenizer.model_max_length

    inputs["labels"] = inputs["input_ids"]

    return inputs

## Training
Finally, using the Hugging Face Trainer, we can finetune the model!

In [9]:
ds = load_dataset("WinterSchool/MideficsDataset")

Downloading readme:   0%|          | 0.00/1.34k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/441M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/418M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/426M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/345M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/484M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/130M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3800 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/201 [00:00<?, ? examples/s]

In [None]:
train_ds = ds["train"]
eval_ds = ds["test"]
train_ds.set_transform(ds_transforms_custom)
eval_ds.set_transform(ds_transforms_custom)

In [None]:
training_args = TrainingArguments(
    output_dir=f"Med-{model_name}",
    learning_rate=2e-4,
    fp16=True,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=16,
    dataloader_pin_memory=False,
    save_total_limit=3,
    evaluation_strategy="no",
    save_strategy="no",
    save_steps=40,
    eval_steps=20,
    logging_steps=1,
    num_train_epochs = 1,
    remove_unused_columns=False,
    push_to_hub=False,
    label_names=["labels"],
    load_best_model_at_end=True,
    report_to="wandb",
    optim="paged_adamw_8bit",
)

trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
)

In [None]:
trainer.train()

Step,Training Loss
1,1.726
2,1.3136
3,1.1771
4,1.0932
5,1.0794
6,0.9257
7,0.9385
8,0.8447
9,0.8125
10,0.8017


In [2]:
peft_model.push_to_hub("WinterSchool/Midefics-lora-v3", private=False)

In [16]:
# merged_model

In [17]:
merged_model = peft_model.merge_and_unload()
merged_model.push_to_hub("WinterSchool/Midefics", private=False)



model.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/WinterSchool/Midefics/commit/351c371c8c178d402e05e1d4396478f60e25bff7', commit_message='Upload IdeficsForVisionText2Text', commit_description='', oid='351c371c8c178d402e05e1d4396478f60e25bff7', pr_url=None, pr_revision=None, pr_num=None)

### test

In [None]:
ds = load_dataset("WinterSchool/MideficsDataset")

In [14]:
ds["train"][0]

{'id': '4bdbe382-2d87-47a3-850e-aec9f5ce0bca',
 'image': <PIL.PngImagePlugin.PngImageFile image mode=RGB size=600x459>,
 'conversation': {'data': [{'answer': 'The CT angiography image shows an anomalous arterial supply to the normal lung tissue contiguous with sequestration.',
    'question': 'What does the CT angiography image show?'},
   {'answer': 'The arrow points to the anomalous arterial supply to the normal lung tissue.',
    'question': 'What does the arrow in the image indicate?'},
   {'answer': 'Sequestration refers to a congenital condition where a part of the lung is not connected to the normal airways or pulmonary circulation.',
    'question': 'What is sequestration in this context?'},
   {'answer': 'Anomalous arterial supply can lead to improper oxygenation of the lung tissue and potential complications such as infection or bleeding.',
    'question': 'How can anomalous arterial supply affect the lung tissue?'},
   {'answer': 'Treatment options for this condition may inc

In [23]:
prompts = []
sample = 12
example = ds["train"][sample]
data = example['conversation']['data']
element =  [
            f"User: {data[0]['question']}",
            example['image'],
            "<end_of_utterance>",
            f"\nAssistant: {data[0]['answer']}<end_of_utterance>"
            ]

for j in range(1,len(data)-1):
    question = data[j]['question']
    answer = data[j]['answer']

    element.extend(
        [
            f"User: {question}<end_of_utterance>",
            f"\nAssistant: {answer}<end_of_utterance>"
        ],
    )
element.extend([ f"User: {data[-1]['question']}<end_of_utterance>","\nAssistant:"])
prompts.append(element)

check_inference(peft_model, processor, prompts, max_new_tokens=300, pre_process = True)
print("**************")
print(data[-1]['answer'])

0:
User: What does the lateral radiograph of the leg in a child with OI show? 
Assistant: The lateral radiograph shows anterior bowing of the tibia. User: What does OI stand for in this context? 
Assistant: OI stands for Osteogenesis Imperfecta, a genetic disorder characterized by fragile bones that break easily. User: Why does the tibia appear bowed in this image? 
Assistant: The anterior bowing of the tibia is a common feature of Osteogenesis Imperfecta due to the abnormal collagen structure in the bones. User: What other symptoms may a child with Osteogenesis Imperfecta present with? 
Assistant: A child with Osteogenesis Imperfecta may present with symptoms such as easy bruising, hearing loss, loose joints, and long bones that bend or break easily. They may also have a soft tissue that is loose and floppy, and they may have a large head relative to their body size.

**************
Children with Osteogenesis Imperfecta may also experience frequent fractures, short stature, blue scler

In [24]:
i = 0
sample = 2
example = ds["train"][sample]

prompts = [
    [
        f"User: {example['conversation']['data'][i]['question']}",
        example['image'],
        "<end_of_utterance>",
        f"\nAssistant:"
    ],
]

check_inference(peft_model, processor, prompts, max_new_tokens=300, pre_process = True)
print("**************")
print(example['conversation']['data'][i]['answer'])

0:
User: Can you describe what you see in the image? 
Assistant: I see an x-ray image of a person's hand. The hand appears to have a broken bone, specifically a fractured wrist, with a metal plate and screws inserted to hold the bone together. The image also shows a surgical incision on the hand.

**************
I see an image of an upper extremity radiographic study type involving a finger.
