In [2]:
!pip install -q -U accelerate bitsandbytes git+https://github.com/huggingface/transformers.git
!pip install datasets -q
!pip install peft -q

In [3]:
!git clone https://brucewayne0459:your_hf_token@huggingface.co/datasets/joshuachou/SkinCAP  #Replace with your hf token

Cloning into 'SkinCAP'...
Filtering content: 100% (4347/4347), 545.21 MiB | 11.78 MiB/s, done.



In [2]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [2]:
!pip install aiohttp
!pip install nest_asyncio





In [None]:
import torch
import aiohttp
import asyncio
import nest_asyncio
from transformers import AutoProcessor, PaliGemmaForConditionalGeneration, TrainingArguments, Trainer, BitsAndBytesConfig
from datasets import load_dataset
from peft import get_peft_model, LoraConfig
from PIL import Image
import os
from io import BytesIO
from functools import lru_cache

# Step 1: Load your dataset
jsonl_file_path = "/teamspace/studios/this_studio/Updated_Final_Cleaned_Skin_Diseases_Data_with_question_v2.jsonl"
dataset = load_dataset('json', data_files=jsonl_file_path, split='train')

# Step 2: Split the dataset into training and validation sets
dataset = dataset.train_test_split(test_size=0.1)
train_ds = dataset["train"]
val_ds = dataset["test"]

# Step 3: Define the base directory where images are stored
image_base_dir = "/teamspace/studios/this_studio/SkinCAP/skincap"



In [None]:
# Step 4: Asynchronous function to load images from URLs
async def async_load_image_from_url(session, url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36"
    }
    try:
        async with session.get(url, headers=headers) as response:
            if response.status == 200:
                img_data = await response.read()
                img = Image.open(BytesIO(img_data)).convert("RGB")
                return img
            else:
                print(f"URL not accessible (Status Code: {response.status}): {url}")
                return None
    except Exception as e:
        print(f"Request failed for {url}: {e}")
        return None

@lru_cache(maxsize=1024)  # Cache up to 1024 images in memory
def load_image_from_cache(img_filename):
    if img_filename.startswith("http"):
        return None  # URLs are handled asynchronously, no need to cache them here
    else:
        img_path = os.path.join(image_base_dir, img_filename)
        if os.path.isfile(img_path):
            try:
                img = Image.open(img_path).convert("RGB")
                return img
            except Exception as e:
                print(f"Error loading image {img_filename} from path {img_path}: {e}")
                return None
        else:
            print(f"File not found: {img_filename} (Path: {img_path})")
            return None

async def fetch_images(examples):
    images = []
    texts = []
    labels = []

    async with aiohttp.ClientSession() as session:
        tasks = []
        for example in examples:
            img_filename = example["image"]
            if img_filename.startswith("http"):
                tasks.append(async_load_image_from_url(session, img_filename))
            else:
                img = load_image_from_cache(img_filename)
                tasks.append(asyncio.sleep(0))  # Dummy task to keep indices aligned
                images.append(img)
                texts.append(example["prefix"])
                labels.append(example["suffix"])

        fetched_images = await asyncio.gather(*tasks)

        valid_images = []
        valid_texts = []
        valid_labels = []

        for i, img in enumerate(fetched_images):
            if img is not None:
                valid_images.append(img)
                valid_texts.append(examples[i]["prefix"])
                valid_labels.append(examples[i]["suffix"])

    return valid_images, valid_texts, valid_labels

def collate_fn(examples):
    try:
        # This allows nesting of asyncio in environments that already have an event loop running
        nest_asyncio.apply()
        images, texts, labels = asyncio.run(fetch_images(examples))
    except RuntimeError:  # If an event loop is already running, use another approach
        loop = asyncio.get_event_loop()
        images, texts, labels = loop.run_until_complete(fetch_images(examples))

    # Use the processor to handle tokenization and image processing
    tokens = processor(text=texts, images=images, suffix=labels,
                       return_tensors="pt", padding="longest")

    # Move tokens to GPU and convert to bfloat16
    tokens = {key: value.to(device) for key, value in tokens.items()}

    return tokens



In [None]:
# Step 5: Load the processor
model_id = "google/paligemma-3b-pt-224"
processor = AutoProcessor.from_pretrained(model_id)

# Step 6: Setup Bits and Bytes Config for 4-bit quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

# Step 7: Configure LoRA
lora_config = LoraConfig(
    r=8,
    target_modules=["q_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj"],
    task_type="CAUSAL_LM",
)

# Step 8: Load the model with quantization and apply LoRA
model = PaliGemmaForConditionalGeneration.from_pretrained(model_id, quantization_config=bnb_config, device_map={"": 0})
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()  # Show trainable parameters for LoRA

device = "cuda"
image_token = processor.tokenizer.convert_tokens_to_ids("<image>")



In [None]:

# Step 9: Define the training arguments
output_dir = "./pali_gemma_derm"

training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=10,
    per_device_train_batch_size=6,
    gradient_accumulation_steps=4,
    warmup_steps=2,
    learning_rate=2e-5,
    weight_decay=1e-6,
    logging_dir='./logs',
    logging_steps=100,
    save_strategy="epoch",  # Save the model after every epoch
    evaluation_strategy="epoch",  # Evaluate the model after every epoch
    save_total_limit=3,  # Keeps the last 3 checkpoints
    load_best_model_at_end=True,  # Load the best model at the end
    bf16=True,
    report_to=["tensorboard"],
    dataloader_pin_memory=False,  # Disable pinning memory
    push_to_hub=False,
    remove_unused_columns=False,
)

# Check if there's a checkpoint to resume from
last_checkpoint = None
if os.path.isdir(output_dir) and len(os.listdir(output_dir)) > 0:
    last_checkpoint = max([os.path.join(output_dir, d) for d in os.listdir(output_dir) if d.startswith("checkpoint")], key=os.path.getctime)
    print(f"Resuming from checkpoint: {last_checkpoint}")

# Step 10: Initialize the Trainer
trainer = Trainer(
    model=model,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    data_collator=collate_fn,
    args=training_args,
)


In [3]:

# Step 11: Start training or resume from checkpoint
trainer.train(resume_from_checkpoint=last_checkpoint)


Generating train split: 0 examples [00:00, ? examples/s]

preprocessor_config.json:   0%|          | 0.00/699 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/40.0k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.26M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/607 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.03k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/62.6k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/1.74G [00:00<?, ?B/s]

`config.hidden_act` is ignored, you should use `config.hidden_activation` instead.
Gemma's activation function will be set to `gelu_pytorch_tanh`. Please, use
`config.hidden_activation` if you want to override this behaviour.
See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

trainable params: 11,298,816 || all params: 2,934,765,296 || trainable%: 0.3850
Resuming from checkpoint: ./pali_gemma_derm/checkpoint-5383




Epoch,Training Loss,Validation Loss
7,0.2613,0.251
9,0.2136,0.24325


File not found:  (Path: /teamspace/studios/this_studio/SkinCAP/skincap/)
URL not accessible (Status Code: 404): http://atlasdermatologico.com.br/img?imageId=6724
URL not accessible (Status Code: 404): http://atlasdermatologico.com.br/img?imageId=8364
URL not accessible (Status Code: 404): http://atlasdermatologico.com.br/img?imageId=2863
URL not accessible (Status Code: 404): http://atlasdermatologico.com.br/img?imageId=2856
URL not accessible (Status Code: 404): http://atlasdermatologico.com.br/img?imageId=4031
URL not accessible (Status Code: 404): http://atlasdermatologico.com.br/img?imageId=4030
URL not accessible (Status Code: 404): http://atlasdermatologico.com.br/img?imageId=2862
URL not accessible (Status Code: 404): http://atlasdermatologico.com.br/img?imageId=2859
URL not accessible (Status Code: 404): http://atlasdermatologico.com.br/img?imageId=2860




URL not accessible (Status Code: 404): http://atlasdermatologico.com.br/img?imageId=6724
URL not accessible (Status Code: 404): http://atlasdermatologico.com.br/img?imageId=2863
URL not accessible (Status Code: 404): http://atlasdermatologico.com.br/img?imageId=4030
URL not accessible (Status Code: 404): http://atlasdermatologico.com.br/img?imageId=4031
URL not accessible (Status Code: 404): http://atlasdermatologico.com.br/img?imageId=8364
URL not accessible (Status Code: 404): http://atlasdermatologico.com.br/img?imageId=2862
URL not accessible (Status Code: 404): http://atlasdermatologico.com.br/img?imageId=2856
URL not accessible (Status Code: 404): http://atlasdermatologico.com.br/img?imageId=2859
URL not accessible (Status Code: 404): http://atlasdermatologico.com.br/img?imageId=2860




URL not accessible (Status Code: 404): http://atlasdermatologico.com.br/img?imageId=2862
URL not accessible (Status Code: 404): http://atlasdermatologico.com.br/img?imageId=8364
URL not accessible (Status Code: 404): http://atlasdermatologico.com.br/img?imageId=4030
URL not accessible (Status Code: 404): http://atlasdermatologico.com.br/img?imageId=4031
URL not accessible (Status Code: 404): http://atlasdermatologico.com.br/img?imageId=6724
URL not accessible (Status Code: 404): http://atlasdermatologico.com.br/img?imageId=2863
URL not accessible (Status Code: 404): http://atlasdermatologico.com.br/img?imageId=2856




URL not accessible (Status Code: 404): http://atlasdermatologico.com.br/img?imageId=2859
URL not accessible (Status Code: 404): http://atlasdermatologico.com.br/img?imageId=2860




TrainOutput(global_step=7690, training_loss=0.07154164264664073, metrics={'train_runtime': 16205.8291, 'train_samples_per_second': 11.396, 'train_steps_per_second': 0.475, 'total_flos': 5.774222626298077e+17, 'train_loss': 0.07154164264664073, 'epoch': 9.998050682261209})

In [1]:
import torch
from transformers import AutoProcessor, PaliGemmaForConditionalGeneration
from PIL import Image

# Load the processor and model
model_id = "google/paligemma-3b-pt-224"
processor = AutoProcessor.from_pretrained(model_id,device_map={"": 0})

# Load the model with LoRA adapters
model = PaliGemmaForConditionalGeneration.from_pretrained("/teamspace/studios/this_studio/pali_gemma_derm/checkpoint-7690",device_map = 'cuda')
model.eval()  # Set model to evaluation mode


`config.hidden_act` is ignored, you should use `config.hidden_activation` instead.
Gemma's activation function will be set to `gelu_pytorch_tanh`. Please, use
`config.hidden_activation` if you want to override this behaviour.
See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

PaliGemmaForConditionalGeneration(
  (vision_tower): SiglipVisionModel(
    (vision_model): SiglipVisionTransformer(
      (embeddings): SiglipVisionEmbeddings(
        (patch_embedding): Conv2d(3, 1152, kernel_size=(14, 14), stride=(14, 14), padding=valid)
        (position_embedding): Embedding(256, 1152)
      )
      (encoder): SiglipEncoder(
        (layers): ModuleList(
          (0-26): 27 x SiglipEncoderLayer(
            (self_attn): SiglipSdpaAttention(
              (k_proj): lora.Linear(
                (base_layer): Linear(in_features=1152, out_features=1152, bias=True)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=1152, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=1152, bias=False)
                )
                (lora_embedding_

In [13]:
input_text = "Identify the skin disease?"
input_image_path = "/teamspace/studios/this_studio/vitiligo-0011.jpg"  # Replace with your image path
input_image = Image.open(input_image_path).convert("RGB")

# Process the input
inputs = processor(text=input_text, images=input_image, return_tensors="pt", padding="longest").to("cuda")

# Run inference with increased max_length
with torch.no_grad():
    outputs = model.generate(**inputs, max_length=300)  # Adjust max_length as needed

# Decode the output
decoded_output = processor.decode(outputs[0], skip_special_tokens=True)
print("Model Output:", decoded_output)


Model Output: Identify the skin disease?
vitiligo


In [1]:
import torch
from transformers import AutoProcessor, PaliGemmaForConditionalGeneration
from peft import get_peft_model, LoraConfig, PeftModel
from transformers import BitsAndBytesConfig

# Load the processor
model_id = "google/paligemma-3b-pt-224"
processor = AutoProcessor.from_pretrained(model_id)

# Load the trained model checkpoint
model_checkpoint_path = "/teamspace/studios/this_studio/pali_gemma_derm/checkpoint-7690"
model = PaliGemmaForConditionalGeneration.from_pretrained(model_checkpoint_path, device_map={"": 0})

# Define the LoRA configuration
lora_config = LoraConfig(
    r=8,
    target_modules=["q_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj"],
    task_type="CAUSAL_LM",
)

# Apply the LoRA configuration to the trained model
peft_model = get_peft_model(model, lora_config)

# Push the LoRA adapters to Hugging Face Hub
peft_model.push_to_hub("brucewayne0459/Lora_pali_gemma", use_auth_token=True)

print("LoRA adapters pushed to Hugging Face Hub successfully.")


`config.hidden_act` is ignored, you should use `config.hidden_activation` instead.
Gemma's activation function will be set to `gelu_pytorch_tanh`. Please, use
`config.hidden_activation` if you want to override this behaviour.
See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]



README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]



adapter_model.safetensors:   0%|          | 0.00/45.3M [00:00<?, ?B/s]

LoRA adapters pushed to Hugging Face Hub successfully.


# Merging the model

In [3]:
import torch
from transformers import AutoModelForCausalLM,PaliGemmaForConditionalGeneration
from peft import PeftModel

# Load the base model
base_model = PaliGemmaForConditionalGeneration.from_pretrained("google/paligemma-3b-pt-224", device_map="auto")

# Load the PEFT model with LoRA adapters
peft_model_id = "/teamspace/studios/this_studio/pali_gemma_derm/checkpoint-7690"
peft_model = PeftModel.from_pretrained(base_model, peft_model_id)



config.json:   0%|          | 0.00/1.03k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/62.6k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/1.74G [00:00<?, ?B/s]

`config.hidden_act` is ignored, you should use `config.hidden_activation` instead.
Gemma's activation function will be set to `gelu_pytorch_tanh`. Please, use
`config.hidden_activation` if you want to override this behaviour.
See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

In [6]:
# Merge the LoRA adapters with the base model
model_merge = peft_model.merge_and_unload()



In [9]:
# Save the merged model locally
merged_model_path = "/teamspace/studios/this_studio/pali_gemma_merged"
#peft_model.save_pretrained(merged_model_path)

model_merge.save_pretrained(merged_model_path)


In [11]:
# Optionally, push the merged model to Hugging Face Hub
model_merge.push_to_hub("brucewayne0459/paligemma_derm", use_auth_token=True, safe_serialization=True)




README.md:   0%|          | 0.00/31.0 [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/1.74G [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/brucewayne0459/paligemma_derm/commit/8c92e51a2f4a91c28c78a1797a7301bb1608b33a', commit_message='Upload PaliGemmaForConditionalGeneration', commit_description='', oid='8c92e51a2f4a91c28c78a1797a7301bb1608b33a', pr_url=None, pr_revision=None, pr_num=None)

In [12]:
from transformers import AutoTokenizer,AutoProcessor
model_id = 'google/paligemma-3b-pt-224'
tokenizer = AutoTokenizer.from_pretrained(model_id)
processor = AutoProcessor.from_pretrained(model_id)
merged_model_path = "/teamspace/studios/this_studio/pali_gemma_merged"

# save the tokenizer
processor.save_pretrained(merged_model_path)
tokenizer.save_pretrained(merged_model_path)

# push the tokenizer to hub
#tokenizer.push_to_hub(new_hub_model_path, token=True)

tokenizer_config.json:   0%|          | 0.00/40.0k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.26M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/607 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/699 [00:00<?, ?B/s]

('/teamspace/studios/this_studio/pali_gemma_merged/tokenizer_config.json',
 '/teamspace/studios/this_studio/pali_gemma_merged/special_tokens_map.json',
 '/teamspace/studios/this_studio/pali_gemma_merged/tokenizer.json')

In [14]:
processor.push_to_hub("brucewayne0459/paligemma_derm", use_auth_token=True, safe_serialization=True)



tokenizer.json:   0%|          | 0.00/17.8M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/brucewayne0459/paligemma_derm/commit/62e892186457108d708cf30c46f985ed28bf3909', commit_message='Upload processor', commit_description='', oid='62e892186457108d708cf30c46f985ed28bf3909', pr_url=None, pr_revision=None, pr_num=None)

# Model Inference

In [3]:
import torch
from transformers import AutoProcessor, PaliGemmaForConditionalGeneration
from PIL import Image

# Load the model and processor
model_id = "brucewayne0459/paligemma_derm"
processor = AutoProcessor.from_pretrained(model_id)
model = PaliGemmaForConditionalGeneration.from_pretrained(model_id, device_map={"": 0})
model.eval()


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Both `max_new_tokens` (=50) and `max_length`(=512) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Model Output: What is this skin condition?
vitiligo


In [8]:
# Load a sample image and text input
input_text = "Identify the skin condition?"
input_image_path = "/teamspace/studios/this_studio/ulcer-scc.jpg"  # Replace with your actual image path
input_image = Image.open(input_image_path).convert("RGB")

# Process the input
inputs = processor(text=input_text, images=input_image, return_tensors="pt", padding="longest").to("cuda" if torch.cuda.is_available() else "cpu")

# Set the maximum length for generation
max_length = 512  # You can increase this as needed
max_new_tokens = 50  # Controls how many new tokens are generated

# Run inference
with torch.no_grad():
    outputs = model.generate(**inputs, max_length=max_length, max_new_tokens=max_new_tokens)

# Decode the output
decoded_output = processor.decode(outputs[0], skip_special_tokens=True)
print("Model Output:", decoded_output)

Both `max_new_tokens` (=50) and `max_length`(=512) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Model Output: Identify the skin condition?
basal cell carcinoma
