In [1]:
""" 
Fine tune a image captioning or description model on a artwork description data to
obtain a model that is able to analyze an artwork and generate a description of 
the artwork that is hopefully more art-review-like than a generic image captioning 
model would

The pretrained model used here is the Salesforce Blip Captioniong model: 
https://huggingface.co/Salesforce/blip-image-captioning-base

The fine tuning data set is the data from the SemArt Project: 
https://github.com/noagarcia/SemArt
"""

' \nFine tune a image captioning or description model on a artwork description data to\nobtain a model that is able to analyze an artwork and generate a description of \nthe artwork that is hopefully more art-review-like than a generic image captioning \nmodel would\n\nThe pretrained model used here is the Salesforce Blip Captioniong model: \nhttps://huggingface.co/Salesforce/blip-image-captioning-base\n\nThe fine tuning data set is the data from the SemArt Project: \nhttps://github.com/noagarcia/SemArt\n'

In [2]:
import os
import torch
import pandas as pd
import chardet
from PIL import Image
from torchvision import transforms
from datasets import Dataset, DatasetDict
import pandas as pd
import numpy as np
from PIL import Image
from torch.utils.data import Dataset, DataLoader
import datasets
from datasets import Dataset as HFDataset, DatasetDict, IterableDataset
from transformers import BlipProcessor, BlipForConditionalGeneration, ViTImageProcessor, VisionEncoderDecoderModel, AutoTokenizer, TrainingArguments, Trainer

In [3]:
def list_files_in_directory(dir_path):
    try:
        files = os.listdir(dir_path)
        return [x for x in files if '.jpg' in x]
    except FileNotFoundError:
        return f"Directory not found: {dir_path}"
    except NotADirectoryError:
        return f"Not a directory: {dir_path}"

In [4]:
import pandas as pd
import torch
from torchvision import transforms
from PIL import Image
import os

# Step 1: Load the CSV file as a dictionary mapping image names to descriptions
semart_dir = f"/Users/rckyi/Documents/Datasette/SemArtData/SemArt" 
images_dir = semart_dir 

description_file_train = semart_dir + '/semart_train.csv'
description_file_test = semart_dir + '/semart_test.csv'

with open(description_file_train, 'rb') as file:
    print(f'file path {description_file_train}')
    result = chardet.detect(file.read())
    encoding = result['encoding']
    print(f'encoding {encoding}')
    df_train = pd.read_csv(description_file_train, encoding=encoding, sep='\t')
    print(type(df_train['IMAGE_FILE'][0]))
# df_test = pd.read_csv(description_file_test, encoding = "utf-8")


image_to_description = dict(zip(df_train["IMAGE_FILE"], df_train["DESCRIPTION"]))
# list(image_to_description.items())[0:5]

# # Define the image directory and transformation
# image_dir = "path/to/images"
transform = transforms.Compose([
    transforms.Resize((224, 224)),  # Resize to a fixed shape
    transforms.ToTensor(),  # Convert image to tensor
#     transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])  # Normalize
])


file path /Users/rckyi/Documents/Datasette/SemArtData/SemArt/semart_train.csv
encoding utf-8
<class 'str'>


In [5]:
image_files_in_dir = list_files_in_directory(semart_dir+'/Images/')

In [6]:
from datasets import IterableDataset, DatasetDict
from transformers import BlipProcessor, BlipForConditionalGeneration, TrainingArguments, Trainer
import torch

def data_generator():
    image_file_list = list(df_train['IMAGE_FILE'])
    for image_name in image_files_in_dir:
        image_path = f"{semart_dir}/Images/{image_name}"

        if image_name in image_file_list: #image_to_description: # and os.path.exists(image_path):
            image = Image.open(image_path).convert("RGB")
            image_tensor = transform(image))
            yield {"image":image_tensor, "text":image_to_description[image_name]}

In [7]:
hf_dataset = DatasetDict({"train": datasets.Dataset.from_generator(data_generator)})
hf_dataset

Loading dataset shards:   0%|          | 0/20 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['image', 'text'],
        num_rows: 19244
    })
})

In [8]:
def transforms(example_batch):
    images = np.asarray([x for x in example_batch["image"]])
    captions = [x for x in example_batch["text"]]
    inputs = processor(images=images, text=captions, 
                       padding="max_length",   # ✅ Pads or truncates to max_length
                        truncation=True,        # ✅ Prevents sequences longer than max_length
                        max_length=512  )
    inputs.update({"labels": inputs["input_ids"]})
    return inputs

hf_dataset.set_transform(transforms)

In [9]:
# # Define a custom collator to handle the format expected by the BLIP model
# def collate_fn(batch):
#     images = [item['image'] for item in batch]
#     texts = [item['text'] for item in batch]
#     # Convert the images into tensor and normalize
#     images = torch.stack(images)
#     images = torch.add(images,1.0)
#     text = torch.stack(texts)
#     return processor(images=images, text=texts, padding=True, return_tensors="pt")


In [10]:
# Define Training Arguments
training_args = TrainingArguments(
    output_dir="./blip-semart-finetuned",
    per_device_train_batch_size=8,
    num_train_epochs=5,
    save_steps=500,
    evaluation_strategy=None,
#     evaluation_strategy="steps",
    save_total_limit=2,
    learning_rate=5e-5,
    weight_decay=0.01,
    logging_dir="./logs",
    push_to_hub=False,  # Set to True if uploading to Hugging Face Hub
    logging_steps=10,
    fp16=torch.cuda.is_available(),
    remove_unused_columns=False,
    max_steps=1000,
    no_cuda=True,
)



In [11]:
# Load Pretrained BLIP Model & Processor
model_name = "Salesforce/blip-image-captioning-base"
processor = BlipProcessor.from_pretrained(model_name)
model = BlipForConditionalGeneration.from_pretrained(model_name)

In [12]:
# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset= hf_dataset["train"],
    tokenizer=processor,
#     data_collator=collate_fn,  # Use custom collate function to prepare batches
)

  trainer = Trainer(
max_steps is given, it will override any value given in num_train_epochs


In [None]:
# Start Fine-Tuning
trainer.train()


[34m[1mwandb[0m: Currently logged in as: [33mshaddie77[0m ([33mshaddie77-personal[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


It looks like you are trying to rescale already rescaled images. If the input images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again.


Step,Training Loss
10,9.3683
20,7.2804
30,6.3271
40,5.4025
50,4.4479
60,3.6291
70,2.5831
80,1.9317
90,1.7249
100,1.3295


In [None]:
# Save the fine-tuned model
trainer.save_model("./blip-semart-finetuned")
tokenizer.save_pretrained("./blip-semart-finetuned")

print("Fine-tuning complete! Model saved at './blip-semart-finetuned'")

In [None]:
# image_files_in_dir

In [None]:
# from PIL import Image
# import requests

# im_path = semart_dir+'/Images/'+ '14152-08abbe.jpg'
# image = Image.open(im_path)

In [None]:
# from accelerate.test_utils.testing import get_backend
# 
# device, _, _ = get_backend()
# inputs = processor(images=image, return_tensors="pt").to(torch.float).to(device)
# pixel_values = inputs.pixel_values.to(torch.float)
# model = model.to(device)
# device

In [None]:
# generated_ids = model.generate(pixel_values=pixel_values, max_length=50)
# generated_caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
# print(generated_caption)