In [1]:
import torch
from PIL import Image
from datasets import Dataset
from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer, Seq2SeqTrainer, Seq2SeqTrainingArguments, default_data_collator
from torchvision import transforms


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_name = "nlpconnect/vit-gpt2-image-captioning"

In [3]:
model = VisionEncoderDecoderModel.from_pretrained(model_name)
feature_extractor = ViTImageProcessor.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [4]:

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

VisionEncoderDecoderModel(
  (encoder): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): ViTPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0-11): 12 x ViTLayer(
          (attention): ViTAttention(
            (attention): ViTSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(in_features=768, out_features=3072, bias=True)
            (inte

In [67]:
def generate_caption(image_path):
    image = Image.open(image_path).convert("RGB")
    pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values.to(device)
    # output_ids = model.generate(pixel_values, max_length=16, num_beams=4)
    output_ids = model.generate(pixel_values, max_length=16)
    caption = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return caption
# def generate_caption(image_path):
#     image = Image.open(image_path).convert("RGB")
#     pixel_values = processor(images=image, return_tensors="pt").pixel_values.to(device)

#     # Set decoder_start_token_id if it's missing
#     if model.config.decoder_start_token_id is None:
#         model.config.decoder_start_token_id = tokenizer.bos_token_id  # or eos_token_id if your tokenizer doesn't have bos

#     # Use greedy decoding or sampling
#     output_ids = model.generate(pixel_values, max_length=16, do_sample=False)

#     caption = tokenizer.decode(output_ids[0], skip_special_tokens=True)
#     return caption


In [8]:

import os
results = []
print("Generating captions for artifact images...\n")
IMAGE_FOLDER = "artifact_images"
for filename in os.listdir(IMAGE_FOLDER):
    if filename.lower().endswith((".jpg", ".jpeg", ".png")):
        full_path = os.path.join(IMAGE_FOLDER, filename)
        print(f"Processing {full_path}")
        caption = generate_caption(full_path)
        results.append((filename, caption))
        print(f"{filename} => {caption}")
        


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Generating captions for artifact images...

Processing artifact_images/stone_axe.png


We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.53.0. You should pass an instance of `Cache` instead, e.g. `past_key_values=DynamicCache.from_legacy_cache(past_key_values)`.


stone_axe.png => a statue of a man with a sword in his hand 
Processing artifact_images/Hieroglph_wall.jpg
Hieroglph_wall.jpg => a series of photos of a variety of animals 
Processing artifact_images/flint_knife_stone_age.png
flint_knife_stone_age.png => a black and white photo of a knife 
Processing artifact_images/mjollnir-6074194_960_720.jpg
mjollnir-6074194_960_720.jpg => a small toy elephant with a bow on it's head 
Processing artifact_images/Samartian-Persian_necklace_and_amulet.png
Samartian-Persian_necklace_and_amulet.png => a large gold and black clock with a face on it 
Processing artifact_images/69bcabd8-5ad1-42fe-ae74-7032455ecfff.jpg
69bcabd8-5ad1-42fe-ae74-7032455ecfff.jpg => a black and white object with a red stripe 
Processing artifact_images/Birckala_1017_spoon.jpg
Birckala_1017_spoon.jpg => a small white object with a black and white pattern 
Processing artifact_images/museum-7995207_1280.jpg
museum-7995207_1280.jpg => a vase with a flower on top of it 
Processing ar

In [9]:
len(results)

13

In [10]:
import pandas as pd
CSV_OUTPUT = "artifact_captions.csv"
df = pd.DataFrame(results, columns=["filename", "generated_caption"])
df["critique_notes"] = ""  # Leave blank for manual review
df.to_csv(CSV_OUTPUT, index=False)

print(f"\nCaptions saved to '{CSV_OUTPUT}'")



Captions saved to 'artifact_captions.csv'


In [11]:
df = pd.read_csv(CSV_OUTPUT)

In [12]:
df.head(10)  # Display the first 10 rows of the DataFrame

Unnamed: 0,filename,generated_caption,critique_notes
0,stone_axe.png,a statue of a man with a sword in his hand,
1,Hieroglph_wall.jpg,a series of photos of a variety of animals,
2,flint_knife_stone_age.png,a black and white photo of a knife,
3,mjollnir-6074194_960_720.jpg,a small toy elephant with a bow on it's head,
4,Samartian-Persian_necklace_and_amulet.png,a large gold and black clock with a face on it,
5,69bcabd8-5ad1-42fe-ae74-7032455ecfff.jpg,a black and white object with a red stripe,
6,Birckala_1017_spoon.jpg,a small white object with a black and white pa...,
7,museum-7995207_1280.jpg,a vase with a flower on top of it,
8,images.jpeg,a black and white vase with a white face,
9,griffin-171407_640.jpg,a statue of a man sitting on top of a table,


## fine tune on manual corrected data

In [13]:
import os
import pandas as pd
from PIL import Image
import torch
from torch.utils.data import Dataset
from transformers import (
    VisionEncoderDecoderModel,
    ViTImageProcessor,
    AutoTokenizer,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    default_data_collator,
)

In [38]:
df = pd.read_csv("manually_observed.csv")
df = df.dropna(subset=["filename", "critique_notes"])
df["filename"] = df["filename"].astype(str).str.strip()
df["critique_notes"] = df["critique_notes"].astype(str).str.strip()
df = df[df["critique_notes"] != ""]  # Remove empty captions
df.head()

Unnamed: 0,filename,generated_caption,critique_notes
0,stone_axe.png,an ancient stone axe tool used in the stone age,it's an axe from the stone age
1,Hieroglph_wall.jpg,ancient Egyptian hieroglyphs carved into a sto...,Egyptian Hieroglyph wall
2,flint_knife_stone_age.png,a primitive flint knife used in prehistoric times,stone age flint knife
3,mjollnir-6074194_960_720.jpg,a Norse artifact shaped like Thor's hammer,it's Thor's hammer
4,Samartian-Persian_necklace_and_amulet.png,a Persian necklace and amulet with intricate d...,Persian necklace and amulet


In [39]:
class ArtifactDataset(Dataset):
    def __init__(self, dataframe, image_dir, processor, tokenizer, max_length=64):
        self.df = dataframe.reset_index(drop=True)
        self.image_dir = image_dir
        self.processor = processor
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        image_path = os.path.join(self.image_dir, row["filename"])
        image = Image.open(image_path).convert("RGB")

        caption = row["critique_notes"]
        if not isinstance(caption, str):
            caption = str(caption)
        caption = caption.strip() or "an artifact"

        pixel_values = self.processor(images=image, return_tensors="pt").pixel_values.squeeze()

        labels = self.tokenizer(
            caption,
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        ).input_ids.squeeze()

        labels[labels == tokenizer.pad_token_id] = -100  # Ignore padding in loss

        return {"pixel_values": pixel_values, "labels": labels}

In [40]:

model_name = "nlpconnect/vit-gpt2-image-captioning"
model = VisionEncoderDecoderModel.from_pretrained(model_name)
processor = ViTImageProcessor.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Enable decoder start/pad token
model.config.decoder_start_token_id = tokenizer.bos_token_id
model.config.pad_token_id = tokenizer.pad_token_id

# Set generation length
model.config.max_length = 64
model.config.num_beams = 4

In [41]:
image_dir = "./artifact_images"  # Directory containing artifact images
dataset = ArtifactDataset(df, image_dir, processor, tokenizer)

training_args = Seq2SeqTrainingArguments(
    output_dir="./vit_gpt2_finetuned_artifacts",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    logging_dir="./logs",
    num_train_epochs=5,
    learning_rate=5e-5,
    fp16=torch.cuda.is_available()
)




In [42]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    tokenizer=processor,
    data_collator=default_data_collator
)


  trainer = Seq2SeqTrainer(


In [43]:

trainer.train()

Step,Training Loss




TrainOutput(global_step=20, training_loss=5.550984191894531, metrics={'train_runtime': 64.5794, 'train_samples_per_second': 1.007, 'train_steps_per_second': 0.31, 'total_flos': 1.173015292280832e+16, 'train_loss': 5.550984191894531, 'epoch': 5.0})

In [44]:
model.save_pretrained("./vit_gpt2_finetuned_artifacts_2")
tokenizer.save_pretrained("./vit_gpt2_finetuned_artifacts_2")
processor.save_pretrained("./vit_gpt2_finetuned_artifacts_2")


['./vit_gpt2_finetuned_artifacts_2/preprocessor_config.json']

In [76]:
# Load from local path
model_path = "./vit_gpt2_finetuned_artifacts_2"

model = VisionEncoderDecoderModel.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)
processor = ViTImageProcessor.from_pretrained(model_path)

# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model = VisionEncoderDecoderModel.from_pretrained(model_path, local_files_only=True)
tokenizer = AutoTokenizer.from_pretrained(model_path, local_files_only=True)
processor = ViTImageProcessor.from_pretrained(model_path, local_files_only=True)


In [79]:
from transformers import VisionEncoderDecoderModel, ViTFeatureExtractor, AutoTokenizer

# Load the model and processor
model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
feature_extractor = ViTFeatureExtractor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")

def generate_caption(image_path):
    image = Image.open(image_path).convert("RGB")
    pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values.to(device)

    output_ids = model.generate(
    pixel_values,
    max_length=32,
    do_sample=True,
    top_k=50,
    top_p=0.95,
    temperature=0.9,
    repetition_penalty=1.5,
    no_repeat_ngram_size=3,
    decoder_start_token_id=model.config.decoder_start_token_id
)

    caption = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return caption


In [80]:
image_path = "/home/hydra/Desktop/Skill_score/skillscore_internship/task2/artifact_images/1024px-The_Curmsun_Disc_-_Obverse.png"  # Replace with your image file path
caption = generate_caption(image_path)
print("Caption:", caption)

Caption: a wooden table topped with a circle shaped food item  edited from scrapbooks, including several pieces of animal figurines and plates filled to the brim , as


In [81]:
IMAGE_FOLDER = "artifact_images"
for filename in os.listdir(IMAGE_FOLDER):
    if filename.lower().endswith((".jpg", ".jpeg", ".png")):
        full_path = os.path.join(IMAGE_FOLDER, filename)
        print(f"Processing {full_path}")
        caption = generate_caption(full_path)
        # break
        print(caption)

Processing artifact_images/stone_axe.png
a carved headstone with a lion figure on it young in the saddle wearing a cowboy hat and necktie while kneeling over top of it next to some
Processing artifact_images/Hieroglph_wall.jpg
a wall of photos showing different colored tiles 
very much like an english cartoon character with black and white stripes in front, a monkey holding two scissors next
Processing artifact_images/flint_knife_stone_age.png
a black knife stuck in a bone and orange string guy
Processing artifact_images/mjollnir-6074194_960_720.jpg
a rusted gold guitar sitting on top of a table large sculpture in the middle of the road, with large pieces of glass around it and metal sp
Processing artifact_images/Samartian-Persian_necklace_and_amulet.png
a fake skull on the back of a antique urn in front oe clock face with white background and an arrow pointing down to be fifty five twenty six
Processing artifact_images/69bcabd8-5ad1-42fe-ae74-7032455ecfff.jpg
a bright orange object at