In [1]:
import cv2
import json, os
import glob
import numpy as np

In [2]:
def create_captions_map(datadir):
    captions = list()
    image_files = list()
    imagedir = datadir
    for file_type in (".png", ".jpg"):
        image_files.extend(glob.glob(os.path.join(imagedir, "**/*" + file_type), recursive=True))
    for each_image_file in image_files:
        caption_map = {"file_name":os.path.basename(each_image_file)}
        filebasename, _ = os.path.splitext(each_image_file)
        label_file_name = filebasename.replace("IMAGES", "TEXT_LABELS") + ".gui"
        with open(label_file_name) as f:
            caption_map["text"] = f.read()
        captions.append(caption_map)
    return captions

In [3]:
root = "../D1/IMAGES/"
captions = create_captions_map(root)

In [4]:
with open(root + "metadata.jsonl", 'w') as f:
    for item in captions:
        f.write(json.dumps(item) + "\n")

In [5]:
from datasets import load_dataset 

dataset = load_dataset("imagefolder", data_dir=root, split="train")

Resolving data files:   0%|          | 0/301 [00:00<?, ?it/s]

Downloading and preparing dataset imagefolder/default to /Users/ritesh/.cache/huggingface/datasets/imagefolder/default-ce6e299874604f3b/0.0.0/37fbb85cc714a338bea574ac6c7d0b5be5aff46c1862c1989b20e0771199e93f...


Downloading data files:   0%|          | 0/301 [00:00<?, ?it/s]

Downloading data files: 0it [00:00, ?it/s]

Extracting data files: 0it [00:00, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset imagefolder downloaded and prepared to /Users/ritesh/.cache/huggingface/datasets/imagefolder/default-ce6e299874604f3b/0.0.0/37fbb85cc714a338bea574ac6c7d0b5be5aff46c1862c1989b20e0771199e93f. Subsequent calls will reuse this data.


In [6]:
dataset

Dataset({
    features: ['image', 'text'],
    num_rows: 300
})

# Create PyTorch Dataset

In [7]:
from torch.utils.data import Dataset

class ImageCaptioningDataset(Dataset):
    def __init__(self, dataset, processor):
        self.dataset = dataset
        self.processor = processor

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = self.dataset[idx]

        encoding = self.processor(images=item["image"], text=item["text"], padding="max_length", return_tensors="pt")

        # remove batch dimension
        encoding = {k:v.squeeze() for k,v in encoding.items()}

        return encoding

In [8]:
from transformers import AutoProcessor

processor = AutoProcessor.from_pretrained("microsoft/git-base")

2023-04-16 16:56:21.492409: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Downloading (…)okenizer_config.json:   0%|          | 0.00/453 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

In [9]:
train_dataset = ImageCaptioningDataset(dataset, processor)

In [10]:
item = train_dataset[0]
for k,v in item.items():
  print(k,v.shape)

input_ids torch.Size([512])
attention_mask torch.Size([512])
pixel_values torch.Size([3, 224, 224])


# Create PyTorch DataLoader

In [11]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=2)

In [13]:
batch = next(iter(train_dataloader))
for k,v in batch.items():
  print(k,v.shape)

input_ids torch.Size([2, 512])
attention_mask torch.Size([2, 512])
pixel_values torch.Size([2, 3, 224, 224])


In [15]:
from PIL import Image
import numpy as np

MEAN = np.array([123.675, 116.280, 103.530]) / 255
STD = np.array([58.395, 57.120, 57.375]) / 255

unnormalized_image = (batch["pixel_values"][0].numpy() * np.array(STD)[:, None, None]) + np.array(MEAN)[:, None, None]
unnormalized_image = (unnormalized_image * 255).astype(np.uint8)
unnormalized_image = np.moveaxis(unnormalized_image, 0, -1)

# Define model

In [16]:
from transformers import AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained("microsoft/git-base")

Downloading (…)lve/main/config.json:   0%|          | 0.00/2.82k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/707M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/141 [00:00<?, ?B/s]

# Dummy forward pass

In [17]:
outputs = model(input_ids=batch["input_ids"],
                attention_mask=batch["attention_mask"],
                pixel_values=batch["pixel_values"],
                labels=batch["input_ids"])
outputs.loss

tensor(11.0119, grad_fn=<NllLossBackward0>)

# Train the model

In [20]:
import torch

optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

model.train()

for epoch in range(1):
  print("Epoch:", epoch)
  for idx, batch in enumerate(train_dataloader):
    input_ids = batch.pop("input_ids").to(device)
    pixel_values = batch.pop("pixel_values").to(device)

    outputs = model(input_ids=input_ids,
                    pixel_values=pixel_values,
                    labels=input_ids)
    
    loss = outputs.loss

    print("Loss:", loss.item(), end="\t")

    loss.backward()

    optimizer.step()
    optimizer.zero_grad()

Epoch: 0
Loss: 5.870874881744385	Loss: 6.350809574127197	Loss: 5.763431549072266	Loss: 5.4379801750183105	Loss: 5.4998626708984375	Loss: 5.20180606842041	Loss: 5.197664737701416	Loss: 5.492949962615967	Loss: 5.431489944458008	Loss: 4.7418107986450195	Loss: 4.775285243988037	Loss: 4.613325595855713	Loss: 4.9958367347717285	Loss: 4.673680782318115	Loss: 4.680605888366699	Loss: 4.9198174476623535	Loss: 4.28180456161499	Loss: 4.108408451080322	Loss: 4.286749362945557	Loss: 3.997328996658325	Loss: 3.8612287044525146	Loss: 3.7262680530548096	Loss: 3.906266450881958	Loss: 3.7782742977142334	Loss: 3.741389274597168	Loss: 3.4223849773406982	Loss: 3.821803569793701	Loss: 3.2485148906707764	Loss: 3.22570538520813	Loss: 3.149899959564209	Loss: 2.985762596130371	Loss: 3.10326886177063	Loss: 3.145672082901001	Loss: 2.7723686695098877	Loss: 2.684929609298706	Loss: 2.649779796600342	Loss: 2.489927291870117	Loss: 2.4839365482330322	Loss: 2.327587604522705	Loss: 2.2354230880737305	Loss: 2.18631935119628

# Inference

In [21]:
# prepare image for the model
example = dataset[0]
image = example["image"]
width, height = image.size
inputs = processor(images=image, return_tensors="pt").to(device)
pixel_values = inputs.pixel_values

generated_ids = model.generate(pixel_values=pixel_values, max_length=50)
generated_caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(generated_caption)

header { btn - active, btn - inactive, btn - inactive, btn - inactive, btn - inactive } row { quadruple { small - title, text, btn - orange } quadruple { small
