In [2]:
# params
test_valid_percentage = 30 # (test - 15, valid - 15)

train_data_percentage = 100
valid_data_percentage = 100
test_data_percentage = 100

max_target_length = 256

random_state = 77

In [3]:
import os
import re
import json
import torch
import numpy as np
import pandas as pd
from PIL import Image

In [4]:
from transformers import VisionEncoderDecoderModel, AutoFeatureExtractor,AutoTokenizer
os.environ["WANDB_DISABLED"] = "true"

In [5]:
import nltk
try:
    nltk.data.find("tokenizers/punkt")
except (LookupError, OSError):
    nltk.download("punkt", quiet=True)

## Initialize VisionEncoderDecoderModel

In [6]:
from transformers import VisionEncoderDecoderModel, AutoTokenizer, AutoFeatureExtractor

image_encoder_model = "google/vit-base-patch16-224-in21k"
text_decode_model = "gpt2"

model = VisionEncoderDecoderModel.from_encoder_decoder_pretrained(image_encoder_model, text_decode_model)

Some weights of GPT2LMHeadModel were not initialized from the model checkpoint at gpt2 and are newly initialized: ['h.9.crossattention.c_proj.weight', 'h.8.crossattention.bias', 'h.3.crossattention.c_proj.weight', 'h.2.crossattention.c_attn.weight', 'h.8.crossattention.c_attn.weight', 'h.8.ln_cross_attn.weight', 'h.6.crossattention.c_proj.weight', 'h.3.crossattention.masked_bias', 'h.1.crossattention.c_attn.weight', 'h.2.crossattention.q_attn.weight', 'h.0.crossattention.c_attn.weight', 'h.9.crossattention.c_attn.weight', 'h.7.crossattention.c_proj.bias', 'h.7.crossattention.bias', 'h.4.crossattention.bias', 'h.9.crossattention.c_proj.bias', 'h.1.crossattention.masked_bias', 'h.6.crossattention.bias', 'h.5.crossattention.bias', 'h.6.crossattention.masked_bias', 'h.3.crossattention.bias', 'h.1.crossattention.c_proj.weight', 'h.8.crossattention.c_proj.weight', 'h.7.crossattention.c_attn.weight', 'h.10.crossattention.masked_bias', 'h.11.crossattention.q_attn.weight', 'h.10.crossattention.

In [7]:
# image feature extractor
feature_extractor = AutoFeatureExtractor.from_pretrained(image_encoder_model)
# text tokenizer
tokenizer = AutoTokenizer.from_pretrained(text_decode_model)

In [8]:
# GPT2 only has bos/eos tokens but not decoder_start/pad tokens
tokenizer.pad_token = tokenizer.eos_token

# update the model config
model.config.eos_token_id = tokenizer.eos_token_id
model.config.decoder_start_token_id = tokenizer.bos_token_id
model.config.pad_token_id = tokenizer.pad_token_id

In [9]:
output_dir = "vit-gpt-model"
model.save_pretrained(output_dir)
feature_extractor.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

('vit-gpt-model\\tokenizer_config.json',
 'vit-gpt-model\\special_tokens_map.json',
 'vit-gpt-model\\vocab.json',
 'vit-gpt-model\\merges.txt',
 'vit-gpt-model\\added_tokens.json',
 'vit-gpt-model\\tokenizer.json')

## Data Loading and Preparation

In [10]:
# file paths data
image_dir = './all_data/train/radiology/images/'
data_file = './all_data/train/radiology/traindata.csv'

In [11]:
data = pd.read_csv(data_file)
data

Unnamed: 0,id,name,caption
0,ROCO_00002,PMC4083729_AMHSR-4-14-g002.jpg,Computed tomography scan in axial view showin...
1,ROCO_00003,PMC2837471_IJD2009-150251.001.jpg,Bacterial contamination occurred after comple...
2,ROCO_00004,PMC2505281_11999_2007_30_Fig6_HTML.jpg,The patient had residual paralysis of the han...
3,ROCO_00005,PMC3745845_IJD2013-683423.005.jpg,Panoramic radiograph after immediate loading.\n
4,ROCO_00007,PMC4917066_amjcaserep-17-301-g001.jpg,Plain abdomen x-ray: Multiple air levels at t...
...,...,...,...
65445,ROCO_81819,PMC3517833_CRIM.HEMATOLOGY2012-490438.001.jpg,Initial CT abdomen with contrast showing a di...
65446,ROCO_81820,PMC5487234_rb-50-03-0190-g13.jpg,44-year-old male patient after surgical amput...
65447,ROCO_81821,PMC2974222_kjr-11-612-g001.jpg,Primary pulmonary tuberculosis in 18-year-old...
65448,ROCO_81822,PMC3532764_AJNS-7-151-g002.jpg,"MRI brain with gadolinium, coronal view, show..."


In [12]:
# Replace column name 'name' with 'image_path'
data['image_path'] = data.pop('name')

# Prepend 'image_dir' to all entries in 'image_path' column
data['image_path'] = image_dir + data['image_path']

data

Unnamed: 0,id,caption,image_path
0,ROCO_00002,Computed tomography scan in axial view showin...,./all_data/train/radiology/images/PMC4083729_A...
1,ROCO_00003,Bacterial contamination occurred after comple...,./all_data/train/radiology/images/PMC2837471_I...
2,ROCO_00004,The patient had residual paralysis of the han...,./all_data/train/radiology/images/PMC2505281_1...
3,ROCO_00005,Panoramic radiograph after immediate loading.\n,./all_data/train/radiology/images/PMC3745845_I...
4,ROCO_00007,Plain abdomen x-ray: Multiple air levels at t...,./all_data/train/radiology/images/PMC4917066_a...
...,...,...,...
65445,ROCO_81819,Initial CT abdomen with contrast showing a di...,./all_data/train/radiology/images/PMC3517833_C...
65446,ROCO_81820,44-year-old male patient after surgical amput...,./all_data/train/radiology/images/PMC5487234_r...
65447,ROCO_81821,Primary pulmonary tuberculosis in 18-year-old...,./all_data/train/radiology/images/PMC2974222_k...
65448,ROCO_81822,"MRI brain with gadolinium, coronal view, show...",./all_data/train/radiology/images/PMC3532764_A...


In [13]:
# for index, row in data.iterrows():
#     image_path = row['image_path']
#     if not os.path.exists(image_path):
#         data.drop(index, inplace=True)
#     else:
#         try:
#             image = Image.open(image_path)
#         except Exception:
#             data.drop(index, inplace=True)
        
# # Reset the index after dropping rows
# data.reset_index(drop=True, inplace=True)

# data

In [14]:
# LLM output
f = open('./all_data/llm_result.txt', "r")
contents = f.read()
contents = contents.replace("\n", "")
json_data = json.loads(contents)

llm_df = pd.DataFrame(json_data)

llm_df = llm_df.drop('index', axis=1)

llm_df = llm_df[llm_df['relationship'].apply(lambda x: re.search(r'\w', str(x)) is not None)]
llm_df = llm_df.reset_index(drop=True)

llm_df

In [18]:
# Filter ones that have been training by the LLM
data = data.merge(llm_df, on='id')
data

Unnamed: 0,id,caption,image_path,relationship,summary
0,ROCO_00002,Computed tomography scan in axial view showin...,./all_data/train/radiology/images/PMC4083729_A...,\nAI: The diagnosis is Obliteration of the lef...,The diagnosis is Obliteration of the left maxi...
1,ROCO_00003,Bacterial contamination occurred after comple...,./all_data/train/radiology/images/PMC2837471_I...,\nAI: The UMLS semantic types describe the dia...,The UMLS semantic types describe the diagnosis...
2,ROCO_00004,The patient had residual paralysis of the han...,./all_data/train/radiology/images/PMC2505281_1...,\nAI: The UMLS semantic types describe the dia...,The UMLS semantic types describe the diagnosis...
3,ROCO_00007,Plain abdomen x-ray: Multiple air levels at t...,./all_data/train/radiology/images/PMC4917066_a...,\nAI: The UMLS semantic types describe the dia...,The UMLS semantic types describe the diagnosis...
4,ROCO_00008,A 3-year-old child with visual difficulties. ...,./all_data/train/radiology/images/PMC4805615_1...,\nAI: The UMLS semantic types describe the dia...,The UMLS semantic types describe the diagnosis...
...,...,...,...,...,...
500,ROCO_00651,Fig. 5Fluorescein angiography: early hyperflu...,./all_data/train/radiology/images/PMC5052491_J...,\nAI: For the fluorescence angiography procedu...,For the fluorescence angiography procedure (Di...
501,ROCO_00652,Transverse CT thorax image at the level of th...,./all_data/train/radiology/images/PMC4751134_j...,\nAI: The UMLS semantic types describe the dis...,The UMLS semantic types describe the disease a...
502,ROCO_00653,Computed tomography revealing right upper-lun...,./all_data/train/radiology/images/PMC5585904_1...,\nAI: Computed tomography revealing (Pathology...,Computed tomography revealing (Pathology) pneu...
503,ROCO_00654,Lateral fluoroscopic view in a 77-year-old os...,./all_data/train/radiology/images/PMC3119972_A...,\nAI: The UMLS semantic types describe the dia...,The UMLS semantic types describe the diagnosis...


In [19]:
for index, row in data.iterrows():
    image_path = row['image_path']
    if not os.path.exists(image_path):
        data.drop(index, inplace=True)
    else:
        try:
            image = Image.open(image_path)
        except Exception:
            data.drop(index, inplace=True)
        
# Reset the index after dropping rows
data.reset_index(drop=True, inplace=True)

data

Unnamed: 0,id,caption,image_path,relationship,summary
0,ROCO_00002,Computed tomography scan in axial view showin...,./all_data/train/radiology/images/PMC4083729_A...,\nAI: The diagnosis is Obliteration of the lef...,The diagnosis is Obliteration of the left maxi...
1,ROCO_00003,Bacterial contamination occurred after comple...,./all_data/train/radiology/images/PMC2837471_I...,\nAI: The UMLS semantic types describe the dia...,The UMLS semantic types describe the diagnosis...
2,ROCO_00004,The patient had residual paralysis of the han...,./all_data/train/radiology/images/PMC2505281_1...,\nAI: The UMLS semantic types describe the dia...,The UMLS semantic types describe the diagnosis...
3,ROCO_00007,Plain abdomen x-ray: Multiple air levels at t...,./all_data/train/radiology/images/PMC4917066_a...,\nAI: The UMLS semantic types describe the dia...,The UMLS semantic types describe the diagnosis...
4,ROCO_00008,A 3-year-old child with visual difficulties. ...,./all_data/train/radiology/images/PMC4805615_1...,\nAI: The UMLS semantic types describe the dia...,The UMLS semantic types describe the diagnosis...
...,...,...,...,...,...
500,ROCO_00651,Fig. 5Fluorescein angiography: early hyperflu...,./all_data/train/radiology/images/PMC5052491_J...,\nAI: For the fluorescence angiography procedu...,For the fluorescence angiography procedure (Di...
501,ROCO_00652,Transverse CT thorax image at the level of th...,./all_data/train/radiology/images/PMC4751134_j...,\nAI: The UMLS semantic types describe the dis...,The UMLS semantic types describe the disease a...
502,ROCO_00653,Computed tomography revealing right upper-lun...,./all_data/train/radiology/images/PMC5585904_1...,\nAI: Computed tomography revealing (Pathology...,Computed tomography revealing (Pathology) pneu...
503,ROCO_00654,Lateral fluoroscopic view in a 77-year-old os...,./all_data/train/radiology/images/PMC3119972_A...,\nAI: The UMLS semantic types describe the dia...,The UMLS semantic types describe the diagnosis...


In [20]:
from sklearn.model_selection import train_test_split

# Split data into train, test, and valid datasets
train_data, valid_test_data = train_test_split(data, test_size=test_valid_percentage/100, random_state=random_state)
valid_data, test_data = train_test_split(valid_test_data, test_size=0.5, random_state=random_state)

# Reset index
train_data = train_data.reset_index(drop=True)
valid_data = valid_data.reset_index(drop=True)
test_data = test_data.reset_index(drop=True)

In [21]:
print("Train data shape: ", train_data.shape)
print("Valid data shape: ", valid_data.shape)
print("Test data shape: ", test_data.shape)

Train data shape:  (353, 5)
Valid data shape:  (76, 5)
Test data shape:  (76, 5)


In [22]:
# Select n% of data
train_data = train_data.sample(frac=train_data_percentage/100, random_state=random_state)
valid_data = valid_data.sample(frac=valid_data_percentage/100, random_state=random_state)
test_data = test_data.sample(frac=test_data_percentage/100, random_state=random_state)

In [23]:
from datasets import Dataset, DatasetDict

# Convert DataFrame to Hugging Face dataset dictionary format
train_data_dict = Dataset.from_pandas(train_data)
valid_data_dict = Dataset.from_pandas(valid_data)
test_data_dict = Dataset.from_pandas(test_data)

dataset_dict = DatasetDict({
    'train': train_data_dict,
    'validation': valid_data_dict,
    'test': test_data_dict
})

print(dataset_dict)

DatasetDict({
    train: Dataset({
        features: ['id', 'caption', 'image_path', 'relationship', 'summary', '__index_level_0__'],
        num_rows: 353
    })
    validation: Dataset({
        features: ['id', 'caption', 'image_path', 'relationship', 'summary', '__index_level_0__'],
        num_rows: 76
    })
    test: Dataset({
        features: ['id', 'caption', 'image_path', 'relationship', 'summary', '__index_level_0__'],
        num_rows: 76
    })
})


In [24]:
# from PIL import Image

# # text preprocessing step
# def tokenization_fn(captions, max_target_length):
#     """Run tokenization on captions."""
#     labels = tokenizer(captions, 
#                       padding="max_length", 
#                       max_length=max_target_length).input_ids

#     return labels

# # image preprocessing step
# def preprocess_images(image_paths):
#     processed_images = []
#     for image_path in image_paths:
#         image = Image.open(image_path)
#         if image.mode != "RGB":
#             image = image.convert("RGB")
#         processed_images.append(image)
#     return processed_images

# def feature_extraction_fn(image_paths, check_image=True):
#     if check_image:
#         images = preprocess_images(image_paths)
#     else:
#         images = [Image.open(image_file) for image_file in image_paths]

#     encoder_inputs = feature_extractor(images=images, return_tensors="np")

#     return encoder_inputs.pixel_values

# # def feature_extraction_fn(image_paths, check_image=True):
# #     """
# #     Run feature extraction on images
# #     If `check_image` is `True`, the examples that fails during `Image.open()` will be caught and discarded.
# #     Otherwise, an exception will be thrown.
# #     """
# #     model_inputs = {}

# #     if check_image:
# #         images = []
# #         to_keep = []
# #         for image_file in image_paths:
# #             try:
# #                 img = Image.open(image_file)
# #                 images.append(img)
# #                 to_keep.append(True)
# #             except Exception:
# #                 to_keep.append(False)
# #     else:
# #         images = [Image.open(image_file) for image_file in image_paths]

# #     encoder_inputs = feature_extractor(images=images, return_tensors="np")

# #     return encoder_inputs.pixel_values

# def preprocess_fn(examples, max_target_length, check_image = True):
#     """Run tokenization + image feature extraction"""
#     image_paths = examples['image_path']
#     captions = examples['caption']    
    
#     model_inputs = {}
#     # This contains image path column
#     model_inputs['labels'] = tokenization_fn(captions, max_target_length)
#     model_inputs['pixel_values'] = feature_extraction_fn(image_paths, check_image=check_image)

#     return model_inputs

In [25]:
class ImageCaptioningDataset(torch.utils.data.Dataset):
    def __init__(self, ds, ds_type, max_target_length):
        self.ds = ds
        self.max_target_length = max_target_length
        self.ds_type = ds_type

    def __getitem__(self, idx):
        image_path = self.ds[self.ds_type]['image_path'][idx]
        caption = self.ds[self.ds_type]['caption'][idx]
        model_inputs = dict()
        model_inputs['labels'] = self.tokenization_fn(caption, self.max_target_length)
        model_inputs['pixel_values'] = self.feature_extraction_fn(image_path)
        return model_inputs

    def __len__(self):
        return len(self.ds[self.ds_type])
    
    # text preprocessing step
    def tokenization_fn(self, caption, max_target_length):
        """Run tokenization on caption."""
        labels = tokenizer(caption, 
                          padding="max_length", 
                          max_length=max_target_length,
                          truncation=True).input_ids

        return labels

    # image preprocessing step
    def feature_extraction_fn(self, image_path):
        image = Image.open(image_path)
        if image.mode != "RGB":
            image = image.convert("RGB")

        encoder_inputs = feature_extractor(images=image, return_tensors="np")

        return encoder_inputs.pixel_values[0]
#         except Exception:
#             print(Exception)
#             return np.empty((0,))

In [26]:
# processed_dataset = dataset_dict.map(
#     function=preprocess_fn,
#     batched=True,
#     fn_kwargs={"max_target_length": 128},
#     remove_columns=dataset_dict['train'].column_names
# )

In [27]:
# processed_dataset

In [28]:
train_ds = ImageCaptioningDataset(dataset_dict, 'train', max_target_length)
eval_ds = ImageCaptioningDataset(dataset_dict, 'validation', max_target_length)
test_ds = ImageCaptioningDataset(dataset_dict, 'test', max_target_length)

## Define seq2seq training argumentsPermalink

In [29]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    predict_with_generate=True,
    evaluation_strategy="epoch",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    output_dir="./image-captioning-output",
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


## Define metric

In [30]:
import evaluate
metric = evaluate.load("rouge")

In [31]:
import numpy as np

ignore_pad_token_for_loss = True

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # rougeLSum expects newline after each sentence
    preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    if ignore_pad_token_for_loss:
        # Replace -100 in the labels as we can't decode them.
        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds,
                                                     decoded_labels)

    result = metric.compute(predictions=decoded_preds,
                            references=decoded_labels,
                            use_stemmer=True)
    result = {k: round(v * 100, 4) for k, v in result.items()}
    prediction_lens = [
        np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds
    ]
    result["gen_len"] = np.mean(prediction_lens)
    return result

## Training

In [32]:
from transformers import default_data_collator

# # instantiate trainer
# trainer = Seq2SeqTrainer(
#     model=model,
#     tokenizer=feature_extractor,
#     args=training_args,
#     compute_metrics=compute_metrics,
#     train_dataset=processed_dataset['train'],
#     eval_dataset=processed_dataset['validation'],
#     data_collator=default_data_collator,
# )

# instantiate trainer
trainer = Seq2SeqTrainer(
    model=model,
    tokenizer=feature_extractor,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    data_collator=default_data_collator,
)

#6799

In [33]:
trainer.train()

***** Running training *****
  Num examples = 353
  Num Epochs = 3
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 267
  Number of trainable parameters = 239195904


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,No log,0.428813,12.8102,1.199,9.5193,10.039,18.763158
2,No log,0.42134,19.0445,4.455,16.463,16.7536,18.657895
3,No log,0.422779,18.7961,3.8957,16.4756,16.6736,18.486842


***** Running Evaluation *****
  Num examples = 76
  Batch size = 4
***** Running Evaluation *****
  Num examples = 76
  Batch size = 4
***** Running Evaluation *****
  Num examples = 76
  Batch size = 4


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=267, training_loss=0.5580083725604226, metrics={'train_runtime': 248.5096, 'train_samples_per_second': 4.261, 'train_steps_per_second': 1.074, 'total_flos': 1.9111126069621555e+17, 'train_loss': 0.5580083725604226, 'epoch': 3.0})

In [30]:
#trainer.train("./image-captioning-output\checkpoint-6500")

In [34]:
trainer.save_model("./new_image-captioning-output-without-llm")
tokenizer.save_pretrained("./new_image-captioning-output-without-llm")

Saving model checkpoint to ./new_image-captioning-output-without-llm
Configuration saved in ./new_image-captioning-output-without-llm\config.json
Model weights saved in ./new_image-captioning-output-without-llm\pytorch_model.bin
Feature extractor saved in ./new_image-captioning-output-without-llm\preprocessor_config.json
tokenizer config file saved in ./new_image-captioning-output-without-llm\tokenizer_config.json
Special tokens file saved in ./new_image-captioning-output-without-llm\special_tokens_map.json


('./new_image-captioning-output-without-llm\\tokenizer_config.json',
 './new_image-captioning-output-without-llm\\special_tokens_map.json',
 './new_image-captioning-output-without-llm\\vocab.json',
 './new_image-captioning-output-without-llm\\merges.txt',
 './new_image-captioning-output-without-llm\\added_tokens.json',
 './new_image-captioning-output-without-llm\\tokenizer.json')

In [35]:
# Get predictions from the model
predictions = trainer.predict(test_ds)

# Process and evaluate the predictions
preds = predictions.predictions
labels = predictions.label_ids

# Post-process the predictions and labels
decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

# Calculate evaluation metrics
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu
bleu_scores = []

# Print the actual captions and predicted captions
for actual_caption, predicted_caption in zip(decoded_labels, decoded_preds):
    actual_caption = actual_caption.split("</s>")[0]
    predicted_caption = predicted_caption.split("</s>")[0]
    bleu_score = sentence_bleu([actual_caption.split()], predicted_caption.split())
    bleu_scores.append(bleu_score)
    print("Actual Caption:", actual_caption)
    print("Predicted Caption:", predicted_caption)
    print("Blue score: ", bleu_score)
    print("--------------")

average_bleu_score = sum(bleu_scores) / len(bleu_scores)

# Print average BLEU score
print("Average BLEU score:", average_bleu_score)

***** Running Prediction *****
  Num examples = 76
  Batch size = 4


Actual Caption:  Diffusion-weighted image shows acute infarct in left parieto-occipital area

Predicted Caption:  A CT scan of the abdomen shows a large mass of fat in the right abdomen.

Blue score:  1.0832677820940877e-231
--------------
Actual Caption:  Transverse view of lung using computed tomography. Leukemic infiltration is seen.

Predicted Caption:  A CT scan of the abdomen shows a large mass of lymph nodes in the right abdomen.

Blue score:  8.972141065609098e-232
--------------
Actual Caption:  Facial skull radiograph. Note sclerosis of the orbits and sphenoid bones resulting in “Harlequin mask appearance’’

Predicted Caption:  A CT scan of the abdomen shows a large mass of fat and a large amount of fluid in
Blue score:  4.905470711005226e-155
--------------
Actual Caption:  MRI showing that the mass was greatly enhanced, with a clear portion in the nearby tissues.MRI showing that the mass was greatly enhanced, with a clear portion in the nearby tissues

Predicted Caption:  A

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


## With knowledge

In [36]:
from transformers import VisionEncoderDecoderModel, AutoTokenizer, AutoFeatureExtractor

image_encoder_model = "google/vit-base-patch16-224-in21k"
text_decode_model = "gpt2"

model = VisionEncoderDecoderModel.from_encoder_decoder_pretrained(image_encoder_model, text_decode_model)

# image feature extractor
feature_extractor = AutoFeatureExtractor.from_pretrained(image_encoder_model)
# text tokenizer
tokenizer = AutoTokenizer.from_pretrained(text_decode_model)

# GPT2 only has bos/eos tokens but not decoder_start/pad tokens
tokenizer.pad_token = tokenizer.eos_token

# update the model config
model.config.eos_token_id = tokenizer.eos_token_id
model.config.decoder_start_token_id = tokenizer.bos_token_id
model.config.pad_token_id = tokenizer.pad_token_id

loading configuration file config.json from cache at C:\Users\ACER/.cache\huggingface\hub\models--google--vit-base-patch16-224-in21k\snapshots\7cbdb7ee3a6bcdf99dae654893f66519c480a0f8\config.json
Model config ViTConfig {
  "_name_or_path": "google/vit-base-patch16-224-in21k",
  "architectures": [
    "ViTModel"
  ],
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "image_size": 224,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "model_type": "vit",
  "num_attention_heads": 12,
  "num_channels": 3,
  "num_hidden_layers": 12,
  "patch_size": 16,
  "qkv_bias": true,
  "transformers_version": "4.24.0"
}

loading weights file pytorch_model.bin from cache at C:\Users\ACER/.cache\huggingface\hub\models--google--vit-base-patch16-224-in21k\snapshots\7cbdb7ee3a6bcdf99dae654893f66519c480a0f8\pytorch_model.bin
All model checkpoint weights were used when initializing 

loading file vocab.json from cache at C:\Users\ACER/.cache\huggingface\hub\models--gpt2\snapshots\e7da7f221d5bf496a48136c0cd264e630fe9fcc8\vocab.json
loading file merges.txt from cache at C:\Users\ACER/.cache\huggingface\hub\models--gpt2\snapshots\e7da7f221d5bf496a48136c0cd264e630fe9fcc8\merges.txt
loading file tokenizer.json from cache at C:\Users\ACER/.cache\huggingface\hub\models--gpt2\snapshots\e7da7f221d5bf496a48136c0cd264e630fe9fcc8\tokenizer.json
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at None
loading configuration file config.json from cache at C:\Users\ACER/.cache\huggingface\hub\models--gpt2\snapshots\e7da7f221d5bf496a48136c0cd264e630fe9fcc8\config.json
Model config GPT2Config {
  "_name_or_path": "gpt2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_toke

In [37]:
class ImageCaptioningDatasetWithKnowledge(torch.utils.data.Dataset):
    def __init__(self, ds, ds_type, max_target_length):
        self.ds = ds
        self.max_target_length = max_target_length
        self.ds_type = ds_type

    def __getitem__(self, idx):
        image_path = self.ds[self.ds_type]['image_path'][idx]
        caption = self.ds[self.ds_type]['caption'][idx]
        summary = self.ds[self.ds_type]['summary'][idx]
        model_inputs = dict()
        model_inputs['labels'] = self.tokenization_fn(f'{caption} </s> {summary}', self.max_target_length)
        model_inputs['pixel_values'] = self.feature_extraction_fn(image_path)
        return model_inputs

    def __len__(self):
        return len(self.ds[self.ds_type])
    
    # text preprocessing step
    def tokenization_fn(self, caption, max_target_length):
        """Run tokenization on caption."""
        labels = tokenizer(caption, 
                          padding="max_length", 
                          max_length=max_target_length,
                          truncation=True).input_ids

        return labels

    # image preprocessing step
    def feature_extraction_fn(self, image_path):
        image = Image.open(image_path)
        if image.mode != "RGB":
            image = image.convert("RGB")

        encoder_inputs = feature_extractor(images=image, return_tensors="np")

        return encoder_inputs.pixel_values[0]
#         except Exception:
#             print(Exception)
#             return np.empty((0,))

In [38]:
train_ds = ImageCaptioningDatasetWithKnowledge(dataset_dict, 'train', max_target_length)
eval_ds = ImageCaptioningDatasetWithKnowledge(dataset_dict, 'validation', max_target_length)
test_ds = ImageCaptioningDatasetWithKnowledge(dataset_dict, 'test', max_target_length)

In [39]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    predict_with_generate=True,
    evaluation_strategy="epoch",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    output_dir="./image-captioning-output",
)

from transformers import default_data_collator

# # instantiate trainer
# trainer = Seq2SeqTrainer(
#     model=model,
#     tokenizer=feature_extractor,
#     args=training_args,
#     compute_metrics=compute_metrics,
#     train_dataset=processed_dataset['train'],
#     eval_dataset=processed_dataset['validation'],
#     data_collator=default_data_collator,
# )

# instantiate trainer
trainer = Seq2SeqTrainer(
    model=model,
    tokenizer=feature_extractor,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    data_collator=default_data_collator,
)

trainer.train()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
***** Running training *****
  Num examples = 353
  Num Epochs = 3
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 267
  Number of trainable parameters = 239195904


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,No log,0.72089,30.0119,22.6904,29.2407,29.7066,19.0
2,No log,0.692185,30.0214,20.9597,27.4251,29.3785,19.0
3,No log,0.689636,29.8098,21.0805,27.6826,29.2863,19.0


***** Running Evaluation *****
  Num examples = 76
  Batch size = 4
***** Running Evaluation *****
  Num examples = 76
  Batch size = 4
***** Running Evaluation *****
  Num examples = 76
  Batch size = 4


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=267, training_loss=0.8971670200762231, metrics={'train_runtime': 244.401, 'train_samples_per_second': 4.333, 'train_steps_per_second': 1.092, 'total_flos': 1.9111126069621555e+17, 'train_loss': 0.8971670200762231, 'epoch': 3.0})

In [40]:
trainer.save_model("./new_image-captioning-output-with-llm")
tokenizer.save_pretrained("./new_image-captioning-output-with-llm")

Saving model checkpoint to ./image-captioning-output-with-llm
Configuration saved in ./image-captioning-output-with-llm\config.json
Model weights saved in ./image-captioning-output-with-llm\pytorch_model.bin
Feature extractor saved in ./image-captioning-output-with-llm\preprocessor_config.json
tokenizer config file saved in ./image-captioning-output-with-llm\tokenizer_config.json
Special tokens file saved in ./image-captioning-output-with-llm\special_tokens_map.json


('./image-captioning-output-with-llm\\tokenizer_config.json',
 './image-captioning-output-with-llm\\special_tokens_map.json',
 './image-captioning-output-with-llm\\vocab.json',
 './image-captioning-output-with-llm\\merges.txt',
 './image-captioning-output-with-llm\\added_tokens.json',
 './image-captioning-output-with-llm\\tokenizer.json')

In [41]:
# Get predictions from the model
predictions = trainer.predict(test_ds)

# Process and evaluate the predictions
preds = predictions.predictions
labels = predictions.label_ids

# Post-process the predictions and labels
decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

# Calculate evaluation metrics
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu
bleu_scores = []

# Print the actual captions and predicted captions
for actual_caption, predicted_caption in zip(decoded_labels, decoded_preds):
#     actual_caption = actual_caption.split("</s>")[0]
#     predicted_caption = predicted_caption.split("</s>")[0]
    bleu_score = sentence_bleu([actual_caption.split()], predicted_caption.split())
    bleu_scores.append(bleu_score)
    print("Actual Caption:", actual_caption)
    print("Predicted Caption:", predicted_caption)
    print("Blue score: ", bleu_score)
    print("--------------")

average_bleu_score = sum(bleu_scores) / len(bleu_scores)

# Print average BLEU score
print("Average BLEU score:", average_bleu_score)

***** Running Prediction *****
  Num examples = 76
  Batch size = 4


Actual Caption:  Diffusion-weighted image shows acute infarct in left parieto-occipital area
 </s> The UMLS semantic types describe the diagnosis for diffusion MRI (Intellectual Product) showing Acute Infarction in left parieto-occipital area.
Predicted Caption:  The UMLS semantic types describe the diagnosis for MRI procedure (Diagnostic Procedure) using
Blue score:  0.16825572285877524
--------------
Actual Caption:  Transverse view of lung using computed tomography. Leukemic infiltration is seen.
 </s> Transverse view of lung using computed tomography (Diagnostic Procedure) shows Leukemic infiltration seen (Sign or Symptom). The anatomical location affected by the disease is Lung (Body Part, Organ, or Organ Component).
Predicted Caption:  The UMLS semantic types describe the diagnosis for the diagnosis for MRI procedure (Diagnostic
Blue score:  1.5618762825312226e-232
--------------
Actual Caption:  Facial skull radiograph. Note sclerosis of the orbits and sphenoid bones resulting i

# Computing BLEU and ROUGE score

In [39]:
# import evaluate


# bleu = evaluate.load("bleu")
# rouge = evaluate.load("rouge")


# def calculate_bleu_and_rouge(reference: str, hypothesis: str):
#     """
#     Inputs: Reference -> Target caption, Hypothesis -> Generated caption
#     Outputs: Dictionary of bleu score and rouge score
#     Description: This function computes the bleu score as well as rouge1, rouge2, rougeL, and rougeLsum
#     """
    
#     bleu_score = bleu.compute(predictions=[hypothesis], references=[reference])
#     rouge_score = rouge.compute(
#         predictions=[hypothesis], references=[reference]
#     )

#     return {"bleu_score": bleu_score, "rouge_score": rouge_score}