In [1]:
# params
test_valid_percentage = 30 # (test - 15, valid - 15)

train_data_percentage = 60
valid_data_percentage = 60
test_data_percentage = 60

max_target_length = 256

In [2]:
import os
import re
import json
import torch
import numpy as np
import pandas as pd
from PIL import Image

In [3]:
from transformers import VisionEncoderDecoderModel, AutoFeatureExtractor,AutoTokenizer
os.environ["WANDB_DISABLED"] = "true"

In [4]:
import nltk
try:
    nltk.data.find("tokenizers/punkt")
except (LookupError, OSError):
    nltk.download("punkt", quiet=True)

## Initialize VisionEncoderDecoderModelPermalink

In [5]:
from transformers import VisionEncoderDecoderModel, AutoTokenizer, AutoFeatureExtractor

image_encoder_model = "google/vit-base-patch16-224-in21k"
text_decode_model = "gpt2"

model = VisionEncoderDecoderModel.from_encoder_decoder_pretrained(image_encoder_model, text_decode_model)

Some weights of GPT2LMHeadModel were not initialized from the model checkpoint at gpt2 and are newly initialized: ['h.8.crossattention.bias', 'h.6.crossattention.q_attn.weight', 'h.2.crossattention.masked_bias', 'h.5.crossattention.c_proj.bias', 'h.10.crossattention.c_attn.weight', 'h.5.crossattention.c_proj.weight', 'h.4.crossattention.masked_bias', 'h.5.crossattention.bias', 'h.11.crossattention.masked_bias', 'h.5.ln_cross_attn.weight', 'h.8.crossattention.c_proj.weight', 'h.9.crossattention.c_attn.weight', 'h.9.crossattention.c_proj.weight', 'h.0.ln_cross_attn.weight', 'h.4.crossattention.c_attn.weight', 'h.1.crossattention.c_proj.weight', 'h.4.ln_cross_attn.weight', 'h.7.crossattention.c_proj.weight', 'h.1.crossattention.q_attn.weight', 'h.7.crossattention.q_attn.weight', 'h.2.crossattention.bias', 'h.3.ln_cross_attn.weight', 'h.7.ln_cross_attn.weight', 'h.6.crossattention.c_attn.weight', 'h.1.ln_cross_attn.weight', 'h.11.crossattention.c_proj.bias', 'h.10.crossattention.c_proj.bia

In [6]:
# image feature extractor
feature_extractor = AutoFeatureExtractor.from_pretrained(image_encoder_model)
# text tokenizer
tokenizer = AutoTokenizer.from_pretrained(text_decode_model)

In [7]:
# GPT2 only has bos/eos tokens but not decoder_start/pad tokens
tokenizer.pad_token = tokenizer.eos_token

# update the model config
model.config.eos_token_id = tokenizer.eos_token_id
model.config.decoder_start_token_id = tokenizer.bos_token_id
model.config.pad_token_id = tokenizer.pad_token_id

In [8]:
output_dir = "vit-gpt-model"
model.save_pretrained(output_dir)
feature_extractor.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

('vit-gpt-model\\tokenizer_config.json',
 'vit-gpt-model\\special_tokens_map.json',
 'vit-gpt-model\\vocab.json',
 'vit-gpt-model\\merges.txt',
 'vit-gpt-model\\added_tokens.json',
 'vit-gpt-model\\tokenizer.json')

## Data Loading and Preparation

In [9]:
# file paths data
image_dir = './all_data/train/radiology/images/'
data_file = './all_data/train/radiology/traindata.csv'

In [10]:
data = pd.read_csv(data_file)
data

Unnamed: 0,id,name,caption
0,ROCO_00002,PMC4083729_AMHSR-4-14-g002.jpg,Computed tomography scan in axial view showin...
1,ROCO_00003,PMC2837471_IJD2009-150251.001.jpg,Bacterial contamination occurred after comple...
2,ROCO_00004,PMC2505281_11999_2007_30_Fig6_HTML.jpg,The patient had residual paralysis of the han...
3,ROCO_00005,PMC3745845_IJD2013-683423.005.jpg,Panoramic radiograph after immediate loading.\n
4,ROCO_00007,PMC4917066_amjcaserep-17-301-g001.jpg,Plain abdomen x-ray: Multiple air levels at t...
...,...,...,...
65445,ROCO_81819,PMC3517833_CRIM.HEMATOLOGY2012-490438.001.jpg,Initial CT abdomen with contrast showing a di...
65446,ROCO_81820,PMC5487234_rb-50-03-0190-g13.jpg,44-year-old male patient after surgical amput...
65447,ROCO_81821,PMC2974222_kjr-11-612-g001.jpg,Primary pulmonary tuberculosis in 18-year-old...
65448,ROCO_81822,PMC3532764_AJNS-7-151-g002.jpg,"MRI brain with gadolinium, coronal view, show..."


In [11]:
# Replace column name 'name' with 'image_path'
data['image_path'] = data.pop('name')

# Prepend 'image_dir' to all entries in 'image_path' column
data['image_path'] = image_dir + data['image_path']

data

Unnamed: 0,id,caption,image_path
0,ROCO_00002,Computed tomography scan in axial view showin...,./all_data/train/radiology/images/PMC4083729_A...
1,ROCO_00003,Bacterial contamination occurred after comple...,./all_data/train/radiology/images/PMC2837471_I...
2,ROCO_00004,The patient had residual paralysis of the han...,./all_data/train/radiology/images/PMC2505281_1...
3,ROCO_00005,Panoramic radiograph after immediate loading.\n,./all_data/train/radiology/images/PMC3745845_I...
4,ROCO_00007,Plain abdomen x-ray: Multiple air levels at t...,./all_data/train/radiology/images/PMC4917066_a...
...,...,...,...
65445,ROCO_81819,Initial CT abdomen with contrast showing a di...,./all_data/train/radiology/images/PMC3517833_C...
65446,ROCO_81820,44-year-old male patient after surgical amput...,./all_data/train/radiology/images/PMC5487234_r...
65447,ROCO_81821,Primary pulmonary tuberculosis in 18-year-old...,./all_data/train/radiology/images/PMC2974222_k...
65448,ROCO_81822,"MRI brain with gadolinium, coronal view, show...",./all_data/train/radiology/images/PMC3532764_A...


In [12]:
for index, row in data.iterrows():
    image_path = row['image_path']
    if not os.path.exists(image_path):
        data.drop(index, inplace=True)
    else:
        try:
            image = Image.open(image_path)
        except Exception:
            data.drop(index, inplace=True)
        
# Reset the index after dropping rows
data.reset_index(drop=True, inplace=True)

data

Unnamed: 0,id,caption,image_path
0,ROCO_00002,Computed tomography scan in axial view showin...,./all_data/train/radiology/images/PMC4083729_A...
1,ROCO_00003,Bacterial contamination occurred after comple...,./all_data/train/radiology/images/PMC2837471_I...
2,ROCO_00004,The patient had residual paralysis of the han...,./all_data/train/radiology/images/PMC2505281_1...
3,ROCO_00005,Panoramic radiograph after immediate loading.\n,./all_data/train/radiology/images/PMC3745845_I...
4,ROCO_00007,Plain abdomen x-ray: Multiple air levels at t...,./all_data/train/radiology/images/PMC4917066_a...
...,...,...,...
65414,ROCO_81819,Initial CT abdomen with contrast showing a di...,./all_data/train/radiology/images/PMC3517833_C...
65415,ROCO_81820,44-year-old male patient after surgical amput...,./all_data/train/radiology/images/PMC5487234_r...
65416,ROCO_81821,Primary pulmonary tuberculosis in 18-year-old...,./all_data/train/radiology/images/PMC2974222_k...
65417,ROCO_81822,"MRI brain with gadolinium, coronal view, show...",./all_data/train/radiology/images/PMC3532764_A...


In [13]:
# # LLM output
# f = open('./all_data/llm_result.txt', "r")
# contents = f.read()
# contents = contents.replace("\n", "")
# json_data = json.loads(contents)

# llm_df = pd.DataFrame(json_data)

# llm_df = llm_df.drop('index', axis=1)

# llm_df = llm_df[llm_df['relationship'].apply(lambda x: re.search(r'\w', str(x)) is not None)]
# llm_df = llm_df.reset_index(drop=True)

# llm_df

In [14]:
# # Filter ones that have been training by the LLM
# data = data.merge(llm_df, on='id')
# data

In [15]:
from sklearn.model_selection import train_test_split

# Split data into train, test, and valid datasets
train_data, valid_test_data = train_test_split(data, test_size=test_valid_percentage/100, random_state=42)
valid_data, test_data = train_test_split(valid_test_data, test_size=0.5, random_state=42)

# Reset index
train_data = train_data.reset_index(drop=True)
valid_data = valid_data.reset_index(drop=True)
test_data = test_data.reset_index(drop=True)

In [16]:
print("Train data shape: ", train_data.shape)
print("Valid data shape: ", valid_data.shape)
print("Test data shape: ", test_data.shape)

Train data shape:  (45793, 3)
Valid data shape:  (9813, 3)
Test data shape:  (9813, 3)


In [17]:
# Select n% of data
train_data = train_data.sample(frac=train_data_percentage/100, random_state=42)
valid_data = valid_data.sample(frac=valid_data_percentage/100, random_state=42)
test_data = test_data.sample(frac=test_data_percentage/100, random_state=42)

In [18]:
from datasets import Dataset, DatasetDict

# Convert DataFrame to Hugging Face dataset dictionary format
train_data_dict = Dataset.from_pandas(train_data)
valid_data_dict = Dataset.from_pandas(valid_data)
test_data_dict = Dataset.from_pandas(test_data)

dataset_dict = DatasetDict({
    'train': train_data_dict,
    'validation': valid_data_dict,
    'test': test_data_dict
})

print(dataset_dict)

DatasetDict({
    train: Dataset({
        features: ['id', 'caption', 'image_path', '__index_level_0__'],
        num_rows: 27476
    })
    validation: Dataset({
        features: ['id', 'caption', 'image_path', '__index_level_0__'],
        num_rows: 5888
    })
    test: Dataset({
        features: ['id', 'caption', 'image_path', '__index_level_0__'],
        num_rows: 5888
    })
})


In [19]:
# from PIL import Image

# # text preprocessing step
# def tokenization_fn(captions, max_target_length):
#     """Run tokenization on captions."""
#     labels = tokenizer(captions, 
#                       padding="max_length", 
#                       max_length=max_target_length).input_ids

#     return labels

# # image preprocessing step
# def preprocess_images(image_paths):
#     processed_images = []
#     for image_path in image_paths:
#         image = Image.open(image_path)
#         if image.mode != "RGB":
#             image = image.convert("RGB")
#         processed_images.append(image)
#     return processed_images

# def feature_extraction_fn(image_paths, check_image=True):
#     if check_image:
#         images = preprocess_images(image_paths)
#     else:
#         images = [Image.open(image_file) for image_file in image_paths]

#     encoder_inputs = feature_extractor(images=images, return_tensors="np")

#     return encoder_inputs.pixel_values

# # def feature_extraction_fn(image_paths, check_image=True):
# #     """
# #     Run feature extraction on images
# #     If `check_image` is `True`, the examples that fails during `Image.open()` will be caught and discarded.
# #     Otherwise, an exception will be thrown.
# #     """
# #     model_inputs = {}

# #     if check_image:
# #         images = []
# #         to_keep = []
# #         for image_file in image_paths:
# #             try:
# #                 img = Image.open(image_file)
# #                 images.append(img)
# #                 to_keep.append(True)
# #             except Exception:
# #                 to_keep.append(False)
# #     else:
# #         images = [Image.open(image_file) for image_file in image_paths]

# #     encoder_inputs = feature_extractor(images=images, return_tensors="np")

# #     return encoder_inputs.pixel_values

# def preprocess_fn(examples, max_target_length, check_image = True):
#     """Run tokenization + image feature extraction"""
#     image_paths = examples['image_path']
#     captions = examples['caption']    
    
#     model_inputs = {}
#     # This contains image path column
#     model_inputs['labels'] = tokenization_fn(captions, max_target_length)
#     model_inputs['pixel_values'] = feature_extraction_fn(image_paths, check_image=check_image)

#     return model_inputs

In [20]:
class ImageCapatioingDataset(torch.utils.data.Dataset):
    def __init__(self, ds, ds_type, max_target_length):
        self.ds = ds
        self.max_target_length = max_target_length
        self.ds_type = ds_type

    def __getitem__(self, idx):
        image_path = self.ds[self.ds_type]['image_path'][idx]
        caption = self.ds[self.ds_type]['caption'][idx]
        model_inputs = dict()
        model_inputs['labels'] = self.tokenization_fn(caption, self.max_target_length)
        model_inputs['pixel_values'] = self.feature_extraction_fn(image_path)
        return model_inputs

    def __len__(self):
        return len(self.ds[self.ds_type])
    
    # text preprocessing step
    def tokenization_fn(self, caption, max_target_length):
        """Run tokenization on caption."""
        labels = tokenizer(caption, 
                          padding="max_length", 
                          max_length=max_target_length,
                          truncation=True).input_ids

        return labels

    # image preprocessing step
    def feature_extraction_fn(self, image_path):
        image = Image.open(image_path)
        if image.mode != "RGB":
            image = image.convert("RGB")

        encoder_inputs = feature_extractor(images=image, return_tensors="np")

        return encoder_inputs.pixel_values[0]
#         except Exception:
#             print(Exception)
#             return np.empty((0,))

In [21]:
# processed_dataset = dataset_dict.map(
#     function=preprocess_fn,
#     batched=True,
#     fn_kwargs={"max_target_length": 128},
#     remove_columns=dataset_dict['train'].column_names
# )

In [22]:
# processed_dataset

In [23]:
train_ds = ImageCapatioingDataset(dataset_dict, 'train', max_target_length)
eval_ds = ImageCapatioingDataset(dataset_dict, 'validation', max_target_length)
test_ds = ImageCapatioingDataset(dataset_dict, 'test', max_target_length)

## Define seq2seq training argumentsPermalink

In [24]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    predict_with_generate=True,
    evaluation_strategy="epoch",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    output_dir="./image-captioning-output",
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


## Define metric

In [25]:
import evaluate
metric = evaluate.load("rouge")

In [26]:
import numpy as np

ignore_pad_token_for_loss = True

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # rougeLSum expects newline after each sentence
    preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    if ignore_pad_token_for_loss:
        # Replace -100 in the labels as we can't decode them.
        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds,
                                                     decoded_labels)

    result = metric.compute(predictions=decoded_preds,
                            references=decoded_labels,
                            use_stemmer=True)
    result = {k: round(v * 100, 4) for k, v in result.items()}
    prediction_lens = [
        np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds
    ]
    result["gen_len"] = np.mean(prediction_lens)
    return result

## Training

In [27]:
from transformers import default_data_collator

# # instantiate trainer
# trainer = Seq2SeqTrainer(
#     model=model,
#     tokenizer=feature_extractor,
#     args=training_args,
#     compute_metrics=compute_metrics,
#     train_dataset=processed_dataset['train'],
#     eval_dataset=processed_dataset['validation'],
#     data_collator=default_data_collator,
# )

# instantiate trainer
trainer = Seq2SeqTrainer(
    model=model,
    tokenizer=feature_extractor,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    data_collator=default_data_collator,
)

#6799

In [28]:
#trainer.train()

In [29]:
trainer.train("./image-captioning-output\checkpoint-6500")

Loading model from ./image-captioning-output\checkpoint-6500.
***** Running training *****
  Num examples = 27476
  Num Epochs = 3
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 20607
  Number of trainable parameters = 239195904
  Continuing training from checkpoint, will skip to saved global_step
  Continuing training from epoch 0
  Continuing training from global step 6500
  Will skip the first 0 epochs then the first 6500 batches in the first epoch. If this takes a lot of time, you can add the `--ignore_data_skip` flag to your launch command, but you will resume the training on data already seen by your model.


  0%|          | 0/6500 [00:00<?, ?it/s]

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,0.4021,0.379527,20.9135,5.6592,18.1773,18.5655,17.566576
2,0.3657,0.362218,21.6791,6.6571,19.3032,19.5949,16.492867


***** Running Evaluation *****
  Num examples = 5888
  Batch size = 4
Saving model checkpoint to ./image-captioning-output\checkpoint-7000
Configuration saved in ./image-captioning-output\checkpoint-7000\config.json
Model weights saved in ./image-captioning-output\checkpoint-7000\pytorch_model.bin
Feature extractor saved in ./image-captioning-output\checkpoint-7000\preprocessor_config.json
Saving model checkpoint to ./image-captioning-output\checkpoint-7500
Configuration saved in ./image-captioning-output\checkpoint-7500\config.json
Model weights saved in ./image-captioning-output\checkpoint-7500\pytorch_model.bin
Feature extractor saved in ./image-captioning-output\checkpoint-7500\preprocessor_config.json
Saving model checkpoint to ./image-captioning-output\checkpoint-8000
Configuration saved in ./image-captioning-output\checkpoint-8000\config.json
Model weights saved in ./image-captioning-output\checkpoint-8000\pytorch_model.bin
Feature extractor saved in ./image-captioning-output\ch

RuntimeError: [enforce fail at ..\caffe2\serialize\inline_container.cc:337] . unexpected pos 730413952 vs 730413840

In [None]:
`

In [None]:
trainer.save_model("./image-captioning-output")
tokenizer.save_pretrained("./image-captioning-output")

In [30]:
# Get predictions from the model
predictions = trainer.predict(test_ds)

# Process and evaluate the predictions
preds = predictions.predictions
labels = predictions.label_ids

# Post-process the predictions and labels
decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

# Calculate evaluation metrics
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu
bleu_scores = []

# Print the actual captions and predicted captions
i = 0
for actual_caption, predicted_caption in zip(decoded_labels, decoded_preds):
    i += 1
    bleu_score = sentence_bleu([actual_caption.split()], predicted_caption.split())
    bleu_scores.append(bleu_score)
    if i % 20 == 0:
        print("Actual Caption:", actual_caption)
        print("Predicted Caption:", predicted_caption)
        print("Blue score: ", bleu_score)
        print("--------------")

average_bleu_score = sum(bleu_scores) / len(bleu_scores)

# Print average BLEU score
print("Average BLEU score:", average_bleu_score)

***** Running Prediction *****
  Num examples = 5888
  Batch size = 4
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Actual Caption:  Vascular anatomic variants. The coronal multiplanar reformatted image demonstrates the origin of the right hepatic artery (black arrow) from the superior mesenteric artery (white arrow) and diffuse telangiectasias (black arrowheads) in the peripheral parenchyma.

Predicted Caption:  Coronal CT angiography showing a large right common carotid artery (arrow) and
Blue score:  2.0596184594894264e-232
--------------
Actual Caption:  Identification of anastomosis site by duplex scan. The extent of calcification is easily estimated by the thickness and intensity of echo density. The figure shows the calcified tibial artery, which is not suitable as a distal anastomosis site.

Predicted Caption:  Ultrasound image of the right breast showing a hypoechoic mass with a hypoe
Blue score:  1.8455299742220844e-232
--------------
Actual Caption:  White line: posterior condylar line. Red line: anatomical transepicondylar axis.

Predicted Caption:  Axial T2-weighted image of the knee jo

Actual Caption:  Carotid duplex sonography performed on 1 October 2003. The floating thrombus is observed at the left internal carotid artery (arrow).

Predicted Caption:  Ultrasound image of the right eye showing a hypoechoic lesion (arrow)
Blue score:  4.413959087726801e-232
--------------
Actual Caption:  Measurement of intervertebral space height.The intervertebral space height (h) is determined by the distance from the midpoint of the superior endplate to the midpoint of the inferior endplate.

Predicted Caption:  Lateral radiograph of the knee showing the lateral radiograph of the knee.

Blue score:  2.0628584276138003e-155
--------------
Actual Caption:  M-mode at the level of the mitral leaflet tips; SAM is visible

Predicted Caption:  Transesophageal echocardiography showing a large mass in the left atrium
Blue score:  8.387826279040936e-232
--------------
Actual Caption:  A new hypodense lesion in caudate lobe of the liver suspicious for metastasis.

Predicted Caption:  CT sc

Actual Caption:  CBCT image showing the palatal location of the impacted secondary canine

Predicted Caption:  Axial CT scan of the neck showing the fracture of the right maxillary sinus.

Blue score:  6.830096904817037e-155
--------------
Actual Caption:  Enlarged supraclavicular node.

Predicted Caption:  CT scan of the abdomen showing a large mass in the right parotid gland.

Blue score:  0
--------------
Actual Caption:  Nasal cavity width. Measured in millimeters from the anterior nasal spine to the lateral wall of the nasal base on the side of the impacted canine and canine without impaction

Predicted Caption:  Panoramic radiograph of the patient.

Blue score:  5.6519694558963347e-157
--------------
Actual Caption:  Axial fat-saturated contrast-enhanced MRI image showing loss of normal corticomedullary differentiation along with focal caliectasis, cavity communicating with the calyces, enhancing urothelial thickening (white block arrow) and necrotic (L) para-aortic lymphnodes (w

Actual Caption:  Ultrasound of ill-defined breast lesion showing hypoechoic mass with irregular borders. Ultrasound interpreted as BI-RADS 4, suspicious abnormality with differential diagnoses of phlegmon, malignancy, and calciphylaxis. Surgical consultation recommended.

Predicted Caption:  Ultrasound image of the right breast.

Blue score:  2.9952459934854115e-233
--------------
Actual Caption:  Video fluoroscopic examination showing no tracheal aspiration.

Predicted Caption:  Lateral cephalometric radiograph of the abdomen showing the dilated stomach and the presence
Blue score:  9.594503055152632e-232
--------------
Actual Caption:  Axial fluid-attenuated inversion recovery MRI image demonstrating tumor-related infiltration involving both temporal lobes (Short arrow), and the substantia nigra (Long arrow).

Predicted Caption:  Axial T2-weighted image shows a hyperintense lesion in the left temporal
Blue score:  5.699831074849419e-232
--------------
Actual Caption:  Axial section o

Actual Caption:  Grayscale ultrasound sagittal image of liver in a 37-year-old patient demonstrates a 3.2 cm hypoechoic mass in segment 6.

Predicted Caption:  Ultrasound image of the right kidney.

Blue score:  8.683164396086396e-156
--------------
Actual Caption:  Axial tomography image showing a lateral compression type injury.

Predicted Caption:  CT scan of the pelvis showing the presence of a large mass in the right iliac
Blue score:  1.0832677820940877e-231
--------------
Actual Caption:   Eight days after the first embolization, pelvic angiography shows collateral arteries feeding the uterus from the right external iliac artery.

Predicted Caption:  Angiography of the left kidney showing a large mass in the left renal vein.

Blue score:  7.296382734947757e-232
--------------
Actual Caption:  Presentation M reveals the enlargement of the right ventricle (RV) and an abnormal motion of the ventricular septum (↑↑) – a phenomenon that indicated the possibility of a low-pressure syst

# Computing BLEU and ROUGE score

In [None]:
import evaluate


bleu = evaluate.load("bleu")
rouge = evaluate.load("rouge")


def calculate_bleu_and_rouge(reference: str, hypothesis: str):
    """
    Inputs: Reference -> Target caption, Hypothesis -> Generated caption
    Outputs: Dictionary of bleu score and rouge score
    Description: This function computes the bleu score as well as rouge1, rouge2, rougeL, and rougeLsum
    """
    
    bleu_score = bleu.compute(predictions=[hypothesis], references=[reference])
    rouge_score = rouge.compute(
        predictions=[hypothesis], references=[reference]
    )

    return {"bleu_score": bleu_score, "rouge_score": rouge_score}