In [54]:
import torch
from uform.gen_model import VLMForCausalLM, VLMProcessor
from PIL import Image
import PIL
from concurrent.futures import ThreadPoolExecutor
from functools import partial
import io
import urllib
from datasets import load_dataset
from datasets.utils.file_utils import get_datasets_user_agent
import clip
import os
import json
import numpy as np

# Data and Baselne Model Loading

In [29]:
# pretrained model from hugging face
VLM = VLMForCausalLM.from_pretrained("unum-cloud/uform-gen")
VLMprocessor = VLMProcessor.from_pretrained("unum-cloud/uform-gen")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [38]:
# load image captions from training data

# Path to .jsonl file
file_path = '/Users/seanhuang/MBAn/Sem2/HODL/Project/archive (1)/childrens-books/metadata.jsonl'

data = {}

# Open the .jsonl file
with open(file_path, 'r', encoding='utf-8') as file:
    # Read and parse each line
    for line in file:
        # Append each JSON object to the list
        img_and_cap = json.loads(line)
        img = img_and_cap['file_name']
        cap = img_and_cap['text']
        data[img] = {"caption": cap}

# Now, 'data' contains all the JSON objects from the .jsonl file
print(f"Loaded {len(data)} records.")

Loaded 1710 records.


In [39]:
data

{'0001956604.jpg': {'caption': 'drawing in black and white of Egypt pyramids with the caption "PYRAMID"'},
 '0027352358.jpg': {'caption': 'drawing of a man with a telescope looking at stars with the title "GALILEO"'},
 '0030367697.jpg': {'caption': 'picture of a black and white owl with yellow eyes and title "MODERN BIOLOGY"'},
 '0030367778.jpg': {'caption': 'image of blue background with the title "MODERN CHEMISTRY"'},
 '0030367867.jpg': {'caption': 'image of blue chemical elements with the title "MODERN CHEMISTRY"'},
 '0030683726.jpg': {'caption': 'drawing of abstract animals with the title "ELEMENTS OF LITERATURE"'},
 '0030683734.jpg': {'caption': 'drawing of white boats by the beach with people and the title "ELEMENTS OF LITERATURE"'},
 '006009110X.jpg': {'caption': 'drawing of a vampire in a black cape in a laboratory with his hunchbacked assistant and the caption "a drop of blood"'},
 '0060099208.jpg': {'caption': 'drawing of a detective in brown clothes holding a magnifying glas

In [41]:
# load images
image_folder_path = '/Users/seanhuang/MBAn/Sem2/HODL/Project/archive (1)/childrens-books'

# Loop through all files in the directory
for filename in os.listdir(image_folder_path):
    # Construct the full file path
    file_path = os.path.join(image_folder_path, filename)
    
    if file_path.lower().endswith(('.png', '.jpg', '.jpeg')):
        img = Image.open(file_path)
        data[filename]['image'] = img


In [42]:
# this is the full data as a dict of dicts
data

{'0001956604.jpg': {'caption': 'drawing in black and white of Egypt pyramids with the caption "PYRAMID"',
  'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=224x224>},
 '0027352358.jpg': {'caption': 'drawing of a man with a telescope looking at stars with the title "GALILEO"',
  'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=224x224>},
 '0030367697.jpg': {'caption': 'picture of a black and white owl with yellow eyes and title "MODERN BIOLOGY"',
  'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=224x224>},
 '0030367778.jpg': {'caption': 'image of blue background with the title "MODERN CHEMISTRY"',
  'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=224x224>},
 '0030367867.jpg': {'caption': 'image of blue chemical elements with the title "MODERN CHEMISTRY"',
  'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=224x224>},
 '0030683726.jpg': {'caption': 'drawing of abstract animals with the title "ELEMENTS OF LITERAT

In [None]:
# [cap] Narrate the contents of the image with precision.
# [cap] Summarize the visual content of the image.
# [vqa] What is the main subject of the image?
prompt = "Summarize the visual content of the image"

# Pretrained Model from HuggingFace

In [47]:
texts = {}
count = 1
for file_name, d in data.items():

    print(file_name, count)

    inputs = VLMprocessor(texts=[prompt], images=[d["image"]], return_tensors="pt")
    with torch.inference_mode():
        output = VLM.generate(
            **inputs,
            do_sample=False,
            use_cache=True,
            max_new_tokens=128,
            eos_token_id=32001,
            pad_token_id=VLMprocessor.tokenizer.pad_token_id
        )

    prompt_len = inputs["input_ids"].shape[1]
    decoded_text = VLMprocessor.batch_decode(output[:, prompt_len:])[0]

    texts[file_name] = decoded_text

    count += 1

0001956604.jpg 1
0027352358.jpg 2
0030367697.jpg 3
0030367778.jpg 4
0030367867.jpg 5
0030683726.jpg 6
0030683734.jpg 7
006009110X.jpg 8
0060099208.jpg 9
0060256710.jpg 10
0060271396.jpg 11
0060279028.jpg 12
0060280751.jpg 13
0060283246.jpg 14
0060284870.jpg 15
0060287934.jpg 16
0060291907.jpg 17
0060526335.jpg 18
0060560452.jpg 19
0060560665.jpg 20
0060565098.jpg 21
0060576154.jpg 22
0060589450.jpg 23
0060591374.jpg 24
0060728396.jpg 25
0060732113.jpg 26
0060759046.jpg 27
0060760907.jpg 28
0060782188.jpg 29
0060815566.jpg 30
006085281X.jpg 31
0060877138.jpg 32
0060877197.jpg 33
0060928689.jpg 34
0061123250.jpg 35
0061131679.jpg 36
0061215341.jpg 37
006122779X.jpg 38
0061234796.jpg 39
0061235989.jpg 40
0061240354.jpg 41
0061255122.jpg 42
0061547948.jpg 43
0061688665.jpg 44
0061704040.jpg 45
0061756903.jpg 46
0061804428.jpg 47
0061935069.jpg 48
0061963739.jpg 49
0061996572.jpg 50
0062012932.jpg 51
0062020552.jpg 52
0062086707.jpg 53
0062104950.jpg 54
0062110586.jpg 55
0062110667.jpg 56
0

In [57]:
# write generated texts to jsonl file
file_path = '/Users/seanhuang/MBAn/Sem2/HODL/Project/baseline_outputs.jsonl'

with open(file_path, 'w', encoding='utf-8') as file:
    for entry in texts:
        # Convert the dictionary to a JSON string and write it to the file
        file.write(json.dumps(entry) + ', ' +  json.dumps(texts[entry]) + '\n')

In [49]:
# close all open image files
for file, d in data.items():
    data[file]['image'].close()

In [59]:
# evaluate generated captions using BERT embeddings and average of the cosine similarity matrix
from transformers import BertTokenizer, BertModel
import torch
from sklearn.metrics.pairwise import cosine_similarity

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

score = {}

def encode(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(1)
    return embeddings

for filename in data:
    captions = [data[filename]['caption'], texts[filename]]
    
    embeddings = torch.cat([encode(caption) for caption in captions], dim=0)
    similarity_matrix = cosine_similarity(embeddings)

    # calculate average similarity
    np.fill_diagonal(similarity_matrix, 0) # get rid of self-similarity
    avg_sim = np.sum(similarity_matrix) / (similarity_matrix.size - len(similarity_matrix))
    
    score[filename] = avg_sim

In [60]:
score

{'0001956604.jpg': 0.8084050416946411,
 '0027352358.jpg': 0.7541347146034241,
 '0030367697.jpg': 0.8103808164596558,
 '0030367778.jpg': 0.7008174657821655,
 '0030367867.jpg': 0.7226169109344482,
 '0030683726.jpg': 0.7571567296981812,
 '0030683734.jpg': 0.7617661952972412,
 '006009110X.jpg': 0.7850784659385681,
 '0060099208.jpg': 0.8515745401382446,
 '0060256710.jpg': 0.7705490589141846,
 '0060271396.jpg': 0.7185732126235962,
 '0060279028.jpg': 0.7782298922538757,
 '0060280751.jpg': 0.7374362945556641,
 '0060283246.jpg': 0.8741008043289185,
 '0060284870.jpg': 0.8002466559410095,
 '0060287934.jpg': 0.7901105880737305,
 '0060291907.jpg': 0.7602488398551941,
 '0060526335.jpg': 0.7265545129776001,
 '0060560452.jpg': 0.8297795057296753,
 '0060560665.jpg': 0.8353239297866821,
 '0060565098.jpg': 0.7699826955795288,
 '0060576154.jpg': 0.7608946561813354,
 '0060589450.jpg': 0.7862715125083923,
 '0060591374.jpg': 0.789837658405304,
 '0060728396.jpg': 0.7727282643318176,
 '0060732113.jpg': 0.77045

In [66]:
# min score of basline model
min(score.values())

0.6112775206565857

In [67]:
# max score of baseline model
max(score.values())

0.8801933526992798

In [69]:
# average score of baseline model
np.mean(list(score.values()))

0.7747303398729067