# Rodrigo Barraza's Inscriptions: Blip 2 Mass Captioning
Large RAM and VRAM is required to load the larger models. RAM should be at least 24-32GB with 64GB being optimal. VRAM should be at least 16GB or more.

In [None]:
!pip3 install salesforce-lavis --upgrade
!pip3 install validators

In [None]:
import sys
import validators
import torch
from PIL import Image
import requests
from lavis.models import load_model_and_preprocess

#### Load BLIP2 captioning model

In [None]:
# setup device to use
device = torch.device("cuda") if torch.cuda.is_available() else "cpu"
# we associate a model with its preprocessors to make it easier for inference.
model, vis_processors, _ = load_model_and_preprocess(
    # name="blip2_opt", model_type="pretrain_opt2.7b", is_eval=True, device=device
    # name="blip2_opt", model_type="pretrain_opt6.7b", is_eval=True, device=device
    # name="blip2_opt", model_type="caption_coco_opt2.7b", is_eval=True, device=device
    name="blip2_opt", model_type="caption_coco_opt6.7b", is_eval=True, device=device
    # name="blip2_t5", model_type="pretrain_flant5xl", is_eval=True, device=device
    # name="blip2_t5", model_type="caption_coco_flant5xl", is_eval=True, device=device
    # This next model is one scary devil in terms of size. It requires at least 32GB of VRAM to run, and will not load on 3090s or 4090s.
    # name="blip2_t5", model_type="pretrain_flant5xxl", is_eval=True, device=device
)

vis_processors.keys()

#### Auto Caption

In [13]:
import os
from pathlib import Path
from IPython.display import clear_output
from PIL import Image

# Start of Options
imagesDirectory = "/mnt/d/dataset-1080/"
useFolderNamesAsTokens = True  # Append the folder names to the beginning of the caption
tokensStartOrEnd = 'start'  # end or start
minTokenLength = 5  # The amount of minimum tokens to generate
maxTokenLength = 72  # The maximum amount of tokens to generate
numberOfCaptions = 1  # How many captions to generate
useNucleusSampling = False
repetitionPenalty = 1
# End of Options

# Count the total number of images in the directory and subdirectories
total_images = 0
for dirpath, dirnames, filenames in os.walk(imagesDirectory):
    total_images += sum([filename.lower().endswith((".jpg", ".png", ".jpeg", ".webp", "gif")) for filename in filenames])

processed_images = 0


def process_images(dirpath):
    global processed_images
    suspects = os.listdir(dirpath)
    imageSuspects = [filename for filename in suspects if filename.lower().endswith((".jpg", ".png", ".jpeg", ".webp", "gif"))]

    for suspectIndex in range(len(imageSuspects)):
        processed_images += 1
        remaining_images = total_images - processed_images
        caption = ""
        print(f"Processed images: {processed_images}/{total_images}")
        print(f"Remaining images: {remaining_images}")

        # Load Image
        imagePath = imageSuspects[suspectIndex]
        imageFilePath = dirpath + "/" + imagePath
        textFilePath = Path(imageFilePath).with_suffix('.txt')

        # If file doesn't exist:
        if not os.path.exists(textFilePath):
            rawImage = Image.open(imageFilePath).convert('RGB')
            # display(rawImage)
            image = vis_processors["eval"](rawImage).unsqueeze(0).to(device)
            imageCaption = model.generate({"image": image}, min_length=minTokenLength, max_length=maxTokenLength, use_nucleus_sampling=useNucleusSampling, num_captions=numberOfCaptions, repetition_penalty=repetitionPenalty)
            modifiedCaption = imageCaption[0]
            captionWords = modifiedCaption.split()

            # Fix grammatical spelling errors by BLIP2
            if "laying" in captionWords:
                modifiedCaption = modifiedCaption.replace('laying', 'lying')

            if useFolderNamesAsTokens:
                relpath = os.path.relpath(dirpath, imagesDirectory)
                relpath_parts = relpath.split(os.sep)

                if tokensStartOrEnd == 'end':
                    caption += modifiedCaption
                    for part in relpath_parts:
                        if "_" not in part and part != ".":
                            imageCaptionSet = set(captionWords)
                            if part not in imageCaptionSet and part not in modifiedCaption:
                                caption += ", " + part
                else:
                    caption = ""
                    for part in relpath_parts:
                        if "_" not in part and part != ".":
                            if part.startswith('-'):
                                imageCaptionSet = set(captionWords)
                                if part.replace('-', '') not in imageCaptionSet and part.replace('-', '') not in modifiedCaption:
                                    caption += part.replace('-', '') + ", "
                            else:
                                caption += part + ", "
                    caption += modifiedCaption
            else:
                caption = imageCaption[0]

            # Remove periods
            caption.replace('.', '')

            # Save Caption as .txt file
            with open(textFilePath, 'w+') as f:
                f.write(caption)
        clear_output(wait=True)
        print(caption)
        print(imageFilePath)
                     
# Iterate through directories inside directories
for dirpath, dirnames, filenames in os.walk(imagesDirectory):
    process_images(dirpath)

photograph, elizabeth elder, medium wide shot, a woman in a pink dress with her hands folded
/mnt/d/dataset-1080/photograph/_photographers/elizabeth elder/medium wide shot/0001840.jpg
Processed images: 1839/12275
Remaining images: 10436
