# Rodrigo Barraza's Inscriptions: Blip 2 Mass Captioning
Large RAM and VRAM is required to load the larger models. RAM should be at least 24-32GB with 64GB being optimal. VRAM should be at least 16GB or more.

In [None]:
!pip3 install salesforce-lavis --upgrade
!pip3 install validators

In [None]:
import sys
import validators
import torch
from PIL import Image
import requests
from lavis.models import load_model_and_preprocess

#### Load BLIP2 captioning model

In [None]:
# setup device to use
device = torch.device("cuda") if torch.cuda.is_available() else "cpu"
# we associate a model with its preprocessors to make it easier for inference.
model, vis_processors, _ = load_model_and_preprocess(
    # name="blip2_opt", model_type="pretrain_opt2.7b", is_eval=True, device=device
    # name="blip2_opt", model_type="pretrain_opt6.7b", is_eval=True, device=device
    # name="blip2_opt", model_type="caption_coco_opt2.7b", is_eval=True, device=device
    name="blip2_opt", model_type="caption_coco_opt6.7b", is_eval=True, device=device
    # name="blip2_t5", model_type="pretrain_flant5xl", is_eval=True, device=device
    # name="blip2_t5", model_type="caption_coco_flant5xl", is_eval=True, device=device
    # This next model is one scary devil in terms of size...
    # ... it requires at least 32GB of VRAM to run...
    # ... and will not load on 3090s or 4090s.
    # name="blip2_t5", model_type="pretrain_flant5xxl", is_eval=True, device=device
)

vis_processors.keys()

#### Auto Caption

In [None]:
import os
import re
from pathlib import Path
from collections import OrderedDict
from IPython.display import clear_output
from PIL import Image
###############################################################################
# Start of Options
imagesDirectory = "/mnt/d/datatest/"
useFolderNamesAsTokens = True  # Append the folder names to the beginning of the caption
tokensStartOrEnd = 'start'  # end or start
minTokenLength = 15  # The amount of minimum tokens to generate
maxTokenLength = 20  # The maximum amount of tokens to generate

useNucleusSampling = False
repetitionPenalty = 1

appendStyles = True
promptQuestion = "Describe the style in 1 word"
numberOfAnswers = 5
minAnswerLength = 7
maxAnswerLength = 10

showImages = False
# End of Options
###############################################################################

numberOfCaptions = 1  # How many captions to generate
# Count the total number of images in the directory and subdirectories
totalImages = 0
for dirpath, dirnames, filenames in os.walk(imagesDirectory):
    totalImages += sum([filename.lower().endswith((".jpg", ".png", ".jpeg", ".webp", "gif")) for filename in filenames])

processedImages = 0


def process_images(dirpath):
    global processedImages
    suspects = os.listdir(dirpath)
    imageSuspects = [filename for filename in suspects if filename.lower().endswith((".jpg", ".png", ".jpeg", ".webp", "gif"))]

    # Process each image
    for suspectIndex in range(len(imageSuspects)):
        processedImages += 1
        remainingImages = totalImages - processedImages
        caption = ""
        print(f"Processed images: {processedImages}/{totalImages}")
        print(f"Remaining images: {remainingImages}")

        # Load Image
        imagePath = imageSuspects[suspectIndex]
        imageFilePath = dirpath + "/" + imagePath
        textFilePath = Path(imageFilePath).with_suffix('.txt')

        # If the image hasn't already been processed, caption it
        if not os.path.exists(textFilePath):
            rawImage = Image.open(imageFilePath).convert('RGB')
            # Display the image as it's been processed
            if showImages:
                display(rawImage)
            image = vis_processors["eval"](rawImage).unsqueeze(0).to(device)
            imageCaption = model.generate({"image": image}, min_length=minTokenLength, max_length=maxTokenLength, use_nucleus_sampling=useNucleusSampling, num_captions=numberOfCaptions, repetition_penalty=repetitionPenalty)

            modifiedCaption = imageCaption[0]
            captionWords = modifiedCaption.split()

            # Fix grammatical spelling errors by BLIP2
            if "laying" in captionWords:
                modifiedCaption = modifiedCaption.replace('laying', 'lying')

            # Append/Prepend folder names to the caption
            if useFolderNamesAsTokens:
                relpath = os.path.relpath(dirpath, imagesDirectory)
                relpathParts = [part for part in relpath.split(os.sep) if "_" not in part and part != "."]

                validParts = [part for part in relpathParts if part.lower() not in captionWords and part.lower() not in modifiedCaption]

                if tokensStartOrEnd == 'end':
                    caption = f"{modifiedCaption}, {', '.join(validParts)}"
                else:
                    caption = f"{', '.join(validParts)}, {modifiedCaption}"
            else:
                caption = imageCaption[0]

            # Append answers to the caption
            if appendStyles:
                style = model.generate({"image": image, "prompt": "Describe the style in 1 word. Answer:"}, use_nucleus_sampling=False, num_captions=3, min_length=7, max_length=10)
                theme = model.generate({"image": image, "prompt": "Describe the theme in 1 word. Answer:"}, use_nucleus_sampling=False, num_captions=3, min_length=7, max_length=10)
                background = model.generate({"image": image, "prompt": "Describe object in the background in 1 word. Answer:"}, use_nucleus_sampling=False, num_captions=5, min_length=7, max_length=10)
                medium = model.generate({"image": image, "prompt": "Describe the medium in 1 word. Answer:"}, use_nucleus_sampling=False, num_captions=3, min_length=7, max_length=10)
                color = model.generate({"image": image, "prompt": "Describe the color in 1 word. Answer:"}, use_nucleus_sampling=False, num_captions=5, min_length=7, max_length=10)
                person = model.generate({"image": image, "prompt": "Describe the person in 1 word. Answer:"}, use_nucleus_sampling=False, num_captions=5, min_length=7, max_length=10)
                outfit = model.generate({"image": image, "prompt": "Describe the outfit in 1 word. Answer:"}, use_nucleus_sampling=False, num_captions=5, min_length=7, max_length=10)
                
                combined = OrderedDict()

                answers = [person, outfit, background, style, theme, medium, color]

                for answer in answers:
                    for word in answer:
                        lowerWord = word.lower().lstrip()

                        if '_' in lowerWord:
                            lowerWord = ''

                        if lowerWord.startswith(('a ', 'the ', 'and ')):
                            lowerWord = lowerWord.split(' ', 1)[1]

                        if lowerWord.endswith(('.', ',', '!', '?')):
                            lowerWord = lowerWord[:-1]

                        lowerWord = lowerWord.replace(', and ', ' and ').replace(', ', ' and ')

                        if len(lowerWord) > 1:
                            combined[lowerWord] = None

                uniqueCombinedArray = list(combined)

                uniqueImageAnswers = set(answer.lower() for answer in uniqueCombinedArray)
                filteredImageAnswers = [ans for ans in uniqueImageAnswers if not re.search(rf'\b{ans}\b', caption.lower())]

                if filteredImageAnswers:
                    caption += ', ' + ', '.join(filteredImageAnswers)

            # Remove periods
            caption.replace('.', '')

            # Save Caption as .txt file
            with open(textFilePath, 'w+') as f:
                f.write(caption)

        clear_output(wait=True)
        print(caption)
        print(imageFilePath)
                     
# Iterate through directories inside directories
for dirpath, dirnames, filenames in os.walk(imagesDirectory):
    process_images(dirpath)