# Rodrigo Barraza's Inscriptions: Blip 2 Mass Captioning
Large RAM and VRAM is required to load the larger models. RAM should be at least 24-32GB with 64GB being optimal. VRAM should be at least 16GB or more.

In [None]:
!pip3 install salesforce-lavis --upgrade
!pip3 install validators

In [2]:
import sys
import validators
import torch
from PIL import Image
import requests
from lavis.models import load_model_and_preprocess

  from .autonotebook import tqdm as notebook_tqdm


#### Load BLIP2 captioning model

In [3]:
# setup device to use
device = torch.device("cuda") if torch.cuda.is_available() else "cpu"
# we associate a model with its preprocessors to make it easier for inference.
model, vis_processors, _ = load_model_and_preprocess(
    # name="blip2_opt", model_type="pretrain_opt2.7b", is_eval=True, device=device
    # name="blip2_opt", model_type="pretrain_opt6.7b", is_eval=True, device=device
    # name="blip2_opt", model_type="caption_coco_opt2.7b", is_eval=True, device=device
    name="blip2_opt", model_type="caption_coco_opt6.7b", is_eval=True, device=device
    # name="blip2_t5", model_type="pretrain_flant5xl", is_eval=True, device=device
    # name="blip2_t5", model_type="caption_coco_flant5xl", is_eval=True, device=device
    # This next model is one scary devil in terms of size...
    # ... it requires at least 32GB of VRAM to run...
    # ... and will not load on 3090s or 4090s.
    # name="blip2_t5", model_type="pretrain_flant5xxl", is_eval=True, device=device
)

vis_processors.keys()

Position interpolate from 16x16 to 26x26


  with safe_open(checkpoint_file, framework="pt") as f:


#### Auto Caption

In [1]:
import os
import re
from pathlib import Path
from collections import OrderedDict
from IPython.display import clear_output
from PIL import Image
###############################################################################
# Start of Options
imagesDirectory = "/mnt/d/datatest/"
useFolderNamesAsTokens = True  # Append the folder names to the beginning of the caption
tokensStartOrEnd = 'start'  # end or start
minTokenLength = 15  # The amount of minimum tokens to generate
maxTokenLength = 20  # The maximum amount of tokens to generate

useNucleusSampling = False
repetitionPenalty = 1

appendStyles = True
promptQuestion = "Describe the style in 1 word"
numberOfAnswers = 5
minAnswerLength = 7
maxAnswerLength = 10

showImages = False
# End of Options
###############################################################################

numberOfCaptions = 1  # How many captions to generate
# Count the total number of images in the directory and subdirectories
total_images = 0
for dirpath, dirnames, filenames in os.walk(imagesDirectory):
    total_images += sum([filename.lower().endswith((".jpg", ".png", ".jpeg", ".webp", "gif")) for filename in filenames])

processed_images = 0


def process_images(dirpath):
    global processed_images
    suspects = os.listdir(dirpath)
    imageSuspects = [filename for filename in suspects if filename.lower().endswith((".jpg", ".png", ".jpeg", ".webp", "gif"))]

    # Process each image
    for suspectIndex in range(len(imageSuspects)):
        processed_images += 1
        remaining_images = total_images - processed_images
        caption = ""
        print(f"Processed images: {processed_images}/{total_images}")
        print(f"Remaining images: {remaining_images}")

        # Load Image
        imagePath = imageSuspects[suspectIndex]
        imageFilePath = dirpath + "/" + imagePath
        textFilePath = Path(imageFilePath).with_suffix('.txt')

        # If the image hasn't already been processed, caption it
        if not os.path.exists(textFilePath):
            rawImage = Image.open(imageFilePath).convert('RGB')
            # Display the image as it's been processed
            if showImages:
                display(rawImage)
            image = vis_processors["eval"](rawImage).unsqueeze(0).to(device)
            imageCaption = model.generate({"image": image}, min_length=minTokenLength, max_length=maxTokenLength, use_nucleus_sampling=useNucleusSampling, num_captions=numberOfCaptions, repetition_penalty=repetitionPenalty)

            modifiedCaption = imageCaption[0]
            captionWords = modifiedCaption.split()

            # Fix grammatical spelling errors by BLIP2
            if "laying" in captionWords:
                modifiedCaption = modifiedCaption.replace('laying', 'lying')

            # Append/Prepend folder names to the caption
            if useFolderNamesAsTokens:
                relpath = os.path.relpath(dirpath, imagesDirectory)
                relpath_parts = relpath.split(os.sep)
                # Append names to the caption
                if tokensStartOrEnd == 'end':
                    caption += modifiedCaption
                    for part in relpath_parts:
                        if "_" not in part and part != ".":
                            if part.lower() not in captionWords and part.lower() not in modifiedCaption:
                                caption += ", " + part
                # Prepend names to the caption
                else:
                    caption = ""
                    for part in relpath_parts:
                        if "_" not in part and part != ".":
                            if part.lower() not in captionWords:
                                caption += part + ", "

                    caption += modifiedCaption
            # Use the caption as is
            else:
                caption = imageCaption[0]

            # Append answers to the caption
            if appendStyles:
                style = model.generate({"image": image, "prompt": "Describe the style in 1 word. Answer:"}, use_nucleus_sampling=False, num_captions=3, min_length=7, max_length=10)
                theme = model.generate({"image": image, "prompt": "Describe the theme in 1 word. Answer:"}, use_nucleus_sampling=False, num_captions=3, min_length=7, max_length=10)
                background = model.generate({"image": image, "prompt": "Describe object in the background in 1 word. Answer:"}, use_nucleus_sampling=False, num_captions=5, min_length=7, max_length=10)
                medium = model.generate({"image": image, "prompt": "Describe the medium in 1 word. Answer:"}, use_nucleus_sampling=False, num_captions=3, min_length=7, max_length=10)
                color = model.generate({"image": image, "prompt": "Describe the color in 1 word. Answer:"}, use_nucleus_sampling=False, num_captions=5, min_length=7, max_length=10)
                person = model.generate({"image": image, "prompt": "Describe the person in 1 word. Answer:"}, use_nucleus_sampling=False, num_captions=5, min_length=7, max_length=10)
                outfit = model.generate({"image": image, "prompt": "Describe the outfit in 1 word. Answer:"}, use_nucleus_sampling=False, num_captions=5, min_length=7, max_length=10)
                
                combined = OrderedDict()

                for array in [person, outfit, background, style, theme, medium, color]:
                    for word in array:
                        lower_word = word.lower()
                        lower_word = lower_word.lstrip()  # Remove leading whitespace

                        if '_' in lower_word:
                            lower_word = ''

                        if lower_word.startswith('a '):
                            lower_word = lower_word[2:]  # Remove 'a ' from the beginning of the string
                        elif lower_word.startswith('the ') or lower_word.startswith('and '):
                            lower_word = lower_word[4:]  # Remove 'the ' from the beginning of the string

                        if lower_word.endswith('.') or lower_word.endswith(',') or lower_word.endswith('!') or lower_word.endswith('?'):
                            lower_word = lower_word[:-1]  # Remove the period from the end of the string

                        if ', and ' in lower_word:
                            lower_word = lower_word.replace(', and ', ' and ')
                        if ', ' in lower_word:
                            lower_word = lower_word.replace(', ', ' and ')

                        split_words = [lower_word]

                        for split_word in split_words:
                            if split_word and len(split_word) > 1:  # This checks if the string is not empty and longer than 1 character:
                                combined[split_word] = None

                unique_combined_array = list(combined)

                uniqueImageAnswers = set(answer.lower() for answer in unique_combined_array)
                filteredImageAnswers = [answer for answer in uniqueImageAnswers if not re.search(r'\b' + answer + r'\b', caption.lower())]
                if filteredImageAnswers:
                    caption += ', ' + ', '.join(filteredImageAnswers)

            # Remove periods
            caption.replace('.', '')

            # Save Caption as .txt file
            with open(textFilePath, 'w+') as f:
                f.write(caption)

        clear_output(wait=True)
        print(caption)
        print(imageFilePath)
                     
# Iterate through directories inside directories
for dirpath, dirnames, filenames in os.walk(imagesDirectory):
    process_images(dirpath)


/mnt/d/datatest/spacescape/solar eclipse/0018301.jpeg
Processed images: 102/225
Remaining images: 123


NameError: name 'vis_processors' is not defined