__Author__ = "Pradeep Pujari"  
__Competition__ = "Stable Diffusion - Image to Prompts April 2023"   
__Version__ = "0 - Base line model"  
__Paper__ = "BLIP-2 https://arxiv.org/abs/2301.12597"  

### Table Of Contents  
1. Goal of the Competition  
2. Install and Import dependendent packages  
3. Config Setup 
4. Load BLIP2 Model  
5. Submission Code

 

### Goal of the Competition  
**The competition aims to create a model that can predict the text prompt given a generated image, instead of generating an image from a text prompt. The dataset contains various (prompt, image) pairs generated by Stable Diffusion 2.0, and the goal is to determine how reversible the latent relationship is.**

**Context**  
The popularity of text-to-image models has spurned an entire new field of prompt engineering. Part art and part unsettled science, ML practitioners and researchers are rapidly grappling with understanding the relationships between prompts and the images they generate. Is adding "4k" to a prompt the best way to make it more photographic? Do small perturbations in prompts lead to highly divergent images? How does the order of prompt keywords impact the resulting generated scene? This competition tasks you with creating a model that can reliably invert the diffusion process that generated to a given image.

In order to calculate prompt similarity in a robust way—meaning that "epic cat" is scored as similar to "majestic kitten" in spite of character-level differences—you will submit embeddings of your predicted prompts. Whether you model the embeddings directly or first predict prompts and then convert to embeddings is up to you! Good luck, and may you create "highly quality, sharp focus, intricate, detailed, in the style of unreal robust cross validation" models herein.


### Install and Import dependendent packages

In [1]:
#!pip3 install blip
#!pip list
#!pip install git+https://github.com/huggingface/transformers

In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import requests
from PIL import Image
from tqdm import tqdm
from pathlib import Path
import torch
from transformers import AutoProcessor, BlipForConditionalGeneration
#from transformers import BlipProcessor, BlipForConditionalGeneration
import os
import sys
sys.path.append('/kaggle/input/sentence-transformers-222/sentence-transformers')
from sentence_transformers import SentenceTransformer, models
#from IPython.display import clear_output
#from transformers import AutoTokenizer

### Config Setup

In [3]:
comp_path = Path('/kaggle/input/stable-diffusion-image-to-prompts/')
folder_path = "/kaggle/working/" 
image_files=[]
for dirname, _, filenames in os.walk('/kaggle/input/stable-diffusion-image-to-prompts/images/'):
#for dirname, _, filenames in os.walk('/kaggle/input/900k-diffusion-prompts-dataset/features/'):
    for filename in sorted(filenames):
        #print(os.path.join(dirname, filename))
        image_files.append(os.path.join(dirname, filename))

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [4]:
images = sorted(os.listdir(comp_path / 'images'))
imgIds = [i.split('.')[0] for i in images]

EMBEDDING_LENGTH = 384
eIds = list(range(EMBEDDING_LENGTH))

imgId_eId = [
    '_'.join(map(str, i)) for i in zip(
        np.repeat(imgIds, EMBEDDING_LENGTH),
        np.tile(range(EMBEDDING_LENGTH), len(imgIds)))]

### The Sample Submission contains correct embeddings (for the example images)
The sample_submission.csv file on the Data page has the correct imbeddings for the prompts listed in the prompts.csv file. This is so you can test whether you are calculating embeddings correctly.

In [5]:
st_model = SentenceTransformer('/kaggle/input/sentence-transformers-222/all-MiniLM-L6-v2')

Load the embedding model all-MiniLM-L6-v2
We're loading this from the attached dataset, which you will also need to attach to your notebooks!

### Stable Diffusion Process with BLIP2 - Here weights are pretrained compared to BLIP

In [6]:
#model_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth'
processor = AutoProcessor.from_pretrained("/kaggle/input/salesforceblip-image-caption")

#model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("/kaggle/input/salesforceblip-image-caption")

model.to(device) 


BlipForConditionalGeneration(
  (vision_model): BlipVisionModel(
    (embeddings): BlipVisionEmbeddings(
      (patch_embedding): Conv2d(3, 1024, kernel_size=(16, 16), stride=(16, 16))
    )
    (encoder): BlipEncoder(
      (layers): ModuleList(
        (0): BlipEncoderLayer(
          (self_attn): BlipAttention(
            (dropout): Dropout(p=0.0, inplace=False)
            (qkv): Linear(in_features=1024, out_features=3072, bias=True)
            (projection): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (layer_norm1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): BlipMLP(
            (activation_fn): GELUActivation()
            (fc1): Linear(in_features=1024, out_features=4096, bias=True)
            (fc2): Linear(in_features=4096, out_features=1024, bias=True)
          )
          (layer_norm2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
        (1): BlipEncoderLayer(
          (self_attn): BlipAtte

In [7]:
def image_to_prompt(raw_image):
    inputs = processor(raw_image, return_tensors="pt").to(device)
    out = model.generate(**inputs, max_new_tokens=32)
    generated_prompt = processor.batch_decode(out, skip_special_tokens=True)[0].strip()
    return generated_prompt

In [8]:
generated_prompts =[]

for idx, file in enumerate(tqdm(image_files, desc='Generating prompts')):

#    image = Image.open(file).convert('RGB')
    image = Image.open(file)
    #prompts=dict()
    prompt = image_to_prompt(image)
    #prompts['imageId']=file.split('/')[-1]
    #prompts['prompt']=prompt
    generated_prompts.append(prompt)
    #print(file)
    #print(prompt)
    #thumb = image.copy()
    #thumb.thumbnail([256, 256])
    #display(thumb)


Generating prompts: 100%|██████████| 7/7 [00:53<00:00,  7.60s/it]


### Calculate prompt embeddings

In [9]:
#print(generated_prompts)
#new_generated_prompts = pd.DataFrame(generated_prompts).set_index(['imageId'])
#new_generated_prompts.head()

In [10]:
#tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
#encoded_input = tokenizer(sentences, padding=True, truncation=True, max_length=128, return_tensors='pt')

prompt_embeddings = st_model.encode(generated_prompts).flatten()
submission = pd.DataFrame(
                index=imgId_eId,
                data=prompt_embeddings,
                columns=['val']).rename_axis('imgId_eId')

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [11]:
submission.head()

Unnamed: 0_level_0,val
imgId_eId,Unnamed: 1_level_1
20057f34d_0,0.058347
20057f34d_1,0.081329
20057f34d_2,-0.042682
20057f34d_3,0.033359
20057f34d_4,0.01785


### Ground Truth
Actual prompts used for the images  
NOTE: This file will not be available for the notebook re-run. References to it will create notebook failures.

### The Sample Submission contains correct embeddings (for the example images)
The sample_submission.csv file on the Data page has the correct imbeddings for the prompts listed in the prompts.csv file. This is so you can test whether you are calculating embeddings correctly.

In [12]:
#sample_submission = pd.read_csv(comp_path / 'sample_submission.csv', index_col='imgId_eId')
#sample_submission.head()

In [13]:
#assert (sorted(imgId_eId) == sorted(sample_submission.index))

In [14]:
#prompts = pd.read_csv(comp_path / 'prompts.csv', index_col='imgId')
#prompts.head(7)

### Compare calculated embeddings with ground truth (within tolerance)

In [15]:
#prompt_embeddings = st_model.encode(prompts['prompt']).flatten()

In [16]:
#assert np.all(np.isclose(sample_submission['val'].values, prompt_embeddings, atol=1e-07))

In [17]:
submission.to_csv('submission.csv')