salesforce · guangsen-wang · Dec 9, 2022 · Dec 9, 2022 · Dec 9, 2022
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,53 @@
+FROM nvcr.io/nvidia/pytorch:21.06-py3 
+
+COPY requirements-app.txt requirements_gpu.txt
+
+ENV DEBIAN_FRONTEND=noninteractive 
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    locales \
+    wget \
+    build-essential \
+    vim \
+    htop \
+    curl \
+    git less ssh cmake \
+    zip unzip gzip bzip2 \
+    python3-tk gcc g++ libpq-dev
+
+RUN apt -y install openssh-server openssh-client
+# BLIP-specific commands
+RUN apt-get install -y libxtst6
+RUN pip3 uninstall -y torch
+RUN pip3 uninstall -y torchtext
+RUN pip3 install torch==1.10.0+cu113 torchvision==0.11.1+cu113 -f https://download.pytorch.org/whl/cu113/torch_stable.html
+RUN pip3 install omegaconf
+RUN pip3 install ipython
+RUN pip3 install pycocoevalcap
+RUN pip3 install pycocotools
+RUN pip3 install timm==0.4.12
+RUN pip3 install fairscale==0.4.4
+RUN apt install -y default-jre
+RUN apt install -y openjdk-11-jre-headless
+RUN apt install -y openjdk-8-jre-headless
+RUN pip uninstall opencv-python
+RUN pip uninstall opencv-contrib-python
+RUN pip uninstall opencv-contrib-python-headless
+
+
+RUN  pip3 install -r requirements_gpu.txt
+
+
+COPY . /lavis_app
+WORKDIR /lavis_app
+
+RUN wget https://huggingface.co/CompVis/stable-diffusion-v-1-4-original/resolve/main/sd-v1-4.ckpt
+RUN mv  sd-v1-4.ckpt /lavis_app/stable-diffusion/sd-v1-4.ckpt
+
+ENV PYTHONPATH="${PYTHONPATH}:./:/lavis_app:/lavis_app/stable-diffusion"
+
+EXPOSE 8080
+RUN chmod +x /lavis_app/run_scripts/start_lavis_app.sh
+ENTRYPOINT ["/lavis_app/run_scripts/start_lavis_app.sh" ]
+
+
diff --git a/app/__init__.py b/app/__init__.py
@@ -1,10 +1,3 @@
-"""
- # Copyright (c) 2022, salesforce.com, inc.
- # All rights reserved.
- # SPDX-License-Identifier: BSD-3-Clause
- # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
-"""
-
 from PIL import Image
 import requests
 
@@ -24,3 +17,6 @@ def load_demo_image():
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
 cache_root = "/export/home/.cache/lavis/"
+pending_job_path = "app/task_queues/pending_jobs/"
+finished_job_path = "app/task_queues/finished_jobs/"
+job_output_path = "app/task_queues/outputs/"
diff --git a/app/backend/caption_backend.py b/app/backend/caption_backend.py
@@ -0,0 +1,107 @@
+from app import device, load_demo_image
+from app.utils import load_model_cache, get_pending_jobs, create_uniq_user_job_name
+from app import job_output_path, finished_job_path, pending_job_path
+from lavis.processors import load_processor
+from PIL import Image
+
+import random
+import numpy as np
+import torch
+import os, shutil, time
+
+job_type = 'caption'
+
+if torch.cuda.is_available():
+    torch.cuda.set_device(0)
+    device = "cuda"
+else:
+    device = "cpu"
+
+def setup_seed(seed):
+    random.seed(seed)
+    np.random.seed(int(seed))
+    torch.manual_seed(seed)
+
+    if torch.cuda.is_available():
+        import torch.backends.cudnn as cudnn
+        cudnn.benchmark = False
+        cudnn.deterministic = True
+
+def back_end():
+    vis_processor = load_processor("blip_image_eval").build(image_size=384)
+    blip_large_model = load_model_cache(
+                "blip_caption",
+                model_type=f"large_coco",
+                is_eval=True,
+                device=device,
+            )
+    blip_base_model = load_model_cache(
+                "blip_caption",
+                model_type=f"base_coco",
+                is_eval=True,
+                device=device,
+            )
+    os.makedirs(os.path.join(finished_job_path, job_type), exist_ok=True)
+    while True:
+        pending_jobs = get_pending_jobs(job_type)
+        for job in pending_jobs:
+            while True:
+                with open(job) as f:
+                    content = f.readline().rstrip(' \n')
+                    if len(content.split('\t')) == 5: break
+            time_stamp, blip_type, sampling_method, num_captions, seed = content.split('\t')
+            outpath = os.path.join(job_output_path, job_type)
+            os.makedirs(outpath, exist_ok=True)
+            img_file = outpath+'/{}_raw_image.pt'.format(create_uniq_user_job_name(time_stamp, sampling_method))
+            while True:
+                if os.path.exists(img_file):
+                    break
+            time.sleep(1)
+            img = torch.load(outpath+'/{}_raw_image.pt'.format(create_uniq_user_job_name(time_stamp, sampling_method)),map_location=torch.device(device))
+            if blip_type == 'large':
+                model = blip_large_model
+            else:
+                model = blip_base_model
+            use_nucleus_sampling = False
+            if sampling_method == 'Nucleus sampling':
+                use_nucleus_sampling = True
+            setup_seed(int(seed))
+            captions = generate_caption(model, img, use_nucleus_sampling, int(num_captions))
+            caption_result = outpath+'/{}_result.txt'.format(create_uniq_user_job_name(time_stamp, sampling_method))
+            with open(caption_result,'w') as f:
+                for caption in captions:
+                    f.write(caption+'\n')
+            shutil.move(job, os.path.join(finished_job_path, job_type))
+            os.remove(img_file)
+
+
+def generate_caption(
+    model, image, use_nucleus_sampling=False, num_captions = 1, num_beams=3, max_length=40, min_length=5
+):
+    samples = {"image": image}
+
+    captions = []
+    if use_nucleus_sampling:
+        #for _ in range(5):
+        captions = model.generate(
+                samples,
+                use_nucleus_sampling=True,
+                max_length=max_length,
+                min_length=min_length,
+                top_p=0.9,
+                num_captions=num_captions
+        )
+        #captions.append(caption[0])
+    else:
+        caption = model.generate(
+            samples,
+            use_nucleus_sampling=False,
+            num_beams=num_beams,
+            max_length=max_length,
+            min_length=min_length,
+            num_captions=1
+        )
+        captions.append(caption[0])
+    return captions
+if __name__ == "__main__":
+    back_end()
diff --git a/app/backend/multimodal_search_backend.py b/app/backend/multimodal_search_backend.py
@@ -0,0 +1,198 @@
+import os, shutil
+
+import numpy as np
+import streamlit as st
+import torch
+import torch.nn.functional as F
+from app import cache_root, device, job_output_path, finished_job_path
+from app.utils import (
+    getAttMap,
+    init_bert_tokenizer,
+    load_blip_itm_model,
+    read_img,
+    resize_img,
+    get_pending_jobs,
+    create_uniq_user_job_name
+)
+from lavis.models import BlipFeatureExtractor, load_model
+from lavis.processors import load_processor
+
+if torch.cuda.is_available():
+    torch.cuda.set_device(0)
+    device = "cuda"
+else:
+    device = "cpu"
+
+job_type = 'search'
+
+def load_feat():
+    from lavis.common.utils import download_url
+
+    dirname = os.path.join(os.path.dirname(__file__), "assets")
+    filename = "path2feat_coco_train2014.pth"
+    filepath = os.path.join(dirname, filename)
+    url = "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/assets/path2feat_coco_train2014.pth"
+
+    if not os.path.exists(filepath):
+        download_url(url=url, root=dirname, filename="path2feat_coco_train2014.pth")
+
+    path2feat = torch.load(filepath)
+    paths = sorted(path2feat.keys())
+
+    all_img_feats = torch.stack([path2feat[k] for k in paths], dim=0).to(device)
+
+    return path2feat, paths, all_img_feats
+
+def load_feature_extractor_model(device):
+    model_url = "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base.pth"
+
+    model = load_model("blip_feature_extractor", model_type="base", is_eval=True, device=device)
+    model.load_from_pretrained(model_url)
+
+    return model
+
+def search(time_stamp, user_question, feature_extractor, vis_processor, raw_user_question, num_display, itm_model):
+    sample = {"text_input": user_question}
+    with torch.no_grad():
+        text_feature = feature_extractor.extract_features(
+            sample, mode="text").text_embeds_proj[0, 0]
+
+        path2feat, paths, all_img_feats = load_feat()
+        all_img_feats.to(device)
+        all_img_feats = F.normalize(all_img_feats, dim=1)
+
+        num_cols = 4
+        num_rows = int(num_display) // num_cols
+
+        similarities = text_feature @ all_img_feats.T
+        indices = torch.argsort(similarities, descending=True)[:num_display]
+
+    top_paths = [paths[ind.detach().cpu().item()] for ind in indices]
+    sorted_similarities = [similarities[idx] for idx in indices]
+    file_root = os.path.join(cache_root, "coco/images/train2014/")
+    filenames = [os.path.join(file_root, p) for p in top_paths]
+    outpath = os.path.join(job_output_path, job_type)
+    os.makedirs(outpath, exist_ok=True)
+
+    bsz = 8  # max number of images to avoid cuda oom
+
+    #itm_model = load_blip_itm_model("cuda", model_type=blip_type)
+
+    tokenizer = init_bert_tokenizer()
+    queries_batch = [user_question] * bsz
+    queries_tok_batch = tokenizer(queries_batch, return_tensors="pt").to("cpu")
+
+    num_batches = int(num_display / bsz)
+
+    avg_gradcams = []
+    all_raw_images = []
+    itm_scores = []
+
+    for i in range(num_batches):
+        filenames_in_batch = filenames[i * bsz : (i + 1) * bsz]
+        raw_images, images = read_and_process_images(filenames_in_batch, vis_processor)
+        gradcam, itm_output = compute_gradcam_batch(
+            itm_model, images, queries_batch, queries_tok_batch
+        )
+
+        all_raw_images.extend([resize_img(r_img) for r_img in raw_images])
+        norm_imgs = [np.float32(r_img) / 255 for r_img in raw_images]
+
+        for norm_img, grad_cam in zip(norm_imgs, gradcam):
+            avg_gradcam = getAttMap(norm_img, grad_cam[0], blur=True)
+            avg_gradcams.append(avg_gradcam)
+
+        with torch.no_grad():
+            itm_score = torch.nn.functional.softmax(itm_output, dim=1)
+
+        itm_scores.append(itm_score)
+
+    #avg_gradcams = torch.cat(avg_gradcams)
+    #all_raw_images = torch.cat(all_raw_images)
+
+    itm_scores = torch.cat(itm_scores)[:, 1]
+    torch.save(itm_scores, outpath+'/{}_itm.pt'.format(create_uniq_user_job_name(time_stamp,raw_user_question)))
+    np.save(outpath+'/{}_avg_gradcams.npy'.format(create_uniq_user_job_name(time_stamp,raw_user_question)), avg_gradcams, allow_pickle=True)
+    np.save(outpath+'/{}_all_raw_images.npy'.format(create_uniq_user_job_name(time_stamp,raw_user_question)),all_raw_images,allow_pickle=True)
+
+    search_result = outpath+'/{}_result.txt'.format(create_uniq_user_job_name(time_stamp,raw_user_question))
+    with open(search_result,'w') as f:
+        for filename in filenames:
+            f.write(filename+'\n')
+
+def back_end():
+    # === event ===
+    vis_processor = load_processor("blip_image_eval").build(image_size=384)
+    text_processor = load_processor("blip_caption")
+    feature_extractor = load_feature_extractor_model(device)
+    os.makedirs("{}/{}/".format(finished_job_path, job_type), exist_ok=True)
+    large_itm_model = load_blip_itm_model(device, model_type='large')
+    base_itm_model = load_blip_itm_model(device, model_type='base')
+
+    while True:
+        pending_jobs = get_pending_jobs(job_type)
+        for job in pending_jobs:
+            while True:
+                with open(job) as f:
+                    content = f.readline().rstrip(' \n')
+                    if len(content.split('\t')) == 4: break
+            time_stamp, raw_user_question, num_display, blip_type = content.split('\t')
+            user_question = text_processor(raw_user_question)
+            if blip_type == 'large':
+                search(time_stamp, user_question, feature_extractor, vis_processor, raw_user_question, int(num_display), large_itm_model)
+            else:
+                search(time_stamp, user_question, feature_extractor, vis_processor, raw_user_question, int(num_display), base_itm_model)
+            shutil.move(job, "{}/{}/".format(finished_job_path,job_type))
+
+
+def read_and_process_images(image_paths, vis_processor):
+    raw_images = [read_img(path) for path in image_paths]
+    images = [vis_processor(r_img) for r_img in raw_images]
+    images_tensors = torch.stack(images).to(device)
+
+    return raw_images, images_tensors
+
+
+def compute_gradcam_batch(model, visual_input, text_input, tokenized_text, block_num=6):
+    model.text_encoder.base_model.base_model.encoder.layer[
+        block_num
+    ].crossattention.self.save_attention = True
+
+    output = model(visual_input, text_input, match_head="itm")
+    loss = output[:, 1].sum()
+
+    model.zero_grad()
+    loss.backward()
+    with torch.no_grad():
+        mask = tokenized_text.attention_mask.view(
+            tokenized_text.attention_mask.size(0), 1, -1, 1, 1
+        ).to(device=device) # (bsz,1,token_len, 1,1)
+        token_length = mask.sum() - 2
+        token_length = token_length.cpu()
+        # grads and cams [bsz, num_head, seq_len, image_patch]
+        grads = model.text_encoder.base_model.base_model.encoder.layer[
+            block_num
+        ].crossattention.self.get_attn_gradients()
+        cams = model.text_encoder.base_model.base_model.encoder.layer[
+            block_num
+        ].crossattention.self.get_attention_map()
+
+        # assume using vit large with 576 num image patch
+        cams = cams[:, :, :, 1:].reshape(visual_input.size(0), 12, -1, 24, 24) * mask
+        grads = (
+            grads[:, :, :, 1:].clamp(0).reshape(visual_input.size(0), 12, -1, 24, 24)
+            * mask
+        )
+
+        gradcam = cams * grads
+        # [enc token gradcam, average gradcam across token, gradcam for individual token]
+        # gradcam = torch.cat((gradcam[0:1,:], gradcam[1:token_length+1, :].sum(dim=0, keepdim=True)/token_length, gradcam[1:, :]))
+        gradcam = gradcam.mean(1).cpu().detach()
+        gradcam = (
+            gradcam[:, 1 : token_length + 1, :].sum(dim=1, keepdim=True) / token_length
+        )
+
+    return gradcam, output
+
+if __name__ == '__main__':
+    back_end()