Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Port DF Streamlit demo including GCP deployment to LAVIS OSS #85

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 53 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
FROM nvcr.io/nvidia/pytorch:21.06-py3

COPY requirements-app.txt requirements_gpu.txt

ENV DEBIAN_FRONTEND=noninteractive

RUN apt-get update && apt-get install -y --no-install-recommends \
locales \
wget \
build-essential \
vim \
htop \
curl \
git less ssh cmake \
zip unzip gzip bzip2 \
python3-tk gcc g++ libpq-dev

RUN apt -y install openssh-server openssh-client
# BLIP-specific commands
RUN apt-get install -y libxtst6
RUN pip3 uninstall -y torch
RUN pip3 uninstall -y torchtext
RUN pip3 install torch==1.10.0+cu113 torchvision==0.11.1+cu113 -f https://download.pytorch.org/whl/cu113/torch_stable.html
RUN pip3 install omegaconf
RUN pip3 install ipython
RUN pip3 install pycocoevalcap
RUN pip3 install pycocotools
RUN pip3 install timm==0.4.12
RUN pip3 install fairscale==0.4.4
RUN apt install -y default-jre
RUN apt install -y openjdk-11-jre-headless
RUN apt install -y openjdk-8-jre-headless
RUN pip uninstall opencv-python
RUN pip uninstall opencv-contrib-python
RUN pip uninstall opencv-contrib-python-headless


RUN pip3 install -r requirements_gpu.txt


COPY . /lavis_app
WORKDIR /lavis_app

RUN wget https://huggingface.co/CompVis/stable-diffusion-v-1-4-original/resolve/main/sd-v1-4.ckpt
RUN mv sd-v1-4.ckpt /lavis_app/stable-diffusion/sd-v1-4.ckpt

ENV PYTHONPATH="${PYTHONPATH}:./:/lavis_app:/lavis_app/stable-diffusion"

EXPOSE 8080
RUN chmod +x /lavis_app/run_scripts/start_lavis_app.sh
ENTRYPOINT ["/lavis_app/run_scripts/start_lavis_app.sh" ]


10 changes: 3 additions & 7 deletions app/__init__.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,3 @@
"""
# Copyright (c) 2022, salesforce.com, inc.
# All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
"""

from PIL import Image
import requests

Expand All @@ -24,3 +17,6 @@ def load_demo_image():
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

cache_root = "/export/home/.cache/lavis/"
pending_job_path = "app/task_queues/pending_jobs/"
finished_job_path = "app/task_queues/finished_jobs/"
job_output_path = "app/task_queues/outputs/"
107 changes: 107 additions & 0 deletions app/backend/caption_backend.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
from app import device, load_demo_image
from app.utils import load_model_cache, get_pending_jobs, create_uniq_user_job_name
from app import job_output_path, finished_job_path, pending_job_path
from lavis.processors import load_processor
from PIL import Image

import random
import numpy as np
import torch
import os, shutil, time

job_type = 'caption'

if torch.cuda.is_available():
torch.cuda.set_device(0)
device = "cuda"
else:
device = "cpu"

def setup_seed(seed):
random.seed(seed)
np.random.seed(int(seed))
torch.manual_seed(seed)

if torch.cuda.is_available():
import torch.backends.cudnn as cudnn
cudnn.benchmark = False
cudnn.deterministic = True

def back_end():
vis_processor = load_processor("blip_image_eval").build(image_size=384)
blip_large_model = load_model_cache(
"blip_caption",
model_type=f"large_coco",
is_eval=True,
device=device,
)
blip_base_model = load_model_cache(
"blip_caption",
model_type=f"base_coco",
is_eval=True,
device=device,
)
os.makedirs(os.path.join(finished_job_path, job_type), exist_ok=True)
while True:
pending_jobs = get_pending_jobs(job_type)
for job in pending_jobs:
while True:
with open(job) as f:
content = f.readline().rstrip(' \n')
if len(content.split('\t')) == 5: break
time_stamp, blip_type, sampling_method, num_captions, seed = content.split('\t')
outpath = os.path.join(job_output_path, job_type)
os.makedirs(outpath, exist_ok=True)
img_file = outpath+'/{}_raw_image.pt'.format(create_uniq_user_job_name(time_stamp, sampling_method))
while True:
if os.path.exists(img_file):
break
time.sleep(1)
img = torch.load(outpath+'/{}_raw_image.pt'.format(create_uniq_user_job_name(time_stamp, sampling_method)),map_location=torch.device(device))
if blip_type == 'large':
model = blip_large_model
else:
model = blip_base_model
use_nucleus_sampling = False
if sampling_method == 'Nucleus sampling':
use_nucleus_sampling = True
setup_seed(int(seed))
captions = generate_caption(model, img, use_nucleus_sampling, int(num_captions))
caption_result = outpath+'/{}_result.txt'.format(create_uniq_user_job_name(time_stamp, sampling_method))
with open(caption_result,'w') as f:
for caption in captions:
f.write(caption+'\n')
shutil.move(job, os.path.join(finished_job_path, job_type))
os.remove(img_file)


def generate_caption(
model, image, use_nucleus_sampling=False, num_captions = 1, num_beams=3, max_length=40, min_length=5
):
samples = {"image": image}

captions = []
if use_nucleus_sampling:
#for _ in range(5):
captions = model.generate(
samples,
use_nucleus_sampling=True,
max_length=max_length,
min_length=min_length,
top_p=0.9,
num_captions=num_captions
)
#captions.append(caption[0])
else:
caption = model.generate(
samples,
use_nucleus_sampling=False,
num_beams=num_beams,
max_length=max_length,
min_length=min_length,
num_captions=1
)
captions.append(caption[0])
return captions
if __name__ == "__main__":
back_end()
198 changes: 198 additions & 0 deletions app/backend/multimodal_search_backend.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,198 @@
import os, shutil

import numpy as np
import streamlit as st
import torch
import torch.nn.functional as F
from app import cache_root, device, job_output_path, finished_job_path
from app.utils import (
getAttMap,
init_bert_tokenizer,
load_blip_itm_model,
read_img,
resize_img,
get_pending_jobs,
create_uniq_user_job_name
)
from lavis.models import BlipFeatureExtractor, load_model
from lavis.processors import load_processor

if torch.cuda.is_available():
torch.cuda.set_device(0)
device = "cuda"
else:
device = "cpu"

job_type = 'search'

def load_feat():
from lavis.common.utils import download_url

dirname = os.path.join(os.path.dirname(__file__), "assets")
filename = "path2feat_coco_train2014.pth"
filepath = os.path.join(dirname, filename)
url = "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/assets/path2feat_coco_train2014.pth"

if not os.path.exists(filepath):
download_url(url=url, root=dirname, filename="path2feat_coco_train2014.pth")

path2feat = torch.load(filepath)
paths = sorted(path2feat.keys())

all_img_feats = torch.stack([path2feat[k] for k in paths], dim=0).to(device)

return path2feat, paths, all_img_feats

def load_feature_extractor_model(device):
model_url = "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base.pth"

model = load_model("blip_feature_extractor", model_type="base", is_eval=True, device=device)
model.load_from_pretrained(model_url)

return model

def search(time_stamp, user_question, feature_extractor, vis_processor, raw_user_question, num_display, itm_model):
sample = {"text_input": user_question}
with torch.no_grad():
text_feature = feature_extractor.extract_features(
sample, mode="text").text_embeds_proj[0, 0]

path2feat, paths, all_img_feats = load_feat()
all_img_feats.to(device)
all_img_feats = F.normalize(all_img_feats, dim=1)

num_cols = 4
num_rows = int(num_display) // num_cols

similarities = text_feature @ all_img_feats.T
indices = torch.argsort(similarities, descending=True)[:num_display]

top_paths = [paths[ind.detach().cpu().item()] for ind in indices]
sorted_similarities = [similarities[idx] for idx in indices]
file_root = os.path.join(cache_root, "coco/images/train2014/")
filenames = [os.path.join(file_root, p) for p in top_paths]
outpath = os.path.join(job_output_path, job_type)
os.makedirs(outpath, exist_ok=True)

bsz = 8 # max number of images to avoid cuda oom

#itm_model = load_blip_itm_model("cuda", model_type=blip_type)

tokenizer = init_bert_tokenizer()
queries_batch = [user_question] * bsz
queries_tok_batch = tokenizer(queries_batch, return_tensors="pt").to("cpu")

num_batches = int(num_display / bsz)

avg_gradcams = []
all_raw_images = []
itm_scores = []

for i in range(num_batches):
filenames_in_batch = filenames[i * bsz : (i + 1) * bsz]
raw_images, images = read_and_process_images(filenames_in_batch, vis_processor)
gradcam, itm_output = compute_gradcam_batch(
itm_model, images, queries_batch, queries_tok_batch
)

all_raw_images.extend([resize_img(r_img) for r_img in raw_images])
norm_imgs = [np.float32(r_img) / 255 for r_img in raw_images]

for norm_img, grad_cam in zip(norm_imgs, gradcam):
avg_gradcam = getAttMap(norm_img, grad_cam[0], blur=True)
avg_gradcams.append(avg_gradcam)

with torch.no_grad():
itm_score = torch.nn.functional.softmax(itm_output, dim=1)

itm_scores.append(itm_score)

#avg_gradcams = torch.cat(avg_gradcams)
#all_raw_images = torch.cat(all_raw_images)

itm_scores = torch.cat(itm_scores)[:, 1]
torch.save(itm_scores, outpath+'/{}_itm.pt'.format(create_uniq_user_job_name(time_stamp,raw_user_question)))
np.save(outpath+'/{}_avg_gradcams.npy'.format(create_uniq_user_job_name(time_stamp,raw_user_question)), avg_gradcams, allow_pickle=True)
np.save(outpath+'/{}_all_raw_images.npy'.format(create_uniq_user_job_name(time_stamp,raw_user_question)),all_raw_images,allow_pickle=True)

search_result = outpath+'/{}_result.txt'.format(create_uniq_user_job_name(time_stamp,raw_user_question))
with open(search_result,'w') as f:
for filename in filenames:
f.write(filename+'\n')

def back_end():
# === event ===
vis_processor = load_processor("blip_image_eval").build(image_size=384)
text_processor = load_processor("blip_caption")
feature_extractor = load_feature_extractor_model(device)
os.makedirs("{}/{}/".format(finished_job_path, job_type), exist_ok=True)
large_itm_model = load_blip_itm_model(device, model_type='large')
base_itm_model = load_blip_itm_model(device, model_type='base')

while True:
pending_jobs = get_pending_jobs(job_type)
for job in pending_jobs:
while True:
with open(job) as f:
content = f.readline().rstrip(' \n')
if len(content.split('\t')) == 4: break
time_stamp, raw_user_question, num_display, blip_type = content.split('\t')
user_question = text_processor(raw_user_question)
if blip_type == 'large':
search(time_stamp, user_question, feature_extractor, vis_processor, raw_user_question, int(num_display), large_itm_model)
else:
search(time_stamp, user_question, feature_extractor, vis_processor, raw_user_question, int(num_display), base_itm_model)
shutil.move(job, "{}/{}/".format(finished_job_path,job_type))


def read_and_process_images(image_paths, vis_processor):
raw_images = [read_img(path) for path in image_paths]
images = [vis_processor(r_img) for r_img in raw_images]
images_tensors = torch.stack(images).to(device)

return raw_images, images_tensors


def compute_gradcam_batch(model, visual_input, text_input, tokenized_text, block_num=6):
model.text_encoder.base_model.base_model.encoder.layer[
block_num
].crossattention.self.save_attention = True

output = model(visual_input, text_input, match_head="itm")
loss = output[:, 1].sum()

model.zero_grad()
loss.backward()
with torch.no_grad():
mask = tokenized_text.attention_mask.view(
tokenized_text.attention_mask.size(0), 1, -1, 1, 1
).to(device=device) # (bsz,1,token_len, 1,1)
token_length = mask.sum() - 2
token_length = token_length.cpu()
# grads and cams [bsz, num_head, seq_len, image_patch]
grads = model.text_encoder.base_model.base_model.encoder.layer[
block_num
].crossattention.self.get_attn_gradients()
cams = model.text_encoder.base_model.base_model.encoder.layer[
block_num
].crossattention.self.get_attention_map()

# assume using vit large with 576 num image patch
cams = cams[:, :, :, 1:].reshape(visual_input.size(0), 12, -1, 24, 24) * mask
grads = (
grads[:, :, :, 1:].clamp(0).reshape(visual_input.size(0), 12, -1, 24, 24)
* mask
)

gradcam = cams * grads
# [enc token gradcam, average gradcam across token, gradcam for individual token]
# gradcam = torch.cat((gradcam[0:1,:], gradcam[1:token_length+1, :].sum(dim=0, keepdim=True)/token_length, gradcam[1:, :]))
gradcam = gradcam.mean(1).cpu().detach()
gradcam = (
gradcam[:, 1 : token_length + 1, :].sum(dim=1, keepdim=True) / token_length
)

return gradcam, output

if __name__ == '__main__':
back_end()
Loading