In [1]:
import argparse
from io import BytesIO

import requests
import torch
from PIL import Image

from share4v.constants import (DEFAULT_IM_END_TOKEN, DEFAULT_IM_START_TOKEN,
                               DEFAULT_IMAGE_TOKEN, IMAGE_TOKEN_INDEX)
from share4v.conversation import SeparatorStyle, conv_templates
from share4v.mm_utils import (KeywordsStoppingCriteria,
                              get_model_name_from_path, tokenizer_image_token)
from share4v.model.builder import load_pretrained_model
from share4v.utils import disable_torch_init

  from .autonotebook import tqdm as notebook_tqdm


[2024-12-08 23:24:26,043] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/usr/bin/ld: cannot find -laio
collect2: error: ld returned 1 exit status
/sw/spack/deltas11-2023-03/apps/linux-rhel8-zen3/gcc-11.4.0/cuda-11.8.0-vfixfmc/lib64/libcufile.so: undefined reference to `pthread_rwlock_unlock'
/sw/spack/deltas11-2023-03/apps/linux-rhel8-zen3/gcc-11.4.0/cuda-11.8.0-vfixfmc/lib64/libcufile.so: undefined reference to `pthread_spin_init'
/sw/spack/deltas11-2023-03/apps/linux-rhel8-zen3/gcc-11.4.0/cuda-11.8.0-vfixfmc/lib64/libcufile.so: undefined reference to `pthread_spin_unlock'
/sw/spack/deltas11-2023-03/apps/linux-rhel8-zen3/gcc-11.4.0/cuda-11.8.0-vfixfmc/lib64/libcufile.so: undefined reference to `pthread_rwlock_init'
/sw/spack/deltas11-2023-03/apps/linux-rhel8-zen3/gcc-11.4.0/cuda-11.8.0-vfixfmc/lib64/libcufile.so: undefined reference to `dlopen'
/sw/spack/deltas11-2023-03/apps/linux-rhel8-zen3/gcc-11.4.0/cuda-11.8.0-vfixfmc/lib64/libcufile.so: undefined reference to `pthread_rwlock_rdlock'
/sw/spack/deltas11-2023-03/apps/linux-rhel8-zen3/gcc-11.4.0/cuda-11

In [5]:

def load_image(image_file):
    if image_file.startswith('http') or image_file.startswith('https'):
        response = requests.get(image_file)
        image = Image.open(BytesIO(response.content)).convert('RGB')
    else:
        image = Image.open(image_file).convert('RGB')
    return image


def eval_model(args):
    # Model
    disable_torch_init()

    model_name = get_model_name_from_path(args.model_path)
    tokenizer, model, image_processor, context_len = load_pretrained_model(
        args.model_path, args.model_base, model_name)

    qs = args.query
    if model.config.mm_use_im_start_end:
        qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + \
            DEFAULT_IM_END_TOKEN + '\n' + qs
    else:
        qs = DEFAULT_IMAGE_TOKEN + '\n' + qs

    if 'llama-2' in model_name.lower():
        conv_mode = "share4v_llama_2"
    elif "v1" in model_name.lower():
        conv_mode = "share4v_v1"
    elif "mpt" in model_name.lower():
        conv_mode = "mpt"
    else:
        conv_mode = "share4v_v0"

    if args.conv_mode is not None and conv_mode != args.conv_mode:
        print('[WARNING] the auto inferred conversation mode is {}, while `--conv-mode` is {}, using {}'.format(
            conv_mode, args.conv_mode, args.conv_mode))
    else:
        args.conv_mode = conv_mode

    conv = conv_templates[args.conv_mode].copy()
    conv.append_message(conv.roles[0], qs)
    conv.append_message(conv.roles[1], None)
    prompt = conv.get_prompt()

    images = load_image(args.image_file)
    images_tensor = image_processor.preprocess(images, return_tensors='pt')[
        'pixel_values'].half().cuda()

    input_ids = tokenizer_image_token(
        prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()

    stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
    keywords = [stop_str]
    stopping_criteria = KeywordsStoppingCriteria(
        keywords, tokenizer, input_ids)

    with torch.inference_mode():
        output_ids = model.generate(
            input_ids,
            images=images_tensor,
            do_sample=True,
            temperature=0.2,
            max_new_tokens=1024,
            use_cache=True,
            stopping_criteria=[stopping_criteria])

    input_token_len = input_ids.shape[1]
    n_diff_input_output = (
        input_ids != output_ids[:, :input_token_len]).sum().item()
    if n_diff_input_output > 0:
        print(
            f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids')
    outputs = tokenizer.batch_decode(
        output_ids[:, input_token_len:], skip_special_tokens=True)[0]
    outputs = outputs.strip()
    if outputs.endswith(stop_str):
        outputs = outputs[:-len(stop_str)]
    outputs = outputs.strip()
    print(outputs)
    
    
    with torch.inference_mode():
        # Obtain the model's output logits or embeddings
        output = model(
            input_ids=input_ids,
            images=images_tensor,
            return_dict=True,
            output_hidden_states=True  # Ensure hidden states are returned
        )
    last_hidden_states = output.hidden_states[-1]  # Assuming this is the last layer

    # Optionally, you can pool these features or use them as is
    pooled_features = last_hidden_states.mean(dim=1)  # Simple example of pooling
    # Return or save the features
    return pooled_features

In [6]:
model_path = "Lin-Chen/ShareGPT4V-7B"
prompt = "Please write a paragraph of formal art analysis for this painting"
image_file = '/scratch/bbmr/syang7/art/dataset_new/Manet/EM_15.jpg'

args = type('Args', (), {
    "model_path": model_path,
    "model_base": None,
    "model_name": get_model_name_from_path(model_path),
    "query": prompt,
    "conv_mode": None,
    "image_file": image_file,
    "sep": ",",
    "temperature": 0,
    "top_p": None,
    "num_beams": 1,
    "max_new_tokens": 512
})()

pooled_features = eval_model(args)

Loading checkpoint shards: 100%|██████████| 2/2 [00:07<00:00,  3.65s/it]


Load vision tower from Lin-Chen/ShareGPT4V-7B_Pretrained_vit-large336-l12
This is an impressionist painting that captures a serene scene of a river. The painting is dominated by a palette of greens, blues, and browns, which are used to depict the water, trees, and boats. The artist has employed a loose, sketchy style, focusing on the interplay of light and shadow to create a sense of depth and movement. The boats, painted in a darker shade of brown, are positioned in the foreground, drawing the viewer's attention. The background features a lighter shade of green, with trees and buildings subtly visible, adding to the overall tranquility of the scene. The artist's use of color and light creates a harmonious balance, reflecting the peaceful atmosphere of the river.


In [7]:
test1 = torch.load('../../../../../GalleryGPT/GalleryGPT/comments_raw_features/Manet/EM_15.pt')

In [8]:
test1

tensor([[-0.2328,  0.1823, -0.2034,  ...,  0.5649, -0.8965,  0.1780]],
       device='cuda:0', dtype=torch.float16)

In [9]:
pooled_features

tensor([[ 0.4070, -0.7573,  0.6772,  ..., -0.2795,  0.3252, -0.4282]],
       device='cuda:0', dtype=torch.float16)