In [1]:
# =============================================================================
# LLaVA Custom Vision Encoder/Projector Inference Script
# =============================================================================

# 0. 환경 설정 & 공통 라이브러리
import os, sys, warnings, shutil
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"
os.environ["CUDA_HOME"] = "/usr/local/cuda-12.4"
os.environ["LD_LIBRARY_PATH"] = (
    "/usr/lib/x86_64-linux-gnu:/usr/local/cuda-12.4/lib64:"
    + os.environ.get("LD_LIBRARY_PATH", "")
)

import math
import cv2  # OpenCV
import torch
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
from PIL import Image
from io import BytesIO
import requests  # For loading image from URL
from threading import Thread

from transformers import (
    AutoConfig, AutoTokenizer, BitsAndBytesConfig, TextStreamer
)
from llava.constants import (
    IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_PATCH_TOKEN,
    DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
)
from llava.conversation import conv_templates, SeparatorStyle
from llava.utils import disable_torch_init
from llava.mm_utils import process_images, tokenizer_image_token, get_model_name_from_path
from llava.model.builder import load_pretrained_model

warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
torch.backends.cuda.enable_flash_sdp(False)

# LLaVA 프로젝트 루트가 sys.path 에 없으면 추가
nb_root = os.getcwd()
if nb_root not in sys.path:
    sys.path.insert(0, nb_root)
    print(f"Added to sys.path: {nb_root}")

# 사용자 정의 CLIP 모듈 임포트
from INFERclipregXGATED.model import VisionTransformer as CustomVisionTransformer
print("✔ CustomVisionTransformer imported.")

# =============================================================================
# 1. 사용자 설정
# =============================================================================
MODEL_PATH_LLAVA_CONFIG_AND_PROJECTOR = "./llava-v1.5-7b-local"  # 실제 경로로 수정
MODEL_BASE_LLM = "lmsys/vicuna-7b-v1.5"                        # 사용할 LLM

CUSTOM_VISION_ENCODER_WEIGHTS_PATH = "./models/ViT-L-14-REG-GATED-balanced-ckpt12.safetensors"
CUSTOM_PROJECTOR_FILENAME = "mm_projector.bin"

IMAGE_FILE_TO_PROCESS = "data/car.jpg"
USER_PROMPT = "Describe the car in the image and its surroundings in detail."

MAX_NEW_TOKENS = 256
TEMPERATURE = 0.2
CONV_MODE = None  # None 이면 자동 추론 ("vicuna_v1" 등 지정 가능)

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
LOAD_8BIT = False
LOAD_4BIT = False

VISION_ENCODER_CONFIG = {
    "image_resolution": 224,
    "patch_size": 14,
    "width": 1024,
    "layers": 24,
    "heads": 16,
    "output_dim": 1024,
    "num_registers": 4
}
mm_vision_select_layer_val = -2
mm_projector_type_val = "mlp2x_gelu"

# =============================================================================
# 2. 유틸리티 함수 정의
# =============================================================================
def load_image(path_or_url: str) -> Image.Image:
    if path_or_url.startswith(("http://", "https://")):
        resp = requests.get(path_or_url)
        resp.raise_for_status()
        return Image.open(BytesIO(resp.content)).convert("RGB")
    if not os.path.exists(path_or_url):
        raise FileNotFoundError(f"Image file not found: {path_or_url}")
    return Image.open(path_or_url).convert("RGB")

def overlay_heatmap(heatmap_np: np.ndarray, base_img: Image.Image) -> np.ndarray:
    bgr = cv2.cvtColor(np.array(base_img), cv2.COLOR_RGB2BGR)
    h, w = bgr.shape[:2]
    hm = cv2.resize(heatmap_np.astype(np.float32), (w, h), interpolation=cv2.INTER_LINEAR)
    hm = (hm - hm.min()) / (hm.ptp() + 1e-8)
    hm_cm = cv2.applyColorMap((hm * 255).astype(np.uint8), cv2.COLORMAP_JET)
    mix = cv2.addWeighted(bgr, 0.6, hm_cm, 0.4, 0)
    return cv2.cvtColor(mix, cv2.COLOR_BGR2RGB)

def infer_conv_mode(model_name: str) -> str:
    name = model_name.lower()
    if "llama-2" in name:    return "llava_llama_2"
    if "mistral" in name:    return "mistral_instruct"
    if "v1.6-34b" in name:   return "chatml_direct"
    if "v1" in name:         return "llava_v1"
    if "mpt" in name:        return "mpt"
    return "llava_v0"

# =============================================================================
# 3. 모델 로드 및 초기화
# =============================================================================
print("▶ Loading LLaVA model...")
disable_torch_init()

model_arch_name = get_model_name_from_path(MODEL_PATH_LLAVA_CONFIG_AND_PROJECTOR)
tokenizer, model, image_processor, context_len = load_pretrained_model(
    MODEL_PATH_LLAVA_CONFIG_AND_PROJECTOR,
    model_base=MODEL_BASE_LLM,
    model_name=model_arch_name,
    load_8bit=LOAD_8BIT,
    load_4bit=LOAD_4BIT,
    device=DEVICE
)
print("✅ Model loaded.")

# 대화 템플릿 선택
conv_key = CONV_MODE or infer_conv_mode(model_arch_name)
if conv_key not in conv_templates:
    raise ValueError(f"Unknown conv mode: {conv_key}")
conv = conv_templates[conv_key].copy()
roles = ("user","assistant") if "mpt" in model_arch_name.lower() else conv.roles
print("Conversation mode →", conv_key)

# =============================================================================
# 4. 이미지 로드 & 전처리
# =============================================================================
pil_img = load_image(IMAGE_FILE_TO_PROCESS)
print("Loaded image:", pil_img.size)
img_tensor = process_images([pil_img], image_processor, model.config)
if isinstance(img_tensor, list):
    img_tensor = [t.to(model.device, dtype=model.dtype) for t in img_tensor]
else:
    img_tensor = img_tensor.to(model.device, dtype=model.dtype)

# =============================================================================
# 5. 프롬프트 구성
# =============================================================================
if model.config.mm_use_im_start_end:
    user_inp = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_PATCH_TOKEN + DEFAULT_IM_END_TOKEN + "\n" + USER_PROMPT
else:
    user_inp = DEFAULT_IMAGE_PATCH_TOKEN + "\n" + USER_PROMPT

conv.append_message(roles[0], user_inp)
conv.append_message(roles[1], None)
full_prompt = conv.get_prompt()

print("\n--- Prompt to tokenizer ---")
print(full_prompt)

input_ids = tokenizer_image_token(
    full_prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt"
).unsqueeze(0).to(model.device)

# =============================================================================
# ★★★ 6. 텍스트 생성: 오직 여기만 수정 ★★★
# =============================================================================
# ① attention_mask 생성
attention_mask = (input_ids != tokenizer.pad_token_id).long().to(model.device)

# ② generate() 호출 시 키 이름 변경 및 attention_mask 추가
gen_kwargs = dict(
    inputs=input_ids,               # 변경: input_ids=→ inputs=
    attention_mask=attention_mask,  # 추가
    images=img_tensor,
    image_sizes=[pil_img.size],
    do_sample=TEMPERATURE > 0,
    temperature=TEMPERATURE,
    max_new_tokens=MAX_NEW_TOKENS,
    streamer=TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True),
    use_cache=True,
    pad_token_id=tokenizer.pad_token_id or tokenizer.eos_token_id,
)

print(f"\n[{roles[0]}] {USER_PROMPT}")
print(f"[{roles[1]}] ", end="", flush=True)

with torch.inference_mode():
    model.generate(**gen_kwargs)

print("\n\n--- Inference Complete. ---")


  from .autonotebook import tqdm as notebook_tqdm


[2025-05-13 04:35:30,297] [INFO] [real_accelerator.py:239:get_accelerator] Setting ds_accelerator to cuda (auto detect)
Added to sys.path: /home/ubuntu/Projects/regllava
✔ CustomVisionTransformer imported.
▶ Loading LLaVA model...
Loading LLaVA from base model...


Fetching 2 files: 100%|██████████| 2/2 [00:00<00:00, 16743.73it/s]


/home/ubuntu/Projects/regllava/INFERclipregXGATED 에서 INFERclipregXGATED 모듈을 성공적으로 임포트했습니다.


Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.22s/it]
Some weights of LlavaLlamaForCausalLM were not initialized from the model checkpoint at lmsys/vicuna-7b-v1.5 and are newly initialized: ['model.mm_projector.0.bias', 'model.mm_projector.0.weight', 'model.mm_projector.2.bias', 'model.mm_projector.2.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


CLIPVisionTower 로드 시 사용할 target_dtype: torch.float32 (fp16: False, bf16: False)
로딩 시작: 사용자 정의 Reg-Gated CLIP 모델 (./models/ViT-L-14-REG-GATED-balanced-ckpt12.safetensors)
Checking model parameters... You should see [261, _ ]. If you see [257, _ ], something is wrong.
vision_width, vision_layers, patch_size, grid_size:, new_pos_embed: 1024, 24, 14, 16, 261
사용자 정의 Reg‑Gated CLIP 모델 로드 완료. 입력 해상도: 224, 패치 크기: 14
✅ Model loaded.
Conversation mode → llava_v1
Loaded image: (1300, 954)

--- Prompt to tokenizer ---
A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions. USER: <im_patch>
Describe the car in the image and its surroundings in detail. ASSISTANT:

[USER] Describe the car in the image and its surroundings in detail.
[ASSISTANT] 

RuntimeError: mat1 and mat2 must have the same dtype, but got Float and Half