In [1]:
from __future__ import annotations
import argparse
import os
import sys
from dataclasses import dataclass
from PIL import Image
from pathlib import Path
from typing import Optional, Union, List
import torch


from attr import dataclass

sys.path.append('./minigpt4')
import random

import numpy as np
import torch
import torch.backends.cudnn as cudnn

from transformers import StoppingCriteriaList

from minigpt4.common.config import Config
from minigpt4.common.dist_utils import get_rank
from minigpt4.common.registry import registry
from minigpt4.conversation.conversation import Chat, CONV_VISION_Vicuna0, CONV_VISION_LLama2, StoppingCriteriaSub

device = 'mps' if torch.backends.mps.is_available() else 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Using device: {device}')



Using device: mps




In [2]:
@dataclass
class Args:
    cfg_path: str = './minigpt4/eval_configs/minigpt4_eval.yaml'
    options: list = None
    gpu_id: int = 0

def setup_seeds(config):
    seed = config.run_cfg.seed + get_rank()

    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

    cudnn.benchmark = False
    cudnn.deterministic = True

In [3]:
class MiniGPT4Wrapper:
    def __init__(
        self,
        cfg_path: str = './minigpt4/eval_configs/minigpt4_eval.yaml',
        device: str = None,
        gpu_id: int = 0
    ):
        if device is None:
            self.device = 'mps' if torch.backends.mps.is_available() else 'cuda' if torch.cuda.is_available() else 'cpu'
        else:
            self.device = device

        if self.device == 'cuda':
            self.device = f'cuda:{gpu_id}'

        args = Args(cfg_path=cfg_path, gpu_id=gpu_id)
        cfg = Config(args)

        model_config = cfg.model_cfg
        model_cls = registry.get_model_class(model_config.arch)
        self.model = model_cls.from_config(model_config).to(self.device)

        self.model.visual_encoder.float()
        self.model.ln_vision.float()
        self.model.eval()

        original_forward = self.model.llama_model.forward
        def forward_wrapper(*args, **kwargs):
            kwargs.pop('cache_position', None)
            return original_forward(*args, **kwargs)
        self.model.llama_model.forward = forward_wrapper

        if hasattr(self.model.llama_model, 'generation_config'):
            self.model.llama_model.generation_config.do_sample = False

        conv_dict = {
            'pretrain_vicuna0': CONV_VISION_Vicuna0,
            'pretrain_llama2': CONV_VISION_LLama2
        }
        self.conv_template = conv_dict[model_config.model_type]

        vis_processor_cfg = cfg.datasets_cfg.cc_sbu_align.vis_processor.train
        self.vis_processor = registry.get_processor_class(vis_processor_cfg.name).from_config(vis_processor_cfg)

        stop_words_ids = [[835], [2277, 29937]]
        stop_words_ids = [torch.tensor(ids).to(device=self.device) for ids in stop_words_ids]
        self.stopping_criteria = StoppingCriteriaList([StoppingCriteriaSub(stops=stop_words_ids)])

        self.chat = Chat(
            self.model,
            self.vis_processor,
            device=self.device,
            stopping_criteria=self.stopping_criteria
        )

        self.chat_state = None
        self.img_list = []

    def reset(self):
        self.chat_state = self.conv_template.copy()
        self.img_list = []

    def set_image(self, image_path: str):
        self.chat_state = self.conv_template.copy()
        self.img_list = []
        self.chat.upload_img(image_path, self.chat_state, self.img_list)
        self.chat.encode_img(self.img_list)

    def ask(self, message: str):
        self.chat.ask(message, self.chat_state)
        llm_message, _ = self.chat.answer(
            conv=self.chat_state,
            img_list=self.img_list,
            num_beams=1,
            temperature=1.0,
            max_new_tokens=300,
            max_length=2000
        )
        return llm_message

In [4]:
wrapper = MiniGPT4Wrapper(cfg_path='./minigpt4/eval_configs/minigpt4_eval.yaml', device=device)

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message
The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Loading Q-Former


BertLMHeadModel has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.


Loading Q-Former Done
Load MiniGPT-4 Checkpoint: weights/pretrained_minigpt4.pth


In [5]:
img = Image.open('../AudioCLIP/dataset/italian pasta recipe/0/frames/video_0.jpg')
wrapper.set_image('../AudioCLIP/dataset/italian pasta recipe/0/frames/video_0.jpg')
wrapper.ask("Describe the image in detail.")

  return torch.cuda.amp.autocast(dtype=dtype)


RuntimeError: probability tensor contains either `inf`, `nan` or element < 0