In [None]:
%pip install -U transformers accelerate ctransformers langchain torch pydantic

In [None]:
import os
import torch
import transformers
from transformers import AutoTokenizer
from pydantic import BaseModel, Field
from typing import Type, Optional
import json
from langchain.llms import CTransformers
from langchain import PromptTemplate, LLMChain

In [None]:
from langchain import PromptTemplate, LLMChain

In [None]:
MODEL_PATH = './model/mistral-7b-instruct-v0.1.Q5_0.gguf'  # Replace this with the path to your model

In [None]:
config = {
    "max_new_tokens": 2048,
    "context_length": 4096,
    "repetition_penalty": 1.1,
    "temperature": 0.5,
    "top_k": 50,
    "top_p": 0.9,
    "stream": True,
    "threads": int(os.cpu_count() / 2)
}

In [None]:
llm = CTransformers(model=MODEL_PATH,
                    config=config)

In [None]:
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1")

In [None]:
from transformers import BlipProcessor, BlipForConditionalGeneration

blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

In [None]:
from langchain.tools import tool
import requests
from PIL import Image

class ImageCaptionerInput(BaseModel):
    image_url: str = Field(description="URL of the image that is to be described")


@tool("image_captioner", return_direct=True, args_schema=ImageCaptionerInput)
def image_captioner(image_url: str) -> str:
    """Provides information about the image"""
    raw_image = Image.open(requests.get(image_url, stream=True).raw).convert('RGB')
    inputs = blip_processor(raw_image, return_tensors="pt")
    out = blip_model.generate(**inputs)
    return blip_processor.decode(out[0], skip_special_tokens=True)

tools = [image_captioner]

In [None]:
from langchain.agents import AgentOutputParser
from langchain.agents.conversational_chat.prompt import FORMAT_INSTRUCTIONS
from langchain.output_parsers.json import parse_json_markdown
from langchain.schema import AgentAction, AgentFinish
from langchain.memory import ConversationBufferWindowMemory

memory = ConversationBufferWindowMemory(
    memory_key="chat_history", k=5, return_messages=True, output_key="output"
)

class OutputParser(AgentOutputParser):
    def get_format_instructions(self) -> str:
        return FORMAT_INSTRUCTIONS

    def parse(self, text: str) -> AgentAction | AgentFinish:
        try:
            # this will work IF the text is a valid JSON with action and action_input
            response = parse_json_markdown(text)
            action, action_input = response["action"], response["action_input"]
            if action == "Final Answer":
                # this means the agent is finished so we call AgentFinish
                return AgentFinish({"output": action_input}, text)
            else:
                # otherwise the agent wants to use an action, so we call AgentAction
                return AgentAction(action, action_input, text)
        except Exception:
            # sometimes the agent will return a string that is not a valid JSON
            # often this happens when the agent is finished
            # so we just return the text as the output
            return AgentFinish({"output": text}, text)

    @property
    def _type(self) -> str:
        return "conversational_chat"


# initialize output parser for agent
parser = OutputParser()

from langchain.agents import initialize_agent

# initialize agent
agent = initialize_agent(
    agent="chat-conversational-react-description",
    tools=tools,
    llm=llm,
    verbose=True,
    early_stopping_method="generate",
    memory=memory,
    agent_kwargs={"output_parser": parser}
)

In [None]:
start_instruct = "<s>[INST] "
end_instruct = "[/INST]"
end_sentence = "</s>"
system_message_plain = """Assistant is a expert JSON builder designed to help user describe images.

Assistant is able to respond to the User and use tools using JSON strings that contain "action" and "action_input" parameters.

All of Assistant's communication is performed using this JSON format.

Assistant can also use tools by responding to the user with tool use instructions in the same "action" and "action_input" JSON format. Tools available to Assistant are:

- "image_captioner": Useful when you need to get information about the image
  - To use the image_captioner tool, Assistant should write like so:
    ```json
    {{"action": "image_captioner",
      "action_input": "https://xyz.png"}}
    ```

Here are some previous conversations between the Assistant and User:
"""
system_message = start_instruct + system_message_plain + end_instruct + end_sentence

In [None]:
system_message

In [None]:
messages = [
        {
            "role": "user",
            "content": 'Hey how are you today?'
        },
        {
            "role": "assistant",
            "content": '''```json
{{"action": "Final Answer",
 "action_input": "I'm good thanks, how are you?"}}
```'''
        },
        {
            "role": "user",
            "content": "I'm great, what is this image about - https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg ?"
        },
            {"role": "assistant",
            "content": '''```json
{{"action": "image_captioner",
 "action_input": "https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg"}}
```'''
        },{
            "role": "user",
            "content": "a woman sitting on the beach with her dog"
        },
            {"role": "assistant",
            "content": '''```json
{{"action": "Final Answer",
 "action_input": "This image shows a woman sitting on the beach with her dog"}}
```'''
        },
        {
            "role": "user",
            "content": "Thanks could you now tell me what is in this image: https://www.adorama.com/alc/wp-content/uploads/2015/05/stories-HRX5WXFyB64-unsplash.jpg"
        },
            {"role": "assistant",
            "content": '''```json
{{"action": "image_captioner",
 "action_input": "https://www.adorama.com/alc/wp-content/uploads/2015/05/stories-HRX5WXFyB64-unsplash.jpg"}}
```'''
        },{
            "role": "user",
            "content": "a beach with sun setting in the background"
        },
            {"role": "assistant",
            "content": '''```json
{{"action": "Final Answer",
 "action_input": "The image is of a sunset on the beach"}}
```'''
        }
]

conversation_formatted = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

In [None]:
prompt = system_message + '\n\n' + conversation_formatted

In [None]:
new_prompt = agent.agent.create_prompt(
    system_message=prompt,
    tools=tools
)

In [None]:
agent.agent.llm_chain.prompt = new_prompt

instruction = start_instruct + " Respond to the following in JSON with 'action' and 'action_input' values " + end_instruct
human_msg = instruction + "\nUser: {input}"

In [None]:
agent.agent.llm_chain.prompt.messages[2].prompt.template = human_msg

In [None]:
resp = agent("Explain this image: https://images.hindustantimes.com/auto/img/2023/07/23/1600x900/Tesla_Cybertruck_1688887534001_1690087911053.jpeg")
resp['output']

In [None]:
resp['output']

In [None]:
resp = agent('Where was the Tesla car parked?')
resp['output']