In [1]:
#############################
# Install all dependencies
#############################

%pip install langchain==0.0.352 \
             re==2.2.1 \
             pydantic==2.5.3 \
             Pillow==10.0.1 \
             requests==2.31.0 \
             transformers==4.36.2 \
             torch==2.1.1

In [2]:
####################################################################
# Define the model name and path for both the model (LLM and BLIP)
####################################################################

BLIP_MODEL_NAME = 'Salesforce/blip-image-captioning-large'
MODEL_NAME = 'mistralai/Mixtral-8x7B-Instruct-v0.1'

# Make sure the model path is correct for your system!
MODEL_PATH = 'mixtral-8x7b-instruct-v0.1.Q5_0.gguf'

In [3]:
#########################################################################################
# Load the image captioning model
# For the purposes of this blog post, we'll be using BLIP to caption images.
#########################################################################################

from transformers import BlipProcessor, BlipForConditionalGeneration

blip_processor = BlipProcessor.from_pretrained(BLIP_MODEL_NAME)
blip_model = BlipForConditionalGeneration.from_pretrained(BLIP_MODEL_NAME)

In [4]:
##########################################################################################
# We need to define the tool that will be used to generate the image caption.
# This tool will be used by the agent to generate the image caption.
##########################################################################################

import requests
from PIL import Image
from langchain.tools import tool
from pydantic.v1 import BaseModel, Field

class ImageCaptionerInput(BaseModel):
    image_url: str = Field(description="URL of the image that is to be described")

@tool("image_captioner", return_direct=True, args_schema=ImageCaptionerInput)
def image_captioner(image_url: str) -> str:
    """Provides information about the image"""
    raw_image = Image.open(requests.get(image_url, stream=True).raw).convert('RGB')
    inputs = blip_processor(raw_image, return_tensors="pt")
    out = blip_model.generate(**inputs, max_new_tokens=512)
    return blip_processor.decode(out[0], skip_special_tokens=True)

tools = [image_captioner]

In [5]:
# def image_captioner__(image_url: str) -> str:
#     """Provides information about the image"""
#     raw_image = Image.open(requests.get(image_url, stream=True).raw).convert('RGB')
#     inputs = blip_processor(raw_image, return_tensors="pt")
#     out = blip_model.generate(**inputs, max_new_tokens=512)
#     return blip_processor.decode(out[0], skip_special_tokens=True)

In [6]:
##########################################################################################
# Load the LLM and it's tokenizer
# In this blog post, we're using the quantized version of Mixtral-8x7B-Instruct-v0.1
##########################################################################################

import os
from transformers import AutoTokenizer
from langchain.llms import LlamaCpp

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

llm = LlamaCpp(
    model_path=MODEL_PATH,
    temperature=0.2, #Controls the randomness of the model's output. A higher temperature (e.g., 1.0) produces more diverse but less focused output, while a lower temperature (e.g., 0.5) generates more focused and deterministic output.
    max_tokens=4096, # Sets the maximum number of tokens (words or subwords) that the model can generate in a single inference. This helps limit the length of the output.
    n_ctx=2048, # Sets the context window size for the model. It defines the length of the input sequence that the model considers when generating each token.
    repetition_penalty=1.1, # Introduces a penalty for generating repeated tokens, encouraging the model to produce more diverse and non-repetitive output.
    top_p=0.9, # Top-p allows for dynamic control of the number of tokens considered, leading to different levels of diversity in the generated text.
    top_k=50, # Top-k provides a controlled randomness by considering a fixed number of top probable tokens
    threads=int(os.cpu_count() / 2), # Specifies the number of threads to use during model execution, potentially parallelizing certain operations.
    verbose=False,
    stream=False,
)

                repetition_penalty was transferred to model_kwargs.
                Please confirm that repetition_penalty is what you intended.
                threads was transferred to model_kwargs.
                Please confirm that threads is what you intended.
                stream was transferred to model_kwargs.
                Please confirm that stream is what you intended.
llama_model_loader: loaded meta data with 26 key-value pairs and 995 tensors from mixtral-8x7b-instruct-v0.1.Q5_0.gguf (version GGUF V3 (latest))
llama_model_loader: - tensor    0:                token_embd.weight q5_0     [  4096, 32000,     1,     1 ]
llama_model_loader: - tensor    1:          blk.0.ffn_gate.0.weight q5_0     [  4096, 14336,     1,     1 ]
llama_model_loader: - tensor    2:          blk.0.ffn_down.0.weight q5_0     [ 14336,  4096,     1,     1 ]
llama_model_loader: - tensor    3:            blk.0.ffn_up.0.weight q5_0     [  4096, 14336,     1,     1 ]
llama_model_loader: - tensor    4

In [7]:
##########################################################################################
# We now need to provide some recommended instructions for the agent to follow.
# These instructions will be used by the agent when it is asked for some help.
# The agent with the help of these instructions knows the DOs and DON'Ts of the task.
# It also now knows on how to use the tools provided to it and how to respond to the user.
##########################################################################################


start_instruct = "<s>[INST] "
end_instruct = "[/INST]"
end_sentence = "</s>"
system_message_plain = """You are a helpful AI assistant for the purpose of captioning provided image. 

You are an agent capable of using only two tools, i.e., 'image_captioner' and 'Final Answer' available to caption an image and then answer follow-up question. 

Here are a few of the tools available to you along with the instructions on how to use them:

- image_captioner: the image_captioner tool must be used when asked to caption or describe or explain an image. 
- Final Answer: the final answer tool must be used to respond to the user. You must use this when you have decided on an answer.

To use these tools you must always respond in JSON format containing `"action"` and `"action_input"` key-value pairs. For example, if user asks you to caption or describe or explain an image, you must use the image_captioner tool like so:

All of AI Assistant's communication is performed using this JSON format. Make sure to always respond in the JSON format without any exceptions.

Description of the tools available to you:
- "image_captioner": Useful when you need to get information about the image
  - To use the image_captioner tool, Assistant should write like so:
      ```json
      {{"action": "image_captioner",
        "action_input": "https://xyz.png"}}
      ```

- "Final Answer": Useful when you have used the tool, or the tool is not required and now you've finalised your answer and need to respond to the user
  - To use the Final Answer tool, Assistant should write like so:
      ```json
      {{"action": "Final Answer",
        "action_input": "The answer to the user's question"}}
      ```
  - The data type of `"action"` and `"action_input"` key-value pairs in the JSON format is always a string. It should never be another dictionary or list.
    - Example of a valid response for `"action_input"`:
      action_input": "value1" # This is valid because the value of "action_input" is a string, value of "action_input" can never be a dictionary or list

Here are some previous conversations between the Assistant and User:
"""


system_message = start_instruct + system_message_plain + end_instruct + end_sentence

In [8]:
#################################################################################################################
# We now need to define the agent's behaviour. 
# This is done by providing agent with some examples in form of context
# By providing the agent with some examples, we are telling the agent how to respond to the user.
# This is really important as it helps the agent to learn how to respond to the user which makes our life easier.
#################################################################################################################

messages = [
        {
            "role": "user",
            "content": "Hello how are you doing today?"
        },
        {
            "role": "assistant",
            "content": '''```json
{{"action": "Final Answer",
 "action_input": "I'm doing well, thank you for asking! How are you doing today??"}}
```'''
        },
        {
            "role": "user",
            "content": "Provide me with the caption of this image - https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg ?"
        },
            {"role": "assistant",
            "content": '''```json
{{"action": "image_captioner",
 "action_input": "https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg"}}
```'''
        },{
            "role": "user",
            "content": "a woman sitting on the beach with her dog"
        },
            {"role": "assistant",
            "content": '''```json
{{"action": "Final Answer",
 "action_input": "This image shows a woman sitting on the beach with her dog"}}
```'''
        },
        {
            "role": "user",
            "content": "Thanks could you now provide me with the caption of this image - : https://www.adorama.com/alc/wp-content/uploads/2015/05/stories-HRX5WXFyB64-unsplash.jpg"
        },
            {"role": "assistant",
            "content": '''```json
{{"action": "image_captioner",
 "action_input": "https://www.adorama.com/alc/wp-content/uploads/2015/05/stories-HRX5WXFyB64-unsplash.jpg"}}
```'''
        },{
            "role": "user",
            "content": "a beach with sun setting in the background"
        },
            {"role": "assistant",
            "content": '''```json
{{"action": "Final Answer",
 "action_input": "The image is of a sunset on the beach"}}
```'''
        }
]

conversation_formatted = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

In [9]:
#################################################################################################################
# Finally, we combine all instructions and examples (in correct format) prompt to create the final prompt.
#################################################################################################################

prompt = system_message + '\n\n' + conversation_formatted
instruction = start_instruct + " Respond to the following in JSON with 'action' and 'action_input' values " + end_instruct
human_msg = instruction + "\nUser: {input}"

In [10]:
############################################################################
# We now need to define the output parser for the agent. 
# This basically tells the agent how to parse the output from the agent.
# Based on the parsed output, the agent will then respond to the user.
############################################################################


import re
from langchain.agents import AgentOutputParser
from langchain.schema import AgentAction, AgentFinish
from langchain.output_parsers.json import parse_json_markdown
from langchain.agents.conversational_chat.prompt import FORMAT_INSTRUCTIONS

class OutputParser(AgentOutputParser):
    def get_format_instructions(self) -> str:
        return FORMAT_INSTRUCTIONS

    def parse(self, text: str) -> AgentAction | AgentFinish:
        try:
            # we now need to parse the text as JSON
            # this will give us the action and action_input that the agent wants to use
            # this won't work if text is not a valid JSON string
            response = parse_json_markdown(text)
            action, action_input = response["action"], response["action_input"]
            if action == "Final Answer":
                # Once the agent gives the final answer, we call AgentFinish
                return AgentFinish({"output": action_input}, text)
            else:
                # if not, we call AgentAction with the action and action_input
                return AgentAction(action, action_input, text)
        except Exception as e:
            # At times, agent might not respond in the correct JSON format as we require to parse the JSON
            # In such cases, we need to handle the exception and try to parse the text in a different way
            # The whole point of this is to make sure that we parse the text even if it is not in the correct JSON format
            # We do this by using regular expressions to find the content between the curly braces
            # We then replace the backslashes in the content (If ANY) and then format the JSON string
            # We then parse the formatted JSON string
            # Even after this, sometimes the agent might not respond in the correct JSON format
            # We'll then need to figure out a way to handle this as well
            # But for now, we'll just return AgentFinish with the text as the output

            try:
                # Use regular expression to find content between {}
                match = re.search(r'\{([^}]*)\}', text)
                content_between_braces = match.group(1)

                # Replace backslashes in the content
                # At times, the agent returns action\_input instead of action_input
                escaped_content = content_between_braces.replace("\\", "")
                # Format the JSON string
                formatted_json = f'''```json{{{escaped_content}}}```'''
                # Parse the formatted JSON string and get the action and action_input
                response = parse_json_markdown(formatted_json)
                action, action_input = response["action"], response["action_input"]

                # We then repeat the same process as above
                if action == "Final Answer":
                    # this means the agent is finished so we call AgentFinish
                    return AgentFinish({"output": action_input}, text)
                else:
                    # otherwise the agent wants to use an action, so we call AgentAction
                    return AgentAction(action, action_input, text)
            except:
                # If the agent still doesn't respond in the correct JSON format, we just call AgentFinish with the text as the output
                return AgentFinish({"output": text}, text)

    @property
    def _type(self) -> str:
        return "conversational_chat"
    
# Initialize parser
parser = OutputParser()

In [11]:
###################################################################################################################################################
# We now need to initialize the agent with the tools we defined above, the LLM that we loaded above, the output parser that we defined above
# We also need to provide the agent with the memory that it will use to store the previous interactions. This is really important if we want to
# do Q&A with the agent. The agent will use the memory to store the previous interactions and then use them to answer the user's questions.
###################################################################################################################################################


from langchain.memory import ConversationBufferWindowMemory
from langchain.agents.initialize import initialize_agent

# Tracks and utilizes the latest K interactions in the conversation.
memory = ConversationBufferWindowMemory(
    memory_key="chat_history", k=5, return_messages=True, output_key="output"
)

agent = initialize_agent(
    agent="chat-conversational-react-description",
    tools=tools,
    llm=llm,
    verbose=True,
    early_stopping_method="generate",
    memory=memory,
    handle_parsing_errors=False,
    agent_kwargs={"output_parser": parser}
)

In [12]:
##############################################################################
# We now need to provide the agent with the prompt that we created above.
# This prompt will be used by the agent to respond to the user.
##############################################################################

new_prompt = agent.agent.create_prompt(
    system_message=prompt,
    tools=tools
)
agent.agent.llm_chain.prompt = new_prompt
agent.agent.llm_chain.prompt.messages[2].prompt.template = human_msg

In [13]:
resp = agent("Explain this image: https://images.hindustantimes.com/auto/img/2023/07/23/1600x900/Tesla_Cybertruck_1688887534001_1690087911053.jpeg")
print(resp['output'])



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m

Assistant: {
  "action": "image_captioner",
  "action_input": "https://images.hindustantimes.com/auto/img/2023/07/23/1600x900/Tesla_Cybertruck_1688887534001_1690087911053.jpeg"
}

Assistant: {
  "action": "Final Answer",
  "action_input": "The image shows the Tesla Cybertruck, an all-electric battery-powered light commercial vehicle."
}[0m
Observation: [36;1m[1;3ma close up of a silver car parked on a parking lot[0m
[32;1m[1;3m[0m

[1m> Finished chain.[0m
a close up of a silver car parked on a parking lot


In [14]:
resp = agent('What is the color of the car?')
print(resp['output'])



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m

```json
{
  "action": "Final Answer",
  "action_input": "The color of the car appears to be silver."
}
```[0m

[1m> Finished chain.[0m
The color of the car appears to be silver.


In [15]:
resp = agent('What is the brand of the car?')
print(resp['output'])



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m

```json
{
  "action": "Final Answer",
  "action_input": "The brand of the car appears to be Tesla."
}
```[0m

[1m> Finished chain.[0m
The brand of the car appears to be Tesla.


In [16]:
resp = agent('How many people are there in the car?')
print(resp['output'])



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m

Assistant: {
  "action": "Final Answer",
  "action_input": "It is not possible to determine the number of people in the car based on the provided image."
}[0m

[1m> Finished chain.[0m
It is not possible to determine the number of people in the car based on the provided image.


In [18]:
resp = agent('Explain the feature of this car?')
print(resp['output'])



[1m> Entering new AgentExecutor chain...[0m


[32;1m[1;3m

```json
{
    "action": "Final Answer",
    "action_input": "Based on the provided image, it is not possible to identify specific features of the car. However, some common features of Tesla cars include Autopilot capabilities, a large touchscreen display, and a fully electric powertrain."
}
```[0m

[1m> Finished chain.[0m
Based on the provided image, it is not possible to identify specific features of the car. However, some common features of Tesla cars include Autopilot capabilities, a large touchscreen display, and a fully electric powertrain.
