### How to chat with GPT using OpenAI API

In [1]:
import os
import openai
from openai import OpenAI
from rich.console import Console
from typing import List, Dict, Optional

print("openai version:[%s]" % (openai.__version__))

openai version:[1.3.7]


### Locate where your key is

In [2]:
key_path = "../key/rilab_key.txt"
print("key_path:[%s]" % (key_path))

with open(key_path, "r") as f:
    OPENAI_API_KEY = f.read()
client = OpenAI(api_key=OPENAI_API_KEY)

key_path:[../key/rilab_key.txt]


### Query function

In [3]:
def query_gpt(messages: List[Dict], gpt_model="gpt-4-vision-preview"):
    """
    gpt_model: 'gpt-4-vision-preview'
    refer to : https://platform.openai.com/docs/guides/vision
    """
    # Call the OpenAI API
    response = client.chat.completions.create(model=gpt_model, messages=messages)
    # Extract the response content and status code
    content = response.choices[0].message.content
    status_code = response.choices[0].finish_reason
    return content, status_code, response


print("Ready.")

Ready.


`messages`, an input to GPT, is basically a list where each item is a dictionary consists of `role` and `content`. A `role` can either be
* `system`: which defines the identity of the agent
* `user`: which states the input of a user
* `assistant`: which stores messages previously generated by the agents
More information can be found in [here](https://platform.openai.com/docs/guides/gpt/chat-completions-api).

In [4]:
role_msg = """You are a helpful agent with vision capabilities; do not respond to objects not depicted in images."""
question = "What’s in this image?"
image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
messages = [
    {"role": "system", "content": f"{role_msg}"},
    {
        "role": "user",
        "content": [
            {"type": "text", "text": "What’s in this image?"},
            {
                "type": "image_url",
                "image_url": {
                    "url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
                },
            },
        ],
    },
]

print(messages)

[{'role': 'system', 'content': 'You are a helpful agent with vision capabilities; do not respond to objects not depicted in images.'}, {'role': 'user', 'content': [{'type': 'text', 'text': 'What’s in this image?'}, {'type': 'image_url', 'image_url': {'url': 'https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg'}}]}]


### Now let's use `GPT`

In [6]:
content, status_code, response = query_gpt(messages=messages, gpt_model="gpt-4-vision-preview")

In [7]:
print(content)

The image depicts a tranquil natural setting featuring a wooden boardwalk stretching out into the


In [8]:
print(status_code)

None


In [9]:
print(response)

ChatCompletion(id='chatcmpl-8T4Z9LJ2nnajDUnnlVMkez2ensmM2', choices=[Choice(finish_reason=None, index=0, message=ChatCompletionMessage(content='The image depicts a tranquil natural setting featuring a wooden boardwalk stretching out into the', role='assistant', function_call=None, tool_calls=None), finish_details={'type': 'max_tokens'})], created=1701939755, model='gpt-4-1106-vision-preview', object='chat.completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=16, prompt_tokens=1141, total_tokens=1157))


### Helper Class for implementing efficient chat with GPT

In [10]:
from PIL import Image
import requests
import re
import base64, io
from IPython.display import Markdown, display


def printmd(string):
    display(Markdown(string))


class GPT4VchatClass:
    def __init__(
        self,
        gpt_model: str = "gpt-4-vision-preview",
        role_msg: str = "You are a helpful agent with vision capabilities; do not respond to objects not depicted in images.",
        image_longaxis_max_size: int = 512,
        VERBOSE: str = True,
    ):
        self.gpt_model = gpt_model
        self.role_msg = role_msg
        self.messages = [{"role": "system", "content": f"{role_msg}"}]
        self.init_messages = [{"role": "system", "content": f"{role_msg}"}]
        self.VERBOSE = VERBOSE
        if self.VERBOSE:
            self.console = Console()
        self.response = None
        self.image_longaxis_max_size = image_longaxis_max_size

        self._setup_client()

    def _setup_client(self, key_path: str = "../"):
        key_path = "../key/rilab_key.txt"
        if self.VERBOSE:
            self.console.print(f"[bold cyan]key_path:[%s][/bold cyan]" % (key_path))

        with open(key_path, "r") as f:
            OPENAI_API_KEY = f.read()
        self.client = OpenAI(api_key=OPENAI_API_KEY)

        if self.VERBOSE:
            self.console.print(
                "[bold cyan]Chat agent using [%s] initialized with the follow role:[%s][/bold cyan]"
                % (self.gpt_model, self.role_msg)
            )

    def _encode_image(self, image_pil: Image.Image) -> str:
        image_pil_rgb = image_pil.convert("RGB")
        # change pil to base64 string
        img_buf = io.BytesIO()
        image_pil_rgb.save(img_buf, format="PNG")
        # Encode bytes to base64 string
        img_base64 = base64.b64encode(img_buf.getvalue()).decode("utf-8")
        return img_base64

    def _divide_by_img_tag(self, text: str) -> List[str]:
        """
        Example:
        Input: "<img1> <img2> What is the difference of these two images?"
        Output: ['<img1>', '<img2>', ' What is the difference of these two images?']
        """

        pattern = r"(<img\d+>)"
        segments = re.split(pattern, text)
        segments = [seg for seg in segments if seg.strip() != ""]

        return segments

    def _add_message(
        self, role="assistant", content: str = "", images: Optional[List] = None
    ):
        """
        role: 'assistant' / 'user'
        """
        if images is not None:
            # parsing text content
            image_text_segments = self._divide_by_img_tag(content)
            new_content = []
            image_num = 0
            for segment in image_text_segments:
                # check if image or text
                if segment.startswith("<img") and segment.endswith(">"):
                    # this is image
                    local_image_path = images[image_num]
                    image_pil = Image.open(local_image_path)
                    image_pil.thumbnail(
                        (self.image_longaxis_max_size, self.image_longaxis_max_size)
                    )
                    base64_image = self._encode_image(image_pil)
                    new_content.append(
                        {
                            "type": "image_url",
                            "image_url": {
                                "url": f"data:image/jpeg;base64,{base64_image}"
                            },
                        }
                    )
                    image_num += 1

                else:
                    # this is text
                    new_content.append(
                        {
                            "type": "text",
                            "text": segment,
                        }
                    )
            self.messages.append({"role": role, "content": new_content})
        else:
            self.messages.append({"role": role, "content": content})

    def _get_response_content(self):
        if self.response:
            return self.response.choices[0].message.content
        else:
            return None

    def _get_response_status(self):
        if self.response:
            return self.response.choices[0].message.finish_reason
        else:
            return None

    def chat(
        self,
        user_msg: str = "<img> what's in this image?",
        images: List[str] = ["../img/cat.png"],
        PRINT_USER_MSG=True,
        PRINT_GPT_OUTPUT=True,
        RESET_CHAT=False,
        RETURN_RESPONSE=True,
        MAX_TOKENS = 512,
    ):
        self._add_message(role="user", content=user_msg, images=images)
        self.response = self.client.chat.completions.create(
            model=self.gpt_model, messages=self.messages, max_tokens=MAX_TOKENS
        )
        print(self.response)
        # Backup response for continous chatting
        self._add_message(role="assistant", content=self._get_response_content())
        if PRINT_USER_MSG:
            self.console.print("[deep_sky_blue3][USER_MSG][/deep_sky_blue3]")
            printmd(user_msg)
        if PRINT_GPT_OUTPUT:
            self.console.print("[spring_green4][GPT_OUTPUT][/spring_green4]")
            printmd(self._get_response_content())
        # Reset
        if RESET_CHAT:
            self.messages = self.init_messages
        # Return
        if RETURN_RESPONSE:
            return self._get_response_content()


print("Ready.")

Ready.


### Now let's chat

<img src="../img/cat.png" width="256">

In [11]:
GPT = GPT4VchatClass(
    gpt_model="gpt-4-vision-preview",
    role_msg="You are a helpful agent with vision capabilities; do not respond to objects not depicted in images.",
)
PRINT_USER_MSG = True
PRINT_GPT_OUTPUT = True
RESET_CHAT = False
RETURN_RESPONSE = False
GPT.chat(
    user_msg="<img1> Describe the image.",
    images=["../img/cat.png"],
    PRINT_USER_MSG=PRINT_USER_MSG,
    PRINT_GPT_OUTPUT=PRINT_GPT_OUTPUT,
    RESET_CHAT=RESET_CHAT,
    RETURN_RESPONSE=RETURN_RESPONSE,
    MAX_TOKENS = 256,
)

ChatCompletion(id='chatcmpl-8T4ZdC1vLXPgHJ31gLQIrEEFVWnmq', choices=[Choice(finish_reason=None, index=0, message=ChatCompletionMessage(content="The image shows an adorable kitten standing on its hind legs with one front paw raised as if it is reaching for something above. The kitten has a fluffy coat with a combination of light brown, cream, and white colors and distinctive tabby markings on its face, along with a white chin and chest. It has large, round, and expressive eyes and appears curious or playful. The backdrop is a soft blue color, providing a calm and complimentary background that contrasts nicely with the kitten's warm tones.", role='assistant', function_call=None, tool_calls=None), finish_details={'type': 'stop', 'stop': '<|fim_suffix|>'})], created=1701939785, model='gpt-4-1106-vision-preview', object='chat.completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=99, prompt_tokens=289, total_tokens=388))


<img1> Describe the image.

The image shows an adorable kitten standing on its hind legs with one front paw raised as if it is reaching for something above. The kitten has a fluffy coat with a combination of light brown, cream, and white colors and distinctive tabby markings on its face, along with a white chin and chest. It has large, round, and expressive eyes and appears curious or playful. The backdrop is a soft blue color, providing a calm and complimentary background that contrasts nicely with the kitten's warm tones.

### Multi Image

<img src="../img/cat.png" height="256"> <img src="../img/dog.png" height="256">

In [15]:
GPT = GPT4VchatClass(
    gpt_model="gpt-4-vision-preview",
    role_msg="You are a helpful agent with vision capabilities; do not respond to objects not depicted in images.",
)
PRINT_USER_MSG = True
PRINT_GPT_OUTPUT = True
RESET_CHAT = True
RETURN_RESPONSE = False
GPT.chat(
    user_msg="<img1> <img2> What is the difference between two images?",
    images=["../img/cat.png", "../img/dog.png"],
    PRINT_USER_MSG=PRINT_USER_MSG,
    PRINT_GPT_OUTPUT=PRINT_GPT_OUTPUT,
    RESET_CHAT=RESET_CHAT,
    RETURN_RESPONSE=RETURN_RESPONSE,
)

ChatCompletion(id='chatcmpl-8T4dk5RTf7AItnoA64MEquviWtv2F', choices=[Choice(finish_reason=None, index=0, message=ChatCompletionMessage(content='The first image features a cat, while the second image depicts a dog. The cat appears to be indoors with a solid light-colored background and is reaching upwards with its front paw, giving it a playful or curious expression. The dog, on the other hand, is outdoors, possibly on a beach, judging by the sandy ground, with a natural, bright sky background. The dog is seated and looking upward with a calm or inquisitive expression.\n\nIn summary, the differences include the species (cat vs. dog), the setting (indoor vs. outdoor), and the pose and expressions of the animals.', role='assistant', function_call=None, tool_calls=None), finish_details={'type': 'stop', 'stop': '<|fim_suffix|>'})], created=1701940040, model='gpt-4-1106-vision-preview', object='chat.completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=122, prompt_tok

<img1> <img2> What is the difference between two images?

The first image features a cat, while the second image depicts a dog. The cat appears to be indoors with a solid light-colored background and is reaching upwards with its front paw, giving it a playful or curious expression. The dog, on the other hand, is outdoors, possibly on a beach, judging by the sandy ground, with a natural, bright sky background. The dog is seated and looking upward with a calm or inquisitive expression.

In summary, the differences include the species (cat vs. dog), the setting (indoor vs. outdoor), and the pose and expressions of the animals.