In [2]:
import os
from dotenv import load_dotenv

# Set environment variables in the .env file.
load_dotenv()

OPENAI_API_TYPE = os.environ["OPENAI_API_TYPE"]
OPENAI_API_VERSION = os.environ["OPENAI_API_VERSION"]
OPENAI_API_BASE = os.environ["OPENAI_API_BASE"]
OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]

OPENAI_DEPLOYMENT_NAME = os.environ["OPENAI_DEPLOYMENT_NAME"]

AZURE_AI_VISION_ENDPOINT = os.environ["AZURE_AI_VISION_ENDPOINT"]
AZURE_AI_VISION_KEY = os.environ["AZURE_AI_VISION_KEY"]

# OpenAI Vision Only

## Image from Public Web
If the image is available via an image URL, the OpenAI SDK may be used.

In [8]:
from openai import AzureOpenAI

client = AzureOpenAI(
  azure_endpoint = OPENAI_API_BASE, 
  api_key=OPENAI_API_KEY,  
  api_version=OPENAI_API_VERSION
)

response = client.chat.completions.create(
  model=OPENAI_DEPLOYMENT_NAME,
  messages=[
    {
      "role": "user",
      "content": [
        {
          "type": "text",
          "text": "What is in this image?"
        },
        {
          "type": "image_url",
          "image_url": {
            "url": "https://media.wired.com/photos/64ed0bc52da6c6d86e70e575/master/w_1280,c_limit/WI100123_FF_OpenAI_01.jpg",
          },
        },
      ],
    }
  ],
  max_tokens=300,
)

print(response.choices[0])

Choice(finish_reason=None, index=0, message=ChatCompletionMessage(content='The image shows four individuals standing against a purple background. From left to right, the first person is wearing a purple t-shirt with dark pants, the second individual is dressed in a dark purple sweater with black pants, the third person is wearing a denim jacket over a black top paired with black pants, and the fourth person is in a light purple long-sleeve shirt with light pink pants. All four individuals appear to be casually posing for the photo.', role='assistant', function_call=None, tool_calls=None), finish_details={'type': 'stop', 'stop': '<|fim_suffix|>'}, content_filter_results={'hate': {'filtered': False, 'severity': 'safe'}, 'self_harm': {'filtered': False, 'severity': 'safe'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered': False, 'severity': 'safe'}})


## Image from Local Machine (REST API)
Below is the code sample from OpenAI and Azure OpenAI as of 2024-01-02. It uses the Rest API instead of the SDK.

In [7]:
import os
import requests
import base64

# Configuration
IMAGE_PATH = "../sampledata/image-barbeque.png"
encoded_image = base64.b64encode(open(IMAGE_PATH, 'rb').read()).decode('ascii')
headers = {
    "Content-Type": "application/json",
    "api-key": OPENAI_API_KEY,
}

system_prompt = """You are an assistant helps the blind. In addition to answering questions, you help the blind understand what is in the images provided by the user.

Image outputs should include:
- Detailed description
- Suggested tags
- Key-value pairs (if the image is a form, receipt, invoice, etc.)"""

# Payload for the request
payload = {
  "messages": [
    {
      "role": "system",
      "content": [
        {
          "type": "text",
          "text": system_prompt
        }
      ]
    },
    {
      "role": "user",
      "content": [
        {
          "type": "image_url",
          "image_url": {
            "url": f"data:image/jpeg;base64,{encoded_image}"
          }
        }
      ]
    }
  ],
  "temperature": 0.7,
  "top_p": 0.95,
  "max_tokens": 800
}

GPT4V_ENDPOINT = f"{OPENAI_API_BASE}/openai/deployments/{OPENAI_DEPLOYMENT_NAME}/extensions/chat/completions?api-version={OPENAI_API_VERSION}"

# Send request
try:
    response = requests.post(GPT4V_ENDPOINT, headers=headers, json=payload)
    response.raise_for_status()  # Will raise an HTTPError if the HTTP request returned an unsuccessful status code
except requests.RequestException as e:
    raise SystemExit(f"Failed to make the request. Error: {e}")

# Handle the response as needed (e.g., print or process)
print(response.json())

{'id': 'chatcmpl-8cUYbQWlME7BVEbiyWiZRKkfUCKPr', 'object': 'chat.completion', 'created': 1704184617, 'model': 'gpt-4', 'prompt_filter_results': [{'prompt_index': 0, 'content_filter_results': {'hate': {'filtered': False, 'severity': 'safe'}, 'self_harm': {'filtered': False, 'severity': 'safe'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered': False, 'severity': 'safe'}}}], 'choices': [{'finish_details': {'type': 'stop', 'stop': '<|fim_suffix|>'}, 'index': 0, 'message': {'role': 'assistant', 'content': 'This is an outdoor image showing a family scene. A person, whose face is blurred, is wearing a white t-shirt and a bright yellow apron, standing in front of a charcoal grill with what appears to be chicken on it. The individual is holding a pair of tongs in one hand and a piece of bread or bun in the other. In the background, two children are playing; one is twirling a hula hoop around her arm while the other appears to be watching. There is a red and white socc

## Image from Local Machine (SDK)
Below is working code using the OpenAI SDK. This code is not in the official samples as of 2024-01-02.

In [4]:
from openai import AzureOpenAI
import base64

client = AzureOpenAI(
  azure_endpoint = OPENAI_API_BASE, 
  api_key=OPENAI_API_KEY,  
  api_version=OPENAI_API_VERSION
)

# Configuration
IMAGE_PATH = "../sampledata/image-barbeque.png"
encoded_image = base64.b64encode(open(IMAGE_PATH, 'rb').read()).decode('ascii')

system_prompt = """You are an assistant helps the blind. In addition to answering questions, you help the blind understand what is in the images provided by the user.

Image outputs should include:
- Detailed description
- Suggested tags
- Key-value pairs (if the image is a form, receipt, invoice, etc.)"""

response = client.chat.completions.create(
  model=OPENAI_DEPLOYMENT_NAME,
  messages=[
    {
      "role": "system",
      "content": [
        {
          "type": "text",
          "text": system_prompt
        }
      ],
    },
    {
      "role": "user",
      "content": [
        {
          "type": "image_url",
          "image_url": {
            "url": f"data:image/jpeg;base64,{encoded_image}"
          },
        },
      ],
    }
  ],
  max_tokens=300,
)

print(response.choices[0])

Choice(finish_reason=None, index=0, message=ChatCompletionMessage(content="Description:\nThe image shows a lively outdoor scene where a man is standing and grilling chicken on a barbecue grill. He is wearing a white T-shirt and a bright yellow apron and is holding barbecue tongs in one hand with what appears to be several pieces of chicken on the grill. His face is blurred for privacy. In the background, there are two children playing with hula hoops in a grassy area with trees around. The child closer to the man is hula hooping, while the younger one is holding a hula hoop and watching, with her face also blurred. Close to the grilling area, there is a deflated red and black soccer ball on the ground.\n\nSuggested Tags:\nOutdoor, Family, BBQ, Grilling, Cooking, Children, Playtime, Recreation, Trees, Nature, Leisure, Activity, Food.\n\nPlease note that faces have been intentionally blurred to protect the individuals' privacy.", role='assistant', function_call=None, tool_calls=None), fi

# OpenAI Vision + Azure AI Vision (REST API)
To use Azure AI Vision, Azure OpenAI REST API is **required**. This is because it is using the Chat Completions _Extensions_ API, which the official SDK doesn't have.

## Image

In [8]:
import os
import requests
import base64

# Configuration
IMAGE_PATH = "../sampledata/image-barbeque.png"
encoded_image = base64.b64encode(open(IMAGE_PATH, 'rb').read()).decode('ascii')
headers = {
    "Content-Type": "application/json",
    "api-key": OPENAI_API_KEY,
}

system_prompt = """You are an assistant helps the blind. In addition to answering questions, you help the blind understand what is in the images provided by the user.

Image outputs should include:
- Detailed description
- Suggested tags
- Key-value pairs (if the image is a form, receipt, invoice, etc.)"""

# Payload for the request
payload = {
  "enhancements": {
    "ocr": {
      "enabled": True
    },
    "grounding": {
      "enabled": True
    }
  },
  "dataSources": [
    {
      "type" : "AzureComputerVision",
      "parameters" : {
        "endpoint" : AZURE_AI_VISION_ENDPOINT,
        "key" : AZURE_AI_VISION_KEY
      }
    }
  ],
  "messages": [
    {
      "role": "system",
      "content": [
        {
          "type": "text",
          "text": system_prompt
        }
      ]
    },
    {
      "role": "user",
      "content": [
        {
          "type": "image_url",
          "image_url": {
            "url": f"data:image/jpeg;base64,{encoded_image}"
          }
        }
      ]
    }
  ],
  "temperature": 0.7,
  "top_p": 0.95,
  "max_tokens": 800
}

GPT4V_ENDPOINT = f"{OPENAI_API_BASE}/openai/deployments/{OPENAI_DEPLOYMENT_NAME}/extensions/chat/completions?api-version={OPENAI_API_VERSION}"

# Send request
try:
    response = requests.post(GPT4V_ENDPOINT, headers=headers, json=payload)
    response.raise_for_status()  # Will raise an HTTPError if the HTTP request returned an unsuccessful status code
except requests.RequestException as e:
    raise SystemExit(f"Failed to make the request. Error: {e}")

# Handle the response as needed (e.g., print or process)
print(response.json())

{'id': 'chatcmpl-8cUbceCQC3SQ6WNS2vxA1jfe5x6cL', 'object': 'chat.completion', 'created': 1704184804, 'model': 'gpt-4', 'choices': [{'finish_details': {'type': 'stop', 'stop': '<|fim_suffix|>'}, 'index': 0, 'message': {'role': 'assistant', 'content': "Description:\nThe image captures an outdoor setting with a man standing in the foreground, actively grilling chicken on a small charcoal grill. The man is wearing a white t-shirt and a bright yellow apron. His face is blurred for privacy. He appears to be holding a pair of tongs in one hand and a piece of bread or bun in the other. In the background, two children are playing; one is playing with a hula hoop, and the other is standing nearby, possibly waiting for her turn. There's a red soccer ball on the ground beside them. The environment is lush with green grass and trees, suggesting a park or a backyard garden. A pile of dirt or sand and a few scattered fallen branches are also visible in the background.\n\nSuggested Tags:\nOutdoor, Fam

## Video

In [11]:
import os
import time
import requests

# Configuration
GPT4V_ENDPOINT = f"{OPENAI_API_BASE}/openai/deployments/{OPENAI_DEPLOYMENT_NAME}/extensions/chat/completions?api-version={OPENAI_API_VERSION}"

## ingest the video
VIDEO_FILE_SAS_URL = "https://raztypestore.blob.core.windows.net/temp/UberCognitiveServices.mp4?sv=2023-01-03&st=2024-01-02T08%3A35%3A43Z&se=2025-01-03T08%3A35%3A00Z&sr=b&sp=r&sig=7WRhu0YdOQAxJ7Uj8OKKBeZtPn6m4RhkqXCz03RP7IY%3D"
VIDEO_INDEX_NAME = "RazGpt4VisionVideoIndex8" # this needs to be unique, append number. To delete old indices, use the REST API https://learn.microsoft.com/en-us/azure/ai-services/computer-vision/reference-video-search
VIDEO_DOCUMENT_ID = "AOAIChatDocument"

def create_video_index(vision_api_endpoint, vision_api_key, index_name):
    url = f"{vision_api_endpoint}/computervision/retrieval/indexes/{index_name}?api-version=2023-05-01-preview"
    headers = {"Ocp-Apim-Subscription-Key": vision_api_key, "Content-Type": "application/json"}
    data = {
        "features": [
            {"name": "vision", "domain": "surveillance"}
        ]
    }
    response = requests.put(url, headers=headers, json=data)
    return response

def add_video_to_index(vision_api_endpoint, vision_api_key, index_name, video_url, video_id):
    url = f"{vision_api_endpoint}/computervision/retrieval/indexes/{index_name}/ingestions/my-ingestion?api-version=2023-05-01-preview"
    headers = {"Ocp-Apim-Subscription-Key": vision_api_key, "Content-Type": "application/json"}
    data = {
        'videos': [{'mode': 'add', 'documentId': video_id, 'documentUrl': video_url}]
    }
    response = requests.put(url, headers=headers, json=data)
    return response

def wait_for_ingestion_completion(vision_api_endpoint, vision_api_key, index_name, max_retries=30):
    url = f"{vision_api_endpoint}/computervision/retrieval/indexes/{index_name}/ingestions?api-version=2023-05-01-preview"
    headers = {"Ocp-Apim-Subscription-Key": vision_api_key}
    retries = 0
    while retries < max_retries:
        time.sleep(10)
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            state_data = response.json()
            if state_data['value'][0]['state'] == 'Completed':
                print(state_data)
                print('Ingestion completed.')
                return True
            elif state_data['value'][0]['state'] == 'Failed':
                print(state_data)
                print('Ingestion failed.')
                return False
        retries += 1
    return False


# Step 1: Create an Index
response = create_video_index(AZURE_AI_VISION_ENDPOINT, AZURE_AI_VISION_KEY, VIDEO_INDEX_NAME)
print(response.status_code, response.text)

# Step 2: Add a video file to the index
response = add_video_to_index(AZURE_AI_VISION_ENDPOINT, AZURE_AI_VISION_KEY, VIDEO_INDEX_NAME, VIDEO_FILE_SAS_URL, VIDEO_DOCUMENT_ID)
print(response.status_code, response.text)

# Step 3: Wait for ingestion to complete
if not wait_for_ingestion_completion(AZURE_AI_VISION_ENDPOINT, AZURE_AI_VISION_KEY, VIDEO_INDEX_NAME):
    print("Ingestion did not complete within the expected time.")


## Chat with GPT-4V

headers = {
    "Content-Type": "application/json",
    "api-key": OPENAI_API_KEY,
}

system_prompt = """You are an assistant helps the blind. In addition to answering questions, you help the blind understand what is in the images provided by the user.

Image outputs should include:
- Detailed description
- Suggested tags
- Key-value pairs (if the image is a form, receipt, invoice, etc.)"""

# Payload for the request
payload = {
    "dataSources": [
        {
            "type": "AzureComputerVisionVideoIndex",
            "parameters": {
                "computerVisionBaseUrl": f"{AZURE_AI_VISION_ENDPOINT}/computervision",
                "computerVisionApiKey": AZURE_AI_VISION_KEY,
                "indexName": VIDEO_INDEX_NAME,
                "videoUrls": [VIDEO_FILE_SAS_URL]
            }
        }
    ],
    "enhancements": {
        "video": {
            "enabled": True
        }
    },
    "messages": [
     {
          "role": "system",
          "content": [
               {
                    "type": "text",
                    "text": system_prompt
               }
          ]
     },
     {
          "role": "user",
          "content": [
               {
                    "type": "acv_document_id",
                    "acv_document_id": VIDEO_DOCUMENT_ID
               },
               {
                    "type": "text",
                    "text": " "
               }
          ]
     }
],
    "temperature": 0.7,
    "top_p": 0.95,
    "max_tokens": 800
}

# Send request
try:
    response = requests.post(GPT4V_ENDPOINT, headers=headers, json=payload)
    response.raise_for_status()  # Will raise an HTTPError if the HTTP request returned an unsuccessful status code
except requests.RequestException as e:
    raise SystemExit(f"Failed to make the request. Error: {e}")

# Handle the response as needed (e.g., print or process)
print(response.json())

201 {"name":"razgpt4visionvideoindex8","userData":{},"features":[{"name":"vision","modelVersion":"2023-05-31","domain":"surveillance"}],"eTag":"\"2023c2a0a59e4e9f858692c726a94887\"","createdDateTime":"2024-01-02T09:51:26.1308243Z","lastModifiedDateTime":"2024-01-02T09:51:26.1308243Z"}
202 {"name":"my-ingestion","state":"Running","batchName":"9e5f0b9f-fa60-4269-b5cf-722d5031c540","createdDateTime":"2024-01-02T09:51:27.2089472Z","lastModifiedDateTime":"2024-01-02T09:51:27.4120723Z"}
{'value': [{'name': 'my-ingestion', 'state': 'Completed', 'batchName': '9e5f0b9f-fa60-4269-b5cf-722d5031c540', 'createdDateTime': '2024-01-02T09:51:27.2089472Z', 'lastModifiedDateTime': '2024-01-02T09:51:48.4121019Z'}]}
Ingestion completed.
{'id': 'chatcmpl-8cVjMGc7O5WsYdLMiSNdwie19I9dq', 'object': 'chat.completion', 'created': 1704189128, 'model': 'gpt-4', 'choices': [{'finish_details': {'type': 'stop', 'stop': '<|fim_suffix|>'}, 'index': 0, 'message': {'role': 'assistant', 'content': 'The images provided ar