# Multimodal Models

In [0]:
!pip install -qU openai
!pip install pdf2image
!apt-get install -y poppler-utils
dbutils.library.restartPython()

In [0]:
import os
import io
import time
import openai
import base64
import requests
import textwrap
from pdf2image import convert_from_path
import PIL

from IPython.display import Image, Audio, Markdown, Math
from openai import OpenAI
from openai import AzureOpenAI

In [0]:
openai.api_type = "azure"
openai.api_key = os.environ["OPENAI_API_KEY"]
openai.api_version = "2023-07-01-preview"
#openai.api_base = "https://rg-rbi-aa-aitest-dsacademy.openai.azure.com/"
openai.api_base = "https://chatgpt-summarization.openai.azure.com/"

llm_model_name = "gpt-4o"
llm_deploy_name = "pioneers-gpt-4o"

client = AzureOpenAI(api_key=openai.api_key,
                     api_version=openai.api_version,
                     azure_endpoint=openai.api_base,
                     )

## Dealing with images

#### Opening Image

In [0]:
filename = "../../Data/handwritten.jpg"
Image(filename=filename, width=700)

#### Encoding Image (base64)

In [0]:
def encode_image(input_image, pdf_page=0):
    if isinstance(input_image, str):
        # Check if the file is a PDF
        if input_image.lower().endswith('.pdf'):
            pages = convert_from_path(input_image, dpi=300, poppler_path='/usr/bin')
            # Assuming you want to encode the first page
            input_image = pages[pdf_page]
        else:
            with open(input_image, "rb") as image_file:
                return base64.b64encode(image_file.read()).decode("utf-8")
    
    if isinstance(input_image, PIL.Image.Image):
        with io.BytesIO() as buffer:
            input_image.save(buffer, format="PNG")  # Save in PNG format or any other
            byte_data = buffer.getvalue()
            return base64.b64encode(byte_data).decode("utf-8")
    else:
        raise ValueError("Input must be a file path or a PIL image object")

In [0]:
myimage = encode_image(filename)
myimage

#### Sending request

In [0]:
prompt = "Can you transcribe this text?"

messages=[
    {
      "role": "system",
      "content": [
        {
          "type": "text",
          "text": f"{prompt}"
        }
      ]
    },
    {
      "role": "user",
      "content": [
        {
          "type": "image_url",
          "image_url": {
            "url": f"data:image/png;base64,{myimage}"
          }
        }
      ]
    },
  ]

response = client.chat.completions.create(model=llm_deploy_name,
                                          messages=messages,
                                          temperature=0.5,
                                          max_tokens=256,
                                          top_p=1,
                                          frequency_penalty=0,
                                          presence_penalty=0,
                                          #stop=None,
                                          )


print(response.choices[0].message.content)

## PDF Files

#### Encoding the Image

In [0]:
filename = "../../Data/pdf/NASDAQ_AMZN_2019.pdf"
encode_image(filename, pdf_page=1)

#### Displaying the PDF file (sanity check)

In [0]:
filename = "../../Data/pdf/NASDAQ_AMZN_2019.pdf"
pages = convert_from_path(filename, dpi=300, poppler_path='/usr/bin')
encode_image(pages[1])
with io.BytesIO() as buffer:
    pages[1].save(buffer, format="PNG") 
    byte_data = buffer.getvalue()

Image(byte_data, width=1200)

#### Sending request

In [0]:
base64_encoded = encode_image(pages[1])
prompt = "Can you transcribe and translate to German?"
messages=[
    {"role": "system", "content": [{"type": "text", "text": f"{prompt}"}]},
    {"role": "user", "content": [{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{base64_encoded}"}}]},
          ]

try:
    response = client.chat.completions.create(
        model=llm_deploy_name,
        messages=messages,
        temperature=1,
        # max_tokens=256,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0,
    )
    print(response.choices[0].message.content)
except Exception as e:
    print(f"An error occurred: {e}")

#### Wrapping everything in a function  

Example usage:

+ **For image file path**  
image_file_path = 'path/to/image.png'  
prompt = "Can you transcribe this text?"  
result = process_image(image_file_path, prompt)  
print(result)  

+ **For PDF file path**  
pdf_file_path = 'path/to/document.pdf'  
result = process_image(pdf_file_path, prompt)  
print(result)  

+ **For PIL image object**  
image_obj = Image.open('path/to/image.png')  
result = process_image(image_obj, prompt)  
print(result)  

In [0]:
def process_image(input_image, prompt, pdf_page=0, detail="low"):
    base64_encoded = encode_image(input_image)
    
    messages = [
        {
            "role": "system",
            "content": [
                {
                    "type": "text",
                    "text": f"{prompt}"
                }
            ]
        },
        {
            "role": "user",
            "content": [
                {
                    "type": "image_url",
                    "image_url": {
                        "url": f"data:image/png;base64,{base64_encoded}",
                        "detail": detail  # You can set this to "low", "high", or "auto"
                    }
                }
            ]
        },
    ]

    
    try:
        response = client.chat.completions.create(
            model=llm_deploy_name,
            messages=messages,
            temperature=1,
            max_tokens=256,
            top_p=1,
            frequency_penalty=0,
            presence_penalty=0,
        )
        return response.choices[0].message.content
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

#### Depending of the hanwriting, the results may not be perfect

In [0]:
filename = "../../Data/handwritten2.jpg"
Image(filename=filename, width=500)

In [0]:
prompt = "Can you transcribe this text?"
print(process_image(filename, prompt, detail="high"))

### Translation

In [0]:
filename = "../../Data/handwritten2.jpg"
prompt = "Can you translate this text to English?"
print(process_image(filename, prompt, detail="high"))

### Extraction

In [0]:
prompt = "What is the name of the brand in this PDF Page?"
filename = "../../Data/pdf/NASDAQ_AMZN_2019.pdf"
print(process_image(filename, prompt, pdf_page=1))

### Description

In [0]:
filename = "../../Data/fossils.jpeg"
Image(filename=filename, width=500)

In [0]:
prompt = "Can you describe this image?"
print(process_image(filename, prompt))

### Interpretation

In [0]:
filename = "../../Data/Sign.png"
Image(filename=filename, width=500)

In [0]:
prompt = "What is the type of tree behind the sign? Answer in one line"
print(process_image(filename, prompt))

### Technical Aspects

In [0]:
filename = "../../Data/Lightroom.jpeg"
Image(filename=filename, width=500)

In [0]:
prompt = "Tell me what my Lighthouse settings should be to get this type of filter. I need numeric values. It is ok to approximate. Think step by step"
print(process_image(filename, prompt))

#### A Math problem

In [0]:
filename = "../../Data/triangle.png"
Image(filename=filename, width=500)

In [0]:
base64_image = encode_image(filename)

response = client.chat.completions.create(
    model=llm_deploy_name,
    messages=[
        {"role": "system", "content": "You are a helpful assistant that responds in Markdown. Help me with my math homework!"},
        {"role": "user", "content": [
            {"type": "text", "text": "What's the area of the triangle?"},
            {"type": "image_url", "image_url": {
                "url": f"data:image/png;base64,{base64_image}"}
            }
        ]}
    ],
    temperature=0.0,
)

Markdown(response.choices[0].message.content)
#displayHTML(f"$${response.choices[0].message.content}$$")

#### Using a URL instead of an image:  
(no need to encode)

In [0]:
#Image(url="https://upload.wikimedia.org/wikipedia/commons/e/e2/The_Algebra_of_Mohammed_Ben_Musa_-_page_82b.png")
Image(url="https://i.ytimg.com/vi/nTB6pdf2ae4/hq720.jpg")

In [0]:
response = client.chat.completions.create(
    model=llm_deploy_name,
    messages=[
        {"role": "system", "content": "You are a helpful assistant that responds in Markdown. Help me with my math homework!"},
        {"role": "user", "content": [
            {"type": "text", "text": "Can you solve this problem?"},
            {"type": "image_url", "image_url": {
                "url": "https://i.ytimg.com/vi/nTB6pdf2ae4/hq720.jpg"}
            }
        ]}
    ],
    temperature=0.0,
)

print(response.choices[0].message.content)

### A full pipeline: Working with many images

#### A function to resize the images

In [0]:
def resize_image(image, max_dimension):
    width, height = image.size
    
    # Check if the image has a palette and convert it to true color mode
    if image.mode == "P":
        if "transparency" in image.info:
            image = image.convert("RGBA")
        else:
            image = image.convert("RGB")

    if width > max_dimension or height > max_dimension:
        if width > height:
            new_width = max_dimension
            new_height = int(height * (max_dimension / width))
        else:
            new_height = max_dimension
            new_width = int(width * (max_dimension / height))
        image = image.resize((new_width, new_height), PIL.Image.LANCZOS)
        timestamp = time.time()
    return image

#### A function to convert to PNG format

In [0]:
def convert_to_png(image):
    with io.BytesIO() as output:
        image.save(output, format="PNG")
        return output.getvalue()

#### Processing = converting, resizing and encoding

In [0]:
def process_image(path, max_size):
    with PIL.Image.open(path) as image:
        width, height = image.size
        mimetype = image.get_format_mimetype()
        if mimetype == "image/png" and width <= max_size and height <= max_size:
            with open(path, "rb") as f:
                encoded_image = base64.b64encode(f.read()).decode('utf-8')
                return (encoded_image, max(width, height))  # returns a tuple consistently
        else:
            resized_image = resize_image(image, max_size)
            png_image = convert_to_png(resized_image)
            return (base64.b64encode(png_image).decode('utf-8'),
                    max(width, height)  # same tuple metadata
                   )

#### A function to decide if the image will be treated as low or high definition

In [0]:
def create_image_content(image, maxdim, detail_threshold):
    detail = "low" if maxdim < detail_threshold else "high"
    return {
        "type": "image_url",
        "image_url": {"url": f"data:image/jpeg;base64,{image}", "detail": detail}
    }

#### Setting the System and User messages

In [0]:
def set_system_message(sysmsg):
    return [{
        "role": "system",
        "content": {"text": sysmsg}
    }]

def set_user_message(user_msg_str,
                     file_path_list=[],      # A list of file paths to images.
                     max_size_px=1024,       # Shrink images for lower API consumption
                     file_names_list=None,   # You can set original upload names to show AI
                     tiled=False,            # True is the API Reference method
                     detail_threshold=700):  # any images below this get 512px "low" mode

    if not isinstance(file_path_list, list):  # create empty list for weird input
        file_path_list = []

    if not file_path_list:  # no files, no tiles
        tiled = False

    if file_names_list and len(file_names_list) == len(file_path_list):
        file_names = file_names_list
    else:
        file_names = [os.path.basename(path) for path in file_path_list]

    base64_images = [process_image(path, max_size_px) for path in file_path_list]

    uploaded_images_text = ""
    if file_names:
        uploaded_images_text = "\n\n---\n\nUploaded images:\n" + '\n'.join(file_names)

    if tiled:
        content = [{"type": "text", "text": user_msg_str + uploaded_images_text}]
        content += [create_image_content(image, maxdim, detail_threshold)
                    for image, maxdim in base64_images]
        return [{"role": "user", "content": {"messages": content}}]
    else:
        return [{
            "role": "user",
            "content": {"messages": [user_msg_str + uploaded_images_text]
                        + [{"image": image} for image, _ in base64_images]}
          }]

In [0]:
system_msg = """
You are VisionPal, an AI assistant powered by GPT-4o with
Built-in vision capabilities:
- extract text from image
- describe images
- analyze image contents
- logical problem-solving requiring machine vision
""".strip()

user_msg = """
How many images were received?
Describe the contents.
Describe the quality.
Repeat back the file names sent.
""".strip()

max_size = 512  # downsizes if any dimension above this
image_paths = ["../../Data/handwritten.jpg", "../../Data/handwritten2.jpg", "../../Data/handwritten3.jpg"]

In [0]:
system = set_system_message(system_msg)
chat_hist = []  # list of more user/assistant items
user = set_user_message(user_msg, image_paths, max_size)

params = {
  "model": llm_deploy_name, 
  "temperature": 0.5, 
  "user": "my_customer",
  "max_tokens": 500, 
  "top_p": 0.5, 
  "stream": True,
  "messages": system + chat_hist + user,
}

# Fixing malformed parameters. Can be refactored

if isinstance(params["messages"][0]["content"], dict):
    params["messages"][0]["content"] = params["messages"][0]["content"]["text"]

if isinstance(params["messages"][1]["content"], dict):
    text_part = params["messages"][1]["content"]["messages"][0]  # Extract text
    image_part = params["messages"][1]["content"]["messages"][1]  # Extract image
    params["messages"][1]["content"] = [
        {"type": "text", "text": text_part},
        {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_part['image']}"}}
    ]

#### Making the call:

In [0]:
start = time.perf_counter()
try:
    client = AzureOpenAI(api_key=openai.api_key,
                         api_version=openai.api_version,
                         azure_endpoint=openai.api_base,
                     )
    response = client.chat.completions.create(**params)

    if params.get("stream", False):  # Check if streaming
        reply = ""
        print(f"---\nSENT:\n{params['messages'][-1]['content'][0]['text']}\n---")
    
    for chunk in response:
        if hasattr(chunk, "choices") and chunk.choices:  # Ensure choices exist
            delta = chunk.choices[0].delta
            if hasattr(delta, "content") and delta.content:
                reply += delta.content
                print(delta.content, end="")
        else:
            print("\n[Warning] Empty chunk received:", chunk)

    print("\n---\nFinal Reply:\n", reply)

except Exception as e:
    print(f"Error during API call: {e}")
    response = None

print(f"\n[elapsed: {time.perf_counter()-start:.2f} seconds]")

## [Speech to text with Whisper](https://platform.openai.com/docs/guides/speech-to-text)  
[Github](https://github.com/openai/whisper)  

![](https://raw.githubusercontent.com/openai/whisper/main/approach.png)

### 2.1 Transcriptions and Translations

In [0]:
client_whisper = AzureOpenAI(api_key=openai.api_key,
                             api_version="2024-06-01",
                             azure_endpoint="https://chatgpt-summarization.openai.azure.com/openai/deployments/pioneers-whisper/audio/translations?api-version=2024-06-01",
                            )

In [0]:
filename=r"../../Data/sound_english.mp3"
Audio(filename=filename, autoplay=True, rate=22050)

In [0]:
audio_file = open(filename, "rb")

try:
    transcription = client_whisper.audio.translations.create(
        model="pioneers-whisper", 
        file=audio_file,
    )
    print(transcription.text)
except Exception as e:
    print("API Error:", e)

In [0]:
filename=r"../../Data/sound_german.mp3"
Audio(filename=filename, autoplay=True, rate=22050)

In [0]:
audio_file = open(filename, "rb")

try:
    transcription = client_whisper.audio.translations.create(
        model="pioneers-whisper", 
        file=audio_file,
    )
    print(transcription.text)
except Exception as e:
    print("API Error:", e)

### [Translations](https://platform.openai.com/docs/api-reference/audio/createTranslation?lang=python)  
It only supports translation into English at this time.  
[Supported Languages](https://github.com/openai/whisper#available-models-and-languages)  

In [0]:
filename=r"../../Data/sound_portuguese.mp3"
Audio(filename=filename, autoplay=True, rate=22050)

In [0]:
audio_file = open(filename, "rb")
try:
    transcription = client_whisper.audio.translations.create(
        model="pioneers-whisper", 
        file=audio_file,
        prompt="Translate audio from original Portuguese to English",
    )
    print(transcription.text)
except Exception as e:
    print("API Error:", e)

### [Correcting transcriptions with ChatGPT](https://platform.openai.com/docs/guides/speech-to-text/improving-reliability)

```
system_prompt = """You are a helpful assistant for the company ZyntriQix. 
Your task is to correct any spelling discrepancies in the transcribed text. 
Make sure that the names of the following products are spelled correctly: 
ZyntriQix, Digique Plus, CynapseFive, VortiQore V8, EchoNix Array, OrbitalLink 
Seven, DigiFractal Matrix, PULSE, RAPT, B.R.I.C.K., Q.U.A.R.T.Z., F.L.I.N.T. 
Only add necessary punctuation such as periods, commas, and capitalization, 
and use only the context provided."""

def generate_corrected_transcript(temperature, system_prompt, audio_file):
    response = client.chat.completions.create(
        model="gpt-4o",
        temperature=temperature,
        messages=[
            {
                "role": "system",
                "content": system_prompt
            },
            {
                "role": "user",
                "content": transcribe(audio_file, "")
            }
        ]
    )
    return completion.choices[0].message.content

corrected_text = generate_corrected_transcript(0, system_prompt, fake_company_filepath)
```

## [Text to speech](https://platform.openai.com/docs/guides/text-to-speech)  
The Audio API provides a speech endpoint based on our TTS (text-to-speech) model.   
It comes with 6 built-in voices: ["alloy","echo","fable","onyx","nova","shimmer"] and can be used to:
+ Narrate a written blog post
+ Produce spoken audio in multiple languages
+ Give real time audio output using streaming

The default response format is "mp3", but other formats like "opus", "aac", "flac", and "pcm" are available.

+ Opus: For internet streaming and communication, low latency.
+ AAC: For digital audio compression, preferred by YouTube, Android, iOS.
+ FLAC: For lossless audio compression, favored by audio enthusiasts for archiving.
+ WAV: Uncompressed WAV audio, suitable for low-latency applications to avoid decoding overhead.
+ PCM: Similar to WAV but containing the raw samples in 24kHz (16-bit signed, low-endian), without the header.

In [0]:
## Model not available in our Region


foutput = "mp3"
speech_file_path = f"../../Data/speech.{foutput}"
response = client.audio.speech.create(
    model="tts-1",
    response_format=foutput,
    voice="echo", #["alloy","echo","fable","onyx","nova","shimmer"]
    input="I am very glad to be teaching here, You are wonderful students!",
)


response.write_to_file(speech_file_path)

In [0]:
filename=speech_file_path
Audio(filename=filename, autoplay=True, rate=22050)

## [Text to Image (Dall-e)](https://platform.openai.com/docs/guides/images/usage)  

The Images API provides three methods for interacting with images:

+ Creating images from scratch based on a text prompt (DALL·E 3 and DALL·E 2)
+ Creating edited versions of images by having the model replace some areas of a pre-existing image, based on a new text prompt (DALL·E 2 only)
+ Creating variations of an existing image (DALL·E 2 only)

### Generations

The image generations endpoint allows you to create an original image given a text prompt. When using DALL·E 3, images can have a size of 1024x1024, 1024x1792 or 1792x1024 pixels.

By default, images are generated at standard quality, but when using DALL·E 3 you can set quality: "hd" for enhanced detail. Square, standard quality images are the fastest to generate.

You can request 1 image at a time with DALL·E 3 (request more by making parallel requests) or up to 10 images at a time using DALL·E 2 with the n parameter.

In [0]:
## Model not available in our Region

response = client.images.generate(
    model="dall-e-3",
    prompt="A class about Generative AI in the University of Vienna",
    size="1024x1024",
    quality="standard",
    response_format='url', #'b64_json',
    n=1,
)

In [0]:
Image(url=response.data[0].url)

In [0]:
response = client.images.generate(
    model="dall-e-3",
    prompt="A futuristic view of Vienna",
    size="1024x1024",
    quality="standard",
    response_format='b64_json',
    n=1,
)

In [0]:
#import base64
Image(base64.decodebytes(str.encode(response.data[0].b64_json)))

In [0]:
with open("../../Data/imageToSave.png", "wb") as f:
    f.write(base64.decodebytes(str.encode(response.data[0].b64_json)))

### Edits  

Also known as "inpainting", the image edits endpoint allows you to edit or extend an image by uploading an image and mask indicating which areas should be replaced. The transparent areas of the mask indicate where the image should be edited, and the prompt should describe the full new image, not just the erased area. This endpoint can enable experiences like DALL·E image editing in ChatGPT Plus.

In [0]:
Image("../../Data/Vienna_image.png")

In [0]:
Image("../../Data/Vienna_mask.png")

In [0]:
response = client.images.edit(
    model="dall-e-2",
    image=open("../../Data/Vienna_image.png", "rb"),
    mask=open("../../Data/Vienna_mask.png", "rb"),
    prompt="A new ferris whell in the heart of Vienna",
    response_format='b64_json',
    n=1,
    size="512x512"
)

In [0]:
#Image(url=response.data[0].url)
Image(base64.decodebytes(str.encode(response.data[0].b64_json)))

In [0]:
with open("../../Data/imageToSave2.png", "wb") as f:
    f.write(base64.decodebytes(str.encode(response.data[0].b64_json)))

### Variations (DALL·E 2 only)  
The image variations endpoint allows you to generate a variation of a given image.

In [0]:
response = client.images.create_variation(
    model="dall-e-2",
    image=open("../Data/Vienna_image.png", "rb"),
    n=1,
    response_format='b64_json',
    size="512x512"
)

In [0]:
Image(base64.decodebytes(str.encode(response.data[0].b64_json)))

## Video Processing  
While it's not possible to directly send a video to the API, GPT-4o can understand videos if you sample frames and then provide them as images. It performs better at this task than GPT-4 Turbo.

Since GPT-4o in the API does not yet support audio-in (as of Jun 2024), we'll use a combination of GPT-4o and Whisper to process both the audio and visual for a provided video, and showcase two usecases:
1. Summarization
2. Question and Answering

### Setup for Video Processing
We'll use two python packages for video processing - opencv-python and moviepy. 

These require [ffmpeg](https://ffmpeg.org/about.html), so make sure to install this beforehand. Depending on your OS, you may need to run `brew install ffmpeg` or `sudo apt install ffmpeg`

In [0]:
%pip install opencv-python --quiet
%pip install --upgrade moviepy
%pip install -U imageio-ffmpeg

### Process the video into two components: frames and audio

In [0]:
import cv2
import imageio
from moviepy.video.io.VideoFileClip import VideoFileClip

# We'll be using the OpenAI DevDay Keynote Recap video. You can review the video here: https://www.youtube.com/watch?v=h02ti0Bl6zk
VIDEO_PATH = "../../Data/keynote_recap.mp4"

In [0]:
def process_video(video_path, seconds_per_frame=5):
    base64Frames = []
    base_video_path, _ = os.path.splitext(video_path)

    video = cv2.VideoCapture(video_path)
    total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
    fps = video.get(cv2.CAP_PROP_FPS)
    frames_to_skip = int(fps * seconds_per_frame)
    curr_frame=0

    # Loop through the video and extract frames at specified sampling rate
    while curr_frame < total_frames - 1:
        video.set(cv2.CAP_PROP_POS_FRAMES, curr_frame)
        success, frame = video.read()
        if not success:
            break
        _, buffer = cv2.imencode(".jpg", frame)
        base64Frames.append(base64.b64encode(buffer).decode("utf-8"))
        curr_frame += frames_to_skip
    video.release()

    # Extract audio from video
    audio_path = f"{base_video_path}.mp3"
    clip = VideoFileClip(video_path)
    clip.audio.write_audiofile(audio_path, bitrate="32k")
    clip.audio.close()
    clip.close()

    print(f"Extracted {len(base64Frames)} frames")
    print(f"Extracted audio to {audio_path}")
    return base64Frames, audio_path

# Extract 1 frame per second. You can adjust the `seconds_per_frame` parameter to change the sampling rate
base64Frames, audio_path = process_video(VIDEO_PATH, seconds_per_frame=5)


In [0]:
print(f"{len(base64Frames)} frames extracted")

In [0]:
from IPython.display import display, update_display, Image, Audio

In [0]:
# Show an empty display at first
display_id = "frame_display"
display(Image(data=b""), display_id=display_id)

# Loop through frames and update display
for img in base64Frames:
    update_display(Image(data=base64.b64decode(img.encode("utf-8")), width=600), display_id=display_id)
    time.sleep(0.1)

In [0]:
# Play the audio
Audio(audio_path)

### Example 1: Summarization
Now that we have both the video frames and the audio, let's run a few different tests to generate a video summary to compare the results of using the models with different modalities. We should expect to see that the summary generated with context from both visual and audio inputs will be the most accurate, as the model is able to use the entire context from the video.

1. Visual Summary
2. Audio Summary
3. Visual + Audio Summary

#### Visual Summary
The visual summary is generated by sending the model only the frames from the video. With just the frames, the model is likely to capture the visual aspects, but will miss any details discussed by the speaker.

In [0]:
response = client.chat.completions.create(
    model=llm_deploy_name,
    messages=[
        {"role": "system", "content": "You are generating a video summary. Please provide a summary of the video. Respond in Markdown."},
        {"role": "user", "content": [
            {"type": "text", "text": "These are the frames from the video."},  # Wrap text in a dict
            *map(lambda x: {"type": "image_url", 
                            "image_url": {"url": f'data:image/jpg;base64,{x}', "detail": "low"}}, base64Frames)
        ]}
    ],
    temperature=0,
)

print(response.choices[0].message.content)

The results are as expected - the model is able to capture the high level aspects of the video visuals, but misses the details provided in the speech.

#### Audio Summary
The audio summary is generated by sending the model the audio transcript. With just the audio, the model is likely to bias towards the audio content, and will miss the context provided by the presentations and visuals.

`{audio}` input for GPT-4o isn't currently available but will be coming soon! For now, we use our existing `whisper` model to process the audio

In [0]:
# Transcribe the audio
try:
    transcription = client_whisper.audio.translations.create(
        model="pioneers-whisper", 
        file=open("../../Data/keynote_recap.mp3", "rb"),
        prompt="Translate audio from original Portuguese to English",
    )
    print(transcription.text)
except Exception as e:
    print("API Error:", e)

## OPTIONAL: Uncomment the line below to print the transcription
#print("Transcript: ", transcription.text + "\n\n")

response = client.chat.completions.create(
    model=llm_deploy_name,
    messages=[
    {"role": "system", "content":"""You are generating a transcript summary. Create a summary of the provided transcription. Respond in Markdown."""},
    {"role": "user", "content": [
        {"type": "text", "text": f"The audio transcription is: {transcription.text}"}
        ],
    }
    ],
    temperature=0,
)
print(response.choices[0].message.content)

The audio summary is biased towards the content discussed during the speech, but comes out with much less structure than the video summary.

#### Audio + Visual Summary
The Audio + Visual summary is generated by sending the model both the visual and the audio from the video at once. When sending both of these, the model is expected to better summarize since it can perceive the entire video at once.

In [0]:
## Generate a summary with visual and audio
response = client.chat.completions.create(
    model=llm_deploy_name,
    messages=[
        {"role": "system", "content": """You are generating a video summary. Create a summary of the provided video and its transcript. Respond in Markdown."""},
        {"role": "user", "content": [
            {"type": "text", "text": "These are the frames from the video."},  # Wrap text in a dict
            *map(lambda x: {"type": "image_url", 
                            "image_url": {"url": f'data:image/jpg;base64,{x}', "detail": "low"}}, base64Frames),
            {"type": "text", "text": f"The audio transcription is: {transcription.text}"}
        ]}
    ],
    temperature=0,
)

print(response.choices[0].message.content)

After combining both the video and audio, we're able to get a much more detailed and comprehensive summary for the event which uses information from both the visual and audio elements from the video.

### Example 2: Question and Answering
For the Q&A, we'll use the same concept as before to ask questions of our processed video while running the same 3 tests to demonstrate the benefit of combining input modalities:
1. Visual Q&A
2. Audio Q&A
3. Visual + Audio Q&A 

In [0]:
QUESTION = "Question: Why did Sam Altman have an example about raising windows and turning the radio on?"

In [0]:
qa_visual_response = client.chat.completions.create(
    model=llm_deploy_name,
    messages=[
        {"role": "system", "content": "Use the video to answer the provided question. Respond in Markdown."},
        {"role": "user", "content": [
            {"type": "text", "text": "These are the frames from the video."},  # Wrap text in a dict
            *map(lambda x: {"type": "image_url", "image_url": {"url": f'data:image/jpg;base64,{x}', "detail": "low"}}, base64Frames),
            {"type": "text", "text": QUESTION}  # Wrap QUESTION in a dict
        ]}
    ],
    temperature=0,
)

print("Visual QA:\n" + qa_visual_response.choices[0].message.content)

In [0]:
qa_audio_response = client.chat.completions.create(
    model=llm_deploy_name,
    messages=[
    {"role": "system", "content":"""Use the transcription to answer the provided question. Respond in Markdown."""},
    {"role": "user", "content": f"The audio transcription is: {transcription.text}. \n\n {QUESTION}"},
    ],
    temperature=0,
)
print("Audio QA:\n" + qa_audio_response.choices[0].message.content)

In [0]:
qa_both_response = client.chat.completions.create(
    model=llm_deploy_name,
    messages=[
        {"role": "system", "content": "Use the video and transcription to answer the provided question."},
        {"role": "user", "content": [
            {"type": "text", "text": "These are the frames from the video."},  # Wrap text in a dict
            *map(lambda x: {"type": "image_url", 
                            "image_url": {"url": f'data:image/jpg;base64,{x}', "detail": "low"}}, base64Frames),
            {"type": "text", "text": f"The audio transcription is: {transcription.text}"},  # Wrap transcription
            {"type": "text", "text": QUESTION}  # Wrap QUESTION in a dict
        ]}
    ],
    temperature=0,
)

print("Both QA:\n" + qa_both_response.choices[0].message.content)


Comparing the three answers, the most accurate answer is generated by using both the audio and visual from the video. Sam Altman did not discuss the raising windows or radio on during the Keynote, but referenced an improved capability for the model to execute multiple functions in a single request while the examples were shown behind him.

## Conclusion
Integrating many input modalities such as audio, visual, and textual, significantly enhances the performance of the model on a diverse range of tasks. This multimodal approach allows for more comprehensive understanding and interaction, mirroring more closely how humans perceive and process information. 

Currently, GPT-4o in the API supports text and image inputs, with audio capabilities coming soon.

## 6 Moderations  
The moderations endpoint is a tool you can use to check whether text is potentially harmful. Developers can use it to identify content that might be harmful and take action, for instance by filtering it.

The models classifies the following categories:  
Category	Description  
+ hate	Content that expresses, incites, or promotes hate based on race, gender, ethnicity, religion, nationality, sexual orientation, disability status, or caste.
+ Hateful content aimed at non-protected groups (e.g., chess players) is harassment.  
+ hate/threatening	Hateful content that also includes violence or serious harm towards the targeted group based on race, gender, ethnicity, religion, nationality, sexual orientation, disability status, or caste.  
+ harassment	Content that expresses, incites, or promotes harassing language towards any target.  
+ harassment/threatening	Harassment content that also includes violence or serious harm towards any target.  
+ self-harm	Content that promotes, encourages, or depicts acts of self-harm, such as suicide, cutting, and eating disorders.  
+ self-harm/intent	Content where the speaker expresses that they are engaging or intend to engage in acts of self-harm, such as suicide, cutting, and eating disorders.     
+ self-harm/instructions	Content that encourages performing acts of self-harm, such as suicide, cutting, and eating disorders, or that gives instructions or advice on how to commit such acts.  
+ sexual	Content meant to arouse sexual excitement, such as the description of sexual activity, or that promotes sexual services (excluding sex education and wellness).
+ sexual/minors	Sexual content that includes an individual who is under 18 years old.  
+ violence	Content that depicts death, violence, or physical injury.  
+ violence/graphic	Content that depicts death, violence, or physical injury in graphic detail.  

The moderation endpoint is free to use for most developers. For higher accuracy, try splitting long pieces of text into smaller chunks each less than 2,000 characters.

In [0]:
# Examples from: https://www.politico.eu/article/15-most-offensive-things-trump-campaign-feminism-migration-racism/

sample_text = """
“I have black guys counting my money. … I hate it,” Trump told John R. O’Donnell, the former president of Trump Plaza Hotel & Casino, 
according O’Donnell’s account in his 1991 book “Trumped!” “The only guys I want counting my money are short guys that wear yarmulkes all day.”
Trump, according to O’Donnell, went on to say, “‘Laziness is a trait in blacks. It really is, I believe that.”
"""

response = client.moderations.create(input=sample_text)

In [0]:
print(response.to_json())