### Jupyter notebook for Demo

In [None]:
# import libraries
import io
from IPython.display import display, Audio, Image, clear_output
import ipywidgets as widgets
import asyncio
import nest_asyncio
from edge_tts import Communicate
import os
import sys
nest_asyncio.apply()
sys.path.append(os.path.abspath(os.path.join("..")))
from app.app import OvisCaptioner

In [2]:
# run model
from app.app import OvisCaptioner

captioner = OvisCaptioner()

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



In [16]:
#  create demo outputs folder structure
demo_folder = "../app_outputs_demo/"
img_folder = os.path.join(demo_folder, "images")
audio_folder = os.path.join(demo_folder, "audio")
os.makedirs(img_folder, exist_ok=True)
os.makedirs(audio_folder, exist_ok=True)

In [17]:
# widget for uploading image files
upload_widget = widgets.FileUpload(accept='image/*', multiple=False)
output = widgets.Output()

async def generate_audio(caption, audio_path):
    communicate = Communicate(caption, "en-US-JennyNeural")
    await communicate.save(audio_path)

def on_upload_change(change):
    with output:
        clear_output()
        if upload_widget.value:
            uploaded_file = upload_widget.value[0]
            file_name = uploaded_file['name']
            file_content = uploaded_file['content']
            # save to a temporary image file in demo folder
            temp_img_path = os.path.join(img_folder, file_name)
            with open(temp_img_path, 'wb') as f:
                f.write(file_content)
            # get caption
            result = captioner.describe_image(temp_img_path)
            caption = result["caption"]
            # display image
            display(Image(data=file_content))
            # show caption
            print(f"🖼️ Caption: {caption}")
            # show waiting message
            waiting_msg= widgets.HTML(value="<b>⏳ Generating audio, please wait...</b>")
            display(waiting_msg)
            # create audio file in demo folder
            audio_file_name = os.path.splitext(file_name)[0] + ".mp3"
            temp_audio_path = os.path.join(audio_folder, audio_file_name)
            asyncio.get_event_loop().run_until_complete(generate_audio(caption, temp_audio_path))
            # play audio inline
            display(Audio(filename=temp_audio_path, autoplay=True))

upload_widget.observe(on_upload_change, names='value')

print("⬆️ Upload an image file:")
display(upload_widget, output)


⬆️ Upload an image file:


FileUpload(value=(), accept='image/*', description='Upload')

Output()