<a href="https://colab.research.google.com/github/softmurata/colab_notebooks/blob/main/multimodality/perception_test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Installation

In [None]:
!pip install chex
!pip3 install imageio==2.4.1
!pip3 install ml_collections
!pip install -q mediapy

Import library

In [1]:
import abc
from absl import logging
import chex
import colorsys
import copy
import cv2
import imageio
import io
import jax
import matplotlib.pyplot as plt
import mediapy
import moviepy.editor as mvp
import numpy as np
import pathlib
import PIL
import random
import tensorflow as tf
from google.colab.patches import cv2_imshow
from ml_collections import config_dict
from typing import Any, Callable, Dict, Iterable, List, Mapping, Optional, Sequence, Set, Tuple, Type, TypeVar, Union

In [2]:
#@title Load demo TFRecord data from GCP

dataset = "base_oss"  # @param ["base_oss", "cup_games_oss", "grounded_questions_oss", "points_oss"]
split = "train"  # @param ["train", "test"]

def load_a_sequence_example(dataset, split):
  print(f"Loading dataset {dataset}; split {split}")
  tfrecord_uri = f"gs://dm-perception-test/tfrecords/v1/{dataset}/{split}/pt_{split}-*-of-*.tfrecord"

  filenames = tf.io.matching_files(tfrecord_uri, name=None)
  filenames = [tf.compat.as_str_any(tensor.numpy()) for tensor in filenames]
  filenames.sort()
  print(f"Files {filenames}")
  
  ds = tf.data.TFRecordDataset(filenames)
  # Pick first two and shuffle.
  ds = ds.shuffle(2)
  ds_iter = ds.as_numpy_iterator()
  serialised_example = ds_iter.next()
  return tf.train.SequenceExample.FromString(serialised_example)

sequence_example = load_a_sequence_example(dataset, split)

Loading dataset base_oss; split train
Files ['gs://dm-perception-test/tfrecords/v1/base_oss/train/pt_train-00000-of-00100.tfrecord', 'gs://dm-perception-test/tfrecords/v1/base_oss/train/pt_train-00001-of-00100.tfrecord', 'gs://dm-perception-test/tfrecords/v1/base_oss/train/pt_train-00002-of-00100.tfrecord', 'gs://dm-perception-test/tfrecords/v1/base_oss/train/pt_train-00003-of-00100.tfrecord', 'gs://dm-perception-test/tfrecords/v1/base_oss/train/pt_train-00004-of-00100.tfrecord', 'gs://dm-perception-test/tfrecords/v1/base_oss/train/pt_train-00005-of-00100.tfrecord', 'gs://dm-perception-test/tfrecords/v1/base_oss/train/pt_train-00006-of-00100.tfrecord', 'gs://dm-perception-test/tfrecords/v1/base_oss/train/pt_train-00007-of-00100.tfrecord', 'gs://dm-perception-test/tfrecords/v1/base_oss/train/pt_train-00008-of-00100.tfrecord', 'gs://dm-perception-test/tfrecords/v1/base_oss/train/pt_train-00009-of-00100.tfrecord', 'gs://dm-perception-test/tfrecords/v1/base_oss/train/pt_train-00010-of-0010

In [3]:
#@title Peek into SequenceExample
show_raw_example = False  #@param {type: "boolean"}


FT = TypeVar("FT")
_FEATURE_ACCESS = {
    bytes: lambda x: x.bytes_list.value,
    str: lambda x: [s.decode("utf-8") for s in x.bytes_list.value],
    int: lambda x: x.int64_list.value,
    float: lambda x: x.float_list.value,
}


def get_features(
    example: tf.train.SequenceExample, dtype: Type[FT], feature: str
) -> List[List[FT]]:
  read_feature = _FEATURE_ACCESS[dtype]
  feature_list = example.feature_lists.feature_list.get(feature, None)
  if feature_list is None:
    return []
  return [read_feature(feature) for feature in feature_list.feature]


if show_raw_example:
  def list_features(feature, f_type):
    print(f"\n{feature}")
    data = get_features(sequence_example, f_type, feature)
    for d in data:
      print(data)

  print("SequenceExample Features")
  for key in sequence_example.feature_lists.feature_list.keys():
    print(key)

  ids = get_features(sequence_example, int, "objects/track_id")
  labels = get_features(sequence_example, str, "objects/label")
  print(f"\nNumber of tracked objects: {len(ids)}")
  for i, (id, label) in enumerate(zip(ids, labels)):
    print(f"id: {id[0]:02} - {label[0]:14} (track {i:02})")

  ids = get_features(sequence_example, int, "points/track_id")
  labels = get_features(sequence_example, str, "points/label")
  print(f"\nNumber of tracked points: {len(ids)}")
  for i, (id, label) in enumerate(zip(ids, labels)):
    print(f"id: {id[0]:02} - {label[0]:14} (track {i:02})")

  ids = get_features(sequence_example, int, "questions/type")
  print(f"\nNumber of tracked points: {len(ids)}")
  for i, (typeid, label) in enumerate(zip(ids, labels)):
    print(f"id: {id[0]:02} - {label[0]:14} (track {i:02})")

  FEATURE_LIST = [
      ("questions/type", str),
      ("questions/multi_answer/answer_ids", int),
      ("questions/subcategory", str),
      ("questions/domain", str),
      ("questions/reasoning", str),
      ("questions/task_id", str),
      ("questions/multi_choice/answer_id", int),
      ("questions/tags", str),
      ("questions", str),
      ("actions/track_id", int),
  ]

  for feature, data_type in FEATURE_LIST:
    list_features(feature, data_type)

In [4]:
#@title ParseExample Classes

@chex.dataclass
class ExampleMetadata:
  """Global data about this Perception Test example."""
  original_audio_sample_rate: float
  original_audio_start_time: float
  original_audio_num_samples: int

  original_video_frame_rate: float
  original_video_frames: int

  @classmethod
  def parse(cls, example: tf.train.SequenceExample) -> "ExampleMetadata":
    """Parses a tf.train.SequenceExample."""
    audio_dims = example.context.feature["WAVEFORM/feature/dimensions"]
    [num_audio_samples] = audio_dims.int64_list.value
    sample_rate_feat = example.context.feature["WAVEFORM/feature/sample_rate"]
    [audio_sample_rate] = sample_rate_feat.float_list.value
    [audio_start_us] = get_features(
        example, int, "WAVEFORM/feature/timestamp")
    audio_start_us = audio_start_us[0]
    num_video_frames = len(get_features(example, int, "image/timestamp"))
    frame_rate_feat = example.context.feature["image/frame_rate"]
    [video_frame_rate] = frame_rate_feat.float_list.value
    return cls(
        original_audio_sample_rate=audio_sample_rate,
        original_audio_start_time=audio_start_us / 1e6,
        original_audio_num_samples=num_audio_samples,
        original_video_frame_rate=video_frame_rate,
        original_video_frames=num_video_frames)


@chex.dataclass
class PointTrack:
  """Single point tracked across a video."""

  points: chex.Array           # [frames, 2] -- y, x
  frames: chex.Array           # [frames]
  human_annotated: chex.Array  # [frames]

  @classmethod
  def parse(cls, example: tf.train.SequenceExample) -> List["PointTrack"]:
    """Parses a tf.train.SequenceExample."""
    ys = get_features(example, float, "points/points/y")
    xs = get_features(example, float, "points/points/x")
    is_human = get_features(example, int, "points/points/is_human_label")
    frames = get_features(example, int, "points/points/frame")
    num_points = len(ys)
    if not num_points:
      return []
    for f in [ys, xs, is_human, frames]:
      if len(f) != num_points:
        raise ValueError("Invalid number of point features.")
    return [  # pylint: disable=g-complex-comprehension
        cls(points=np.stack([ys[b], xs[b]],
                            axis=-1).clip(0, 1).astype(np.float32),
            frames=np.asarray(frames[b], dtype=np.int32),
            human_annotated=np.asarray(is_human[b], dtype=bool))
        for b in range(num_points)
    ]


@chex.dataclass
class BoxTrack:
  """Single object bounding boxes across a video."""

  track_id: int
  label: str
  boxes: chex.Array            # [frames, 4] -- y1, x1, y2, x2
  frames: chex.Array           # [frames]
  human_annotated: chex.Array  # [frames]

  @classmethod
  def parse(cls, example: tf.train.SequenceExample) -> List["BoxTrack"]:
    """Parses a tf.train.SequenceExample."""
    track_ids = get_features(example, int, "objects/track_id")
    track_labels = get_features(example, str, "objects/label")
    y1s = get_features(example, float, "objects/bounding_boxes/top_left_y")
    x1s = get_features(example, float, "objects/bounding_boxes/top_left_x")
    y2s = get_features(example, float, "objects/bounding_boxes/bottom_right_y")
    x2s = get_features(example, float, "objects/bounding_boxes/bottom_right_x")
    is_human = get_features(
        example, int, "objects/bounding_boxes/is_human_label")
    frames = get_features(example, int, "objects/bounding_boxes/frame")
    num_boxes = len(y1s)
    if not num_boxes:
      return []
    for f in [y1s, x1s, y2s, x2s, is_human, frames, track_ids, track_labels]:
      if len(f) != num_boxes:
        raise ValueError("Invalid number of box features.")
    return [  # pylint: disable=g-complex-comprehension
        cls(track_id=track_ids[b][0],
            label=track_labels[b][0],
            boxes=np.stack([y1s[b], x1s[b],
                            y2s[b], x2s[b]], axis=-1
                           ).clip(0, 1).astype(np.float32),
            frames=np.asarray(frames[b], dtype=np.int32),
            human_annotated=np.asarray(is_human[b], dtype=bool))
        for b in range(num_boxes)
    ]


@chex.dataclass
class AudioBox:
  """The start and end position of a sound within a video."""

  start_time: chex.Numeric  # Time since the start of the video in seconds.
  end_time: chex.Numeric
  audio_label: str  # E.g. "Human:Speech"

  @classmethod
  def parse(cls, example: tf.train.SequenceExample) -> List["AudioBox"]:
    """Parses a tf.train.SequenceExample."""
    starts = get_features(example, int, "sounds/start_timestamp")
    ends = get_features(example, int, "sounds/end_timestamp")
    labels = get_features(example, str, "sounds/label")
    num_sounds = len(starts)
    if not num_sounds:
      return []
    for f in [starts, ends, labels]:
      if len(f) != num_sounds:
        raise ValueError("Invalid number of audio features.")
    return [  # pylint: disable=g-complex-comprehension
        cls(start_time=start[0] / 1e6, end_time=end[0] / 1e6,
            audio_label=label[0].strip())
        for start, end, label in zip(starts, ends, labels)
    ]


@chex.dataclass
class MultipleChoiceQuestion:
  """Multiple choice questions."""

  text: str
  options: List[str]
  answer_id: int

  @classmethod
  def parse(cls,
            example: tf.train.SequenceExample
            ) -> List["MultipleChoiceQuestion"]:
    """Parses a tf.train.SequenceExample."""
    questions = get_features(example, str, "questions/question")
    qtypes = get_features(example, str, "questions/type")
    options = get_features(example, str, "questions/multi_choice/options")
    answer_ids = get_features(example, int, "questions/multi_choice/answer_id")
    if not questions or not options or not answer_ids:
      return []
    if not len(questions) == len(options) == len(answer_ids) == len(qtypes):
      raise ValueError(
          f"Invalid example with #q={len(questions)}, "
          f"#o={len(options)}, #a={len(answer_ids)}, #t={len(qtypes)}.")
    ret = []
    for i in range(len(questions)):
      [qtype] = qtypes[i]
      if qtype != "LANGUAGE":
        continue
      [question] = questions[i]
      if not options[i]:
        continue
      [answer_id] = answer_ids[i]
      ret.append(cls(text=question, options=options[i], answer_id=answer_id))
    return ret


@chex.dataclass
class GroundedObjectQuestion:
  """Grounded object detection questions."""

  text: str
  box_track_ids: List[int]

  @classmethod
  def parse(cls,
            example: tf.train.SequenceExample,
            available_box_tracks: Set[int],
            ) -> List["GroundedObjectQuestion"]:
    """Parses a tf.train.SequenceExample."""
    questions = get_features(example, str, "questions/question")
    qtypes = get_features(example, str, "questions/type")
    answer_ids = get_features(example, int, "questions/multi_answer/answer_ids")
    if not questions or not answer_ids:
      return []
    if not len(questions) == len(answer_ids) == len(qtypes):
      raise ValueError(
          f"Invalid example with #q={len(questions)}, "
          f"#a={len(answer_ids)}, #t={len(qtypes)}.")
    ret = []
    for i in range(len(questions)):
      [qtype] = qtypes[i]
      if qtype != "BOX":
        continue
      [question] = questions[i]
      if not answer_ids[i]:
        continue
      box_track_ids = [track_id for track_id in answer_ids[i]
                       if track_id in available_box_tracks]
      if not box_track_ids:
        continue
      ret.append(cls(text=question, box_track_ids=box_track_ids))
    return ret


@chex.dataclass
class VideoAction:
  """The data class for the storing actions in video."""
  track_id: int
  start_frame: int
  end_frame: int
  start_time: int
  end_time: int
  label: str

  @classmethod
  def parse(
      cls,
      example: tf.train.SequenceExample,
  ) -> List["VideoAction"]:
    """Parses input data for the temporal action localization task.

    Args:
      example: Initial tf.train.SequenceExample from which data is extracted.

    Returns:
      List of VideoAction objects. One per sample.
    """
    track_ids = get_features(example, int, "actions/track_id")
    start_frames = get_features(example, int, "actions/start_frame")
    start_timestamps = get_features(example, int, "actions/start_timestamp")
    end_frames = get_features(example, int, "actions/end_frame")
    end_timestamps = get_features(example, int, "actions/end_timestamp")
    labels = get_features(example, str, "actions/label")
    result = []
    data_iterator = zip(
        track_ids,
        start_frames,
        end_frames,
        start_timestamps,
        end_timestamps,
        labels,
    )
    for sample in data_iterator:
      track_id, start_frame, end_frame, start_time, end_time, label = sample
      result.append(
          cls(
              track_id=track_id,
              start_frame=start_frame,
              end_frame=end_frame,
              start_time=start_time,
              end_time=end_time,
              label=label
          ))
    return result


@chex.dataclass
class ParsedExample:
  """Parsed Perception Test example."""
  metadata: ExampleMetadata
  video_frames: Optional[chex.Array]  # [num_frames, h, w, c]
  video_features: Optional[chex.Array]
  video_actions: Optional[Sequence[VideoAction]]
  audio_wav: Optional[chex.Array]     # [num_samples]
  point_tracks: List[PointTrack]
  box_tracks: List[BoxTrack]
  audio_boxes: List[AudioBox]
  grounded_object_questions: List[GroundedObjectQuestion]
  multiple_choice_questions: List[MultipleChoiceQuestion]

  @classmethod
  def parse(cls, example: tf.train.SequenceExample) -> "ParsedExample":
    """Parses a tf.train.SequenceExample."""
    metadata = ExampleMetadata.parse(example)
    audio_wav = load_audio(example)
    video_frames = load_video(example)
    point_tracks = PointTrack.parse(example)
    box_tracks = BoxTrack.parse(example)
    audio_boxes = AudioBox.parse(example)
    # Sometimes box/point tracks refer to frames outside the video.
    max_frame = metadata.original_video_frames - 1
    for track in box_tracks + point_tracks:
      max_frame = max(max_frame, track.frames[-1])
    if max_frame >= metadata.original_video_frames:
      logging.info("Video had %d frames, but annotation referenced frame %d.",
                   metadata.original_video_frames, max_frame + 1)
      metadata.original_video_frames = max_frame + 1
    grounded_object_questions = GroundedObjectQuestion.parse(
        example,
        available_box_tracks=set([b.track_id for b in box_tracks]))
    multiple_choice_questions = MultipleChoiceQuestion.parse(example)
    video_actions = VideoAction.parse(example)
    video_features = get_features(example, float, "action/features")
    return cls(metadata=metadata,
               video_frames=video_frames,
               video_features=video_features,
               audio_wav=audio_wav,
               point_tracks=point_tracks,
               box_tracks=box_tracks,
               audio_boxes=audio_boxes,
               grounded_object_questions=grounded_object_questions,
               multiple_choice_questions=multiple_choice_questions,
               video_actions=video_actions)

In [5]:
#@title Data processing functions

def load_audio(
    example: tf.train.SequenceExample
) -> np.ndarray:
  """Returns the audio sample."""
  pad_left = 0
  [audio_start_us] = get_features(example, int, "WAVEFORM/feature/timestamp")
  audio_start_us = audio_start_us[0]
  if audio_start_us > 0:
    sample_rate_feat = example.context.feature["WAVEFORM/feature/sample_rate"]
    [audio_sample_rate] = sample_rate_feat.float_list.value
    pad_left = round(audio_sample_rate * audio_start_us * 1e-6)
  [samples] = get_features(example, float, "WAVEFORM/feature/floats")
  ret = np.empty(pad_left + len(samples), dtype=np.float32)
  ret[:pad_left] = 0
  ret[pad_left:] = samples
  return ret


def load_video(
    example: tf.train.SequenceExample
) -> np.ndarray:
  """Returns the video from a given example."""
  frame_features = get_features(example, bytes, "image/encoded")
  num_frames = len(frame_features)

  for t, frame_index in enumerate(range(num_frames)):
    with io.BytesIO(frame_features[frame_index][0]) as f:
      input_frame = PIL.Image.open(f)
      if t == 0:
        out_height = input_frame.height
        out_width = input_frame.width
        output_frames = np.empty(
            (num_frames, out_height, out_width, 3), dtype=np.uint8)
      output_frames[t] = np.frombuffer(
          input_frame.tobytes(), dtype=np.uint8
      ).reshape((out_height, out_width, 3))
  return output_frames

     

In [6]:
#@title Drawing and display utilities

def display_video(frames, fps=30):
  # Create and display temporary video from numpy array frames
  # format (num_frames, height, width, channels)
  imageio.mimwrite('tmp_video_display.mp4', frames, fps=fps); 
  display(mvp.ipython_display('tmp_video_display.mp4'))


def display_frame(frame):
  # Display a frame, converting from RGB to BGR for cv2.
  cv2_imshow(frame[:, :, ::-1])


def get_colors(num_colors: int) -> Tuple[int, int, int]:
  # Generate random colormaps for visualizing different objects and points.
  colors = []
  for i in np.arange(0., 360., 360. / num_colors):
    hue = i / 360.
    lightness = (50 + np.random.rand() * 10) / 100.
    saturation = (90 + np.random.rand() * 10) / 100.
    color = colorsys.hls_to_rgb(hue, lightness, saturation)
    color = (int(color[0] * 255), int(color[1] * 255), int(color[2] * 255))
    colors.append(color)
  random.seed(0)
  random.shuffle(colors)
  return colors

COLORS = get_colors(num_colors=100)


def try_different_dataset_notice(current, missing, suggestion):
  msg = f"\n*** The current example from the `{current}` dataset does not " \
        f"contain any {missing} data.\n\n" \
        f"*** Try loading the `{suggestion}` dataset to " \
        f"visualise {missing} data." 
  print(f"\x1b[31m{msg}\x1b[0m")


def paint_box(video: List[np.ndarray],
              track: BoxTrack,
              color: Tuple[int, int, int] = (255, 0, 0)):
  num_frames, height, width, _ = video.shape
  for box, frame_idx, human in zip(
      track.boxes, track.frames, track.human_annotated):
    if human:
      label = f"{track.label}*"
    else:
      label = track.label
    name = f'{track.track_id} : {label}'
    frame = np.array(video[frame_idx])
    y1 = int(round(box[0] * height))
    x1 = int(round(box[1] * width))
    y2 = int(round(box[2] * height))
    x2 = int(round(box[3] * width))
    frame = cv2.rectangle(frame, (x1, y1), (x2, y2), color=color, thickness=2)
    frame = cv2.putText(frame, name, (x1, y1 + 20), cv2.FONT_HERSHEY_SIMPLEX,
                        0.75, color, 2)
    video[frame_idx] = frame
  return video


def paint_boxes(video, tracks: List[BoxTrack]):
  for i, track in enumerate(tracks):
    video = paint_box(video, track, COLORS[i])
  return video


def paint_point(
    video: List[np.ndarray],
    track: PointTrack,
    color: Tuple[int, int, int] = (255, 0, 0),
):
  num_frames, height, width, _ = video.shape
  for p, frame_idx, human in zip(
      track.points, track.frames, track.human_annotated):
    frame = video[frame_idx]
    x = int(round(p[1] * width))
    y = int(round(p[0] * height))
    frame = cv2.circle(frame, (x, y), radius=10, color=color, thickness=-1)
    video[frame_idx] = frame
  return video


def paint_points(video, tracks: List[PointTrack]):
  for i, track in enumerate(tracks):
    video = paint_point(video, track, COLORS[i])
  return video


In [7]:

#@title Parse an example to ParsedExample
example = ParsedExample.parse(sequence_example)

In [None]:
show_original_video = True  #@param {type: "boolean"}
if show_original_video:
  display_video(example.video_frames)

Box Tracks

In [None]:
#@markdown Draw annotated bounding boxes on video

tmp_vid = example.video_frames.copy()
show_all_tracks = True  #@param {type: "boolean"}
show_track = 0  #@param {type: "integer"}
if show_all_tracks:
  _ = paint_boxes(tmp_vid, example.box_tracks)
else:
  _ = paint_box(tmp_vid, example.box_tracks[show_track], 
                COLORS[show_track])
display_video(tmp_vid)

Point Tracks

In [None]:
#@markdown Show video with points overlayed.

tmp_vid = example.video_frames.copy()
show_all_tracks = True  #@param {type: "boolean"}
show_track = 0  #@param {type: "integer"}

num_point_tracks = len(example.point_tracks)
if num_point_tracks:
  if show_all_tracks:
    _ = paint_points(tmp_vid, example.point_tracks)
  else:
    _ = paint_point(tmp_vid, example.point_tracks[show_track], 
                  COLORS[show_track])
  display_video(tmp_vid)
else:
  try_different_dataset_notice(dataset, "point tracking", "points_oss")


Actions

In [None]:
#@markdown List video actions
human_readable_text = True # @param {type: "boolean"}

action_labels = []
action_start_times = []
action_end_times = []
print(f"Track id\tstart_frame\tend_frame\tstart_time\t\tend_time\tLabel")
if example.video_actions:
  if human_readable_text:
    for va in example.video_actions:
      action_labels.append(str(va.label))
      action_start_times.append(va.start_time)
      action_end_times.append(va.end_time)
      print(f"{va.track_id[0]}\t\t{va.start_frame[0]}\t\t{va.end_frame[0]}"
            f"\t\t{va.start_time[0]:8}\t\t{va.end_time[0]:8}\t{va.label[0]}")
else:
  try_different_dataset_notice(dataset, "video actions", "base_oss")

action_start_times = np.array(action_start_times).squeeze()
action_end_times = np.array(action_end_times).squeeze()

Sounds

In [None]:
#@title Audio
#markdown Extract the audio for the video.
%matplotlib inline
import matplotlib.pyplot as plt
import librosa.display
from IPython.display import Audio
from scipy.io.wavfile import write
sample_rate = int(example.metadata.original_audio_sample_rate)

wav = (example.audio_wav * 2**15).astype(int)
write('test.wav', sample_rate, wav)
Audio(wav, rate=sample_rate)

In [None]:
#@title Sounds events
#@markdown Print the sound events for a sample video.

audio_labels = []
audio_start_times = []
audio_end_times = []
print(f"start\tend\tlabel")
if example.audio_boxes:
  for audio in example.audio_boxes:
    audio_labels.append(str(audio.audio_label))
    audio_start_times.append(audio.start_time)
    audio_end_times.append(audio.end_time)
    print(f"{audio.start_time:02.4f}\t{audio.end_time:02.4f}"
          f"\t{audio.audio_label}")

audio_start_times = np.array(audio_start_times)
audio_end_times = np.array(audio_end_times)

Visualise timeline of events

In [None]:
#@markdown Plot a timeline with the audio, sound events, action events and some frames.
plt.figure(figsize=(14, 15))

# Plot WAV
plt.subplot(4,1,1)
plt.title("Audio")
librosa.display.waveshow(example.audio_wav, sr=sample_rate)

# Strip of frames
plt.subplot(4,1,2)
plt.title("Video Frames")
f_size = example.video_frames[0].shape
small = tuple(reversed((np.array(f_size[:2]) / 4).astype(int)))
strip = None
num_frames = example.metadata.original_video_frames
for i in range(0, num_frames, int(num_frames/4)):
  frame = cv2.resize(example.video_frames[i], small)
  if strip is None:
    strip = np.array(frame)
  else:
    strip = np.concatenate([strip, frame], axis=1)
plt.imshow(strip)

# Plot audio events
plt.subplot(4,1,3)
plt.title("Audio Events")
plt.barh(range(len(audio_start_times)),
         audio_end_times-audio_start_times,
         left=audio_start_times)
plt.yticks(range(len(audio_start_times)), audio_labels)

# Plot video events
plt.subplot(4,1,4)
plt.title("Action Events")
plt.barh(range(len(action_start_times)),
         action_end_times-action_start_times,
         left=action_start_times)
plt.yticks(range(len(action_start_times)), action_labels)

plt.show()
     

Multiple choice VQA

In [None]:
#@markdown Load and print an example from the `multiple choice questions` dataset. 
human_readable = True # @param {type: "boolean"}
if example.multiple_choice_questions:
  for mcq in example.multiple_choice_questions:
    if human_readable:
      print(mcq.text)
      for i, o in enumerate(mcq.options):
        if i == mcq.answer_id:
          answer = " <---- ANSWER"
        else:
          answer = ""
        print(f"  {o}{answer}")
      print("")
    else:
      print(mcq)
else:
  try_different_dataset_notice(dataset,
                               "multiple choice visual question and answer",
                               "base_oss")