Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[documentation] video API documentation and wrapper #2778

Merged
merged 21 commits into from Oct 9, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
34 changes: 34 additions & 0 deletions docs/source/io.rst
Expand Up @@ -17,6 +17,40 @@ Video
.. autofunction:: write_video


Fine-grained video API
-------------------

In addition to the :mod:`read_video` function, we provide a high-performance
lower-level API for more fine-grained control compared to the :mod:`read_video` function.
It does all this whilst fully supporting torchscript.

.. autoclass:: Video
:members: next, get_metadata, set_current_stream, seek


Example of usage:

.. code:: python

import torchvision
video_path = "path to a test video"
# Constructor allocates memory and a threaded decoder
# instance per video. At the momet it takes two arguments:
# path to the video file, and a wanted stream.
reader = torchvision.io.Video(video_path, "video")

# The information about the video can be retrieved using the
# `get_metadata()` method. It returns a dictionary for every stream, with
# duration and other relevant metadata (often frame rate)
reader_md = reader.get_metadata()

# metadata is structured as a dict of dicts with following structure
# {"stream_type": {"attribute": [attribute per stream]}}
#
# following would print out the list of frame rates for every present video stream
print(reader_md["video"]["fps"])


Image
-----

Expand Down
12 changes: 6 additions & 6 deletions test/test_video.py
Expand Up @@ -10,7 +10,7 @@

import torch
import torchvision
from torchvision.io import _HAS_VIDEO_OPT
from torchvision.io import _HAS_VIDEO_OPT, Video

try:
import av
Expand Down Expand Up @@ -289,7 +289,7 @@ def test_read_video_tensor(self):
tv_result, _, _ = torchvision.io.read_video(full_path, pts_unit="sec")
tv_result = tv_result.permute(0, 3, 1, 2)
# pass 2: decode all frames using new api
reader = torch.classes.torchvision.Video(full_path, "video")
reader = Video(full_path, "video")
frames = []
t, _ = reader.next()
while t.numel() > 0:
Expand All @@ -310,7 +310,7 @@ def test_read_video_tensor(self):
# s = min(r)
# e = max(r)

# reader = torch.classes.torchvision.Video(full_path, "video")
# reader = Video(full_path, "video")
# results = _template_read_video(reader, s, e)
# tv_video, tv_audio, info = torchvision.io.read_video(
# full_path, start_pts=s, end_pts=e, pts_unit="sec"
Expand All @@ -329,7 +329,7 @@ def test_read_video_tensor(self):
# full_path, pts_unit="sec"
# )
# # pass 2: decode all frames using new api
# reader = torch.classes.torchvision.Video(full_path, "video")
# reader = Video(full_path, "video")
# pts = []
# t, p = reader.next()
# while t.numel() > 0:
Expand All @@ -353,7 +353,7 @@ def test_metadata(self):
torchvision.set_video_backend("pyav")
for test_video, config in test_videos.items():
full_path = os.path.join(VIDEO_DIR, test_video)
reader = torch.classes.torchvision.Video(full_path, "video")
reader = Video(full_path, "video")
reader_md = reader.get_metadata()
self.assertAlmostEqual(
config.video_fps, reader_md["video"]["fps"][0], delta=0.0001
Expand All @@ -372,7 +372,7 @@ def test_video_reading_fn(self):

ref_result = _decode_frames_by_av_module(full_path)

reader = torch.classes.torchvision.Video(full_path, "video")
reader = Video(full_path, "video")
newapi_result = _template_read_video(reader)

# First we check if the frames are approximately the same
Expand Down
101 changes: 94 additions & 7 deletions torchvision/io/__init__.py
@@ -1,3 +1,5 @@
import torch

from ._video_opt import (
Timebase,
VideoMetaData,
Expand All @@ -20,10 +22,94 @@
encode_jpeg,
write_jpeg,
encode_png,
write_png
write_png,
)


if _HAS_VIDEO_OPT:

class Video:
Comment on lines +29 to +31
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For a follow-up PR: given that we are wrapping our C++ class in a python class, we don't need to have this as is, and we can instead implement everything outside the guard, but have a guard at instantiation time.

Something like

if _HAS_VIDEO_OPT:
    def _has_video_opt():
        return True
else:
    def _has_video_opt():
        return False

class Video:
    def __init__(self, path, stream):
        # check if video_opt is available
        if not _has_video_opt():
            raise RuntimeError(...)
        ...

In here, we create a function _has_video_opt to make the class work with torchscript (which doesn't support globals).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

added to the list of issues to address

"""
Fine-grained video-reading API.
Supports frame-by-frame reading of various streams from a single video
container.

Args:

path (string): Path to the video file in supported format

stream (string, optional): descriptor of the required stream. Defaults to "video:0"
Currently available options include :mod:`['video', 'audio', 'cc', 'sub']`
bjuncek marked this conversation as resolved.
Show resolved Hide resolved

Example:
The following examples creates :mod:`Video` object, seeks into 2s
point, and returns a single frame::
import torchvision
video_path = "path_to_a_test_video"

reader = torchvision.io.Video(video_path, "video")
reader.seek(2.0)
frame, timestamp = reader.next()
"""

def __init__(self, path, stream="video"):
self._c = torch.classes.torchvision.Video(path, stream)

def next(self):
"""Iterator that decodes the next frame of the current stream
bjuncek marked this conversation as resolved.
Show resolved Hide resolved

Returns:
([torch.Tensor, float]): list containing decoded frame and corresponding timestamp

"""
return self._c.next()

def seek(self, time_s: float):
"""Seek within current stream.

Args:
time_s (float): seek time in seconds

.. note::
Current implementation is the so-called precise seek. This
means following seek, call to :mod:`next()` will return the
frame with the exact timestamp if it exists or
the first frame with timestamp larger than time_s.
"""
self._c.seek(time_s)

def get_metadata(self):
"""Returns video metadata

Returns:
(dict): dictionary containing duration and frame rate for every stream
"""
return self._c.get_metadata()

def set_current_stream(self, stream: str):
"""Set current stream.
Explicitly define the stream we are operating on.

Args:
stream (string): descriptor of the required stream. Defaults to "video:0"
Currently available stream types include :mod:`['video', 'audio', 'cc', 'sub']`.
Each descriptor consists of two parts: stream type (e.g. 'video') and
a unique stream id (which are determined by video encoding).
In this way, if the video contaner contains multiple
streams of the same type, users can acces the one they want.
If only stream type is passed, the decoder auto-detects first stream
of that type and returns it.

Returns:
(bool): True on succes, False otherwise
"""
return self._c.set_current_stream(stream)


else:
Video = None


__all__ = [
"write_video",
"read_video",
Expand All @@ -39,10 +125,11 @@
"_read_video_meta_data",
"VideoMetaData",
"Timebase",
'read_image',
'decode_image',
'encode_jpeg',
'write_jpeg',
'encode_png',
'write_png',
"read_image",
"decode_image",
"encode_jpeg",
"write_jpeg",
"encode_png",
"write_png",
"Video",
bjuncek marked this conversation as resolved.
Show resolved Hide resolved
]