pytorch · fmassa · Oct 9, 2020 · Sep 30, 2020 · Oct 7, 2020 · Oct 7, 2020
diff --git a/docs/source/io.rst b/docs/source/io.rst
@@ -17,6 +17,40 @@ Video
 .. autofunction:: write_video
 
 
+Fine-grained video API
+-------------------
+
+In addition to the :mod:`read_video` function, we provide a high-performance 
+lower-level API for more fine-grained control compared to the :mod:`read_video` function.
+It does all this whilst fully supporting torchscript.
+
+.. autoclass:: Video
+    :members: next, get_metadata, set_current_stream, seek
+
+
+Example of usage:
+
+.. code:: python
+
+    import torchvision
+    video_path = "path to a test video"
+    # Constructor allocates memory and a threaded decoder
+    # instance per video. At the momet it takes two arguments:
+    # path to the video file, and a wanted stream.
+    reader = torchvision.io.Video(video_path, "video")
+
+    # The information about the video can be retrieved using the 
+    # `get_metadata()` method. It returns a dictionary for every stream, with
+    # duration and other relevant metadata (often frame rate)
+    reader_md = reader.get_metadata()
+
+    # metadata is structured as a dict of dicts with following structure
+    # {"stream_type": {"attribute": [attribute per stream]}}
+    #
+    # following would print out the list of frame rates for every present video stream
+    print(reader_md["video"]["fps"])
+
+
 Image
 -----
 

diff --git a/test/test_video.py b/test/test_video.py
@@ -10,7 +10,7 @@
 
 import torch
 import torchvision
-from torchvision.io import _HAS_VIDEO_OPT
+from torchvision.io import _HAS_VIDEO_OPT, Video
 
 try:
     import av
@@ -289,7 +289,7 @@ def test_read_video_tensor(self):
             tv_result, _, _ = torchvision.io.read_video(full_path, pts_unit="sec")
             tv_result = tv_result.permute(0, 3, 1, 2)
             # pass 2: decode all frames using new api
-            reader = torch.classes.torchvision.Video(full_path, "video")
+            reader = Video(full_path, "video")
             frames = []
             t, _ = reader.next()
             while t.numel() > 0:
@@ -310,7 +310,7 @@ def test_read_video_tensor(self):
     #         s = min(r)
     #         e = max(r)
 
-    #         reader = torch.classes.torchvision.Video(full_path, "video")
+    #         reader = Video(full_path, "video")
     #         results = _template_read_video(reader, s, e)
     #         tv_video, tv_audio, info = torchvision.io.read_video(
     #             full_path, start_pts=s, end_pts=e, pts_unit="sec"
@@ -329,7 +329,7 @@ def test_read_video_tensor(self):
     #             full_path, pts_unit="sec"
     #         )
     #         # pass 2: decode all frames using new api
-    #         reader = torch.classes.torchvision.Video(full_path, "video")
+    #         reader = Video(full_path, "video")
     #         pts = []
     #         t, p = reader.next()
     #         while t.numel() > 0:
@@ -353,7 +353,7 @@ def test_metadata(self):
         torchvision.set_video_backend("pyav")
         for test_video, config in test_videos.items():
             full_path = os.path.join(VIDEO_DIR, test_video)
-            reader = torch.classes.torchvision.Video(full_path, "video")
+            reader = Video(full_path, "video")
             reader_md = reader.get_metadata()
             self.assertAlmostEqual(
                 config.video_fps, reader_md["video"]["fps"][0], delta=0.0001
@@ -372,7 +372,7 @@ def test_video_reading_fn(self):
 
             ref_result = _decode_frames_by_av_module(full_path)
 
-            reader = torch.classes.torchvision.Video(full_path, "video")
+            reader = Video(full_path, "video")
             newapi_result = _template_read_video(reader)
 
             # First we check if the frames are approximately the same

diff --git a/torchvision/io/__init__.py b/torchvision/io/__init__.py
@@ -1,3 +1,5 @@
+import torch
+
 from ._video_opt import (
     Timebase,
     VideoMetaData,
@@ -20,10 +22,94 @@
     encode_jpeg,
     write_jpeg,
     encode_png,
-    write_png
+    write_png,
 )
 
 
+if _HAS_VIDEO_OPT:
+
+    class Video:
+        """
+        Fine-grained video-reading API.
+        Supports frame-by-frame reading of various streams from a single video
+        container.
+
+        Args:
+
+            path (string): Path to the video file in supported format
+
+            stream (string, optional): descriptor of the required stream. Defaults to "video:0"
+                Currently available options include :mod:`['video', 'audio', 'cc', 'sub']`
+
+        Example:
+            The following examples creates :mod:`Video` object, seeks into 2s
+            point, and returns a single frame::
+                    import torchvision
+                    video_path = "path_to_a_test_video"
+
+                    reader = torchvision.io.Video(video_path, "video")
+                    reader.seek(2.0)
+                    frame, timestamp = reader.next()
+        """
+
+        def __init__(self, path, stream="video"):
+            self._c = torch.classes.torchvision.Video(path, stream)
+
+        def next(self):
+            """Iterator that decodes the next frame of the current stream
+
+            Returns:
+                ([torch.Tensor, float]): list containing decoded frame and corresponding timestamp
+
+            """
+            return self._c.next()
+
+        def seek(self, time_s: float):
+            """Seek within current stream.
+
+            Args:
+                time_s (float): seek time in seconds
+
+            .. note::
+                Current implementation is the so-called precise seek. This
+                means following seek, call to :mod:`next()` will return the
+                frame with the exact timestamp if it exists or
+                the first frame with timestamp larger than time_s.
+            """
+            self._c.seek(time_s)
+
+        def get_metadata(self):
+            """Returns video metadata
+
+            Returns:
+                (dict): dictionary containing duration and frame rate for every stream
+            """
+            return self._c.get_metadata()
+
+        def set_current_stream(self, stream: str):
+            """Set current stream.
+            Explicitly define the stream we are operating on.
+
+            Args:
+                stream (string): descriptor of the required stream. Defaults to "video:0"
+                    Currently available stream types include :mod:`['video', 'audio', 'cc', 'sub']`.
+                    Each descriptor consists of two parts: stream type (e.g. 'video') and
+                    a unique stream id (which are determined by video encoding).
+                    In this way, if the video contaner contains multiple
+                    streams of the same type, users can acces the one they want.
+                    If only stream type is passed, the decoder auto-detects first stream
+                    of that type and returns it.
+
+            Returns:
+                (bool): True on succes, False otherwise
+            """
+            return self._c.set_current_stream(stream)
+
+
+else:
+    Video = None
+
+
 __all__ = [
     "write_video",
     "read_video",
@@ -39,10 +125,11 @@
     "_read_video_meta_data",
     "VideoMetaData",
     "Timebase",
-    'read_image',
-    'decode_image',
-    'encode_jpeg',
-    'write_jpeg',
-    'encode_png',
-    'write_png',
+    "read_image",
+    "decode_image",
+    "encode_jpeg",
+    "write_jpeg",
+    "encode_png",
+    "write_png",
+    "Video",
 ]