pytorch · fmassa · Jul 31, 2019 · Jul 31, 2019
diff --git a/docs/source/datasets.rst b/docs/source/datasets.rst
@@ -200,3 +200,11 @@ USPS
 .. autoclass:: USPS
   :members: __getitem__
   :special-members:
+
+
+Kinetics-400
+~~~~~
+
+.. autoclass:: Kinetics400
+  :members: __getitem__
+  :special-members:
diff --git a/references/video_classification/train.py b/references/video_classification/train.py
@@ -139,7 +139,7 @@ def main(args):
         if args.distributed:
             print("It is recommended to pre-compute the dataset cache "
                   "on a single-gpu first, as it will be faster")
-        dataset = torchvision.datasets.KineticsVideo(
+        dataset = torchvision.datasets.Kinetics400(
             traindir,
             frames_per_clip=args.clip_len,
             step_between_clips=1,
@@ -171,7 +171,7 @@ def main(args):
         if args.distributed:
             print("It is recommended to pre-compute the dataset cache "
                   "on a single-gpu first, as it will be faster")
-        dataset_test = torchvision.datasets.KineticsVideo(
+        dataset_test = torchvision.datasets.Kinetics400(
             valdir,
             frames_per_clip=args.clip_len,
             step_between_clips=1,

diff --git a/torchvision/datasets/__init__.py b/torchvision/datasets/__init__.py
@@ -19,7 +19,7 @@
 from .sbd import SBDataset
 from .vision import VisionDataset
 from .usps import USPS
-from .kinetics import KineticsVideo
+from .kinetics import Kinetics400
 from .hmdb51 import HMDB51
 from .ucf101 import UCF101
 
@@ -31,4 +31,4 @@
            'Omniglot', 'SBU', 'Flickr8k', 'Flickr30k',
            'VOCSegmentation', 'VOCDetection', 'Cityscapes', 'ImageNet',
            'Caltech101', 'Caltech256', 'CelebA', 'SBDataset', 'VisionDataset',
-           'USPS', 'KineticsVideo', 'HMDB51', 'UCF101')
+           'USPS', 'Kinetics400', 'HMDB51', 'UCF101')
diff --git a/torchvision/datasets/kinetics.py b/torchvision/datasets/kinetics.py
@@ -4,9 +4,40 @@
 from .vision import VisionDataset
 
 
-class KineticsVideo(VisionDataset):
+class Kinetics400(VisionDataset):
+    """
+    `Kinetics-400 <https://deepmind.com/research/open-source/open-source-datasets/kinetics/>`_
+    dataset.
+
+    Kinetics-400 is an action recognition video dataset.
+    This dataset consider every video as a collection of video clips of fixed size, specified
+    by ``frames_per_clip``, where the step in frames between each clip is given by
+    ``step_between_clips``.
+
+    To give an example, for 2 videos with 10 and 15 frames respectively, if ``frames_per_clip=5``
+    and ``step_between_clips=5``, the dataset size will be (2 + 3) = 5, where the first two
+    elements will come from video 1, and the next three elements from video 2.
+    Note that we drop clips which do not have exactly ``frames_per_clip`` elements, so not all
+    frames in a video might be present.
+
+    Internally, it uses a VideoClips object to handle clip creation.
+
+    Args:
+        root (string): Root directory of the Kinetics-400 Dataset.
+        frames_per_clip (int): number of frames in a clip
+        step_between_clips (int): number of frames between each clip
+        transform (callable, optional): A function/transform that  takes in a TxHxWxC video
+            and returns a transformed version.
+
+    Returns:
+        video (Tensor[T, H, W, C]): the `T` video frames
+        audio(Tensor[K, L]): the audio frames, where `K` is the number of channels
+            and `L` is the number of points
+        label (int): class of the video clip
+    """
+
     def __init__(self, root, frames_per_clip, step_between_clips=1, transform=None):
-        super(KineticsVideo, self).__init__(root)
+        super(Kinetics400, self).__init__(root)
         extensions = ('avi',)
 
         classes = list(sorted(list_dir(root)))