diff --git a/docs/source/datasets.rst b/docs/source/datasets.rst index b1696e10411..ddc566bd272 100644 --- a/docs/source/datasets.rst +++ b/docs/source/datasets.rst @@ -200,3 +200,11 @@ USPS .. autoclass:: USPS :members: __getitem__ :special-members: + + +Kinetics-400 +~~~~~ + +.. autoclass:: Kinetics400 + :members: __getitem__ + :special-members: diff --git a/references/video_classification/train.py b/references/video_classification/train.py index a45357d2c43..0f04475eade 100644 --- a/references/video_classification/train.py +++ b/references/video_classification/train.py @@ -139,7 +139,7 @@ def main(args): if args.distributed: print("It is recommended to pre-compute the dataset cache " "on a single-gpu first, as it will be faster") - dataset = torchvision.datasets.KineticsVideo( + dataset = torchvision.datasets.Kinetics400( traindir, frames_per_clip=args.clip_len, step_between_clips=1, @@ -171,7 +171,7 @@ def main(args): if args.distributed: print("It is recommended to pre-compute the dataset cache " "on a single-gpu first, as it will be faster") - dataset_test = torchvision.datasets.KineticsVideo( + dataset_test = torchvision.datasets.Kinetics400( valdir, frames_per_clip=args.clip_len, step_between_clips=1, diff --git a/torchvision/datasets/__init__.py b/torchvision/datasets/__init__.py index 98625192328..db5b572a469 100644 --- a/torchvision/datasets/__init__.py +++ b/torchvision/datasets/__init__.py @@ -19,7 +19,7 @@ from .sbd import SBDataset from .vision import VisionDataset from .usps import USPS -from .kinetics import KineticsVideo +from .kinetics import Kinetics400 from .hmdb51 import HMDB51 from .ucf101 import UCF101 @@ -31,4 +31,4 @@ 'Omniglot', 'SBU', 'Flickr8k', 'Flickr30k', 'VOCSegmentation', 'VOCDetection', 'Cityscapes', 'ImageNet', 'Caltech101', 'Caltech256', 'CelebA', 'SBDataset', 'VisionDataset', - 'USPS', 'KineticsVideo', 'HMDB51', 'UCF101') + 'USPS', 'Kinetics400', 'HMDB51', 'UCF101') diff --git a/torchvision/datasets/kinetics.py b/torchvision/datasets/kinetics.py index f7d3fbe89d7..90717250c22 100644 --- a/torchvision/datasets/kinetics.py +++ b/torchvision/datasets/kinetics.py @@ -4,9 +4,40 @@ from .vision import VisionDataset -class KineticsVideo(VisionDataset): +class Kinetics400(VisionDataset): + """ + `Kinetics-400 `_ + dataset. + + Kinetics-400 is an action recognition video dataset. + This dataset consider every video as a collection of video clips of fixed size, specified + by ``frames_per_clip``, where the step in frames between each clip is given by + ``step_between_clips``. + + To give an example, for 2 videos with 10 and 15 frames respectively, if ``frames_per_clip=5`` + and ``step_between_clips=5``, the dataset size will be (2 + 3) = 5, where the first two + elements will come from video 1, and the next three elements from video 2. + Note that we drop clips which do not have exactly ``frames_per_clip`` elements, so not all + frames in a video might be present. + + Internally, it uses a VideoClips object to handle clip creation. + + Args: + root (string): Root directory of the Kinetics-400 Dataset. + frames_per_clip (int): number of frames in a clip + step_between_clips (int): number of frames between each clip + transform (callable, optional): A function/transform that takes in a TxHxWxC video + and returns a transformed version. + + Returns: + video (Tensor[T, H, W, C]): the `T` video frames + audio(Tensor[K, L]): the audio frames, where `K` is the number of channels + and `L` is the number of points + label (int): class of the video clip + """ + def __init__(self, root, frames_per_clip, step_between_clips=1, transform=None): - super(KineticsVideo, self).__init__(root) + super(Kinetics400, self).__init__(root) extensions = ('avi',) classes = list(sorted(list_dir(root)))