Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 17 additions & 1 deletion docs/source/datasets.rst
Original file line number Diff line number Diff line change
Expand Up @@ -203,8 +203,24 @@ USPS


Kinetics-400
~~~~~
~~~~~~~~~~~~

.. autoclass:: Kinetics400
:members: __getitem__
:special-members:


HMDB51
~~~~~~~

.. autoclass:: HMDB51
:members: __getitem__
:special-members:


UCF101
~~~~~~~

.. autoclass:: UCF101
:members: __getitem__
:special-members:
43 changes: 42 additions & 1 deletion torchvision/datasets/hmdb51.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,40 @@


class HMDB51(VisionDataset):
"""
HMDB51 <http://serre-lab.clps.brown.edu/resource/hmdb-a-large-human-motion-database/>`_
dataset.

HMDB51 is an action recognition video dataset.
This dataset consider every video as a collection of video clips of fixed size, specified
by ``frames_per_clip``, where the step in frames between each clip is given by
``step_between_clips``.

To give an example, for 2 videos with 10 and 15 frames respectively, if ``frames_per_clip=5``
and ``step_between_clips=5``, the dataset size will be (2 + 3) = 5, where the first two
elements will come from video 1, and the next three elements from video 2.
Note that we drop clips which do not have exactly ``frames_per_clip`` elements, so not all
frames in a video might be present.

Internally, it uses a VideoClips object to handle clip creation.

Args:
root (string): Root directory of the HMDB51 Dataset.
annotation_path (str): path to the folder containing the split files
frames_per_clip (int): number of frames in a clip.
step_between_clips (int): number of frames between each clip.
fold (int, optional): which fold to use. Should be between 1 and 3.
train (bool, optional): if ``True``, creates a dataset from the train split,
otherwise from the ``test`` split.
transform (callable, optional): A function/transform that takes in a TxHxWxC video
and returns a transformed version.

Returns:
video (Tensor[T, H, W, C]): the `T` video frames
audio(Tensor[K, L]): the audio frames, where `K` is the number of channels
and `L` is the number of points
label (int): class of the video clip
"""

data_url = "http://serre-lab.clps.brown.edu/wp-content/uploads/2013/10/hmdb51_org.rar"
splits = {
Expand All @@ -16,8 +50,11 @@ class HMDB51(VisionDataset):
}

def __init__(self, root, annotation_path, frames_per_clip, step_between_clips=1,
fold=1, train=True):
fold=1, train=True, transform=None):
super(HMDB51, self).__init__(root)
if not 1 <= fold <= 3:
raise ValueError("fold should be between 1 and 3, got {}".format(fold))

extensions = ('avi',)
self.fold = fold
self.train = train
Expand All @@ -30,6 +67,7 @@ def __init__(self, root, annotation_path, frames_per_clip, step_between_clips=1,
video_clips = VideoClips(video_list, frames_per_clip, step_between_clips)
indices = self._select_fold(video_list, annotation_path, fold, train)
self.video_clips = video_clips.subset(indices)
self.transform = transform

def _select_fold(self, video_list, annotation_path, fold, train):
target_tag = 1 if train else 2
Expand All @@ -53,4 +91,7 @@ def __getitem__(self, idx):
video, audio, info, video_idx = self.video_clips.get_clip(idx)
label = self.samples[video_idx][1]

if self.transform is not None:
video = self.transform(video)

return video, audio, label
42 changes: 41 additions & 1 deletion torchvision/datasets/ucf101.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,46 @@


class UCF101(VisionDataset):
"""
UCF101 <https://www.crcv.ucf.edu/data/UCF101.php>`_ dataset.

UCF101 is an action recognition video dataset.
This dataset consider every video as a collection of video clips of fixed size, specified
by ``frames_per_clip``, where the step in frames between each clip is given by
``step_between_clips``.

To give an example, for 2 videos with 10 and 15 frames respectively, if ``frames_per_clip=5``
and ``step_between_clips=5``, the dataset size will be (2 + 3) = 5, where the first two
elements will come from video 1, and the next three elements from video 2.
Note that we drop clips which do not have exactly ``frames_per_clip`` elements, so not all
frames in a video might be present.

Internally, it uses a VideoClips object to handle clip creation.

Args:
root (string): Root directory of the UCF101 Dataset.
annotation_path (str): path to the folder containing the split files
frames_per_clip (int): number of frames in a clip.
step_between_clips (int, optional): number of frames between each clip.
fold (int, optional): which fold to use. Should be between 1 and 3.
train (bool, optional): if ``True``, creates a dataset from the train split,
otherwise from the ``test`` split.
transform (callable, optional): A function/transform that takes in a TxHxWxC video
and returns a transformed version.

Returns:
video (Tensor[T, H, W, C]): the `T` video frames
audio(Tensor[K, L]): the audio frames, where `K` is the number of channels
and `L` is the number of points
label (int): class of the video clip
"""

def __init__(self, root, annotation_path, frames_per_clip, step_between_clips=1,
fold=1, train=True):
fold=1, train=True, transform=None):
super(UCF101, self).__init__(root)
if not 1 <= fold <= 3:
raise ValueError("fold should be between 1 and 3, got {}".format(fold))

extensions = ('avi',)
self.fold = fold
self.train = train
Expand All @@ -24,6 +60,7 @@ def __init__(self, root, annotation_path, frames_per_clip, step_between_clips=1,
video_clips = VideoClips(video_list, frames_per_clip, step_between_clips)
indices = self._select_fold(video_list, annotation_path, fold, train)
self.video_clips = video_clips.subset(indices)
self.transform = transform

def _select_fold(self, video_list, annotation_path, fold, train):
name = "train" if train else "test"
Expand All @@ -46,4 +83,7 @@ def __getitem__(self, idx):
video, audio, info, video_idx = self.video_clips.get_clip(idx)
label = self.samples[video_idx][1]

if self.transform is not None:
video = self.transform(video)

return video, audio, label