diff --git a/references/video_classification/README.md b/references/video_classification/README.md index 204bfda5ba5..c387e2e7158 100644 --- a/references/video_classification/README.md +++ b/references/video_classification/README.md @@ -18,7 +18,7 @@ We assume the training and validation AVI videos are stored at `/data/kinectics4 Run the training on a single node with 8 GPUs: ```bash -torchrun --nproc_per_node=8 train.py --data-path=/data/kinectics400 --train-dir=train --val-dir=val --batch-size=16 --cache-dataset --sync-bn --amp +torchrun --nproc_per_node=8 train.py --data-path=/data/kinectics400 --kinetics-version="400" --batch-size=16 --cache-dataset --sync-bn --amp ``` **Note:** all our models were trained on 8 nodes with 8 V100 GPUs each for a total of 64 GPUs. Expected training time for 64 GPUs is 24 hours, depending on the storage solution. @@ -30,5 +30,13 @@ torchrun --nproc_per_node=8 train.py --data-path=/data/kinectics400 --train-dir= ```bash -python train.py --data-path=/data/kinectics400 --train-dir=train --val-dir=val --batch-size=8 --cache-dataset +python train.py --data-path=/data/kinectics400 --kinetics-version="400" --batch-size=8 --cache-dataset ``` + + +### Additional Kinetics versions + +Since the original release, additional versions of Kinetics dataset became available (Kinetics 600). +Our training scripts support these versions of dataset as well by setting the `--kinetics-version` parameter to `"600"`. + +**Note:** training on Kinetics 600 requires a different set of hyperparameters for optimal performance. We do not provide Kinetics 600 pretrained models. diff --git a/references/video_classification/presets.py b/references/video_classification/presets.py index c12d00a022b..ef774052257 100644 --- a/references/video_classification/presets.py +++ b/references/video_classification/presets.py @@ -1,6 +1,6 @@ import torch from torchvision.transforms import transforms -from transforms import ConvertBHWCtoBCHW, ConvertBCHWtoCBHW +from transforms import ConvertBCHWtoCBHW class VideoClassificationPresetTrain: @@ -14,7 +14,6 @@ def __init__( hflip_prob=0.5, ): trans = [ - ConvertBHWCtoBCHW(), transforms.ConvertImageDtype(torch.float32), transforms.Resize(resize_size), ] @@ -31,7 +30,6 @@ class VideoClassificationPresetEval: def __init__(self, *, crop_size, resize_size, mean=(0.43216, 0.394666, 0.37645), std=(0.22803, 0.22145, 0.216989)): self.transforms = transforms.Compose( [ - ConvertBHWCtoBCHW(), transforms.ConvertImageDtype(torch.float32), transforms.Resize(resize_size), transforms.Normalize(mean=mean, std=std), diff --git a/references/video_classification/train.py b/references/video_classification/train.py index 26c856da878..c7ac9e8c133 100644 --- a/references/video_classification/train.py +++ b/references/video_classification/train.py @@ -130,8 +130,8 @@ def main(args): # Data loading code print("Loading data") - traindir = os.path.join(args.data_path, args.train_dir) - valdir = os.path.join(args.data_path, args.val_dir) + traindir = os.path.join(args.data_path, "train") + valdir = os.path.join(args.data_path, "val") print("Loading training data") st = time.time() @@ -145,9 +145,11 @@ def main(args): else: if args.distributed: print("It is recommended to pre-compute the dataset cache on a single-gpu first, as it will be faster") - dataset = torchvision.datasets.Kinetics400( - traindir, + dataset = torchvision.datasets.Kinetics( + args.data_path, frames_per_clip=args.clip_len, + num_classes=args.kinetics_version, + split="train", step_between_clips=1, transform=transform_train, frame_rate=15, @@ -179,9 +181,11 @@ def main(args): else: if args.distributed: print("It is recommended to pre-compute the dataset cache on a single-gpu first, as it will be faster") - dataset_test = torchvision.datasets.Kinetics400( - valdir, + dataset_test = torchvision.datasets.Kinetics( + args.data_path, frames_per_clip=args.clip_len, + num_classes=args.kinetics_version, + split="val", step_between_clips=1, transform=transform_test, frame_rate=15, @@ -312,8 +316,9 @@ def parse_args(): parser = argparse.ArgumentParser(description="PyTorch Video Classification Training") parser.add_argument("--data-path", default="/datasets01_101/kinetics/070618/", type=str, help="dataset path") - parser.add_argument("--train-dir", default="train_avi-480p", type=str, help="name of train dir") - parser.add_argument("--val-dir", default="val_avi-480p", type=str, help="name of val dir") + parser.add_argument( + "--kinetics-version", default="400", type=str, choices=["400", "600"], help="Select kinetics version" + ) parser.add_argument("--model", default="r2plus1d_18", type=str, help="model name") parser.add_argument("--device", default="cuda", type=str, help="device (Use cuda or cpu Default: cuda)") parser.add_argument("--clip-len", default=16, type=int, metavar="N", help="number of frames per clip") diff --git a/references/video_classification/transforms.py b/references/video_classification/transforms.py index a0ce691bae7..2a7cc2a4a66 100644 --- a/references/video_classification/transforms.py +++ b/references/video_classification/transforms.py @@ -2,13 +2,6 @@ import torch.nn as nn -class ConvertBHWCtoBCHW(nn.Module): - """Convert tensor from (B, H, W, C) to (B, C, H, W)""" - - def forward(self, vid: torch.Tensor) -> torch.Tensor: - return vid.permute(0, 3, 1, 2) - - class ConvertBCHWtoCBHW(nn.Module): """Convert tensor from (B, C, H, W) to (C, B, H, W)""" diff --git a/torchvision/datasets/kinetics.py b/torchvision/datasets/kinetics.py index 651dbdc158f..937cee495e0 100644 --- a/torchvision/datasets/kinetics.py +++ b/torchvision/datasets/kinetics.py @@ -308,6 +308,7 @@ def __init__( warnings.warn( "The Kinetics400 class is deprecated since 0.12 and will be removed in 0.14." "Please use Kinetics(..., num_classes='400') instead." + "Note that Kinetics(..., num_classes='400') returns video in a more logical Tensor[T, C, H, W] format." ) if any(value is not None for value in (num_classes, split, download, num_download_workers)): raise RuntimeError( diff --git a/torchvision/io/video.py b/torchvision/io/video.py index d026e754546..1c758661164 100644 --- a/torchvision/io/video.py +++ b/torchvision/io/video.py @@ -153,14 +153,13 @@ def _read_from_stream( gc.collect() if pts_unit == "sec": + # TODO: we should change all of this from ground up to simply take + # sec and convert to MS in C++ start_offset = int(math.floor(start_offset * (1 / stream.time_base))) if end_offset != float("inf"): end_offset = int(math.ceil(end_offset * (1 / stream.time_base))) else: - warnings.warn( - "The pts_unit 'pts' gives wrong results and will be removed in a " - + "follow-up version. Please use pts_unit 'sec'." - ) + warnings.warn("The pts_unit 'pts' gives wrong results. Please use pts_unit 'sec'.") frames = {} should_buffer = True @@ -176,9 +175,9 @@ def _read_from_stream( # can't use regex directly because of some weird characters sometimes... pos = extradata.find(b"DivX") d = extradata[pos:] - o = re.search(br"DivX(\d+)Build(\d+)(\w)", d) + o = re.search(rb"DivX(\d+)Build(\d+)(\w)", d) if o is None: - o = re.search(br"DivX(\d+)b(\d+)(\w)", d) + o = re.search(rb"DivX(\d+)b(\d+)(\w)", d) if o is not None: should_buffer = o.group(3) == b"p" seek_offset = start_offset