diff --git a/docs/source/models.rst b/docs/source/models.rst index dda7adf6aaa..e1a141092dc 100644 --- a/docs/source/models.rst +++ b/docs/source/models.rst @@ -4,8 +4,8 @@ torchvision.models The models subpackage contains definitions of models for addressing different tasks, including: image classification, pixelwise semantic -segmentation, object detection, instance segmentation and person -keypoint detection. +segmentation, object detection, instance segmentation, person +keypoint detection and video classification. Classification @@ -395,3 +395,51 @@ Keypoint R-CNN .. autofunction:: torchvision.models.detection.keypointrcnn_resnet50_fpn + +Video classification +==================== + +We provide models for action recognition pre-trained on Kinetics-400. +They have all been trained with the scripts provided in ``references/video_classification``. + +All pre-trained models expect input images normalized in the same way, +i.e. mini-batches of 3-channel RGB videos of shape (3 x T x H x W), +where H and W are expected to be 112, and T is a number of video frames in a clip. +The images have to be loaded in to a range of [0, 1] and then normalized +using ``mean = [0.43216, 0.394666, 0.37645]`` and ``std = [0.22803, 0.22145, 0.216989]``. + + +.. note:: + The normalization parameters are different from the image classification ones, and correspond + to the mean and std from Kinetics-400. + +.. note:: + For now, normalization code can be found in ``references/video_classification/transforms.py``, + see the ``Normalize`` function there. Note that it differs from standard normalization for + images because it assumes the video is 4d. + +Kinetics 1-crop accuracies for clip length 16 (16x112x112) + +================================ ============= ============= +Network Clip acc@1 Clip acc@5 +================================ ============= ============= +ResNet 3D 18 52.75 75.45 +ResNet MC 18 53.90 76.29 +ResNet (2+1)D 57.50 78.81 +================================ ============= ============= + + +ResNet 3D +---------- + +.. autofunction:: torchvision.models.video.r3d_18 + +ResNet Mixed Convolution +------------------------ + +.. autofunction:: torchvision.models.video.mc3_18 + +ResNet (2+1)D +------------- + +.. autofunction:: torchvision.models.video.r2plus1d_18 diff --git a/torchvision/models/resnet.py b/torchvision/models/resnet.py index d49729e4d4e..29f9c001151 100644 --- a/torchvision/models/resnet.py +++ b/torchvision/models/resnet.py @@ -221,7 +221,7 @@ def _resnet(arch, block, layers, pretrained, progress, **kwargs): def resnet18(pretrained=False, progress=True, **kwargs): r"""ResNet-18 model from - `"Deep Residual Learning for Image Recognition" '_ + `"Deep Residual Learning for Image Recognition" `_ Args: pretrained (bool): If True, returns a model pre-trained on ImageNet @@ -233,7 +233,7 @@ def resnet18(pretrained=False, progress=True, **kwargs): def resnet34(pretrained=False, progress=True, **kwargs): r"""ResNet-34 model from - `"Deep Residual Learning for Image Recognition" '_ + `"Deep Residual Learning for Image Recognition" `_ Args: pretrained (bool): If True, returns a model pre-trained on ImageNet @@ -245,7 +245,7 @@ def resnet34(pretrained=False, progress=True, **kwargs): def resnet50(pretrained=False, progress=True, **kwargs): r"""ResNet-50 model from - `"Deep Residual Learning for Image Recognition" '_ + `"Deep Residual Learning for Image Recognition" `_ Args: pretrained (bool): If True, returns a model pre-trained on ImageNet @@ -257,7 +257,7 @@ def resnet50(pretrained=False, progress=True, **kwargs): def resnet101(pretrained=False, progress=True, **kwargs): r"""ResNet-101 model from - `"Deep Residual Learning for Image Recognition" '_ + `"Deep Residual Learning for Image Recognition" `_ Args: pretrained (bool): If True, returns a model pre-trained on ImageNet @@ -269,7 +269,7 @@ def resnet101(pretrained=False, progress=True, **kwargs): def resnet152(pretrained=False, progress=True, **kwargs): r"""ResNet-152 model from - `"Deep Residual Learning for Image Recognition" '_ + `"Deep Residual Learning for Image Recognition" `_ Args: pretrained (bool): If True, returns a model pre-trained on ImageNet diff --git a/torchvision/models/vgg.py b/torchvision/models/vgg.py index 03feb23e79d..dba534f651d 100644 --- a/torchvision/models/vgg.py +++ b/torchvision/models/vgg.py @@ -97,7 +97,7 @@ def _vgg(arch, cfg, batch_norm, pretrained, progress, **kwargs): def vgg11(pretrained=False, progress=True, **kwargs): r"""VGG 11-layer model (configuration "A") from - `"Very Deep Convolutional Networks For Large-Scale Image Recognition" '_ + `"Very Deep Convolutional Networks For Large-Scale Image Recognition" `_ Args: pretrained (bool): If True, returns a model pre-trained on ImageNet @@ -108,7 +108,7 @@ def vgg11(pretrained=False, progress=True, **kwargs): def vgg11_bn(pretrained=False, progress=True, **kwargs): r"""VGG 11-layer model (configuration "A") with batch normalization - `"Very Deep Convolutional Networks For Large-Scale Image Recognition" '_ + `"Very Deep Convolutional Networks For Large-Scale Image Recognition" `_ Args: pretrained (bool): If True, returns a model pre-trained on ImageNet @@ -119,7 +119,7 @@ def vgg11_bn(pretrained=False, progress=True, **kwargs): def vgg13(pretrained=False, progress=True, **kwargs): r"""VGG 13-layer model (configuration "B") - `"Very Deep Convolutional Networks For Large-Scale Image Recognition" '_ + `"Very Deep Convolutional Networks For Large-Scale Image Recognition" `_ Args: pretrained (bool): If True, returns a model pre-trained on ImageNet @@ -130,7 +130,7 @@ def vgg13(pretrained=False, progress=True, **kwargs): def vgg13_bn(pretrained=False, progress=True, **kwargs): r"""VGG 13-layer model (configuration "B") with batch normalization - `"Very Deep Convolutional Networks For Large-Scale Image Recognition" '_ + `"Very Deep Convolutional Networks For Large-Scale Image Recognition" `_ Args: pretrained (bool): If True, returns a model pre-trained on ImageNet @@ -141,7 +141,7 @@ def vgg13_bn(pretrained=False, progress=True, **kwargs): def vgg16(pretrained=False, progress=True, **kwargs): r"""VGG 16-layer model (configuration "D") - `"Very Deep Convolutional Networks For Large-Scale Image Recognition" '_ + `"Very Deep Convolutional Networks For Large-Scale Image Recognition" `_ Args: pretrained (bool): If True, returns a model pre-trained on ImageNet @@ -152,7 +152,7 @@ def vgg16(pretrained=False, progress=True, **kwargs): def vgg16_bn(pretrained=False, progress=True, **kwargs): r"""VGG 16-layer model (configuration "D") with batch normalization - `"Very Deep Convolutional Networks For Large-Scale Image Recognition" '_ + `"Very Deep Convolutional Networks For Large-Scale Image Recognition" `_ Args: pretrained (bool): If True, returns a model pre-trained on ImageNet @@ -163,7 +163,7 @@ def vgg16_bn(pretrained=False, progress=True, **kwargs): def vgg19(pretrained=False, progress=True, **kwargs): r"""VGG 19-layer model (configuration "E") - `"Very Deep Convolutional Networks For Large-Scale Image Recognition" '_ + `"Very Deep Convolutional Networks For Large-Scale Image Recognition" `_ Args: pretrained (bool): If True, returns a model pre-trained on ImageNet @@ -174,7 +174,7 @@ def vgg19(pretrained=False, progress=True, **kwargs): def vgg19_bn(pretrained=False, progress=True, **kwargs): r"""VGG 19-layer model (configuration 'E') with batch normalization - `"Very Deep Convolutional Networks For Large-Scale Image Recognition" '_ + `"Very Deep Convolutional Networks For Large-Scale Image Recognition" `_ Args: pretrained (bool): If True, returns a model pre-trained on ImageNet