From 6351698a9aa41b7bab32a9cd6f488210ccdc1299 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 14 Apr 2022 14:23:43 +0100 Subject: [PATCH 1/7] First PR for model doc revamp --- .gitignore | 1 + docs/requirements.txt | 1 + docs/source/conf.py | 59 +++ docs/source/models.rst | 814 +--------------------------------- docs/source/models/resnet.rst | 28 ++ docs/source/models/vgg.rst | 30 ++ torchvision/models/resnet.py | 95 +++- torchvision/models/vgg.py | 160 +++++-- 8 files changed, 325 insertions(+), 863 deletions(-) create mode 100644 docs/source/models/resnet.rst create mode 100644 docs/source/models/vgg.rst diff --git a/.gitignore b/.gitignore index d3ba0e7a8f9..f16b54061e0 100644 --- a/.gitignore +++ b/.gitignore @@ -15,6 +15,7 @@ docs/build docs/source/auto_examples/ docs/source/gen_modules/ docs/source/generated/ +docs/source/models/generated/ # pytorch-sphinx-theme gets installed here docs/src diff --git a/docs/requirements.txt b/docs/requirements.txt index d7a05e5e499..91b877a6233 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -3,6 +3,7 @@ numpy sphinx-copybutton>=0.3.1 sphinx-gallery>=0.9.0 sphinx==3.5.4 +tabulate # This pin is only needed for sphinx<4.0.2. See https://github.com/pytorch/vision/issues/5673 for details Jinja2<3.1.* -e git+https://github.com/pytorch/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme diff --git a/docs/source/conf.py b/docs/source/conf.py index d09a33c3064..8a1428c1908 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -21,9 +21,12 @@ # sys.path.insert(0, os.path.abspath('.')) import os +import textwrap import pytorch_sphinx_theme import torchvision +import torchvision.models as M +from tabulate import tabulate # -- General configuration ------------------------------------------------ @@ -292,5 +295,61 @@ def inject_minigalleries(app, what, name, obj, options, lines): lines.append("\n") +def inject_weight_metadata(app, what, name, obj, options, lines): + + if obj.__name__.endswith("_Weights"): + lines[:] = ["The model builder above accepts the following values as the ``weights`` parameter:"] + lines.append("") + for field in obj: + lines += [f"**{str(field)}**:", ""] + + table = [] + for k, v in field.meta.items(): + if k != "categories": + table.append((str(k), str(v))) + table = tabulate(table, tablefmt="rst") + lines += [".. table::", ""] + lines += textwrap.indent(table, " " * 4).split("\n") + lines.append("") + + +def generate_table(): + + # TODO: this is ugly af and incorrect. We'll need an automatic way to + # retrieve weight enums for each section, or manually list them. + weight_enums = [getattr(M, name) for name in dir(M) if name.endswith("Weights")] + weights = [w for weight_enum in weight_enums for w in weight_enum if "acc@1" in w.meta] + + def get_weight_link(w): + return f":class:`{w} <{type(w).__name__}>`" + + column_names = ("**Weight**", "**Acc@1**", "**Acc@5**", "**Params**", "**Recipe**") + content = [ + ( + get_weight_link(w), + w.meta["acc@1"], + w.meta["acc@5"], + f"{w.meta['num_params']/1e6:.1f}M", + f"`link <{w.meta['recipe']}>`__", + ) + for w in weights + ] + table = tabulate(content, headers=column_names, tablefmt="rst") + + from pathlib import Path + + generated_dir = Path("generated") + generated_dir.mkdir(exist_ok=True) + with open(generated_dir / "classification_table.rst", "w+") as table_file: + table_file.write(".. table::\n") + table_file.write(" :widths: 100 10 10 20 10\n\n") + table_file.write(f"{textwrap.indent(table, ' ' * 4)}\n\n") + + +generate_table() + + def setup(app): + app.connect("autodoc-process-docstring", inject_minigalleries) + app.connect("autodoc-process-docstring", inject_weight_metadata) diff --git a/docs/source/models.rst b/docs/source/models.rst index f84d9c7fd1a..eb3c059170e 100644 --- a/docs/source/models.rst +++ b/docs/source/models.rst @@ -22,819 +22,27 @@ keypoint detection, video classification, and optical flow. Classification ============== -The models subpackage contains definitions for the following model -architectures for image classification: - -- `AlexNet`_ -- `VGG`_ -- `ResNet`_ -- `SqueezeNet`_ -- `DenseNet`_ -- `Inception`_ v3 -- `GoogLeNet`_ -- `ShuffleNet`_ v2 -- `MobileNetV2`_ -- `MobileNetV3`_ -- `ResNeXt`_ -- `Wide ResNet`_ -- `MNASNet`_ -- `EfficientNet`_ v1 & v2 -- `RegNet`_ -- `VisionTransformer`_ -- `ConvNeXt`_ - -You can construct a model with random weights by calling its constructor: - -.. code:: python - - import torchvision.models as models - resnet18 = models.resnet18() - alexnet = models.alexnet() - vgg16 = models.vgg16() - squeezenet = models.squeezenet1_0() - densenet = models.densenet161() - inception = models.inception_v3() - googlenet = models.googlenet() - shufflenet = models.shufflenet_v2_x1_0() - mobilenet_v2 = models.mobilenet_v2() - mobilenet_v3_large = models.mobilenet_v3_large() - mobilenet_v3_small = models.mobilenet_v3_small() - resnext50_32x4d = models.resnext50_32x4d() - wide_resnet50_2 = models.wide_resnet50_2() - mnasnet = models.mnasnet1_0() - efficientnet_b0 = models.efficientnet_b0() - efficientnet_b1 = models.efficientnet_b1() - efficientnet_b2 = models.efficientnet_b2() - efficientnet_b3 = models.efficientnet_b3() - efficientnet_b4 = models.efficientnet_b4() - efficientnet_b5 = models.efficientnet_b5() - efficientnet_b6 = models.efficientnet_b6() - efficientnet_b7 = models.efficientnet_b7() - efficientnet_v2_s = models.efficientnet_v2_s() - efficientnet_v2_m = models.efficientnet_v2_m() - efficientnet_v2_l = models.efficientnet_v2_l() - regnet_y_400mf = models.regnet_y_400mf() - regnet_y_800mf = models.regnet_y_800mf() - regnet_y_1_6gf = models.regnet_y_1_6gf() - regnet_y_3_2gf = models.regnet_y_3_2gf() - regnet_y_8gf = models.regnet_y_8gf() - regnet_y_16gf = models.regnet_y_16gf() - regnet_y_32gf = models.regnet_y_32gf() - regnet_y_128gf = models.regnet_y_128gf() - regnet_x_400mf = models.regnet_x_400mf() - regnet_x_800mf = models.regnet_x_800mf() - regnet_x_1_6gf = models.regnet_x_1_6gf() - regnet_x_3_2gf = models.regnet_x_3_2gf() - regnet_x_8gf = models.regnet_x_8gf() - regnet_x_16gf = models.regnet_x_16gf() - regnet_x_32gf = models.regnet_x_32gf() - vit_b_16 = models.vit_b_16() - vit_b_32 = models.vit_b_32() - vit_l_16 = models.vit_l_16() - vit_l_32 = models.vit_l_32() - vit_h_14 = models.vit_h_14() - convnext_tiny = models.convnext_tiny() - convnext_small = models.convnext_small() - convnext_base = models.convnext_base() - convnext_large = models.convnext_large() - -We provide pre-trained models, using the PyTorch :mod:`torch.utils.model_zoo`. - -Instancing a pre-trained model will download its weights to a cache directory. -This directory can be set using the `TORCH_HOME` environment variable. See -:func:`torch.hub.load_state_dict_from_url` for details. - -Some models use modules which have different training and evaluation -behavior, such as batch normalization. To switch between these modes, use -``model.train()`` or ``model.eval()`` as appropriate. See -:meth:`~torch.nn.Module.train` or :meth:`~torch.nn.Module.eval` for details. - -All pre-trained models expect input images normalized in the same way, -i.e. mini-batches of 3-channel RGB images of shape (3 x H x W), -where H and W are expected to be at least 224. -The images have to be loaded in to a range of [0, 1] and then normalized -using ``mean = [0.485, 0.456, 0.406]`` and ``std = [0.229, 0.224, 0.225]``. -You can use the following transform to normalize:: - - normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], - std=[0.229, 0.224, 0.225]) - -An example of such normalization can be found in the imagenet example -`here `_ - -The process for obtaining the values of `mean` and `std` is roughly equivalent -to:: - - import torch - from torchvision import datasets, transforms as T - - transform = T.Compose([T.Resize(256), T.CenterCrop(224), T.PILToTensor(), T.ConvertImageDtype(torch.float)]) - dataset = datasets.ImageNet(".", split="train", transform=transform) - - means = [] - stds = [] - for img in subset(dataset): - means.append(torch.mean(img)) - stds.append(torch.std(img)) - - mean = torch.mean(torch.tensor(means)) - std = torch.mean(torch.tensor(stds)) - -Unfortunately, the concrete `subset` that was used is lost. For more -information see `this discussion `_ -or `these experiments `_. - -The sizes of the EfficientNet models depend on the variant. For the exact input sizes -`check here `_ - -ImageNet 1-crop error rates - -================================ ============= ============= -Model Acc@1 Acc@5 -================================ ============= ============= -AlexNet 56.522 79.066 -VGG-11 69.020 88.628 -VGG-13 69.928 89.246 -VGG-16 71.592 90.382 -VGG-19 72.376 90.876 -VGG-11 with batch normalization 70.370 89.810 -VGG-13 with batch normalization 71.586 90.374 -VGG-16 with batch normalization 73.360 91.516 -VGG-19 with batch normalization 74.218 91.842 -ResNet-18 69.758 89.078 -ResNet-34 73.314 91.420 -ResNet-50 76.130 92.862 -ResNet-101 77.374 93.546 -ResNet-152 78.312 94.046 -SqueezeNet 1.0 58.092 80.420 -SqueezeNet 1.1 58.178 80.624 -Densenet-121 74.434 91.972 -Densenet-169 75.600 92.806 -Densenet-201 76.896 93.370 -Densenet-161 77.138 93.560 -Inception v3 77.294 93.450 -GoogleNet 69.778 89.530 -ShuffleNet V2 x1.0 69.362 88.316 -ShuffleNet V2 x0.5 60.552 81.746 -MobileNet V2 71.878 90.286 -MobileNet V3 Large 74.042 91.340 -MobileNet V3 Small 67.668 87.402 -ResNeXt-50-32x4d 77.618 93.698 -ResNeXt-101-32x8d 79.312 94.526 -Wide ResNet-50-2 78.468 94.086 -Wide ResNet-101-2 78.848 94.284 -MNASNet 1.0 73.456 91.510 -MNASNet 0.5 67.734 87.490 -EfficientNet-B0 77.692 93.532 -EfficientNet-B1 78.642 94.186 -EfficientNet-B2 80.608 95.310 -EfficientNet-B3 82.008 96.054 -EfficientNet-B4 83.384 96.594 -EfficientNet-B5 83.444 96.628 -EfficientNet-B6 84.008 96.916 -EfficientNet-B7 84.122 96.908 -EfficientNetV2-s 84.228 96.878 -EfficientNetV2-m 85.112 97.156 -EfficientNetV2-l 85.810 97.792 -regnet_x_400mf 72.834 90.950 -regnet_x_800mf 75.212 92.348 -regnet_x_1_6gf 77.040 93.440 -regnet_x_3_2gf 78.364 93.992 -regnet_x_8gf 79.344 94.686 -regnet_x_16gf 80.058 94.944 -regnet_x_32gf 80.622 95.248 -regnet_y_400mf 74.046 91.716 -regnet_y_800mf 76.420 93.136 -regnet_y_1_6gf 77.950 93.966 -regnet_y_3_2gf 78.948 94.576 -regnet_y_8gf 80.032 95.048 -regnet_y_16gf 80.424 95.240 -regnet_y_32gf 80.878 95.340 -vit_b_16 81.072 95.318 -vit_b_32 75.912 92.466 -vit_l_16 79.662 94.638 -vit_l_32 76.972 93.070 -vit_h_14 88.552 98.694 -convnext_tiny 82.520 96.146 -convnext_small 83.616 96.650 -convnext_base 84.062 96.870 -convnext_large 84.414 96.976 -================================ ============= ============= - - -.. _AlexNet: https://arxiv.org/abs/1404.5997 -.. _VGG: https://arxiv.org/abs/1409.1556 -.. _ResNet: https://arxiv.org/abs/1512.03385 -.. _SqueezeNet: https://arxiv.org/abs/1602.07360 -.. _DenseNet: https://arxiv.org/abs/1608.06993 -.. _Inception: https://arxiv.org/abs/1512.00567 -.. _GoogLeNet: https://arxiv.org/abs/1409.4842 -.. _ShuffleNet: https://arxiv.org/abs/1807.11164 -.. _MobileNetV2: https://arxiv.org/abs/1801.04381 -.. _MobileNetV3: https://arxiv.org/abs/1905.02244 -.. _ResNeXt: https://arxiv.org/abs/1611.05431 -.. _MNASNet: https://arxiv.org/abs/1807.11626 -.. _EfficientNet: https://arxiv.org/abs/1905.11946 -.. _RegNet: https://arxiv.org/abs/2003.13678 -.. _VisionTransformer: https://arxiv.org/abs/2010.11929 -.. _ConvNeXt: https://arxiv.org/abs/2201.03545 - .. currentmodule:: torchvision.models -Alexnet -------- - -.. autosummary:: - :toctree: generated/ - :template: function.rst - - alexnet - -VGG ---- - -.. autosummary:: - :toctree: generated/ - :template: function.rst - - vgg11 - vgg11_bn - vgg13 - vgg13_bn - vgg16 - vgg16_bn - vgg19 - vgg19_bn - - -ResNet ------- - -.. autosummary:: - :toctree: generated/ - :template: function.rst - - resnet18 - resnet34 - resnet50 - resnet101 - resnet152 - -SqueezeNet ----------- - -.. autosummary:: - :toctree: generated/ - :template: function.rst - - squeezenet1_0 - squeezenet1_1 - -DenseNet ---------- - -.. autosummary:: - :toctree: generated/ - :template: function.rst - - densenet121 - densenet169 - densenet161 - densenet201 - -Inception v3 ------------- - -.. autosummary:: - :toctree: generated/ - :template: function.rst - - inception_v3 - -GoogLeNet ------------- - -.. autosummary:: - :toctree: generated/ - :template: function.rst - - googlenet - -ShuffleNet v2 -------------- - -.. autosummary:: - :toctree: generated/ - :template: function.rst - - shufflenet_v2_x0_5 - shufflenet_v2_x1_0 - shufflenet_v2_x1_5 - shufflenet_v2_x2_0 - -MobileNet v2 -------------- - -.. autosummary:: - :toctree: generated/ - :template: function.rst - - mobilenet_v2 - -MobileNet v3 -------------- - -.. autosummary:: - :toctree: generated/ - :template: function.rst - - mobilenet_v3_large - mobilenet_v3_small - -ResNext -------- - -.. autosummary:: - :toctree: generated/ - :template: function.rst - - resnext50_32x4d - resnext101_32x8d - -Wide ResNet ------------ - -.. autosummary:: - :toctree: generated/ - :template: function.rst - - wide_resnet50_2 - wide_resnet101_2 - -MNASNet --------- - -.. autosummary:: - :toctree: generated/ - :template: function.rst - - mnasnet0_5 - mnasnet0_75 - mnasnet1_0 - mnasnet1_3 - -EfficientNet ------------- - -.. autosummary:: - :toctree: generated/ - :template: function.rst - - efficientnet_b0 - efficientnet_b1 - efficientnet_b2 - efficientnet_b3 - efficientnet_b4 - efficientnet_b5 - efficientnet_b6 - efficientnet_b7 - efficientnet_v2_s - efficientnet_v2_m - efficientnet_v2_l - -RegNet ------------- - -.. autosummary:: - :toctree: generated/ - :template: function.rst - - regnet_y_400mf - regnet_y_800mf - regnet_y_1_6gf - regnet_y_3_2gf - regnet_y_8gf - regnet_y_16gf - regnet_y_32gf - regnet_y_128gf - regnet_x_400mf - regnet_x_800mf - regnet_x_1_6gf - regnet_x_3_2gf - regnet_x_8gf - regnet_x_16gf - regnet_x_32gf - -VisionTransformer ------------------ - -.. autosummary:: - :toctree: generated/ - :template: function.rst - - vit_b_16 - vit_b_32 - vit_l_16 - vit_l_32 - vit_h_14 - -ConvNeXt --------- - -.. autosummary:: - :toctree: generated/ - :template: function.rst - - convnext_tiny - convnext_small - convnext_base - convnext_large - -Quantized Models ----------------- - -The following architectures provide support for INT8 quantized models. You can get -a model with random weights by calling its constructor: - -.. code:: python - - import torchvision.models as models - googlenet = models.quantization.googlenet() - inception_v3 = models.quantization.inception_v3() - mobilenet_v2 = models.quantization.mobilenet_v2() - mobilenet_v3_large = models.quantization.mobilenet_v3_large() - resnet18 = models.quantization.resnet18() - resnet50 = models.quantization.resnet50() - resnext101_32x8d = models.quantization.resnext101_32x8d() - shufflenet_v2_x0_5 = models.quantization.shufflenet_v2_x0_5() - shufflenet_v2_x1_0 = models.quantization.shufflenet_v2_x1_0() - -Obtaining a pre-trained quantized model can be done with a few lines of code: - -.. code:: python - - import torchvision.models as models - model = models.quantization.mobilenet_v2(weights=MobileNet_V2_QuantizedWeights.IMAGENET1K_QNNPACK_V1, quantize=True) - model.eval() - # run the model with quantized inputs and weights - out = model(torch.rand(1, 3, 224, 224)) - -We provide pre-trained quantized weights for the following models: - -================================ ============= ============= -Model Acc@1 Acc@5 -================================ ============= ============= -MobileNet V2 71.658 90.150 -MobileNet V3 Large 73.004 90.858 -ShuffleNet V2 x1.0 68.360 87.582 -ShuffleNet V2 x0.5 57.972 79.780 -ResNet 18 69.494 88.882 -ResNet 50 75.920 92.814 -ResNext 101 32x8d 78.986 94.480 -Inception V3 77.176 93.354 -GoogleNet 69.826 89.404 -================================ ============= ============= - - -Semantic Segmentation -===================== - -The models subpackage contains definitions for the following model -architectures for semantic segmentation: +The following classification models are available, with or without pre-trained +weights: -- `FCN ResNet50, ResNet101 `_ -- `DeepLabV3 ResNet50, ResNet101, MobileNetV3-Large `_ -- `LR-ASPP MobileNetV3-Large `_ +.. toctree:: + :maxdepth: 1 -As with image classification models, all pre-trained models expect input images normalized in the same way. -The images have to be loaded in to a range of ``[0, 1]`` and then normalized using -``mean = [0.485, 0.456, 0.406]`` and ``std = [0.229, 0.224, 0.225]``. -They have been trained on images resized such that their minimum size is 520. + models/resnet + models/vgg -For details on how to plot the masks of such models, you may refer to :ref:`semantic_seg_output`. -The pre-trained models have been trained on a subset of COCO train2017, on the 20 categories that are -present in the Pascal VOC dataset. You can see more information on how the subset has been selected in -``references/segmentation/coco_utils.py``. The classes that the pre-trained model outputs are the following, -in order: +Table of all available classificaiton weights +--------------------------------------------- - .. code-block:: python +Accuracies are reported on ImageNet - ['__background__', 'aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', - 'car', 'cat', 'chair', 'cow', 'diningtable', 'dog', 'horse', 'motorbike', - 'person', 'pottedplant', 'sheep', 'sofa', 'train', 'tvmonitor'] +.. include:: generated/classification_table.rst -The accuracies of the pre-trained models evaluated on COCO val2017 are as follows - -================================ ============= ==================== -Network mean IoU global pixelwise acc -================================ ============= ==================== -FCN ResNet50 60.5 91.4 -FCN ResNet101 63.7 91.9 -DeepLabV3 ResNet50 66.4 92.4 -DeepLabV3 ResNet101 67.4 92.4 -DeepLabV3 MobileNetV3-Large 60.3 91.2 -LR-ASPP MobileNetV3-Large 57.9 91.2 -================================ ============= ==================== - - -Fully Convolutional Networks ----------------------------- - -.. autosummary:: - :toctree: generated/ - :template: function.rst - - torchvision.models.segmentation.fcn_resnet50 - torchvision.models.segmentation.fcn_resnet101 - - -DeepLabV3 ---------- - -.. autosummary:: - :toctree: generated/ - :template: function.rst - - torchvision.models.segmentation.deeplabv3_resnet50 - torchvision.models.segmentation.deeplabv3_resnet101 - torchvision.models.segmentation.deeplabv3_mobilenet_v3_large - - -LR-ASPP -------- - -.. autosummary:: - :toctree: generated/ - :template: function.rst - - torchvision.models.segmentation.lraspp_mobilenet_v3_large - -.. _object_det_inst_seg_pers_keypoint_det: Object Detection, Instance Segmentation and Person Keypoint Detection ===================================================================== -The models subpackage contains definitions for the following model -architectures for detection: - -- `Faster R-CNN `_ -- `FCOS `_ -- `Mask R-CNN `_ -- `RetinaNet `_ -- `SSD `_ -- `SSDlite `_ - -The pre-trained models for detection, instance segmentation and -keypoint detection are initialized with the classification models -in torchvision. - -The models expect a list of ``Tensor[C, H, W]``, in the range ``0-1``. -The models internally resize the images but the behaviour varies depending -on the model. Check the constructor of the models for more information. The -output format of such models is illustrated in :ref:`instance_seg_output`. - - -For object detection and instance segmentation, the pre-trained -models return the predictions of the following classes: - - .. code-block:: python - - COCO_INSTANCE_CATEGORY_NAMES = [ - '__background__', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', - 'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'N/A', 'stop sign', - 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', - 'elephant', 'bear', 'zebra', 'giraffe', 'N/A', 'backpack', 'umbrella', 'N/A', 'N/A', - 'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', - 'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket', - 'bottle', 'N/A', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', - 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', - 'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed', 'N/A', 'dining table', - 'N/A', 'N/A', 'toilet', 'N/A', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', - 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'N/A', 'book', - 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush' - ] - - -Here are the summary of the accuracies for the models trained on -the instances set of COCO train2017 and evaluated on COCO val2017. - -====================================== ======= ======== =========== -Network box AP mask AP keypoint AP -====================================== ======= ======== =========== -Faster R-CNN ResNet-50 FPN 37.0 - - -Faster R-CNN MobileNetV3-Large FPN 32.8 - - -Faster R-CNN MobileNetV3-Large 320 FPN 22.8 - - -FCOS ResNet-50 FPN 39.2 - - -RetinaNet ResNet-50 FPN 36.4 - - -SSD300 VGG16 25.1 - - -SSDlite320 MobileNetV3-Large 21.3 - - -Mask R-CNN ResNet-50 FPN 37.9 34.6 - -====================================== ======= ======== =========== - -For person keypoint detection, the accuracies for the pre-trained -models are as follows - -================================ ======= ======== =========== -Network box AP mask AP keypoint AP -================================ ======= ======== =========== -Keypoint R-CNN ResNet-50 FPN 54.6 - 65.0 -================================ ======= ======== =========== - -For person keypoint detection, the pre-trained model return the -keypoints in the following order: - - .. code-block:: python - - COCO_PERSON_KEYPOINT_NAMES = [ - 'nose', - 'left_eye', - 'right_eye', - 'left_ear', - 'right_ear', - 'left_shoulder', - 'right_shoulder', - 'left_elbow', - 'right_elbow', - 'left_wrist', - 'right_wrist', - 'left_hip', - 'right_hip', - 'left_knee', - 'right_knee', - 'left_ankle', - 'right_ankle' - ] - -Runtime characteristics ------------------------ - -The implementations of the models for object detection, instance segmentation -and keypoint detection are efficient. - -In the following table, we use 8 GPUs to report the results. During training, -we use a batch size of 2 per GPU for all models except SSD which uses 4 -and SSDlite which uses 24. During testing a batch size of 1 is used. - -For test time, we report the time for the model evaluation and postprocessing -(including mask pasting in image), but not the time for computing the -precision-recall. - -====================================== =================== ================== =========== -Network train time (s / it) test time (s / it) memory (GB) -====================================== =================== ================== =========== -Faster R-CNN ResNet-50 FPN 0.2288 0.0590 5.2 -Faster R-CNN MobileNetV3-Large FPN 0.1020 0.0415 1.0 -Faster R-CNN MobileNetV3-Large 320 FPN 0.0978 0.0376 0.6 -FCOS ResNet-50 FPN 0.1450 0.0539 3.3 -RetinaNet ResNet-50 FPN 0.2514 0.0939 4.1 -SSD300 VGG16 0.2093 0.0744 1.5 -SSDlite320 MobileNetV3-Large 0.1773 0.0906 1.5 -Mask R-CNN ResNet-50 FPN 0.2728 0.0903 5.4 -Keypoint R-CNN ResNet-50 FPN 0.3789 0.1242 6.8 -====================================== =================== ================== =========== - - -Faster R-CNN ------------- - -.. autosummary:: - :toctree: generated/ - :template: function.rst - - torchvision.models.detection.fasterrcnn_resnet50_fpn - torchvision.models.detection.fasterrcnn_mobilenet_v3_large_fpn - torchvision.models.detection.fasterrcnn_mobilenet_v3_large_320_fpn - -FCOS ----- - -.. autosummary:: - :toctree: generated/ - :template: function.rst - - torchvision.models.detection.fcos_resnet50_fpn - - -RetinaNet ---------- - -.. autosummary:: - :toctree: generated/ - :template: function.rst - - torchvision.models.detection.retinanet_resnet50_fpn - - -SSD ---- - -.. autosummary:: - :toctree: generated/ - :template: function.rst - - torchvision.models.detection.ssd300_vgg16 - - -SSDlite -------- - -.. autosummary:: - :toctree: generated/ - :template: function.rst - - torchvision.models.detection.ssdlite320_mobilenet_v3_large - - -Mask R-CNN ----------- - -.. autosummary:: - :toctree: generated/ - :template: function.rst - - torchvision.models.detection.maskrcnn_resnet50_fpn - - -Keypoint R-CNN --------------- - -.. autosummary:: - :toctree: generated/ - :template: function.rst - - torchvision.models.detection.keypointrcnn_resnet50_fpn - - -Video classification -==================== - -We provide models for action recognition pre-trained on Kinetics-400. -They have all been trained with the scripts provided in ``references/video_classification``. - -All pre-trained models expect input images normalized in the same way, -i.e. mini-batches of 3-channel RGB videos of shape (3 x T x H x W), -where H and W are expected to be 112, and T is a number of video frames in a clip. -The images have to be loaded in to a range of [0, 1] and then normalized -using ``mean = [0.43216, 0.394666, 0.37645]`` and ``std = [0.22803, 0.22145, 0.216989]``. - - -.. note:: - The normalization parameters are different from the image classification ones, and correspond - to the mean and std from Kinetics-400. - -.. note:: - For now, normalization code can be found in ``references/video_classification/transforms.py``, - see the ``Normalize`` function there. Note that it differs from standard normalization for - images because it assumes the video is 4d. - -Kinetics 1-crop accuracies for clip length 16 (16x112x112) - -================================ ============= ============= -Network Clip acc@1 Clip acc@5 -================================ ============= ============= -ResNet 3D 18 52.75 75.45 -ResNet MC 18 53.90 76.29 -ResNet (2+1)D 57.50 78.81 -================================ ============= ============= - - -ResNet 3D ----------- - -.. autosummary:: - :toctree: generated/ - :template: function.rst - - torchvision.models.video.r3d_18 - -ResNet Mixed Convolution ------------------------- - -.. autosummary:: - :toctree: generated/ - :template: function.rst - - torchvision.models.video.mc3_18 - -ResNet (2+1)D -------------- - -.. autosummary:: - :toctree: generated/ - :template: function.rst - - torchvision.models.video.r2plus1d_18 - -Optical flow -============ - -Raft ----- - -.. autosummary:: - :toctree: generated/ - :template: function.rst - - torchvision.models.optical_flow.raft_large - torchvision.models.optical_flow.raft_small +TODO: Something similar to classification models: list of models + table of weights diff --git a/docs/source/models/resnet.rst b/docs/source/models/resnet.rst new file mode 100644 index 00000000000..8ab79fe885b --- /dev/null +++ b/docs/source/models/resnet.rst @@ -0,0 +1,28 @@ +ResNet +====== + +.. currentmodule:: torchvision.models + +The ResNet model is based on the `Deep Residual Learning for Image Recognition +`_ paper. + + +Model builders +-------------- + +The following model builders can be used to instanciate a ResNet model, with or +without pre-trained weights. All the model builders internally rely on the +``torchvision.models.resnet.ResNet`` base class. Please refer to the `source +code +`_ for +more details about this class. + +.. autosummary:: + :toctree: generated/ + :template: function.rst + + resnet18 + resnet34 + resnet50 + resnet101 + resnet152 diff --git a/docs/source/models/vgg.rst b/docs/source/models/vgg.rst new file mode 100644 index 00000000000..068bd330c8b --- /dev/null +++ b/docs/source/models/vgg.rst @@ -0,0 +1,30 @@ +VGG +=== + +.. currentmodule:: torchvision.models + +The VGG model is based on the `Very Deep Convolutional Networks for Large-Scale +Image Recognition `_ paper. + + +Model builders +-------------- + +The following model builders can be used to instanciate a VGG model, with or +without pre-trained weights. All the model buidlers internally rely on the +``torchvision.models.vgg.VGG`` base class. Please refer to the `source code +`_ for +more details about this class. + +.. autosummary:: + :toctree: generated/ + :template: function.rst + + vgg11 + vgg11_bn + vgg13 + vgg13_bn + vgg16 + vgg16_bn + vgg19 + vgg19_bn diff --git a/torchvision/models/resnet.py b/torchvision/models/resnet.py index 8f44e553296..3d1a831becf 100644 --- a/torchvision/models/resnet.py +++ b/torchvision/models/resnet.py @@ -556,12 +556,23 @@ class Wide_ResNet101_2_Weights(WeightsEnum): @handle_legacy_interface(weights=("pretrained", ResNet18_Weights.IMAGENET1K_V1)) def resnet18(*, weights: Optional[ResNet18_Weights] = None, progress: bool = True, **kwargs: Any) -> ResNet: - r"""ResNet-18 model from - `"Deep Residual Learning for Image Recognition" `_. + """ResNet-18 from `Deep Residual Learning for Image Recognition `__. Args: - weights (ResNet18_Weights, optional): The pretrained weights for the model - progress (bool): If True, displays a progress bar of the download to stderr + weights (:class:`~torchvision.models.ResNet18_Weights`, optional): The + pretrained weights to use. See + :class:`~torchvision.models.ResNet18_Weights` below for + more details, and possible values. By default, no pre-trained + weights are used. + progress (bool, optional): If True, displays a progress bar of the + download to stderr. Default is True. + **kwargs: parameters passed to the ``torchvision.models.resnet.ResNet`` + base class. Please refer to the `source code + `_ + for more details about this class. + + .. autoclass:: torchvision.models.ResNet18_Weights + :members: """ weights = ResNet18_Weights.verify(weights) @@ -570,12 +581,23 @@ def resnet18(*, weights: Optional[ResNet18_Weights] = None, progress: bool = Tru @handle_legacy_interface(weights=("pretrained", ResNet34_Weights.IMAGENET1K_V1)) def resnet34(*, weights: Optional[ResNet34_Weights] = None, progress: bool = True, **kwargs: Any) -> ResNet: - r"""ResNet-34 model from - `"Deep Residual Learning for Image Recognition" `_. + """ResNet-34 from `Deep Residual Learning for Image Recognition `__. Args: - weights (ResNet34_Weights, optional): The pretrained weights for the model - progress (bool): If True, displays a progress bar of the download to stderr + weights (:class:`~torchvision.models.ResNet34_Weights`, optional): The + pretrained weights to use. See + :class:`~torchvision.models.ResNet34_Weights` below for + more details, and possible values. By default, no pre-trained + weights are used. + progress (bool, optional): If True, displays a progress bar of the + download to stderr. Default is True. + **kwargs: parameters passed to the ``torchvision.models.resnet.ResNet`` + base class. Please refer to the `source code + `_ + for more details about this class. + + .. autoclass:: torchvision.models.ResNet34_Weights + :members: """ weights = ResNet34_Weights.verify(weights) @@ -584,12 +606,23 @@ def resnet34(*, weights: Optional[ResNet34_Weights] = None, progress: bool = Tru @handle_legacy_interface(weights=("pretrained", ResNet50_Weights.IMAGENET1K_V1)) def resnet50(*, weights: Optional[ResNet50_Weights] = None, progress: bool = True, **kwargs: Any) -> ResNet: - r"""ResNet-50 model from - `"Deep Residual Learning for Image Recognition" `_. + """ResNet-50 from `Deep Residual Learning for Image Recognition `__. Args: - weights (ResNet50_Weights, optional): The pretrained weights for the model - progress (bool): If True, displays a progress bar of the download to stderr + weights (:class:`~torchvision.models.ResNet50_Weights`, optional): The + pretrained weights to use. See + :class:`~torchvision.models.ResNet50_Weights` below for + more details, and possible values. By default, no pre-trained + weights are used. + progress (bool, optional): If True, displays a progress bar of the + download to stderr. Default is True. + **kwargs: parameters passed to the ``torchvision.models.resnet.ResNet`` + base class. Please refer to the `source code + `_ + for more details about this class. + + .. autoclass:: torchvision.models.ResNet50_Weights + :members: """ weights = ResNet50_Weights.verify(weights) @@ -598,12 +631,23 @@ def resnet50(*, weights: Optional[ResNet50_Weights] = None, progress: bool = Tru @handle_legacy_interface(weights=("pretrained", ResNet101_Weights.IMAGENET1K_V1)) def resnet101(*, weights: Optional[ResNet101_Weights] = None, progress: bool = True, **kwargs: Any) -> ResNet: - r"""ResNet-101 model from - `"Deep Residual Learning for Image Recognition" `_. + """ResNet-101 from `Deep Residual Learning for Image Recognition `__. Args: - weights (ResNet101_Weights, optional): The pretrained weights for the model - progress (bool): If True, displays a progress bar of the download to stderr + weights (:class:`~torchvision.models.ResNet101_Weights`, optional): The + pretrained weights to use. See + :class:`~torchvision.models.ResNet101_Weights` below for + more details, and possible values. By default, no pre-trained + weights are used. + progress (bool, optional): If True, displays a progress bar of the + download to stderr. Default is True. + **kwargs: parameters passed to the ``torchvision.models.resnet.ResNet`` + base class. Please refer to the `source code + `_ + for more details about this class. + + .. autoclass:: torchvision.models.ResNet101_Weights + :members: """ weights = ResNet101_Weights.verify(weights) @@ -612,12 +656,23 @@ def resnet101(*, weights: Optional[ResNet101_Weights] = None, progress: bool = T @handle_legacy_interface(weights=("pretrained", ResNet152_Weights.IMAGENET1K_V1)) def resnet152(*, weights: Optional[ResNet152_Weights] = None, progress: bool = True, **kwargs: Any) -> ResNet: - r"""ResNet-152 model from - `"Deep Residual Learning for Image Recognition" `_. + """ResNet-152 from `Deep Residual Learning for Image Recognition `__. Args: - weights (ResNet152_Weights, optional): The pretrained weights for the model - progress (bool): If True, displays a progress bar of the download to stderr + weights (:class:`~torchvision.models.ResNet152_Weights`, optional): The + pretrained weights to use. See + :class:`~torchvision.models.ResNet152_Weights` below for + more details, and possible values. By default, no pre-trained + weights are used. + progress (bool, optional): If True, displays a progress bar of the + download to stderr. Default is True. + **kwargs: parameters passed to the ``torchvision.models.resnet.ResNet`` + base class. Please refer to the `source code + `_ + for more details about this class. + + .. autoclass:: torchvision.models.ResNet152_Weights + :members: """ weights = ResNet152_Weights.verify(weights) diff --git a/torchvision/models/vgg.py b/torchvision/models/vgg.py index c245eef6482..45f2dae5808 100644 --- a/torchvision/models/vgg.py +++ b/torchvision/models/vgg.py @@ -252,13 +252,23 @@ class VGG19_BN_Weights(WeightsEnum): @handle_legacy_interface(weights=("pretrained", VGG11_Weights.IMAGENET1K_V1)) def vgg11(*, weights: Optional[VGG11_Weights] = None, progress: bool = True, **kwargs: Any) -> VGG: - r"""VGG 11-layer model (configuration "A") from - `"Very Deep Convolutional Networks For Large-Scale Image Recognition" `_. - The required minimum input size of the model is 32x32. + """VGG-11 from `Very Deep Convolutional Networks for Large-Scale Image Recognition `__. Args: - weights (VGG11_Weights, optional): The pretrained weights for the model - progress (bool): If True, displays a progress bar of the download to stderr + weights (:class:`~torchvision.models.VGG11_Weights`, optional): The + pretrained weights to use. See + :class:`~torchvision.models.VGG11_Weights` below for + more details, and possible values. By default, no pre-trained + weights are used. + progress (bool, optional): If True, displays a progress bar of the + download to stderr. Default is True. + **kwargs: parameters passed to the ``torchvision.models.vgg.VGG`` + base class. Please refer to the `source code + `_ + for more details about this class. + + .. autoclass:: torchvision.models.VGG11_Weights + :members: """ weights = VGG11_Weights.verify(weights) @@ -267,13 +277,23 @@ def vgg11(*, weights: Optional[VGG11_Weights] = None, progress: bool = True, **k @handle_legacy_interface(weights=("pretrained", VGG11_BN_Weights.IMAGENET1K_V1)) def vgg11_bn(*, weights: Optional[VGG11_BN_Weights] = None, progress: bool = True, **kwargs: Any) -> VGG: - r"""VGG 11-layer model (configuration "A") with batch normalization - `"Very Deep Convolutional Networks For Large-Scale Image Recognition" `_. - The required minimum input size of the model is 32x32. + """VGG-11-BN from `Very Deep Convolutional Networks for Large-Scale Image Recognition `__. Args: - weights (VGG11_BN_Weights, optional): The pretrained weights for the model - progress (bool): If True, displays a progress bar of the download to stderr + weights (:class:`~torchvision.models.VGG11_BN_Weights`, optional): The + pretrained weights to use. See + :class:`~torchvision.models.VGG11_BN_Weights` below for + more details, and possible values. By default, no pre-trained + weights are used. + progress (bool, optional): If True, displays a progress bar of the + download to stderr. Default is True. + **kwargs: parameters passed to the ``torchvision.models.vgg.VGG`` + base class. Please refer to the `source code + `_ + for more details about this class. + + .. autoclass:: torchvision.models.VGG11_BN_Weights + :members: """ weights = VGG11_BN_Weights.verify(weights) @@ -282,13 +302,23 @@ def vgg11_bn(*, weights: Optional[VGG11_BN_Weights] = None, progress: bool = Tru @handle_legacy_interface(weights=("pretrained", VGG13_Weights.IMAGENET1K_V1)) def vgg13(*, weights: Optional[VGG13_Weights] = None, progress: bool = True, **kwargs: Any) -> VGG: - r"""VGG 13-layer model (configuration "B") - `"Very Deep Convolutional Networks For Large-Scale Image Recognition" `_. - The required minimum input size of the model is 32x32. + """VGG-13 from `Very Deep Convolutional Networks for Large-Scale Image Recognition `__. Args: - weights (VGG13_Weights, optional): The pretrained weights for the model - progress (bool): If True, displays a progress bar of the download to stderr + weights (:class:`~torchvision.models.VGG13_Weights`, optional): The + pretrained weights to use. See + :class:`~torchvision.models.VGG13_Weights` below for + more details, and possible values. By default, no pre-trained + weights are used. + progress (bool, optional): If True, displays a progress bar of the + download to stderr. Default is True. + **kwargs: parameters passed to the ``torchvision.models.vgg.VGG`` + base class. Please refer to the `source code + `_ + for more details about this class. + + .. autoclass:: torchvision.models.VGG13_Weights + :members: """ weights = VGG13_Weights.verify(weights) @@ -297,13 +327,23 @@ def vgg13(*, weights: Optional[VGG13_Weights] = None, progress: bool = True, **k @handle_legacy_interface(weights=("pretrained", VGG13_BN_Weights.IMAGENET1K_V1)) def vgg13_bn(*, weights: Optional[VGG13_BN_Weights] = None, progress: bool = True, **kwargs: Any) -> VGG: - r"""VGG 13-layer model (configuration "B") with batch normalization - `"Very Deep Convolutional Networks For Large-Scale Image Recognition" `_. - The required minimum input size of the model is 32x32. + """VGG-13-BN from `Very Deep Convolutional Networks for Large-Scale Image Recognition `__. Args: - weights (VGG13_BN_Weights, optional): The pretrained weights for the model - progress (bool): If True, displays a progress bar of the download to stderr + weights (:class:`~torchvision.models.VGG13_BN_Weights`, optional): The + pretrained weights to use. See + :class:`~torchvision.models.VGG13_BN_Weights` below for + more details, and possible values. By default, no pre-trained + weights are used. + progress (bool, optional): If True, displays a progress bar of the + download to stderr. Default is True. + **kwargs: parameters passed to the ``torchvision.models.vgg.VGG`` + base class. Please refer to the `source code + `_ + for more details about this class. + + .. autoclass:: torchvision.models.VGG13_BN_Weights + :members: """ weights = VGG13_BN_Weights.verify(weights) @@ -312,13 +352,23 @@ def vgg13_bn(*, weights: Optional[VGG13_BN_Weights] = None, progress: bool = Tru @handle_legacy_interface(weights=("pretrained", VGG16_Weights.IMAGENET1K_V1)) def vgg16(*, weights: Optional[VGG16_Weights] = None, progress: bool = True, **kwargs: Any) -> VGG: - r"""VGG 16-layer model (configuration "D") - `"Very Deep Convolutional Networks For Large-Scale Image Recognition" `_. - The required minimum input size of the model is 32x32. + """VGG-16 from `Very Deep Convolutional Networks for Large-Scale Image Recognition `__. Args: - weights (VGG16_Weights, optional): The pretrained weights for the model - progress (bool): If True, displays a progress bar of the download to stderr + weights (:class:`~torchvision.models.VGG16_Weights`, optional): The + pretrained weights to use. See + :class:`~torchvision.models.VGG16_Weights` below for + more details, and possible values. By default, no pre-trained + weights are used. + progress (bool, optional): If True, displays a progress bar of the + download to stderr. Default is True. + **kwargs: parameters passed to the ``torchvision.models.vgg.VGG`` + base class. Please refer to the `source code + `_ + for more details about this class. + + .. autoclass:: torchvision.models.VGG16_Weights + :members: """ weights = VGG16_Weights.verify(weights) @@ -327,13 +377,23 @@ def vgg16(*, weights: Optional[VGG16_Weights] = None, progress: bool = True, **k @handle_legacy_interface(weights=("pretrained", VGG16_BN_Weights.IMAGENET1K_V1)) def vgg16_bn(*, weights: Optional[VGG16_BN_Weights] = None, progress: bool = True, **kwargs: Any) -> VGG: - r"""VGG 16-layer model (configuration "D") with batch normalization - `"Very Deep Convolutional Networks For Large-Scale Image Recognition" `_. - The required minimum input size of the model is 32x32. + """VGG-16-BN from `Very Deep Convolutional Networks for Large-Scale Image Recognition `__. Args: - weights (VGG16_BN_Weights, optional): The pretrained weights for the model - progress (bool): If True, displays a progress bar of the download to stderr + weights (:class:`~torchvision.models.VGG16_BN_Weights`, optional): The + pretrained weights to use. See + :class:`~torchvision.models.VGG16_BN_Weights` below for + more details, and possible values. By default, no pre-trained + weights are used. + progress (bool, optional): If True, displays a progress bar of the + download to stderr. Default is True. + **kwargs: parameters passed to the ``torchvision.models.vgg.VGG`` + base class. Please refer to the `source code + `_ + for more details about this class. + + .. autoclass:: torchvision.models.VGG16_BN_Weights + :members: """ weights = VGG16_BN_Weights.verify(weights) @@ -342,13 +402,23 @@ def vgg16_bn(*, weights: Optional[VGG16_BN_Weights] = None, progress: bool = Tru @handle_legacy_interface(weights=("pretrained", VGG19_Weights.IMAGENET1K_V1)) def vgg19(*, weights: Optional[VGG19_Weights] = None, progress: bool = True, **kwargs: Any) -> VGG: - r"""VGG 19-layer model (configuration "E") - `"Very Deep Convolutional Networks For Large-Scale Image Recognition" `_. - The required minimum input size of the model is 32x32. + """VGG-19 from `Very Deep Convolutional Networks for Large-Scale Image Recognition `__. Args: - weights (VGG19_Weights, optional): The pretrained weights for the model - progress (bool): If True, displays a progress bar of the download to stderr + weights (:class:`~torchvision.models.VGG19_Weights`, optional): The + pretrained weights to use. See + :class:`~torchvision.models.VGG19_Weights` below for + more details, and possible values. By default, no pre-trained + weights are used. + progress (bool, optional): If True, displays a progress bar of the + download to stderr. Default is True. + **kwargs: parameters passed to the ``torchvision.models.vgg.VGG`` + base class. Please refer to the `source code + `_ + for more details about this class. + + .. autoclass:: torchvision.models.VGG19_Weights + :members: """ weights = VGG19_Weights.verify(weights) @@ -357,13 +427,23 @@ def vgg19(*, weights: Optional[VGG19_Weights] = None, progress: bool = True, **k @handle_legacy_interface(weights=("pretrained", VGG19_BN_Weights.IMAGENET1K_V1)) def vgg19_bn(*, weights: Optional[VGG19_BN_Weights] = None, progress: bool = True, **kwargs: Any) -> VGG: - r"""VGG 19-layer model (configuration 'E') with batch normalization - `"Very Deep Convolutional Networks For Large-Scale Image Recognition" `_. - The required minimum input size of the model is 32x32. + """VGG-19_BN from `Very Deep Convolutional Networks for Large-Scale Image Recognition `__. Args: - weights (VGG19_BN_Weights, optional): The pretrained weights for the model - progress (bool): If True, displays a progress bar of the download to stderr + weights (:class:`~torchvision.models.VGG19_BN_Weights`, optional): The + pretrained weights to use. See + :class:`~torchvision.models.VGG19_BN_Weights` below for + more details, and possible values. By default, no pre-trained + weights are used. + progress (bool, optional): If True, displays a progress bar of the + download to stderr. Default is True. + **kwargs: parameters passed to the ``torchvision.models.vgg.VGG`` + base class. Please refer to the `source code + `_ + for more details about this class. + + .. autoclass:: torchvision.models.VGG19_BN_Weights + :members: """ weights = VGG19_BN_Weights.verify(weights) From 1058c4526022dfc2b12656d53689b50ca61a61a5 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 14 Apr 2022 14:47:18 +0100 Subject: [PATCH 2/7] Deactivating fail on warning, temporarily --- docs/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/Makefile b/docs/Makefile index 389a07a604e..11be1d45fce 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -6,7 +6,7 @@ ifneq ($(EXAMPLES_PATTERN),) endif # You can set these variables from the command line. -SPHINXOPTS = -W -j auto $(EXAMPLES_PATTERN_OPTS) +SPHINXOPTS = -j auto $(EXAMPLES_PATTERN_OPTS) SPHINXBUILD = sphinx-build SPHINXPROJ = torchvision SOURCEDIR = source From 41969bd13c2ba9cacfda4c35b322567614637962 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 14 Apr 2022 15:37:19 +0100 Subject: [PATCH 3/7] Remove commnet --- docs/source/conf.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/docs/source/conf.py b/docs/source/conf.py index 8a1428c1908..013596dcd11 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -22,6 +22,7 @@ import os import textwrap +from pathlib import Path import pytorch_sphinx_theme import torchvision @@ -313,10 +314,8 @@ def inject_weight_metadata(app, what, name, obj, options, lines): lines.append("") -def generate_table(): +def generate_classification_table(): - # TODO: this is ugly af and incorrect. We'll need an automatic way to - # retrieve weight enums for each section, or manually list them. weight_enums = [getattr(M, name) for name in dir(M) if name.endswith("Weights")] weights = [w for weight_enum in weight_enums for w in weight_enum if "acc@1" in w.meta] @@ -336,8 +335,6 @@ def get_weight_link(w): ] table = tabulate(content, headers=column_names, tablefmt="rst") - from pathlib import Path - generated_dir = Path("generated") generated_dir.mkdir(exist_ok=True) with open(generated_dir / "classification_table.rst", "w+") as table_file: @@ -346,7 +343,7 @@ def get_weight_link(w): table_file.write(f"{textwrap.indent(table, ' ' * 4)}\n\n") -generate_table() +generate_classification_table() def setup(app): From ad2899a2f99e397438b8883b7e9df2d516c8e529 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 14 Apr 2022 15:53:06 +0100 Subject: [PATCH 4/7] Minor changes --- docs/source/conf.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/docs/source/conf.py b/docs/source/conf.py index 013596dcd11..df9f4486a8b 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -306,8 +306,11 @@ def inject_weight_metadata(app, what, name, obj, options, lines): table = [] for k, v in field.meta.items(): - if k != "categories": - table.append((str(k), str(v))) + if k == "categories": + continue + elif k == "recipe": + v = f"`link <{v}>`__" + table.append((str(k), str(v))) table = tabulate(table, tablefmt="rst") lines += [".. table::", ""] lines += textwrap.indent(table, " " * 4).split("\n") @@ -317,15 +320,12 @@ def inject_weight_metadata(app, what, name, obj, options, lines): def generate_classification_table(): weight_enums = [getattr(M, name) for name in dir(M) if name.endswith("Weights")] - weights = [w for weight_enum in weight_enums for w in weight_enum if "acc@1" in w.meta] - - def get_weight_link(w): - return f":class:`{w} <{type(w).__name__}>`" + weights = [w for weight_enum in weight_enums for w in weight_enum] column_names = ("**Weight**", "**Acc@1**", "**Acc@5**", "**Params**", "**Recipe**") content = [ ( - get_weight_link(w), + f":class:`{w} <{type(w).__name__}>`", w.meta["acc@1"], w.meta["acc@5"], f"{w.meta['num_params']/1e6:.1f}M", From ca4b03ebfa8498222ae71ca50aa4d6ee8e5e9bb3 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Tue, 19 Apr 2022 11:28:46 +0100 Subject: [PATCH 5/7] Typos --- docs/source/conf.py | 2 +- docs/source/models.rst | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/conf.py b/docs/source/conf.py index df9f4486a8b..d0cb718f4fa 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -319,7 +319,7 @@ def inject_weight_metadata(app, what, name, obj, options, lines): def generate_classification_table(): - weight_enums = [getattr(M, name) for name in dir(M) if name.endswith("Weights")] + weight_enums = [getattr(M, name) for name in dir(M) if name.endswith("_Weights")] weights = [w for weight_enum in weight_enums for w in weight_enum] column_names = ("**Weight**", "**Acc@1**", "**Acc@5**", "**Params**", "**Recipe**") diff --git a/docs/source/models.rst b/docs/source/models.rst index eb3c059170e..38f55ba574e 100644 --- a/docs/source/models.rst +++ b/docs/source/models.rst @@ -34,7 +34,7 @@ weights: models/vgg -Table of all available classificaiton weights +Table of all available classification weights --------------------------------------------- Accuracies are reported on ImageNet From 3763a886554c62dcddcf0d88e047f8f1a4651d0b Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Tue, 19 Apr 2022 11:30:35 +0100 Subject: [PATCH 6/7] Added TODO in Makefile --- docs/Makefile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/Makefile b/docs/Makefile index 11be1d45fce..c0282d23230 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -6,6 +6,8 @@ ifneq ($(EXAMPLES_PATTERN),) endif # You can set these variables from the command line. +# TODO: Once the models doc revamp is done, set back the -W option to raise +# errors on warnings. See https://github.com/pytorch/vision/pull/5821#discussion_r850500693 SPHINXOPTS = -j auto $(EXAMPLES_PATTERN_OPTS) SPHINXBUILD = sphinx-build SPHINXPROJ = torchvision From edfcbe0346c3357b5fc7ecba67cf850fdf638a4e Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Tue, 19 Apr 2022 13:39:13 +0100 Subject: [PATCH 7/7] Keep old models.rst file intact, move new docs into new models_new.rst file --- docs/source/index.rst | 1 + docs/source/models.rst | 814 ++++++++++++++++++++++++++++++++++++- docs/source/models_new.rst | 54 +++ 3 files changed, 858 insertions(+), 11 deletions(-) create mode 100644 docs/source/models_new.rst diff --git a/docs/source/index.rst b/docs/source/index.rst index 79dbebdd047..06737ae4b60 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -38,6 +38,7 @@ architectures, and common image transformations for computer vision. ops io feature_extraction + models_new .. toctree:: :maxdepth: 1 diff --git a/docs/source/models.rst b/docs/source/models.rst index 38f55ba574e..f84d9c7fd1a 100644 --- a/docs/source/models.rst +++ b/docs/source/models.rst @@ -22,27 +22,819 @@ keypoint detection, video classification, and optical flow. Classification ============== +The models subpackage contains definitions for the following model +architectures for image classification: + +- `AlexNet`_ +- `VGG`_ +- `ResNet`_ +- `SqueezeNet`_ +- `DenseNet`_ +- `Inception`_ v3 +- `GoogLeNet`_ +- `ShuffleNet`_ v2 +- `MobileNetV2`_ +- `MobileNetV3`_ +- `ResNeXt`_ +- `Wide ResNet`_ +- `MNASNet`_ +- `EfficientNet`_ v1 & v2 +- `RegNet`_ +- `VisionTransformer`_ +- `ConvNeXt`_ + +You can construct a model with random weights by calling its constructor: + +.. code:: python + + import torchvision.models as models + resnet18 = models.resnet18() + alexnet = models.alexnet() + vgg16 = models.vgg16() + squeezenet = models.squeezenet1_0() + densenet = models.densenet161() + inception = models.inception_v3() + googlenet = models.googlenet() + shufflenet = models.shufflenet_v2_x1_0() + mobilenet_v2 = models.mobilenet_v2() + mobilenet_v3_large = models.mobilenet_v3_large() + mobilenet_v3_small = models.mobilenet_v3_small() + resnext50_32x4d = models.resnext50_32x4d() + wide_resnet50_2 = models.wide_resnet50_2() + mnasnet = models.mnasnet1_0() + efficientnet_b0 = models.efficientnet_b0() + efficientnet_b1 = models.efficientnet_b1() + efficientnet_b2 = models.efficientnet_b2() + efficientnet_b3 = models.efficientnet_b3() + efficientnet_b4 = models.efficientnet_b4() + efficientnet_b5 = models.efficientnet_b5() + efficientnet_b6 = models.efficientnet_b6() + efficientnet_b7 = models.efficientnet_b7() + efficientnet_v2_s = models.efficientnet_v2_s() + efficientnet_v2_m = models.efficientnet_v2_m() + efficientnet_v2_l = models.efficientnet_v2_l() + regnet_y_400mf = models.regnet_y_400mf() + regnet_y_800mf = models.regnet_y_800mf() + regnet_y_1_6gf = models.regnet_y_1_6gf() + regnet_y_3_2gf = models.regnet_y_3_2gf() + regnet_y_8gf = models.regnet_y_8gf() + regnet_y_16gf = models.regnet_y_16gf() + regnet_y_32gf = models.regnet_y_32gf() + regnet_y_128gf = models.regnet_y_128gf() + regnet_x_400mf = models.regnet_x_400mf() + regnet_x_800mf = models.regnet_x_800mf() + regnet_x_1_6gf = models.regnet_x_1_6gf() + regnet_x_3_2gf = models.regnet_x_3_2gf() + regnet_x_8gf = models.regnet_x_8gf() + regnet_x_16gf = models.regnet_x_16gf() + regnet_x_32gf = models.regnet_x_32gf() + vit_b_16 = models.vit_b_16() + vit_b_32 = models.vit_b_32() + vit_l_16 = models.vit_l_16() + vit_l_32 = models.vit_l_32() + vit_h_14 = models.vit_h_14() + convnext_tiny = models.convnext_tiny() + convnext_small = models.convnext_small() + convnext_base = models.convnext_base() + convnext_large = models.convnext_large() + +We provide pre-trained models, using the PyTorch :mod:`torch.utils.model_zoo`. + +Instancing a pre-trained model will download its weights to a cache directory. +This directory can be set using the `TORCH_HOME` environment variable. See +:func:`torch.hub.load_state_dict_from_url` for details. + +Some models use modules which have different training and evaluation +behavior, such as batch normalization. To switch between these modes, use +``model.train()`` or ``model.eval()`` as appropriate. See +:meth:`~torch.nn.Module.train` or :meth:`~torch.nn.Module.eval` for details. + +All pre-trained models expect input images normalized in the same way, +i.e. mini-batches of 3-channel RGB images of shape (3 x H x W), +where H and W are expected to be at least 224. +The images have to be loaded in to a range of [0, 1] and then normalized +using ``mean = [0.485, 0.456, 0.406]`` and ``std = [0.229, 0.224, 0.225]``. +You can use the following transform to normalize:: + + normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]) + +An example of such normalization can be found in the imagenet example +`here `_ + +The process for obtaining the values of `mean` and `std` is roughly equivalent +to:: + + import torch + from torchvision import datasets, transforms as T + + transform = T.Compose([T.Resize(256), T.CenterCrop(224), T.PILToTensor(), T.ConvertImageDtype(torch.float)]) + dataset = datasets.ImageNet(".", split="train", transform=transform) + + means = [] + stds = [] + for img in subset(dataset): + means.append(torch.mean(img)) + stds.append(torch.std(img)) + + mean = torch.mean(torch.tensor(means)) + std = torch.mean(torch.tensor(stds)) + +Unfortunately, the concrete `subset` that was used is lost. For more +information see `this discussion `_ +or `these experiments `_. + +The sizes of the EfficientNet models depend on the variant. For the exact input sizes +`check here `_ + +ImageNet 1-crop error rates + +================================ ============= ============= +Model Acc@1 Acc@5 +================================ ============= ============= +AlexNet 56.522 79.066 +VGG-11 69.020 88.628 +VGG-13 69.928 89.246 +VGG-16 71.592 90.382 +VGG-19 72.376 90.876 +VGG-11 with batch normalization 70.370 89.810 +VGG-13 with batch normalization 71.586 90.374 +VGG-16 with batch normalization 73.360 91.516 +VGG-19 with batch normalization 74.218 91.842 +ResNet-18 69.758 89.078 +ResNet-34 73.314 91.420 +ResNet-50 76.130 92.862 +ResNet-101 77.374 93.546 +ResNet-152 78.312 94.046 +SqueezeNet 1.0 58.092 80.420 +SqueezeNet 1.1 58.178 80.624 +Densenet-121 74.434 91.972 +Densenet-169 75.600 92.806 +Densenet-201 76.896 93.370 +Densenet-161 77.138 93.560 +Inception v3 77.294 93.450 +GoogleNet 69.778 89.530 +ShuffleNet V2 x1.0 69.362 88.316 +ShuffleNet V2 x0.5 60.552 81.746 +MobileNet V2 71.878 90.286 +MobileNet V3 Large 74.042 91.340 +MobileNet V3 Small 67.668 87.402 +ResNeXt-50-32x4d 77.618 93.698 +ResNeXt-101-32x8d 79.312 94.526 +Wide ResNet-50-2 78.468 94.086 +Wide ResNet-101-2 78.848 94.284 +MNASNet 1.0 73.456 91.510 +MNASNet 0.5 67.734 87.490 +EfficientNet-B0 77.692 93.532 +EfficientNet-B1 78.642 94.186 +EfficientNet-B2 80.608 95.310 +EfficientNet-B3 82.008 96.054 +EfficientNet-B4 83.384 96.594 +EfficientNet-B5 83.444 96.628 +EfficientNet-B6 84.008 96.916 +EfficientNet-B7 84.122 96.908 +EfficientNetV2-s 84.228 96.878 +EfficientNetV2-m 85.112 97.156 +EfficientNetV2-l 85.810 97.792 +regnet_x_400mf 72.834 90.950 +regnet_x_800mf 75.212 92.348 +regnet_x_1_6gf 77.040 93.440 +regnet_x_3_2gf 78.364 93.992 +regnet_x_8gf 79.344 94.686 +regnet_x_16gf 80.058 94.944 +regnet_x_32gf 80.622 95.248 +regnet_y_400mf 74.046 91.716 +regnet_y_800mf 76.420 93.136 +regnet_y_1_6gf 77.950 93.966 +regnet_y_3_2gf 78.948 94.576 +regnet_y_8gf 80.032 95.048 +regnet_y_16gf 80.424 95.240 +regnet_y_32gf 80.878 95.340 +vit_b_16 81.072 95.318 +vit_b_32 75.912 92.466 +vit_l_16 79.662 94.638 +vit_l_32 76.972 93.070 +vit_h_14 88.552 98.694 +convnext_tiny 82.520 96.146 +convnext_small 83.616 96.650 +convnext_base 84.062 96.870 +convnext_large 84.414 96.976 +================================ ============= ============= + + +.. _AlexNet: https://arxiv.org/abs/1404.5997 +.. _VGG: https://arxiv.org/abs/1409.1556 +.. _ResNet: https://arxiv.org/abs/1512.03385 +.. _SqueezeNet: https://arxiv.org/abs/1602.07360 +.. _DenseNet: https://arxiv.org/abs/1608.06993 +.. _Inception: https://arxiv.org/abs/1512.00567 +.. _GoogLeNet: https://arxiv.org/abs/1409.4842 +.. _ShuffleNet: https://arxiv.org/abs/1807.11164 +.. _MobileNetV2: https://arxiv.org/abs/1801.04381 +.. _MobileNetV3: https://arxiv.org/abs/1905.02244 +.. _ResNeXt: https://arxiv.org/abs/1611.05431 +.. _MNASNet: https://arxiv.org/abs/1807.11626 +.. _EfficientNet: https://arxiv.org/abs/1905.11946 +.. _RegNet: https://arxiv.org/abs/2003.13678 +.. _VisionTransformer: https://arxiv.org/abs/2010.11929 +.. _ConvNeXt: https://arxiv.org/abs/2201.03545 + .. currentmodule:: torchvision.models -The following classification models are available, with or without pre-trained -weights: +Alexnet +------- + +.. autosummary:: + :toctree: generated/ + :template: function.rst + + alexnet + +VGG +--- + +.. autosummary:: + :toctree: generated/ + :template: function.rst + + vgg11 + vgg11_bn + vgg13 + vgg13_bn + vgg16 + vgg16_bn + vgg19 + vgg19_bn + + +ResNet +------ + +.. autosummary:: + :toctree: generated/ + :template: function.rst + + resnet18 + resnet34 + resnet50 + resnet101 + resnet152 + +SqueezeNet +---------- + +.. autosummary:: + :toctree: generated/ + :template: function.rst + + squeezenet1_0 + squeezenet1_1 + +DenseNet +--------- + +.. autosummary:: + :toctree: generated/ + :template: function.rst + + densenet121 + densenet169 + densenet161 + densenet201 + +Inception v3 +------------ + +.. autosummary:: + :toctree: generated/ + :template: function.rst + + inception_v3 + +GoogLeNet +------------ + +.. autosummary:: + :toctree: generated/ + :template: function.rst + + googlenet + +ShuffleNet v2 +------------- + +.. autosummary:: + :toctree: generated/ + :template: function.rst + + shufflenet_v2_x0_5 + shufflenet_v2_x1_0 + shufflenet_v2_x1_5 + shufflenet_v2_x2_0 + +MobileNet v2 +------------- + +.. autosummary:: + :toctree: generated/ + :template: function.rst + + mobilenet_v2 + +MobileNet v3 +------------- + +.. autosummary:: + :toctree: generated/ + :template: function.rst + + mobilenet_v3_large + mobilenet_v3_small + +ResNext +------- + +.. autosummary:: + :toctree: generated/ + :template: function.rst + + resnext50_32x4d + resnext101_32x8d + +Wide ResNet +----------- + +.. autosummary:: + :toctree: generated/ + :template: function.rst + + wide_resnet50_2 + wide_resnet101_2 + +MNASNet +-------- + +.. autosummary:: + :toctree: generated/ + :template: function.rst + + mnasnet0_5 + mnasnet0_75 + mnasnet1_0 + mnasnet1_3 + +EfficientNet +------------ + +.. autosummary:: + :toctree: generated/ + :template: function.rst + + efficientnet_b0 + efficientnet_b1 + efficientnet_b2 + efficientnet_b3 + efficientnet_b4 + efficientnet_b5 + efficientnet_b6 + efficientnet_b7 + efficientnet_v2_s + efficientnet_v2_m + efficientnet_v2_l + +RegNet +------------ + +.. autosummary:: + :toctree: generated/ + :template: function.rst + + regnet_y_400mf + regnet_y_800mf + regnet_y_1_6gf + regnet_y_3_2gf + regnet_y_8gf + regnet_y_16gf + regnet_y_32gf + regnet_y_128gf + regnet_x_400mf + regnet_x_800mf + regnet_x_1_6gf + regnet_x_3_2gf + regnet_x_8gf + regnet_x_16gf + regnet_x_32gf + +VisionTransformer +----------------- + +.. autosummary:: + :toctree: generated/ + :template: function.rst + + vit_b_16 + vit_b_32 + vit_l_16 + vit_l_32 + vit_h_14 + +ConvNeXt +-------- + +.. autosummary:: + :toctree: generated/ + :template: function.rst + + convnext_tiny + convnext_small + convnext_base + convnext_large + +Quantized Models +---------------- + +The following architectures provide support for INT8 quantized models. You can get +a model with random weights by calling its constructor: + +.. code:: python + + import torchvision.models as models + googlenet = models.quantization.googlenet() + inception_v3 = models.quantization.inception_v3() + mobilenet_v2 = models.quantization.mobilenet_v2() + mobilenet_v3_large = models.quantization.mobilenet_v3_large() + resnet18 = models.quantization.resnet18() + resnet50 = models.quantization.resnet50() + resnext101_32x8d = models.quantization.resnext101_32x8d() + shufflenet_v2_x0_5 = models.quantization.shufflenet_v2_x0_5() + shufflenet_v2_x1_0 = models.quantization.shufflenet_v2_x1_0() + +Obtaining a pre-trained quantized model can be done with a few lines of code: + +.. code:: python + + import torchvision.models as models + model = models.quantization.mobilenet_v2(weights=MobileNet_V2_QuantizedWeights.IMAGENET1K_QNNPACK_V1, quantize=True) + model.eval() + # run the model with quantized inputs and weights + out = model(torch.rand(1, 3, 224, 224)) + +We provide pre-trained quantized weights for the following models: + +================================ ============= ============= +Model Acc@1 Acc@5 +================================ ============= ============= +MobileNet V2 71.658 90.150 +MobileNet V3 Large 73.004 90.858 +ShuffleNet V2 x1.0 68.360 87.582 +ShuffleNet V2 x0.5 57.972 79.780 +ResNet 18 69.494 88.882 +ResNet 50 75.920 92.814 +ResNext 101 32x8d 78.986 94.480 +Inception V3 77.176 93.354 +GoogleNet 69.826 89.404 +================================ ============= ============= + + +Semantic Segmentation +===================== + +The models subpackage contains definitions for the following model +architectures for semantic segmentation: -.. toctree:: - :maxdepth: 1 +- `FCN ResNet50, ResNet101 `_ +- `DeepLabV3 ResNet50, ResNet101, MobileNetV3-Large `_ +- `LR-ASPP MobileNetV3-Large `_ - models/resnet - models/vgg +As with image classification models, all pre-trained models expect input images normalized in the same way. +The images have to be loaded in to a range of ``[0, 1]`` and then normalized using +``mean = [0.485, 0.456, 0.406]`` and ``std = [0.229, 0.224, 0.225]``. +They have been trained on images resized such that their minimum size is 520. +For details on how to plot the masks of such models, you may refer to :ref:`semantic_seg_output`. -Table of all available classification weights ---------------------------------------------- +The pre-trained models have been trained on a subset of COCO train2017, on the 20 categories that are +present in the Pascal VOC dataset. You can see more information on how the subset has been selected in +``references/segmentation/coco_utils.py``. The classes that the pre-trained model outputs are the following, +in order: -Accuracies are reported on ImageNet + .. code-block:: python -.. include:: generated/classification_table.rst + ['__background__', 'aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', + 'car', 'cat', 'chair', 'cow', 'diningtable', 'dog', 'horse', 'motorbike', + 'person', 'pottedplant', 'sheep', 'sofa', 'train', 'tvmonitor'] +The accuracies of the pre-trained models evaluated on COCO val2017 are as follows + +================================ ============= ==================== +Network mean IoU global pixelwise acc +================================ ============= ==================== +FCN ResNet50 60.5 91.4 +FCN ResNet101 63.7 91.9 +DeepLabV3 ResNet50 66.4 92.4 +DeepLabV3 ResNet101 67.4 92.4 +DeepLabV3 MobileNetV3-Large 60.3 91.2 +LR-ASPP MobileNetV3-Large 57.9 91.2 +================================ ============= ==================== + + +Fully Convolutional Networks +---------------------------- + +.. autosummary:: + :toctree: generated/ + :template: function.rst + + torchvision.models.segmentation.fcn_resnet50 + torchvision.models.segmentation.fcn_resnet101 + + +DeepLabV3 +--------- + +.. autosummary:: + :toctree: generated/ + :template: function.rst + + torchvision.models.segmentation.deeplabv3_resnet50 + torchvision.models.segmentation.deeplabv3_resnet101 + torchvision.models.segmentation.deeplabv3_mobilenet_v3_large + + +LR-ASPP +------- + +.. autosummary:: + :toctree: generated/ + :template: function.rst + + torchvision.models.segmentation.lraspp_mobilenet_v3_large + +.. _object_det_inst_seg_pers_keypoint_det: Object Detection, Instance Segmentation and Person Keypoint Detection ===================================================================== -TODO: Something similar to classification models: list of models + table of weights +The models subpackage contains definitions for the following model +architectures for detection: + +- `Faster R-CNN `_ +- `FCOS `_ +- `Mask R-CNN `_ +- `RetinaNet `_ +- `SSD `_ +- `SSDlite `_ + +The pre-trained models for detection, instance segmentation and +keypoint detection are initialized with the classification models +in torchvision. + +The models expect a list of ``Tensor[C, H, W]``, in the range ``0-1``. +The models internally resize the images but the behaviour varies depending +on the model. Check the constructor of the models for more information. The +output format of such models is illustrated in :ref:`instance_seg_output`. + + +For object detection and instance segmentation, the pre-trained +models return the predictions of the following classes: + + .. code-block:: python + + COCO_INSTANCE_CATEGORY_NAMES = [ + '__background__', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', + 'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'N/A', 'stop sign', + 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', + 'elephant', 'bear', 'zebra', 'giraffe', 'N/A', 'backpack', 'umbrella', 'N/A', 'N/A', + 'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', + 'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket', + 'bottle', 'N/A', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', + 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', + 'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed', 'N/A', 'dining table', + 'N/A', 'N/A', 'toilet', 'N/A', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', + 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'N/A', 'book', + 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush' + ] + + +Here are the summary of the accuracies for the models trained on +the instances set of COCO train2017 and evaluated on COCO val2017. + +====================================== ======= ======== =========== +Network box AP mask AP keypoint AP +====================================== ======= ======== =========== +Faster R-CNN ResNet-50 FPN 37.0 - - +Faster R-CNN MobileNetV3-Large FPN 32.8 - - +Faster R-CNN MobileNetV3-Large 320 FPN 22.8 - - +FCOS ResNet-50 FPN 39.2 - - +RetinaNet ResNet-50 FPN 36.4 - - +SSD300 VGG16 25.1 - - +SSDlite320 MobileNetV3-Large 21.3 - - +Mask R-CNN ResNet-50 FPN 37.9 34.6 - +====================================== ======= ======== =========== + +For person keypoint detection, the accuracies for the pre-trained +models are as follows + +================================ ======= ======== =========== +Network box AP mask AP keypoint AP +================================ ======= ======== =========== +Keypoint R-CNN ResNet-50 FPN 54.6 - 65.0 +================================ ======= ======== =========== + +For person keypoint detection, the pre-trained model return the +keypoints in the following order: + + .. code-block:: python + + COCO_PERSON_KEYPOINT_NAMES = [ + 'nose', + 'left_eye', + 'right_eye', + 'left_ear', + 'right_ear', + 'left_shoulder', + 'right_shoulder', + 'left_elbow', + 'right_elbow', + 'left_wrist', + 'right_wrist', + 'left_hip', + 'right_hip', + 'left_knee', + 'right_knee', + 'left_ankle', + 'right_ankle' + ] + +Runtime characteristics +----------------------- + +The implementations of the models for object detection, instance segmentation +and keypoint detection are efficient. + +In the following table, we use 8 GPUs to report the results. During training, +we use a batch size of 2 per GPU for all models except SSD which uses 4 +and SSDlite which uses 24. During testing a batch size of 1 is used. + +For test time, we report the time for the model evaluation and postprocessing +(including mask pasting in image), but not the time for computing the +precision-recall. + +====================================== =================== ================== =========== +Network train time (s / it) test time (s / it) memory (GB) +====================================== =================== ================== =========== +Faster R-CNN ResNet-50 FPN 0.2288 0.0590 5.2 +Faster R-CNN MobileNetV3-Large FPN 0.1020 0.0415 1.0 +Faster R-CNN MobileNetV3-Large 320 FPN 0.0978 0.0376 0.6 +FCOS ResNet-50 FPN 0.1450 0.0539 3.3 +RetinaNet ResNet-50 FPN 0.2514 0.0939 4.1 +SSD300 VGG16 0.2093 0.0744 1.5 +SSDlite320 MobileNetV3-Large 0.1773 0.0906 1.5 +Mask R-CNN ResNet-50 FPN 0.2728 0.0903 5.4 +Keypoint R-CNN ResNet-50 FPN 0.3789 0.1242 6.8 +====================================== =================== ================== =========== + + +Faster R-CNN +------------ + +.. autosummary:: + :toctree: generated/ + :template: function.rst + + torchvision.models.detection.fasterrcnn_resnet50_fpn + torchvision.models.detection.fasterrcnn_mobilenet_v3_large_fpn + torchvision.models.detection.fasterrcnn_mobilenet_v3_large_320_fpn + +FCOS +---- + +.. autosummary:: + :toctree: generated/ + :template: function.rst + + torchvision.models.detection.fcos_resnet50_fpn + + +RetinaNet +--------- + +.. autosummary:: + :toctree: generated/ + :template: function.rst + + torchvision.models.detection.retinanet_resnet50_fpn + + +SSD +--- + +.. autosummary:: + :toctree: generated/ + :template: function.rst + + torchvision.models.detection.ssd300_vgg16 + + +SSDlite +------- + +.. autosummary:: + :toctree: generated/ + :template: function.rst + + torchvision.models.detection.ssdlite320_mobilenet_v3_large + + +Mask R-CNN +---------- + +.. autosummary:: + :toctree: generated/ + :template: function.rst + + torchvision.models.detection.maskrcnn_resnet50_fpn + + +Keypoint R-CNN +-------------- + +.. autosummary:: + :toctree: generated/ + :template: function.rst + + torchvision.models.detection.keypointrcnn_resnet50_fpn + + +Video classification +==================== + +We provide models for action recognition pre-trained on Kinetics-400. +They have all been trained with the scripts provided in ``references/video_classification``. + +All pre-trained models expect input images normalized in the same way, +i.e. mini-batches of 3-channel RGB videos of shape (3 x T x H x W), +where H and W are expected to be 112, and T is a number of video frames in a clip. +The images have to be loaded in to a range of [0, 1] and then normalized +using ``mean = [0.43216, 0.394666, 0.37645]`` and ``std = [0.22803, 0.22145, 0.216989]``. + + +.. note:: + The normalization parameters are different from the image classification ones, and correspond + to the mean and std from Kinetics-400. + +.. note:: + For now, normalization code can be found in ``references/video_classification/transforms.py``, + see the ``Normalize`` function there. Note that it differs from standard normalization for + images because it assumes the video is 4d. + +Kinetics 1-crop accuracies for clip length 16 (16x112x112) + +================================ ============= ============= +Network Clip acc@1 Clip acc@5 +================================ ============= ============= +ResNet 3D 18 52.75 75.45 +ResNet MC 18 53.90 76.29 +ResNet (2+1)D 57.50 78.81 +================================ ============= ============= + + +ResNet 3D +---------- + +.. autosummary:: + :toctree: generated/ + :template: function.rst + + torchvision.models.video.r3d_18 + +ResNet Mixed Convolution +------------------------ + +.. autosummary:: + :toctree: generated/ + :template: function.rst + + torchvision.models.video.mc3_18 + +ResNet (2+1)D +------------- + +.. autosummary:: + :toctree: generated/ + :template: function.rst + + torchvision.models.video.r2plus1d_18 + +Optical flow +============ + +Raft +---- + +.. autosummary:: + :toctree: generated/ + :template: function.rst + + torchvision.models.optical_flow.raft_large + torchvision.models.optical_flow.raft_small diff --git a/docs/source/models_new.rst b/docs/source/models_new.rst new file mode 100644 index 00000000000..c644320d9cb --- /dev/null +++ b/docs/source/models_new.rst @@ -0,0 +1,54 @@ +.. _models_new: + +Models and pre-trained weights - New +#################################### + +.. note:: + + These are the new models docs, documenting the new multi-weight API. + TODO: Once all is done, remove the "- New" part in the title above, and + rename this file as models.rst + + +The ``torchvision.models`` subpackage contains definitions of models for addressing +different tasks, including: image classification, pixelwise semantic +segmentation, object detection, instance segmentation, person +keypoint detection, video classification, and optical flow. + +.. note :: + Backward compatibility is guaranteed for loading a serialized + ``state_dict`` to the model created using old PyTorch version. + On the contrary, loading entire saved models or serialized + ``ScriptModules`` (seralized using older versions of PyTorch) + may not preserve the historic behaviour. Refer to the following + `documentation + `_ + + +Classification +============== + +.. currentmodule:: torchvision.models + +The following classification models are available, with or without pre-trained +weights: + +.. toctree:: + :maxdepth: 1 + + models/resnet + models/vgg + + +Table of all available classification weights +--------------------------------------------- + +Accuracies are reported on ImageNet + +.. include:: generated/classification_table.rst + + +Object Detection, Instance Segmentation and Person Keypoint Detection +===================================================================== + +TODO: Something similar to classification models: list of models + table of weights