From 6499cc90413f7854e1cccf6252d48816834b3300 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Wed, 18 May 2022 17:47:56 +0100
Subject: [PATCH 1/3] Remove models.rst

---
 docs/Makefile              |    4 +-
 docs/source/conf.py        |    1 -
 docs/source/index.rst      |    1 -
 docs/source/models.rst     | 1067 ++++++++++++------------------------
 docs/source/models_new.rst |  515 -----------------
 5 files changed, 360 insertions(+), 1228 deletions(-)
 delete mode 100644 docs/source/models_new.rst

diff --git a/docs/Makefile b/docs/Makefile
index c0282d23230..389a07a604e 100644
--- a/docs/Makefile
+++ b/docs/Makefile
@@ -6,9 +6,7 @@ ifneq ($(EXAMPLES_PATTERN),)
 endif
 
 # You can set these variables from the command line.
-# TODO: Once the models doc revamp is done, set back the -W option to raise
-# errors on warnings. See https://github.com/pytorch/vision/pull/5821#discussion_r850500693
-SPHINXOPTS    = -j auto $(EXAMPLES_PATTERN_OPTS)
+SPHINXOPTS    = -W -j auto $(EXAMPLES_PATTERN_OPTS)
 SPHINXBUILD   = sphinx-build
 SPHINXPROJ    = torchvision
 SOURCEDIR     = source
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 3e1b5c95a7b..014eb3c3ae9 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -347,7 +347,6 @@ def inject_weight_metadata(app, what, name, obj, options, lines):
             metrics = meta.pop("_metrics")
             for dataset, dataset_metrics in metrics.items():
                 for metric_name, metric_value in dataset_metrics.items():
-                    metric_name = metric_name.replace("_", "-")
                     table.append((f"{metric_name} (on {dataset})", str(metric_value)))
 
             for k, v in meta.items():
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 06737ae4b60..79dbebdd047 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -38,7 +38,6 @@ architectures, and common image transformations for computer vision.
    ops
    io
    feature_extraction
-   models_new
 
 .. toctree::
    :maxdepth: 1
diff --git a/docs/source/models.rst b/docs/source/models.rst
index 91e0c4fa8cb..a972c7bed30 100644
--- a/docs/source/models.rst
+++ b/docs/source/models.rst
@@ -1,865 +1,516 @@
 .. _models:
 
-Models and pre-trained weights
-##############################
-
+Models and pre-trained weights - New
+####################################
 
 The ``torchvision.models`` subpackage contains definitions of models for addressing
 different tasks, including: image classification, pixelwise semantic
 segmentation, object detection, instance segmentation, person
 keypoint detection, video classification, and optical flow.
 
-.. note ::
-    Backward compatibility is guaranteed for loading a serialized 
-    ``state_dict`` to the model created using old PyTorch version. 
-    On the contrary, loading entire saved models or serialized 
-    ``ScriptModules`` (seralized using older versions of PyTorch) 
-    may not preserve the historic behaviour. Refer to the following 
-    `documentation 
-    <https://pytorch.org/docs/stable/notes/serialization.html#id6>`_   
+General information on pre-trained weights
+==========================================
 
+TorchVision offers pre-trained weights for every provided architecture, using
+the PyTorch :mod:`torch.hub`. Instancing a pre-trained model will download its
+weights to a cache directory. This directory can be set using the `TORCH_HOME`
+environment variable. See :func:`torch.hub.load_state_dict_from_url` for details.
 
-Classification
-==============
+.. note::
 
-The models subpackage contains definitions for the following model
-architectures for image classification:
-
--  `AlexNet`_
--  `VGG`_
--  `ResNet`_
--  `SqueezeNet`_
--  `DenseNet`_
--  `Inception`_ v3
--  `GoogLeNet`_
--  `ShuffleNet`_ v2
--  `MobileNetV2`_
--  `MobileNetV3`_
--  `ResNeXt`_
--  `Wide ResNet`_
--  `MNASNet`_
--  `EfficientNet`_ v1 & v2
--  `RegNet`_
--  `VisionTransformer`_
--  `ConvNeXt`_
--  `SwinTransformer`_
-
-You can construct a model with random weights by calling its constructor:
+    The pre-trained models provided in this library may have their own licenses or
+    terms and conditions derived from the dataset used for training. It is your
+    responsibility to determine whether you have permission to use the models for
+    your use case.
 
-.. code:: python
+.. note ::
+    Backward compatibility is guaranteed for loading a serialized
+    ``state_dict`` to the model created using old PyTorch version.
+    On the contrary, loading entire saved models or serialized
+    ``ScriptModules`` (serialized using older versions of PyTorch)
+    may not preserve the historic behaviour. Refer to the following
+    `documentation
+    <https://pytorch.org/docs/stable/notes/serialization.html#id6>`_
 
-    import torchvision.models as models
-    resnet18 = models.resnet18()
-    alexnet = models.alexnet()
-    vgg16 = models.vgg16()
-    squeezenet = models.squeezenet1_0()
-    densenet = models.densenet161()
-    inception = models.inception_v3()
-    googlenet = models.googlenet()
-    shufflenet = models.shufflenet_v2_x1_0()
-    mobilenet_v2 = models.mobilenet_v2()
-    mobilenet_v3_large = models.mobilenet_v3_large()
-    mobilenet_v3_small = models.mobilenet_v3_small()
-    resnext50_32x4d = models.resnext50_32x4d()
-    resnext101_32x8d = models.resnext101_32x8d()
-    resnext101_64x4d = models.resnext101_64x4d()
-    wide_resnet50_2 = models.wide_resnet50_2()
-    mnasnet = models.mnasnet1_0()
-    efficientnet_b0 = models.efficientnet_b0()
-    efficientnet_b1 = models.efficientnet_b1()
-    efficientnet_b2 = models.efficientnet_b2()
-    efficientnet_b3 = models.efficientnet_b3()
-    efficientnet_b4 = models.efficientnet_b4()
-    efficientnet_b5 = models.efficientnet_b5()
-    efficientnet_b6 = models.efficientnet_b6()
-    efficientnet_b7 = models.efficientnet_b7()
-    efficientnet_v2_s = models.efficientnet_v2_s()
-    efficientnet_v2_m = models.efficientnet_v2_m()
-    efficientnet_v2_l = models.efficientnet_v2_l()
-    regnet_y_400mf = models.regnet_y_400mf()
-    regnet_y_800mf = models.regnet_y_800mf()
-    regnet_y_1_6gf = models.regnet_y_1_6gf()
-    regnet_y_3_2gf = models.regnet_y_3_2gf()
-    regnet_y_8gf = models.regnet_y_8gf()
-    regnet_y_16gf = models.regnet_y_16gf()
-    regnet_y_32gf = models.regnet_y_32gf()
-    regnet_y_128gf = models.regnet_y_128gf()
-    regnet_x_400mf = models.regnet_x_400mf()
-    regnet_x_800mf = models.regnet_x_800mf()
-    regnet_x_1_6gf = models.regnet_x_1_6gf()
-    regnet_x_3_2gf = models.regnet_x_3_2gf()
-    regnet_x_8gf = models.regnet_x_8gf()
-    regnet_x_16gf = models.regnet_x_16gf()
-    regnet_x_32gf = models.regnet_x_32gf()
-    vit_b_16 = models.vit_b_16()
-    vit_b_32 = models.vit_b_32()
-    vit_l_16 = models.vit_l_16()
-    vit_l_32 = models.vit_l_32()
-    vit_h_14 = models.vit_h_14()
-    convnext_tiny = models.convnext_tiny()
-    convnext_small = models.convnext_small()
-    convnext_base = models.convnext_base()
-    convnext_large = models.convnext_large()
-    swin_t = models.swin_t()
-
-We provide pre-trained models, using the PyTorch :mod:`torch.utils.model_zoo`.
-
-Instancing a pre-trained model will download its weights to a cache directory.
-This directory can be set using the `TORCH_HOME` environment variable. See
-:func:`torch.hub.load_state_dict_from_url` for details.
 
-Some models use modules which have different training and evaluation
-behavior, such as batch normalization. To switch between these modes, use
-``model.train()`` or ``model.eval()`` as appropriate. See
-:meth:`~torch.nn.Module.train` or :meth:`~torch.nn.Module.eval` for details.
+Initializing pre-trained models
+-------------------------------
 
-All pre-trained models expect input images normalized in the same way,
-i.e. mini-batches of 3-channel RGB images of shape (3 x H x W),
-where H and W are expected to be at least 224.
-The images have to be loaded in to a range of [0, 1] and then normalized
-using ``mean = [0.485, 0.456, 0.406]`` and ``std = [0.229, 0.224, 0.225]``.
-You can use the following transform to normalize::
+As of v0.13, TorchVision offers a new `Multi-weight support API
+<https://pytorch.org/blog/introducing-torchvision-new-multi-weight-support-api/>`_
+for loading different weights to the existing model builder methods:
 
-    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
-                                     std=[0.229, 0.224, 0.225])
+.. code:: python
 
-An example of such normalization can be found in the imagenet example
-`here <https://github.com/pytorch/examples/blob/42e5b996718797e45c46a25c55b031e6768f8440/imagenet/main.py#L89-L101>`_
+    from torchvision.models import resnet50, ResNet50_Weights
 
-The process for obtaining the values of `mean` and `std` is roughly equivalent
-to::
+    # Old weights with accuracy 76.130%
+    resnet50(weights=ResNet50_Weights.IMAGENET1K_V1)
 
-    import torch
-    from torchvision import datasets, transforms as T
-
-    transform = T.Compose([T.Resize(256), T.CenterCrop(224), T.PILToTensor(), T.ConvertImageDtype(torch.float)])
-    dataset = datasets.ImageNet(".", split="train", transform=transform)
-
-    means = []
-    stds = []
-    for img in subset(dataset):
-        means.append(torch.mean(img))
-        stds.append(torch.std(img))
-
-    mean = torch.mean(torch.tensor(means))
-    std = torch.mean(torch.tensor(stds))
-
-Unfortunately, the concrete `subset` that was used is lost. For more
-information see `this discussion <https://github.com/pytorch/vision/issues/1439>`_
-or `these experiments <https://github.com/pytorch/vision/pull/1965>`_.
-
-The sizes of the EfficientNet models depend on the variant. For the exact input sizes
-`check here <https://github.com/pytorch/vision/blob/d2bfd639e46e1c5dc3c177f889dc7750c8d137c7/references/classification/train.py#L92-L93>`_
-
-ImageNet 1-crop error rates
-
-================================  =============   =============
-Model                             Acc@1           Acc@5
-================================  =============   =============
-AlexNet                           56.522          79.066
-VGG-11                            69.020          88.628
-VGG-13                            69.928          89.246
-VGG-16                            71.592          90.382
-VGG-19                            72.376          90.876
-VGG-11 with batch normalization   70.370          89.810
-VGG-13 with batch normalization   71.586          90.374
-VGG-16 with batch normalization   73.360          91.516
-VGG-19 with batch normalization   74.218          91.842
-ResNet-18                         69.758          89.078
-ResNet-34                         73.314          91.420
-ResNet-50                         76.130          92.862
-ResNet-101                        77.374          93.546
-ResNet-152                        78.312          94.046
-SqueezeNet 1.0                    58.092          80.420
-SqueezeNet 1.1                    58.178          80.624
-Densenet-121                      74.434          91.972
-Densenet-169                      75.600          92.806
-Densenet-201                      76.896          93.370
-Densenet-161                      77.138          93.560
-Inception v3                      77.294          93.450
-GoogleNet                         69.778          89.530
-ShuffleNet V2 x0.5                60.552          81.746
-ShuffleNet V2 x1.0                69.362          88.316
-ShuffleNet V2 x1.5                72.996          91.086
-ShuffleNet V2 x2.0                76.230          93.006
-MobileNet V2                      71.878          90.286
-MobileNet V3 Large                74.042          91.340
-MobileNet V3 Small                67.668          87.402
-ResNeXt-50-32x4d                  77.618          93.698
-ResNeXt-101-32x8d                 79.312          94.526
-ResNeXt-101-64x4d                 83.246          96.454
-Wide ResNet-50-2                  78.468          94.086
-Wide ResNet-101-2                 78.848          94.284
-MNASNet 1.0                       73.456          91.510
-MNASNet 0.5                       67.734          87.490
-EfficientNet-B0                   77.692          93.532
-EfficientNet-B1                   78.642          94.186
-EfficientNet-B2                   80.608          95.310
-EfficientNet-B3                   82.008          96.054
-EfficientNet-B4                   83.384          96.594
-EfficientNet-B5                   83.444          96.628
-EfficientNet-B6                   84.008          96.916
-EfficientNet-B7                   84.122          96.908
-EfficientNetV2-s                  84.228          96.878
-EfficientNetV2-m                  85.112          97.156
-EfficientNetV2-l                  85.810          97.792
-regnet_x_400mf                    72.834          90.950
-regnet_x_800mf                    75.212          92.348
-regnet_x_1_6gf                    77.040          93.440
-regnet_x_3_2gf                    78.364          93.992
-regnet_x_8gf                      79.344          94.686 
-regnet_x_16gf                     80.058          94.944
-regnet_x_32gf                     80.622          95.248
-regnet_y_400mf                    74.046          91.716
-regnet_y_800mf                    76.420          93.136
-regnet_y_1_6gf                    77.950          93.966
-regnet_y_3_2gf                    78.948          94.576
-regnet_y_8gf                      80.032          95.048
-regnet_y_16gf                     80.424          95.240
-regnet_y_32gf                     80.878          95.340
-vit_b_16                          81.072          95.318
-vit_b_32                          75.912          92.466
-vit_l_16                          79.662          94.638
-vit_l_32                          76.972          93.070
-vit_h_14                          88.552          98.694 
-convnext_tiny                     82.520          96.146
-convnext_small                    83.616          96.650
-convnext_base                     84.062          96.870
-convnext_large                    84.414          96.976
-swin_t                            81.358          95.526
-================================  =============   =============
-
-
-.. _AlexNet: https://arxiv.org/abs/1404.5997
-.. _VGG: https://arxiv.org/abs/1409.1556
-.. _ResNet: https://arxiv.org/abs/1512.03385
-.. _SqueezeNet: https://arxiv.org/abs/1602.07360
-.. _DenseNet: https://arxiv.org/abs/1608.06993
-.. _Inception: https://arxiv.org/abs/1512.00567
-.. _GoogLeNet: https://arxiv.org/abs/1409.4842
-.. _ShuffleNet: https://arxiv.org/abs/1807.11164
-.. _MobileNetV2: https://arxiv.org/abs/1801.04381
-.. _MobileNetV3: https://arxiv.org/abs/1905.02244
-.. _ResNeXt: https://arxiv.org/abs/1611.05431
-.. _MNASNet: https://arxiv.org/abs/1807.11626
-.. _EfficientNet: https://arxiv.org/abs/1905.11946
-.. _RegNet: https://arxiv.org/abs/2003.13678
-.. _VisionTransformer: https://arxiv.org/abs/2010.11929
-.. _ConvNeXt: https://arxiv.org/abs/2201.03545
-.. _SwinTransformer: https://arxiv.org/abs/2103.14030
+    # New weights with accuracy 80.858%
+    resnet50(weights=ResNet50_Weights.IMAGENET1K_V2)
 
-.. currentmodule:: torchvision.models
+    # Best available weights (currently alias for IMAGENET1K_V2)
+    # Note that these weights may change across versions
+    resnet50(weights=ResNet50_Weights.DEFAULT)
 
-Alexnet
--------
+    # Strings are also supported
+    resnet50(weights="IMAGENET1K_V2")
 
-.. autosummary::
-    :toctree: generated/
-    :template: function.rst
+    # No weights - random initialization
+    resnet50(weights=None)
 
-    alexnet
 
-VGG
----
+Migrating to the new API is very straightforward. The following method calls between the 2 APIs are all equivalent:
 
-.. autosummary::
-    :toctree: generated/
-    :template: function.rst
+.. code:: python
 
-    vgg11
-    vgg11_bn
-    vgg13
-    vgg13_bn
-    vgg16
-    vgg16_bn
-    vgg19
-    vgg19_bn
+    from torchvision.models import resnet50, ResNet50_Weights
 
+    # Using pretrained weights:
+    resnet50(weights=ResNet50_Weights.IMAGENET1K_V1)
+    resnet50(weights="IMAGENET1K_V1")
+    resnet50(pretrained=True)  # deprecated
+    resnet50(True)  # deprecated
 
-ResNet
-------
+    # Using no weights:
+    resnet50(weights=None)
+    resnet50()
+    resnet50(pretrained=False)  # deprecated
+    resnet50(False)  # deprecated
 
-.. autosummary::
-    :toctree: generated/
-    :template: function.rst
+Note that the ``pretrained`` parameter is now deprecated, using it will emit warnings and will be removed on v0.15.
 
-    resnet18
-    resnet34
-    resnet50
-    resnet101
-    resnet152
+Using the pre-trained models
+----------------------------
 
-SqueezeNet
-----------
+Before using the pre-trained models, one must preprocess the image
+(resize with right resolution/interpolation, apply inference transforms,
+rescale the values etc). There is no standard way to do this as it depends on
+how a given model was trained. It can vary across model families, variants or
+even weight versions. Using the correct preprocessing method is critical and
+failing to do so may lead to decreased accuracy or incorrect outputs.
 
-.. autosummary::
-    :toctree: generated/
-    :template: function.rst
+All the necessary information for the inference transforms of each pre-trained
+model is provided on its weights documentation. To simplify inference, TorchVision
+bundles the necessary preprocessing transforms into each model weight. These are
+accessible via the ``weight.transforms`` attribute:
 
-    squeezenet1_0
-    squeezenet1_1
+.. code:: python
 
-DenseNet
----------
+    # Initialize the Weight Transforms
+    weights = ResNet50_Weights.DEFAULT
+    preprocess = weights.transforms()
 
-.. autosummary::
-    :toctree: generated/
-    :template: function.rst
+    # Apply it to the input image
+    img_transformed = preprocess(img)
 
-    densenet121
-    densenet169
-    densenet161
-    densenet201
 
-Inception v3
-------------
+Some models use modules which have different training and evaluation
+behavior, such as batch normalization. To switch between these modes, use
+``model.train()`` or ``model.eval()`` as appropriate. See
+:meth:`~torch.nn.Module.train` or :meth:`~torch.nn.Module.eval` for details.
 
-.. autosummary::
-    :toctree: generated/
-    :template: function.rst
+.. code:: python
 
-    inception_v3
+    # Initialize model
+    weights = ResNet50_Weights.DEFAULT
+    model = resnet50(weights=weights)
 
-GoogLeNet
-------------
+    # Set model to eval mode
+    model.eval()
 
-.. autosummary::
-    :toctree: generated/
-    :template: function.rst
 
-    googlenet
+Classification
+==============
 
-ShuffleNet v2
--------------
+.. currentmodule:: torchvision.models
 
-.. autosummary::
-    :toctree: generated/
-    :template: function.rst
+The following classification models are available, with or without pre-trained
+weights:
+
+.. toctree::
+   :maxdepth: 1
+
+   models/alexnet
+   models/convnext
+   models/densenet
+   models/efficientnet
+   models/efficientnetv2
+   models/googlenet
+   models/inception
+   models/mnasnet
+   models/mobilenetv2
+   models/mobilenetv3
+   models/regnet
+   models/resnet
+   models/resnext
+   models/shufflenetv2
+   models/squeezenet
+   models/swin_transformer
+   models/vgg
+   models/vision_transformer
+   models/wide_resnet
+
+|
+
+Here is an example of how to use the pre-trained image classification models:
 
-    shufflenet_v2_x0_5
-    shufflenet_v2_x1_0
-    shufflenet_v2_x1_5
-    shufflenet_v2_x2_0
+.. code:: python
 
-MobileNet v2
--------------
+    from torchvision.io import read_image
+    from torchvision.models import resnet50, ResNet50_Weights
 
-.. autosummary::
-    :toctree: generated/
-    :template: function.rst
+    img = read_image("test/assets/encode_jpeg/grace_hopper_517x606.jpg")
 
-    mobilenet_v2
+    # Step 1: Initialize model with the best available weights
+    weights = ResNet50_Weights.DEFAULT
+    model = resnet50(weights=weights)
+    model.eval()
 
-MobileNet v3
--------------
+    # Step 2: Initialize the inference transforms
+    preprocess = weights.transforms()
 
-.. autosummary::
-    :toctree: generated/
-    :template: function.rst
+    # Step 3: Apply inference preprocessing transforms
+    batch = preprocess(img).unsqueeze(0)
 
-    mobilenet_v3_large
-    mobilenet_v3_small
-
-ResNext
--------
-
-.. autosummary::
-    :toctree: generated/
-    :template: function.rst
-
-    resnext50_32x4d
-    resnext101_32x8d
-    resnext101_64x4d
+    # Step 4: Use the model and print the predicted category
+    prediction = model(batch).squeeze(0).softmax(0)
+    class_id = prediction.argmax().item()
+    score = prediction[class_id].item()
+    category_name = weights.meta["categories"][class_id]
+    print(f"{category_name}: {100 * score:.1f}%")
 
-Wide ResNet
------------
+The classes of the pre-trained model outputs can be found at ``weights.meta["categories"]``.
 
-.. autosummary::
-    :toctree: generated/
-    :template: function.rst
+Table of all available classification weights
+---------------------------------------------
 
-    wide_resnet50_2
-    wide_resnet101_2
+Accuracies are reported on ImageNet-1K using single crops:
 
-MNASNet
---------
+.. include:: generated/classification_table.rst
 
-.. autosummary::
-    :toctree: generated/
-    :template: function.rst
+Quantized models
+----------------
 
-    mnasnet0_5
-    mnasnet0_75
-    mnasnet1_0
-    mnasnet1_3
+.. currentmodule:: torchvision.models.quantization
 
-EfficientNet
-------------
+The following architectures provide support for INT8 quantized models, with or without
+pre-trained weights:
 
-.. autosummary::
-    :toctree: generated/
-    :template: function.rst
+.. toctree::
+   :maxdepth: 1
 
-    efficientnet_b0
-    efficientnet_b1
-    efficientnet_b2
-    efficientnet_b3
-    efficientnet_b4
-    efficientnet_b5
-    efficientnet_b6
-    efficientnet_b7
-    efficientnet_v2_s
-    efficientnet_v2_m
-    efficientnet_v2_l
+   models/googlenet_quant
+   models/inception_quant
+   models/mobilenetv2_quant
+   models/mobilenetv3_quant
+   models/resnet_quant
+   models/resnext_quant
+   models/shufflenetv2_quant
 
-RegNet
-------------
-
-.. autosummary::
-    :toctree: generated/
-    :template: function.rst
-
-    regnet_y_400mf
-    regnet_y_800mf
-    regnet_y_1_6gf
-    regnet_y_3_2gf
-    regnet_y_8gf
-    regnet_y_16gf
-    regnet_y_32gf
-    regnet_y_128gf
-    regnet_x_400mf
-    regnet_x_800mf
-    regnet_x_1_6gf
-    regnet_x_3_2gf
-    regnet_x_8gf
-    regnet_x_16gf
-    regnet_x_32gf
-
-VisionTransformer
------------------
-
-.. autosummary::
-    :toctree: generated/
-    :template: function.rst
-
-    vit_b_16
-    vit_b_32
-    vit_l_16
-    vit_l_32
-    vit_h_14
-
-ConvNeXt
---------
-
-.. autosummary::
-    :toctree: generated/
-    :template: function.rst
-
-    convnext_tiny
-    convnext_small
-    convnext_base
-    convnext_large
-
-SwinTransformer
----------------
-
-.. autosummary::
-    :toctree: generated/
-    :template: function.rst
-
-    swin_t
-
-Quantized Models
-----------------
+|
 
-The following architectures provide support for INT8 quantized models. You can get
-a model with random weights by calling its constructor:
+Here is an example of how to use the pre-trained quantized image classification models:
 
 .. code:: python
 
-    import torchvision.models as models
-    googlenet = models.quantization.googlenet()
-    inception_v3 = models.quantization.inception_v3()
-    mobilenet_v2 = models.quantization.mobilenet_v2()
-    mobilenet_v3_large = models.quantization.mobilenet_v3_large()
-    resnet18 = models.quantization.resnet18()
-    resnet50 = models.quantization.resnet50()
-    resnext101_32x8d = models.quantization.resnext101_32x8d()
-    resnext101_64x4d = models.quantization.resnext101_64x4d()
-    shufflenet_v2_x0_5 = models.quantization.shufflenet_v2_x0_5()
-    shufflenet_v2_x1_0 = models.quantization.shufflenet_v2_x1_0()
-    shufflenet_v2_x1_5 = models.quantization.shufflenet_v2_x1_5()
-    shufflenet_v2_x2_0 = models.quantization.shufflenet_v2_x2_0()
-
-Obtaining a pre-trained quantized model can be done with a few lines of code:
+    from torchvision.io import read_image
+    from torchvision.models.quantization import resnet50, ResNet50_QuantizedWeights
 
-.. code:: python
+    img = read_image("test/assets/encode_jpeg/grace_hopper_517x606.jpg")
 
-    import torchvision.models as models
-    model = models.quantization.mobilenet_v2(weights=MobileNet_V2_QuantizedWeights.IMAGENET1K_QNNPACK_V1, quantize=True)
+    # Step 1: Initialize model with the best available weights
+    weights = ResNet50_QuantizedWeights.DEFAULT
+    model = resnet50(weights=weights, quantize=True)
     model.eval()
-    # run the model with quantized inputs and weights
-    out = model(torch.rand(1, 3, 224, 224))
-
-We provide pre-trained quantized weights for the following models:
-
-================================  =============  =============
-Model                             Acc@1          Acc@5
-================================  =============  =============
-MobileNet V2                      71.658         90.150
-MobileNet V3 Large                73.004         90.858
-ShuffleNet V2 x0.5                57.972         79.780
-ShuffleNet V2 x1.0                68.360         87.582
-ShuffleNet V2 x1.5                72.052         90.700
-ShuffleNet V2 x2.0                75.354         92.488
-ResNet 18                         69.494         88.882
-ResNet 50                         75.920         92.814
-ResNext 101 32x8d                 78.986         94.480
-ResNext 101 64x4d                 82.898         96.326
-Inception V3                      77.176         93.354
-GoogleNet                         69.826         89.404
-================================  =============  =============
 
+    # Step 2: Initialize the inference transforms
+    preprocess = weights.transforms()
 
-Semantic Segmentation
-=====================
+    # Step 3: Apply inference preprocessing transforms
+    batch = preprocess(img).unsqueeze(0)
 
-The models subpackage contains definitions for the following model
-architectures for semantic segmentation:
+    # Step 4: Use the model and print the predicted category
+    prediction = model(batch).squeeze(0).softmax(0)
+    class_id = prediction.argmax().item()
+    score = prediction[class_id].item()
+    category_name = weights.meta["categories"][class_id]
+    print(f"{category_name}: {100 * score}%")
 
-- `FCN ResNet50, ResNet101 <https://arxiv.org/abs/1411.4038>`_
-- `DeepLabV3 ResNet50, ResNet101, MobileNetV3-Large <https://arxiv.org/abs/1706.05587>`_
-- `LR-ASPP MobileNetV3-Large <https://arxiv.org/abs/1905.02244>`_
+The classes of the pre-trained model outputs can be found at ``weights.meta["categories"]``.
 
-As with image classification models, all pre-trained models expect input images normalized in the same way.
-The images have to be loaded in to a range of ``[0, 1]`` and then normalized using
-``mean = [0.485, 0.456, 0.406]`` and ``std = [0.229, 0.224, 0.225]``.
-They have been trained on images resized such that their minimum size is 520.
 
-For details on how to plot the masks of such models, you may refer to :ref:`semantic_seg_output`.
+Table of all available quantized classification weights
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-The pre-trained models have been trained on a subset of COCO train2017, on the 20 categories that are
-present in the Pascal VOC dataset. You can see more information on how the subset has been selected in
-``references/segmentation/coco_utils.py``. The classes that the pre-trained model outputs are the following,
-in order:
+Accuracies are reported on ImageNet-1K using single crops:
 
-  .. code-block:: python
+.. include:: generated/classification_quant_table.rst
 
-      ['__background__', 'aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus',
-       'car', 'cat', 'chair', 'cow', 'diningtable', 'dog', 'horse', 'motorbike',
-       'person', 'pottedplant', 'sheep', 'sofa', 'train', 'tvmonitor']
+Semantic Segmentation
+=====================
 
-The accuracies of the pre-trained models evaluated on COCO val2017 are as follows
+.. currentmodule:: torchvision.models.segmentation
 
-================================  =============  ====================
-Network                           mean IoU       global pixelwise acc
-================================  =============  ====================
-FCN ResNet50                      60.5           91.4
-FCN ResNet101                     63.7           91.9
-DeepLabV3 ResNet50                66.4           92.4
-DeepLabV3 ResNet101               67.4           92.4
-DeepLabV3 MobileNetV3-Large       60.3           91.2
-LR-ASPP MobileNetV3-Large         57.9           91.2
-================================  =============  ====================
+The following semantic segmentation models are available, with or without
+pre-trained weights:
 
+.. toctree::
+   :maxdepth: 1
 
-Fully Convolutional Networks
-----------------------------
+   models/deeplabv3
+   models/fcn
+   models/lraspp
 
-.. autosummary::
-    :toctree: generated/
-    :template: function.rst
+|
 
-    torchvision.models.segmentation.fcn_resnet50
-    torchvision.models.segmentation.fcn_resnet101
+Here is an example of how to use the pre-trained semantic segmentation models:
 
+.. code:: python
 
-DeepLabV3
----------
+    from torchvision.io.image import read_image
+    from torchvision.models.segmentation import fcn_resnet50, FCN_ResNet50_Weights
+    from torchvision.transforms.functional import to_pil_image
 
-.. autosummary::
-    :toctree: generated/
-    :template: function.rst
+    img = read_image("gallery/assets/dog1.jpg")
 
-    torchvision.models.segmentation.deeplabv3_resnet50
-    torchvision.models.segmentation.deeplabv3_resnet101
-    torchvision.models.segmentation.deeplabv3_mobilenet_v3_large
+    # Step 1: Initialize model with the best available weights
+    weights = FCN_ResNet50_Weights.DEFAULT
+    model = fcn_resnet50(weights=weights)
+    model.eval()
+
+    # Step 2: Initialize the inference transforms
+    preprocess = weights.transforms()
+
+    # Step 3: Apply inference preprocessing transforms
+    batch = preprocess(img).unsqueeze(0)
+
+    # Step 4: Use the model and visualize the prediction
+    prediction = model(batch)["out"]
+    normalized_masks = prediction.softmax(dim=1)
+    class_to_idx = {cls: idx for (idx, cls) in enumerate(weights.meta["categories"])}
+    mask = normalized_masks[0, class_to_idx["dog"]]
+    to_pil_image(mask).show()
+
+The classes of the pre-trained model outputs can be found at ``weights.meta["categories"]``.
+The output format of the models is illustrated in :ref:`semantic_seg_output`.
 
 
-LR-ASPP
--------
+Table of all available semantic segmentation weights
+----------------------------------------------------
 
-.. autosummary::
-    :toctree: generated/
-    :template: function.rst
+All models are evaluated a subset of COCO val2017, on the 20 categories that are present in the Pascal VOC dataset:
+
+.. include:: generated/segmentation_table.rst
 
-    torchvision.models.segmentation.lraspp_mobilenet_v3_large
 
 .. _object_det_inst_seg_pers_keypoint_det:
 
 Object Detection, Instance Segmentation and Person Keypoint Detection
 =====================================================================
 
-The models subpackage contains definitions for the following model
-architectures for detection:
-
-- `Faster R-CNN <https://arxiv.org/abs/1506.01497>`_
-- `FCOS <https://arxiv.org/abs/1904.01355>`_
-- `Mask R-CNN <https://arxiv.org/abs/1703.06870>`_
-- `RetinaNet <https://arxiv.org/abs/1708.02002>`_
-- `SSD <https://arxiv.org/abs/1512.02325>`_
-- `SSDlite <https://arxiv.org/abs/1801.04381>`_
-
 The pre-trained models for detection, instance segmentation and
 keypoint detection are initialized with the classification models
-in torchvision.
-
-The models expect a list of ``Tensor[C, H, W]``, in the range ``0-1``.
-The models internally resize the images but the behaviour varies depending
-on the model. Check the constructor of the models for more information. The
-output format of such models is illustrated in :ref:`instance_seg_output`.
-
-
-For object detection and instance segmentation, the pre-trained
-models return the predictions of the following classes:
-
-  .. code-block:: python
-
-      COCO_INSTANCE_CATEGORY_NAMES = [
-          '__background__', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
-          'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'N/A', 'stop sign',
-          'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',
-          'elephant', 'bear', 'zebra', 'giraffe', 'N/A', 'backpack', 'umbrella', 'N/A', 'N/A',
-          'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball',
-          'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket',
-          'bottle', 'N/A', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl',
-          'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza',
-          'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed', 'N/A', 'dining table',
-          'N/A', 'N/A', 'toilet', 'N/A', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone',
-          'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'N/A', 'book',
-          'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush'
-      ]
-
-
-Here are the summary of the accuracies for the models trained on
-the instances set of COCO train2017 and evaluated on COCO val2017.
-
-======================================  =======  ========  ===========
-Network                                 box AP   mask AP   keypoint AP
-======================================  =======  ========  ===========
-Faster R-CNN ResNet-50 FPN              37.0     -         -
-Faster R-CNN MobileNetV3-Large FPN      32.8     -         -
-Faster R-CNN MobileNetV3-Large 320 FPN  22.8     -         -
-FCOS ResNet-50 FPN                      39.2     -         -
-RetinaNet ResNet-50 FPN                 36.4     -         -
-SSD300 VGG16                            25.1     -         -
-SSDlite320 MobileNetV3-Large            21.3     -         -
-Mask R-CNN ResNet-50 FPN                37.9     34.6      -
-======================================  =======  ========  ===========
-
-For person keypoint detection, the accuracies for the pre-trained
-models are as follows
-
-================================  =======  ========  ===========
-Network                           box AP   mask AP   keypoint AP
-================================  =======  ========  ===========
-Keypoint R-CNN ResNet-50 FPN      54.6     -         65.0
-================================  =======  ========  ===========
-
-For person keypoint detection, the pre-trained model return the
-keypoints in the following order:
-
-  .. code-block:: python
+in torchvision. The models expect a list of ``Tensor[C, H, W]``.
+Check the constructor of the models for more information.
+
+Object Detection
+----------------
+
+.. currentmodule:: torchvision.models.detection
+
+The following object detection models are available, with or without pre-trained
+weights:
+
+.. toctree::
+   :maxdepth: 1
+
+   models/faster_rcnn
+   models/fcos
+   models/retinanet
+   models/ssd
+   models/ssdlite
+
+|
+
+Here is an example of how to use the pre-trained object detection models:
+
+.. code:: python
 
-    COCO_PERSON_KEYPOINT_NAMES = [
-        'nose',
-        'left_eye',
-        'right_eye',
-        'left_ear',
-        'right_ear',
-        'left_shoulder',
-        'right_shoulder',
-        'left_elbow',
-        'right_elbow',
-        'left_wrist',
-        'right_wrist',
-        'left_hip',
-        'right_hip',
-        'left_knee',
-        'right_knee',
-        'left_ankle',
-        'right_ankle'
-    ]
 
-Runtime characteristics
------------------------
+    from torchvision.io.image import read_image
+    from torchvision.models.detection import fasterrcnn_resnet50_fpn_v2, FasterRCNN_ResNet50_FPN_V2_Weights
+    from torchvision.utils import draw_bounding_boxes
+    from torchvision.transforms.functional import to_pil_image
 
-The implementations of the models for object detection, instance segmentation
-and keypoint detection are efficient.
+    img = read_image("test/assets/encode_jpeg/grace_hopper_517x606.jpg")
 
-In the following table, we use 8 GPUs to report the results. During training,
-we use a batch size of 2 per GPU for all models except SSD which uses 4
-and SSDlite which uses 24. During testing a batch size  of 1 is used.
+    # Step 1: Initialize model with the best available weights
+    weights = FasterRCNN_ResNet50_FPN_V2_Weights.DEFAULT
+    model = fasterrcnn_resnet50_fpn_v2(weights=weights, box_score_thresh=0.9)
+    model.eval()
 
-For test time, we report the time for the model evaluation and postprocessing
-(including mask pasting in image), but not the time for computing the
-precision-recall.
+    # Step 2: Initialize the inference transforms
+    preprocess = weights.transforms()
 
-======================================  ===================  ==================  ===========
-Network                                 train time (s / it)  test time (s / it)  memory (GB)
-======================================  ===================  ==================  ===========
-Faster R-CNN ResNet-50 FPN              0.2288               0.0590              5.2
-Faster R-CNN MobileNetV3-Large FPN      0.1020               0.0415              1.0
-Faster R-CNN MobileNetV3-Large 320 FPN  0.0978               0.0376              0.6
-FCOS ResNet-50 FPN                      0.1450               0.0539              3.3
-RetinaNet ResNet-50 FPN                 0.2514               0.0939              4.1
-SSD300 VGG16                            0.2093               0.0744              1.5
-SSDlite320 MobileNetV3-Large            0.1773               0.0906              1.5
-Mask R-CNN ResNet-50 FPN                0.2728               0.0903              5.4
-Keypoint R-CNN ResNet-50 FPN            0.3789               0.1242              6.8
-======================================  ===================  ==================  ===========
+    # Step 3: Apply inference preprocessing transforms
+    batch = [preprocess(img)]
 
+    # Step 4: Use the model and visualize the prediction
+    prediction = model(batch)[0]
+    labels = [weights.meta["categories"][i] for i in prediction["labels"]]
+    box = draw_bounding_boxes(img, boxes=prediction["boxes"],
+                              labels=labels,
+                              colors="red",
+                              width=4, font_size=30)
+    im = to_pil_image(box.detach())
+    im.show()
 
-Faster R-CNN
-------------
+The classes of the pre-trained model outputs can be found at ``weights.meta["categories"]``.
+For details on how to plot the bounding boxes of the models, you may refer to :ref:`instance_seg_output`.
 
-.. autosummary::
-    :toctree: generated/
-    :template: function.rst
+Table of all available Object detection weights
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-    torchvision.models.detection.fasterrcnn_resnet50_fpn
-    torchvision.models.detection.fasterrcnn_mobilenet_v3_large_fpn
-    torchvision.models.detection.fasterrcnn_mobilenet_v3_large_320_fpn
+Box MAPs are reported on COCO val2017:
 
-FCOS
-----
+.. include:: generated/detection_table.rst
 
-.. autosummary::
-    :toctree: generated/
-    :template: function.rst
 
-    torchvision.models.detection.fcos_resnet50_fpn
+Instance Segmentation
+---------------------
 
+.. currentmodule:: torchvision.models.detection
 
-RetinaNet
----------
+The following instance segmentation models are available, with or without pre-trained
+weights:
 
-.. autosummary::
-    :toctree: generated/
-    :template: function.rst
+.. toctree::
+   :maxdepth: 1
 
-    torchvision.models.detection.retinanet_resnet50_fpn
+   models/mask_rcnn
 
+|
 
-SSD
----
 
-.. autosummary::
-    :toctree: generated/
-    :template: function.rst
+For details on how to plot the masks of the models, you may refer to :ref:`instance_seg_output`.
 
-    torchvision.models.detection.ssd300_vgg16
+Table of all available Instance segmentation weights
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
+Box and Mask MAPs are reported on COCO val2017:
 
-SSDlite
--------
+.. include:: generated/instance_segmentation_table.rst
 
-.. autosummary::
-    :toctree: generated/
-    :template: function.rst
+Keypoint Detection
+------------------
 
-    torchvision.models.detection.ssdlite320_mobilenet_v3_large
+.. currentmodule:: torchvision.models.detection
 
+The following person keypoint detection models are available, with or without
+pre-trained weights:
 
-Mask R-CNN
-----------
+.. toctree::
+   :maxdepth: 1
 
-.. autosummary::
-    :toctree: generated/
-    :template: function.rst
+   models/keypoint_rcnn
 
-    torchvision.models.detection.maskrcnn_resnet50_fpn
+|
 
+The classes of the pre-trained model outputs can be found at ``weights.meta["keypoint_names"]``.
+For details on how to plot the bounding boxes of the models, you may refer to :ref:`keypoint_output`.
 
-Keypoint R-CNN
---------------
+Table of all available Keypoint detection weights
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-.. autosummary::
-    :toctree: generated/
-    :template: function.rst
+Box and Keypoint MAPs are reported on COCO val2017:
 
-    torchvision.models.detection.keypointrcnn_resnet50_fpn
+.. include:: generated/detection_keypoint_table.rst
 
 
-Video classification
+Video Classification
 ====================
 
-We provide models for action recognition pre-trained on Kinetics-400.
-They have all been trained with the scripts provided in ``references/video_classification``.
+.. currentmodule:: torchvision.models.video
 
-All pre-trained models expect input images normalized in the same way,
-i.e. mini-batches of 3-channel RGB videos of shape (3 x T x H x W),
-where H and W are expected to be 112, and T is a number of video frames in a clip.
-The images have to be loaded in to a range of [0, 1] and then normalized
-using ``mean = [0.43216, 0.394666, 0.37645]`` and ``std = [0.22803, 0.22145, 0.216989]``.
+The following video classification models are available, with or without
+pre-trained weights:
 
+.. toctree::
+   :maxdepth: 1
 
-.. note::
-  The normalization parameters are different from the image classification ones, and correspond
-  to the mean and std from Kinetics-400.
+   models/video_resnet
 
-.. note::
-  For now, normalization code can be found in ``references/video_classification/transforms.py``,
-  see the ``Normalize`` function there. Note that it differs from standard normalization for
-  images because it assumes the video is 4d.
+|
 
-Kinetics 1-crop accuracies for clip length 16 (16x112x112)
+Here is an example of how to use the pre-trained video classification models:
 
-================================  =============   =============
-Network                           Clip acc@1      Clip acc@5
-================================  =============   =============
-ResNet 3D 18                      52.75           75.45
-ResNet MC 18                      53.90           76.29
-ResNet (2+1)D                     57.50           78.81
-================================  =============   =============
+.. code:: python
 
 
-ResNet 3D
-----------
+    from torchvision.io.video import read_video
+    from torchvision.models.video import r3d_18, R3D_18_Weights
 
-.. autosummary::
-    :toctree: generated/
-    :template: function.rst
+    vid, _, _ = read_video("test/assets/videos/v_SoccerJuggling_g23_c01.avi")
+    vid = vid[:32]  # optionally shorten duration
 
-    torchvision.models.video.r3d_18
+    # Step 1: Initialize model with the best available weights
+    weights = R3D_18_Weights.DEFAULT
+    model = r3d_18(weights=weights)
+    model.eval()
 
-ResNet Mixed Convolution
-------------------------
+    # Step 2: Initialize the inference transforms
+    preprocess = weights.transforms()
 
-.. autosummary::
-    :toctree: generated/
-    :template: function.rst
+    # Step 3: Apply inference preprocessing transforms
+    batch = preprocess(vid).unsqueeze(0)
 
-    torchvision.models.video.mc3_18
+    # Step 4: Use the model and print the predicted category
+    prediction = model(batch).squeeze(0).softmax(0)
+    label = prediction.argmax().item()
+    score = prediction[label].item()
+    category_name = weights.meta["categories"][label]
+    print(f"{category_name}: {100 * score}%")
 
-ResNet (2+1)D
--------------
+The classes of the pre-trained model outputs can be found at ``weights.meta["categories"]``.
 
-.. autosummary::
-    :toctree: generated/
-    :template: function.rst
 
-    torchvision.models.video.r2plus1d_18
+Table of all available video classification weights
+---------------------------------------------------
 
-Optical flow
+Accuracies are reported on Kinetics-400 using single crops for clip length 16:
+
+.. include:: generated/video_table.rst
+
+Optical Flow
 ============
 
-Raft
-----
+.. currentmodule:: torchvision.models.optical_flow
+
+The following Optical Flow models are available, with or without pre-trained
+
+.. toctree::
+   :maxdepth: 1
+
+   models/raft
+
+Using models from Hub
+=====================
+
+Most pre-trained models can be accessed directly via PyTorch Hub without having TorchVision installed:
+
+.. code:: python
+
+    import torch
+
+    # Option 1: passing weights param as string
+    model = torch.hub.load("pytorch/vision", "resnet50", weights="IMAGENET1K_V2")
 
-.. autosummary::
-    :toctree: generated/
-    :template: function.rst
+    # Option 2: passing weights param as enum
+    weights = torch.hub.load("pytorch/vision", "get_weight", weights="ResNet50_Weights.IMAGENET1K_V2")
+    model = torch.hub.load("pytorch/vision", "resnet50", weights=weights)
 
-    torchvision.models.optical_flow.raft_large
-    torchvision.models.optical_flow.raft_small
+The only exception to the above are the detection models included on
+:mod:`torchvision.models.detection`. These models require TorchVision
+to be installed because they depend on custom C++ operators.
diff --git a/docs/source/models_new.rst b/docs/source/models_new.rst
deleted file mode 100644
index 8d6306139fe..00000000000
--- a/docs/source/models_new.rst
+++ /dev/null
@@ -1,515 +0,0 @@
-.. _models_new:
-
-Models and pre-trained weights - New
-####################################
-
-The ``torchvision.models`` subpackage contains definitions of models for addressing
-different tasks, including: image classification, pixelwise semantic
-segmentation, object detection, instance segmentation, person
-keypoint detection, video classification, and optical flow.
-
-General information on pre-trained weights
-==========================================
-
-TorchVision offers pre-trained weights for every provided architecture, using
-the PyTorch :mod:`torch.hub`. Instancing a pre-trained model will download its
-weights to a cache directory. This directory can be set using the `TORCH_HOME`
-environment variable. See :func:`torch.hub.load_state_dict_from_url` for details.
-
-.. note::
-
-    The pre-trained models provided in this library may have their own licenses or
-    terms and conditions derived from the dataset used for training. It is your
-    responsibility to determine whether you have permission to use the models for
-    your use case.
-
-.. note ::
-    Backward compatibility is guaranteed for loading a serialized
-    ``state_dict`` to the model created using old PyTorch version.
-    On the contrary, loading entire saved models or serialized
-    ``ScriptModules`` (serialized using older versions of PyTorch)
-    may not preserve the historic behaviour. Refer to the following
-    `documentation
-    <https://pytorch.org/docs/stable/notes/serialization.html#id6>`_
-
-
-Initializing pre-trained models
--------------------------------
-
-As of v0.13, TorchVision offers a new `Multi-weight support API
-<https://pytorch.org/blog/introducing-torchvision-new-multi-weight-support-api/>`_
-for loading different weights to the existing model builder methods:
-
-.. code:: python
-
-    from torchvision.models import resnet50, ResNet50_Weights
-
-    # Old weights with accuracy 76.130%
-    resnet50(weights=ResNet50_Weights.IMAGENET1K_V1)
-
-    # New weights with accuracy 80.858%
-    resnet50(weights=ResNet50_Weights.IMAGENET1K_V2)
-
-    # Best available weights (currently alias for IMAGENET1K_V2)
-    # Note that these weights may change across versions
-    resnet50(weights=ResNet50_Weights.DEFAULT)
-
-    # Strings are also supported
-    resnet50(weights="IMAGENET1K_V2")
-
-    # No weights - random initialization
-    resnet50(weights=None)
-
-
-Migrating to the new API is very straightforward. The following method calls between the 2 APIs are all equivalent:
-
-.. code:: python
-
-    from torchvision.models import resnet50, ResNet50_Weights
-
-    # Using pretrained weights:
-    resnet50(weights=ResNet50_Weights.IMAGENET1K_V1)
-    resnet50(weights="IMAGENET1K_V1")
-    resnet50(pretrained=True)  # deprecated
-    resnet50(True)  # deprecated
-
-    # Using no weights:
-    resnet50(weights=None)
-    resnet50()
-    resnet50(pretrained=False)  # deprecated
-    resnet50(False)  # deprecated
-
-Note that the ``pretrained`` parameter is now deprecated, using it will emit warnings and will be removed on v0.15.
-
-Using the pre-trained models
-----------------------------
-
-Before using the pre-trained models, one must preprocess the image
-(resize with right resolution/interpolation, apply inference transforms,
-rescale the values etc). There is no standard way to do this as it depends on
-how a given model was trained. It can vary across model families, variants or
-even weight versions. Using the correct preprocessing method is critical and
-failing to do so may lead to decreased accuracy or incorrect outputs.
-
-All the necessary information for the inference transforms of each pre-trained
-model is provided on its weights documentation. To simplify inference, TorchVision
-bundles the necessary preprocessing transforms into each model weight. These are
-accessible via the ``weight.transforms`` attribute:
-
-.. code:: python
-
-    # Initialize the Weight Transforms
-    weights = ResNet50_Weights.DEFAULT
-    preprocess = weights.transforms()
-
-    # Apply it to the input image
-    img_transformed = preprocess(img)
-
-
-Some models use modules which have different training and evaluation
-behavior, such as batch normalization. To switch between these modes, use
-``model.train()`` or ``model.eval()`` as appropriate. See
-:meth:`~torch.nn.Module.train` or :meth:`~torch.nn.Module.eval` for details.
-
-.. code:: python
-
-    # Initialize model
-    weights = ResNet50_Weights.DEFAULT
-    model = resnet50(weights=weights)
-
-    # Set model to eval mode
-    model.eval()
-
-
-Classification
-==============
-
-.. currentmodule:: torchvision.models
-
-The following classification models are available, with or without pre-trained
-weights:
-
-.. toctree::
-   :maxdepth: 1
-
-   models/alexnet
-   models/convnext
-   models/densenet
-   models/efficientnet
-   models/efficientnetv2
-   models/googlenet
-   models/inception
-   models/mnasnet
-   models/mobilenetv2
-   models/mobilenetv3
-   models/regnet
-   models/resnet
-   models/resnext
-   models/shufflenetv2
-   models/squeezenet
-   models/swin_transformer
-   models/vgg
-   models/vision_transformer
-   models/wide_resnet
-
-|
-
-Here is an example of how to use the pre-trained image classification models:
-
-.. code:: python
-
-    from torchvision.io import read_image
-    from torchvision.models import resnet50, ResNet50_Weights
-
-    img = read_image("test/assets/encode_jpeg/grace_hopper_517x606.jpg")
-
-    # Step 1: Initialize model with the best available weights
-    weights = ResNet50_Weights.DEFAULT
-    model = resnet50(weights=weights)
-    model.eval()
-
-    # Step 2: Initialize the inference transforms
-    preprocess = weights.transforms()
-
-    # Step 3: Apply inference preprocessing transforms
-    batch = preprocess(img).unsqueeze(0)
-
-    # Step 4: Use the model and print the predicted category
-    prediction = model(batch).squeeze(0).softmax(0)
-    class_id = prediction.argmax().item()
-    score = prediction[class_id].item()
-    category_name = weights.meta["categories"][class_id]
-    print(f"{category_name}: {100 * score:.1f}%")
-
-The classes of the pre-trained model outputs can be found at ``weights.meta["categories"]``.
-
-Table of all available classification weights
----------------------------------------------
-
-Accuracies are reported on ImageNet-1K using single crops:
-
-.. include:: generated/classification_table.rst
-
-Quantized models
-----------------
-
-.. currentmodule:: torchvision.models.quantization
-
-The following architectures provide support for INT8 quantized models, with or without
-pre-trained weights:
-
-.. toctree::
-   :maxdepth: 1
-
-   models/googlenet_quant
-   models/inception_quant
-   models/mobilenetv2_quant
-   models/mobilenetv3_quant
-   models/resnet_quant
-   models/resnext_quant
-   models/shufflenetv2_quant
-
-|
-
-Here is an example of how to use the pre-trained quantized image classification models:
-
-.. code:: python
-
-    from torchvision.io import read_image
-    from torchvision.models.quantization import resnet50, ResNet50_QuantizedWeights
-
-    img = read_image("test/assets/encode_jpeg/grace_hopper_517x606.jpg")
-
-    # Step 1: Initialize model with the best available weights
-    weights = ResNet50_QuantizedWeights.DEFAULT
-    model = resnet50(weights=weights, quantize=True)
-    model.eval()
-
-    # Step 2: Initialize the inference transforms
-    preprocess = weights.transforms()
-
-    # Step 3: Apply inference preprocessing transforms
-    batch = preprocess(img).unsqueeze(0)
-
-    # Step 4: Use the model and print the predicted category
-    prediction = model(batch).squeeze(0).softmax(0)
-    class_id = prediction.argmax().item()
-    score = prediction[class_id].item()
-    category_name = weights.meta["categories"][class_id]
-    print(f"{category_name}: {100 * score}%")
-
-The classes of the pre-trained model outputs can be found at ``weights.meta["categories"]``.
-
-
-Table of all available quantized classification weights
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-Accuracies are reported on ImageNet-1K using single crops:
-
-.. include:: generated/classification_quant_table.rst
-
-Semantic Segmentation
-=====================
-
-.. currentmodule:: torchvision.models.segmentation
-
-The following semantic segmentation models are available, with or without
-pre-trained weights:
-
-.. toctree::
-   :maxdepth: 1
-
-   models/deeplabv3
-   models/fcn
-   models/lraspp
-
-|
-
-Here is an example of how to use the pre-trained semantic segmentation models:
-
-.. code:: python
-
-    from torchvision.io.image import read_image
-    from torchvision.models.segmentation import fcn_resnet50, FCN_ResNet50_Weights
-    from torchvision.transforms.functional import to_pil_image
-
-    img = read_image("gallery/assets/dog1.jpg")
-
-    # Step 1: Initialize model with the best available weights
-    weights = FCN_ResNet50_Weights.DEFAULT
-    model = fcn_resnet50(weights=weights)
-    model.eval()
-
-    # Step 2: Initialize the inference transforms
-    preprocess = weights.transforms()
-
-    # Step 3: Apply inference preprocessing transforms
-    batch = preprocess(img).unsqueeze(0)
-
-    # Step 4: Use the model and visualize the prediction
-    prediction = model(batch)["out"]
-    normalized_masks = prediction.softmax(dim=1)
-    class_to_idx = {cls: idx for (idx, cls) in enumerate(weights.meta["categories"])}
-    mask = normalized_masks[0, class_to_idx["dog"]]
-    to_pil_image(mask).show()
-
-The classes of the pre-trained model outputs can be found at ``weights.meta["categories"]``.
-The output format of the models is illustrated in :ref:`semantic_seg_output`.
-
-
-Table of all available semantic segmentation weights
-----------------------------------------------------
-
-All models are evaluated a subset of COCO val2017, on the 20 categories that are present in the Pascal VOC dataset:
-
-.. include:: generated/segmentation_table.rst
-
-
-
-Object Detection, Instance Segmentation and Person Keypoint Detection
-=====================================================================
-
-The pre-trained models for detection, instance segmentation and
-keypoint detection are initialized with the classification models
-in torchvision. The models expect a list of ``Tensor[C, H, W]``.
-Check the constructor of the models for more information.
-
-Object Detection
-----------------
-
-.. currentmodule:: torchvision.models.detection
-
-The following object detection models are available, with or without pre-trained
-weights:
-
-.. toctree::
-   :maxdepth: 1
-
-   models/faster_rcnn
-   models/fcos
-   models/retinanet
-   models/ssd
-   models/ssdlite
-
-|
-
-Here is an example of how to use the pre-trained object detection models:
-
-.. code:: python
-
-
-    from torchvision.io.image import read_image
-    from torchvision.models.detection import fasterrcnn_resnet50_fpn_v2, FasterRCNN_ResNet50_FPN_V2_Weights
-    from torchvision.utils import draw_bounding_boxes
-    from torchvision.transforms.functional import to_pil_image
-
-    img = read_image("test/assets/encode_jpeg/grace_hopper_517x606.jpg")
-
-    # Step 1: Initialize model with the best available weights
-    weights = FasterRCNN_ResNet50_FPN_V2_Weights.DEFAULT
-    model = fasterrcnn_resnet50_fpn_v2(weights=weights, box_score_thresh=0.9)
-    model.eval()
-
-    # Step 2: Initialize the inference transforms
-    preprocess = weights.transforms()
-
-    # Step 3: Apply inference preprocessing transforms
-    batch = [preprocess(img)]
-
-    # Step 4: Use the model and visualize the prediction
-    prediction = model(batch)[0]
-    labels = [weights.meta["categories"][i] for i in prediction["labels"]]
-    box = draw_bounding_boxes(img, boxes=prediction["boxes"],
-                              labels=labels,
-                              colors="red",
-                              width=4, font_size=30)
-    im = to_pil_image(box.detach())
-    im.show()
-
-The classes of the pre-trained model outputs can be found at ``weights.meta["categories"]``.
-For details on how to plot the bounding boxes of the models, you may refer to :ref:`instance_seg_output`.
-
-Table of all available Object detection weights
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-Box MAPs are reported on COCO val2017:
-
-.. include:: generated/detection_table.rst
-
-
-Instance Segmentation
----------------------
-
-.. currentmodule:: torchvision.models.detection
-
-The following instance segmentation models are available, with or without pre-trained
-weights:
-
-.. toctree::
-   :maxdepth: 1
-
-   models/mask_rcnn
-
-|
-
-
-For details on how to plot the masks of the models, you may refer to :ref:`instance_seg_output`.
-
-Table of all available Instance segmentation weights
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-Box and Mask MAPs are reported on COCO val2017:
-
-.. include:: generated/instance_segmentation_table.rst
-
-Keypoint Detection
-------------------
-
-.. currentmodule:: torchvision.models.detection
-
-The following person keypoint detection models are available, with or without
-pre-trained weights:
-
-.. toctree::
-   :maxdepth: 1
-
-   models/keypoint_rcnn
-
-|
-
-The classes of the pre-trained model outputs can be found at ``weights.meta["keypoint_names"]``.
-For details on how to plot the bounding boxes of the models, you may refer to :ref:`keypoint_output`.
-
-Table of all available Keypoint detection weights
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-Box and Keypoint MAPs are reported on COCO val2017:
-
-.. include:: generated/detection_keypoint_table.rst
-
-
-Video Classification
-====================
-
-.. currentmodule:: torchvision.models.video
-
-The following video classification models are available, with or without
-pre-trained weights:
-
-.. toctree::
-   :maxdepth: 1
-
-   models/video_resnet
-
-|
-
-Here is an example of how to use the pre-trained video classification models:
-
-.. code:: python
-
-
-    from torchvision.io.video import read_video
-    from torchvision.models.video import r3d_18, R3D_18_Weights
-
-    vid, _, _ = read_video("test/assets/videos/v_SoccerJuggling_g23_c01.avi")
-    vid = vid[:32]  # optionally shorten duration
-
-    # Step 1: Initialize model with the best available weights
-    weights = R3D_18_Weights.DEFAULT
-    model = r3d_18(weights=weights)
-    model.eval()
-
-    # Step 2: Initialize the inference transforms
-    preprocess = weights.transforms()
-
-    # Step 3: Apply inference preprocessing transforms
-    batch = preprocess(vid).unsqueeze(0)
-
-    # Step 4: Use the model and print the predicted category
-    prediction = model(batch).squeeze(0).softmax(0)
-    label = prediction.argmax().item()
-    score = prediction[label].item()
-    category_name = weights.meta["categories"][label]
-    print(f"{category_name}: {100 * score}%")
-
-The classes of the pre-trained model outputs can be found at ``weights.meta["categories"]``.
-
-
-Table of all available video classification weights
----------------------------------------------------
-
-Accuracies are reported on Kinetics-400 using single crops for clip length 16:
-
-.. include:: generated/video_table.rst
-
-Optical Flow
-============
-
-.. currentmodule:: torchvision.models.optical_flow
-
-The following Optical Flow models are available, with or without pre-trained
-
-.. toctree::
-   :maxdepth: 1
-
-   models/raft
-
-Using models from Hub
-=====================
-
-Most pre-trained models can be accessed directly via PyTorch Hub without having TorchVision installed:
-
-.. code:: python
-
-    import torch
-
-    # Option 1: passing weights param as string
-    model = torch.hub.load("pytorch/vision", "resnet50", weights="IMAGENET1K_V2")
-
-    # Option 2: passing weights param as enum
-    weights = torch.hub.load("pytorch/vision", "get_weight", weights="ResNet50_Weights.IMAGENET1K_V2")
-    model = torch.hub.load("pytorch/vision", "resnet50", weights=weights)
-
-The only exception to the above are the detection models included on
-:mod:`torchvision.models.detection`. These models require TorchVision
-to be installed because they depend on custom C++ operators.

From 9a306c1bf0c4771c1db613ee21a815f40e6392f2 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Wed, 18 May 2022 17:50:53 +0100
Subject: [PATCH 2/3] Remove '- New'

---
 docs/source/models.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/models.rst b/docs/source/models.rst
index a972c7bed30..12b2e174763 100644
--- a/docs/source/models.rst
+++ b/docs/source/models.rst
@@ -1,7 +1,7 @@
 .. _models:
 
-Models and pre-trained weights - New
-####################################
+Models and pre-trained weights
+##############################
 
 The ``torchvision.models`` subpackage contains definitions of models for addressing
 different tasks, including: image classification, pixelwise semantic

From db7e206b452814c7132666ea74c551f8c4570f30 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Wed, 18 May 2022 17:51:45 +0100
Subject: [PATCH 3/3] Put back torchhub section where it originally was

---
 docs/source/models.rst | 39 +++++++++++++++++++--------------------
 1 file changed, 19 insertions(+), 20 deletions(-)

diff --git a/docs/source/models.rst b/docs/source/models.rst
index 12b2e174763..ea3c57bb62b 100644
--- a/docs/source/models.rst
+++ b/docs/source/models.rst
@@ -120,6 +120,25 @@ behavior, such as batch normalization. To switch between these modes, use
     # Set model to eval mode
     model.eval()
 
+Using models from Hub
+---------------------
+
+Most pre-trained models can be accessed directly via PyTorch Hub without having TorchVision installed:
+
+.. code:: python
+
+    import torch
+
+    # Option 1: passing weights param as string
+    model = torch.hub.load("pytorch/vision", "resnet50", weights="IMAGENET1K_V2")
+
+    # Option 2: passing weights param as enum
+    weights = torch.hub.load("pytorch/vision", "get_weight", weights="ResNet50_Weights.IMAGENET1K_V2")
+    model = torch.hub.load("pytorch/vision", "resnet50", weights=weights)
+
+The only exception to the above are the detection models included on
+:mod:`torchvision.models.detection`. These models require TorchVision
+to be installed because they depend on custom C++ operators.
 
 Classification
 ==============
@@ -494,23 +513,3 @@ The following Optical Flow models are available, with or without pre-trained
    :maxdepth: 1
 
    models/raft
-
-Using models from Hub
-=====================
-
-Most pre-trained models can be accessed directly via PyTorch Hub without having TorchVision installed:
-
-.. code:: python
-
-    import torch
-
-    # Option 1: passing weights param as string
-    model = torch.hub.load("pytorch/vision", "resnet50", weights="IMAGENET1K_V2")
-
-    # Option 2: passing weights param as enum
-    weights = torch.hub.load("pytorch/vision", "get_weight", weights="ResNet50_Weights.IMAGENET1K_V2")
-    model = torch.hub.load("pytorch/vision", "resnet50", weights=weights)
-
-The only exception to the above are the detection models included on
-:mod:`torchvision.models.detection`. These models require TorchVision
-to be installed because they depend on custom C++ operators.