From 6351698a9aa41b7bab32a9cd6f488210ccdc1299 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Thu, 14 Apr 2022 14:23:43 +0100
Subject: [PATCH 1/7] First PR for model doc revamp

---
 .gitignore                    |   1 +
 docs/requirements.txt         |   1 +
 docs/source/conf.py           |  59 +++
 docs/source/models.rst        | 814 +---------------------------------
 docs/source/models/resnet.rst |  28 ++
 docs/source/models/vgg.rst    |  30 ++
 torchvision/models/resnet.py  |  95 +++-
 torchvision/models/vgg.py     | 160 +++++--
 8 files changed, 325 insertions(+), 863 deletions(-)
 create mode 100644 docs/source/models/resnet.rst
 create mode 100644 docs/source/models/vgg.rst

diff --git a/.gitignore b/.gitignore
index d3ba0e7a8f9..f16b54061e0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -15,6 +15,7 @@ docs/build
 docs/source/auto_examples/
 docs/source/gen_modules/
 docs/source/generated/
+docs/source/models/generated/
 # pytorch-sphinx-theme gets installed here
 docs/src
 
diff --git a/docs/requirements.txt b/docs/requirements.txt
index d7a05e5e499..91b877a6233 100644
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -3,6 +3,7 @@ numpy
 sphinx-copybutton>=0.3.1
 sphinx-gallery>=0.9.0
 sphinx==3.5.4
+tabulate
 # This pin is only needed for sphinx<4.0.2. See https://github.com/pytorch/vision/issues/5673 for details
 Jinja2<3.1.*
 -e git+https://github.com/pytorch/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme
diff --git a/docs/source/conf.py b/docs/source/conf.py
index d09a33c3064..8a1428c1908 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -21,9 +21,12 @@
 # sys.path.insert(0, os.path.abspath('.'))
 
 import os
+import textwrap
 
 import pytorch_sphinx_theme
 import torchvision
+import torchvision.models as M
+from tabulate import tabulate
 
 
 # -- General configuration ------------------------------------------------
@@ -292,5 +295,61 @@ def inject_minigalleries(app, what, name, obj, options, lines):
         lines.append("\n")
 
 
+def inject_weight_metadata(app, what, name, obj, options, lines):
+
+    if obj.__name__.endswith("_Weights"):
+        lines[:] = ["The model builder above accepts the following values as the ``weights`` parameter:"]
+        lines.append("")
+        for field in obj:
+            lines += [f"**{str(field)}**:", ""]
+
+            table = []
+            for k, v in field.meta.items():
+                if k != "categories":
+                    table.append((str(k), str(v)))
+            table = tabulate(table, tablefmt="rst")
+            lines += [".. table::", ""]
+            lines += textwrap.indent(table, " " * 4).split("\n")
+            lines.append("")
+
+
+def generate_table():
+
+    # TODO: this is ugly af and incorrect. We'll need an automatic way to
+    # retrieve weight enums for each section, or manually list them.
+    weight_enums = [getattr(M, name) for name in dir(M) if name.endswith("Weights")]
+    weights = [w for weight_enum in weight_enums for w in weight_enum if "acc@1" in w.meta]
+
+    def get_weight_link(w):
+        return f":class:`{w} <{type(w).__name__}>`"
+
+    column_names = ("**Weight**", "**Acc@1**", "**Acc@5**", "**Params**", "**Recipe**")
+    content = [
+        (
+            get_weight_link(w),
+            w.meta["acc@1"],
+            w.meta["acc@5"],
+            f"{w.meta['num_params']/1e6:.1f}M",
+            f"`link <{w.meta['recipe']}>`__",
+        )
+        for w in weights
+    ]
+    table = tabulate(content, headers=column_names, tablefmt="rst")
+
+    from pathlib import Path
+
+    generated_dir = Path("generated")
+    generated_dir.mkdir(exist_ok=True)
+    with open(generated_dir / "classification_table.rst", "w+") as table_file:
+        table_file.write(".. table::\n")
+        table_file.write("    :widths: 100 10 10 20 10\n\n")
+        table_file.write(f"{textwrap.indent(table, ' ' * 4)}\n\n")
+
+
+generate_table()
+
+
 def setup(app):
+
     app.connect("autodoc-process-docstring", inject_minigalleries)
+    app.connect("autodoc-process-docstring", inject_weight_metadata)
diff --git a/docs/source/models.rst b/docs/source/models.rst
index f84d9c7fd1a..eb3c059170e 100644
--- a/docs/source/models.rst
+++ b/docs/source/models.rst
@@ -22,819 +22,27 @@ keypoint detection, video classification, and optical flow.
 Classification
 ==============
 
-The models subpackage contains definitions for the following model
-architectures for image classification:
-
--  `AlexNet`_
--  `VGG`_
--  `ResNet`_
--  `SqueezeNet`_
--  `DenseNet`_
--  `Inception`_ v3
--  `GoogLeNet`_
--  `ShuffleNet`_ v2
--  `MobileNetV2`_
--  `MobileNetV3`_
--  `ResNeXt`_
--  `Wide ResNet`_
--  `MNASNet`_
--  `EfficientNet`_ v1 & v2
--  `RegNet`_
--  `VisionTransformer`_
--  `ConvNeXt`_
-
-You can construct a model with random weights by calling its constructor:
-
-.. code:: python
-
-    import torchvision.models as models
-    resnet18 = models.resnet18()
-    alexnet = models.alexnet()
-    vgg16 = models.vgg16()
-    squeezenet = models.squeezenet1_0()
-    densenet = models.densenet161()
-    inception = models.inception_v3()
-    googlenet = models.googlenet()
-    shufflenet = models.shufflenet_v2_x1_0()
-    mobilenet_v2 = models.mobilenet_v2()
-    mobilenet_v3_large = models.mobilenet_v3_large()
-    mobilenet_v3_small = models.mobilenet_v3_small()
-    resnext50_32x4d = models.resnext50_32x4d()
-    wide_resnet50_2 = models.wide_resnet50_2()
-    mnasnet = models.mnasnet1_0()
-    efficientnet_b0 = models.efficientnet_b0()
-    efficientnet_b1 = models.efficientnet_b1()
-    efficientnet_b2 = models.efficientnet_b2()
-    efficientnet_b3 = models.efficientnet_b3()
-    efficientnet_b4 = models.efficientnet_b4()
-    efficientnet_b5 = models.efficientnet_b5()
-    efficientnet_b6 = models.efficientnet_b6()
-    efficientnet_b7 = models.efficientnet_b7()
-    efficientnet_v2_s = models.efficientnet_v2_s()
-    efficientnet_v2_m = models.efficientnet_v2_m()
-    efficientnet_v2_l = models.efficientnet_v2_l()
-    regnet_y_400mf = models.regnet_y_400mf()
-    regnet_y_800mf = models.regnet_y_800mf()
-    regnet_y_1_6gf = models.regnet_y_1_6gf()
-    regnet_y_3_2gf = models.regnet_y_3_2gf()
-    regnet_y_8gf = models.regnet_y_8gf()
-    regnet_y_16gf = models.regnet_y_16gf()
-    regnet_y_32gf = models.regnet_y_32gf()
-    regnet_y_128gf = models.regnet_y_128gf()
-    regnet_x_400mf = models.regnet_x_400mf()
-    regnet_x_800mf = models.regnet_x_800mf()
-    regnet_x_1_6gf = models.regnet_x_1_6gf()
-    regnet_x_3_2gf = models.regnet_x_3_2gf()
-    regnet_x_8gf = models.regnet_x_8gf()
-    regnet_x_16gf = models.regnet_x_16gf()
-    regnet_x_32gf = models.regnet_x_32gf()
-    vit_b_16 = models.vit_b_16()
-    vit_b_32 = models.vit_b_32()
-    vit_l_16 = models.vit_l_16()
-    vit_l_32 = models.vit_l_32()
-    vit_h_14 = models.vit_h_14()
-    convnext_tiny = models.convnext_tiny()
-    convnext_small = models.convnext_small()
-    convnext_base = models.convnext_base()
-    convnext_large = models.convnext_large()
-
-We provide pre-trained models, using the PyTorch :mod:`torch.utils.model_zoo`.
-
-Instancing a pre-trained model will download its weights to a cache directory.
-This directory can be set using the `TORCH_HOME` environment variable. See
-:func:`torch.hub.load_state_dict_from_url` for details.
-
-Some models use modules which have different training and evaluation
-behavior, such as batch normalization. To switch between these modes, use
-``model.train()`` or ``model.eval()`` as appropriate. See
-:meth:`~torch.nn.Module.train` or :meth:`~torch.nn.Module.eval` for details.
-
-All pre-trained models expect input images normalized in the same way,
-i.e. mini-batches of 3-channel RGB images of shape (3 x H x W),
-where H and W are expected to be at least 224.
-The images have to be loaded in to a range of [0, 1] and then normalized
-using ``mean = [0.485, 0.456, 0.406]`` and ``std = [0.229, 0.224, 0.225]``.
-You can use the following transform to normalize::
-
-    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
-                                     std=[0.229, 0.224, 0.225])
-
-An example of such normalization can be found in the imagenet example
-`here <https://github.com/pytorch/examples/blob/42e5b996718797e45c46a25c55b031e6768f8440/imagenet/main.py#L89-L101>`_
-
-The process for obtaining the values of `mean` and `std` is roughly equivalent
-to::
-
-    import torch
-    from torchvision import datasets, transforms as T
-
-    transform = T.Compose([T.Resize(256), T.CenterCrop(224), T.PILToTensor(), T.ConvertImageDtype(torch.float)])
-    dataset = datasets.ImageNet(".", split="train", transform=transform)
-
-    means = []
-    stds = []
-    for img in subset(dataset):
-        means.append(torch.mean(img))
-        stds.append(torch.std(img))
-
-    mean = torch.mean(torch.tensor(means))
-    std = torch.mean(torch.tensor(stds))
-
-Unfortunately, the concrete `subset` that was used is lost. For more
-information see `this discussion <https://github.com/pytorch/vision/issues/1439>`_
-or `these experiments <https://github.com/pytorch/vision/pull/1965>`_.
-
-The sizes of the EfficientNet models depend on the variant. For the exact input sizes
-`check here <https://github.com/pytorch/vision/blob/d2bfd639e46e1c5dc3c177f889dc7750c8d137c7/references/classification/train.py#L92-L93>`_
-
-ImageNet 1-crop error rates
-
-================================  =============   =============
-Model                             Acc@1           Acc@5
-================================  =============   =============
-AlexNet                           56.522          79.066
-VGG-11                            69.020          88.628
-VGG-13                            69.928          89.246
-VGG-16                            71.592          90.382
-VGG-19                            72.376          90.876
-VGG-11 with batch normalization   70.370          89.810
-VGG-13 with batch normalization   71.586          90.374
-VGG-16 with batch normalization   73.360          91.516
-VGG-19 with batch normalization   74.218          91.842
-ResNet-18                         69.758          89.078
-ResNet-34                         73.314          91.420
-ResNet-50                         76.130          92.862
-ResNet-101                        77.374          93.546
-ResNet-152                        78.312          94.046
-SqueezeNet 1.0                    58.092          80.420
-SqueezeNet 1.1                    58.178          80.624
-Densenet-121                      74.434          91.972
-Densenet-169                      75.600          92.806
-Densenet-201                      76.896          93.370
-Densenet-161                      77.138          93.560
-Inception v3                      77.294          93.450
-GoogleNet                         69.778          89.530
-ShuffleNet V2 x1.0                69.362          88.316
-ShuffleNet V2 x0.5                60.552          81.746
-MobileNet V2                      71.878          90.286
-MobileNet V3 Large                74.042          91.340
-MobileNet V3 Small                67.668          87.402
-ResNeXt-50-32x4d                  77.618          93.698
-ResNeXt-101-32x8d                 79.312          94.526
-Wide ResNet-50-2                  78.468          94.086
-Wide ResNet-101-2                 78.848          94.284
-MNASNet 1.0                       73.456          91.510
-MNASNet 0.5                       67.734          87.490
-EfficientNet-B0                   77.692          93.532
-EfficientNet-B1                   78.642          94.186
-EfficientNet-B2                   80.608          95.310
-EfficientNet-B3                   82.008          96.054
-EfficientNet-B4                   83.384          96.594
-EfficientNet-B5                   83.444          96.628
-EfficientNet-B6                   84.008          96.916
-EfficientNet-B7                   84.122          96.908
-EfficientNetV2-s                  84.228          96.878
-EfficientNetV2-m                  85.112          97.156
-EfficientNetV2-l                  85.810          97.792
-regnet_x_400mf                    72.834          90.950
-regnet_x_800mf                    75.212          92.348
-regnet_x_1_6gf                    77.040          93.440
-regnet_x_3_2gf                    78.364          93.992
-regnet_x_8gf                      79.344          94.686 
-regnet_x_16gf                     80.058          94.944
-regnet_x_32gf                     80.622          95.248
-regnet_y_400mf                    74.046          91.716
-regnet_y_800mf                    76.420          93.136
-regnet_y_1_6gf                    77.950          93.966
-regnet_y_3_2gf                    78.948          94.576
-regnet_y_8gf                      80.032          95.048
-regnet_y_16gf                     80.424          95.240
-regnet_y_32gf                     80.878          95.340
-vit_b_16                          81.072          95.318
-vit_b_32                          75.912          92.466
-vit_l_16                          79.662          94.638
-vit_l_32                          76.972          93.070
-vit_h_14                          88.552          98.694 
-convnext_tiny                     82.520          96.146
-convnext_small                    83.616          96.650
-convnext_base                     84.062          96.870
-convnext_large                    84.414          96.976
-================================  =============   =============
-
-
-.. _AlexNet: https://arxiv.org/abs/1404.5997
-.. _VGG: https://arxiv.org/abs/1409.1556
-.. _ResNet: https://arxiv.org/abs/1512.03385
-.. _SqueezeNet: https://arxiv.org/abs/1602.07360
-.. _DenseNet: https://arxiv.org/abs/1608.06993
-.. _Inception: https://arxiv.org/abs/1512.00567
-.. _GoogLeNet: https://arxiv.org/abs/1409.4842
-.. _ShuffleNet: https://arxiv.org/abs/1807.11164
-.. _MobileNetV2: https://arxiv.org/abs/1801.04381
-.. _MobileNetV3: https://arxiv.org/abs/1905.02244
-.. _ResNeXt: https://arxiv.org/abs/1611.05431
-.. _MNASNet: https://arxiv.org/abs/1807.11626
-.. _EfficientNet: https://arxiv.org/abs/1905.11946
-.. _RegNet: https://arxiv.org/abs/2003.13678
-.. _VisionTransformer: https://arxiv.org/abs/2010.11929
-.. _ConvNeXt: https://arxiv.org/abs/2201.03545
-
 .. currentmodule:: torchvision.models
 
-Alexnet
--------
-
-.. autosummary::
-    :toctree: generated/
-    :template: function.rst
-
-    alexnet
-
-VGG
----
-
-.. autosummary::
-    :toctree: generated/
-    :template: function.rst
-
-    vgg11
-    vgg11_bn
-    vgg13
-    vgg13_bn
-    vgg16
-    vgg16_bn
-    vgg19
-    vgg19_bn
-
-
-ResNet
-------
-
-.. autosummary::
-    :toctree: generated/
-    :template: function.rst
-
-    resnet18
-    resnet34
-    resnet50
-    resnet101
-    resnet152
-
-SqueezeNet
-----------
-
-.. autosummary::
-    :toctree: generated/
-    :template: function.rst
-
-    squeezenet1_0
-    squeezenet1_1
-
-DenseNet
----------
-
-.. autosummary::
-    :toctree: generated/
-    :template: function.rst
-
-    densenet121
-    densenet169
-    densenet161
-    densenet201
-
-Inception v3
-------------
-
-.. autosummary::
-    :toctree: generated/
-    :template: function.rst
-
-    inception_v3
-
-GoogLeNet
-------------
-
-.. autosummary::
-    :toctree: generated/
-    :template: function.rst
-
-    googlenet
-
-ShuffleNet v2
--------------
-
-.. autosummary::
-    :toctree: generated/
-    :template: function.rst
-
-    shufflenet_v2_x0_5
-    shufflenet_v2_x1_0
-    shufflenet_v2_x1_5
-    shufflenet_v2_x2_0
-
-MobileNet v2
--------------
-
-.. autosummary::
-    :toctree: generated/
-    :template: function.rst
-
-    mobilenet_v2
-
-MobileNet v3
--------------
-
-.. autosummary::
-    :toctree: generated/
-    :template: function.rst
-
-    mobilenet_v3_large
-    mobilenet_v3_small
-
-ResNext
--------
-
-.. autosummary::
-    :toctree: generated/
-    :template: function.rst
-
-    resnext50_32x4d
-    resnext101_32x8d
-
-Wide ResNet
------------
-
-.. autosummary::
-    :toctree: generated/
-    :template: function.rst
-
-    wide_resnet50_2
-    wide_resnet101_2
-
-MNASNet
---------
-
-.. autosummary::
-    :toctree: generated/
-    :template: function.rst
-
-    mnasnet0_5
-    mnasnet0_75
-    mnasnet1_0
-    mnasnet1_3
-
-EfficientNet
-------------
-
-.. autosummary::
-    :toctree: generated/
-    :template: function.rst
-
-    efficientnet_b0
-    efficientnet_b1
-    efficientnet_b2
-    efficientnet_b3
-    efficientnet_b4
-    efficientnet_b5
-    efficientnet_b6
-    efficientnet_b7
-    efficientnet_v2_s
-    efficientnet_v2_m
-    efficientnet_v2_l
-
-RegNet
-------------
-
-.. autosummary::
-    :toctree: generated/
-    :template: function.rst
-
-    regnet_y_400mf
-    regnet_y_800mf
-    regnet_y_1_6gf
-    regnet_y_3_2gf
-    regnet_y_8gf
-    regnet_y_16gf
-    regnet_y_32gf
-    regnet_y_128gf
-    regnet_x_400mf
-    regnet_x_800mf
-    regnet_x_1_6gf
-    regnet_x_3_2gf
-    regnet_x_8gf
-    regnet_x_16gf
-    regnet_x_32gf
-
-VisionTransformer
------------------
-
-.. autosummary::
-    :toctree: generated/
-    :template: function.rst
-
-    vit_b_16
-    vit_b_32
-    vit_l_16
-    vit_l_32
-    vit_h_14
-
-ConvNeXt
---------
-
-.. autosummary::
-    :toctree: generated/
-    :template: function.rst
-
-    convnext_tiny
-    convnext_small
-    convnext_base
-    convnext_large
-
-Quantized Models
-----------------
-
-The following architectures provide support for INT8 quantized models. You can get
-a model with random weights by calling its constructor:
-
-.. code:: python
-
-    import torchvision.models as models
-    googlenet = models.quantization.googlenet()
-    inception_v3 = models.quantization.inception_v3()
-    mobilenet_v2 = models.quantization.mobilenet_v2()
-    mobilenet_v3_large = models.quantization.mobilenet_v3_large()
-    resnet18 = models.quantization.resnet18()
-    resnet50 = models.quantization.resnet50()
-    resnext101_32x8d = models.quantization.resnext101_32x8d()
-    shufflenet_v2_x0_5 = models.quantization.shufflenet_v2_x0_5()
-    shufflenet_v2_x1_0 = models.quantization.shufflenet_v2_x1_0()
-
-Obtaining a pre-trained quantized model can be done with a few lines of code:
-
-.. code:: python
-
-    import torchvision.models as models
-    model = models.quantization.mobilenet_v2(weights=MobileNet_V2_QuantizedWeights.IMAGENET1K_QNNPACK_V1, quantize=True)
-    model.eval()
-    # run the model with quantized inputs and weights
-    out = model(torch.rand(1, 3, 224, 224))
-
-We provide pre-trained quantized weights for the following models:
-
-================================  =============  =============
-Model                             Acc@1          Acc@5
-================================  =============  =============
-MobileNet V2                      71.658         90.150
-MobileNet V3 Large                73.004         90.858
-ShuffleNet V2 x1.0                68.360         87.582
-ShuffleNet V2 x0.5                57.972         79.780
-ResNet 18                         69.494         88.882
-ResNet 50                         75.920         92.814
-ResNext 101 32x8d                 78.986         94.480
-Inception V3                      77.176         93.354
-GoogleNet                         69.826         89.404
-================================  =============  =============
-
-
-Semantic Segmentation
-=====================
-
-The models subpackage contains definitions for the following model
-architectures for semantic segmentation:
+The following classification models are available, with or without pre-trained
+weights:
 
-- `FCN ResNet50, ResNet101 <https://arxiv.org/abs/1411.4038>`_
-- `DeepLabV3 ResNet50, ResNet101, MobileNetV3-Large <https://arxiv.org/abs/1706.05587>`_
-- `LR-ASPP MobileNetV3-Large <https://arxiv.org/abs/1905.02244>`_
+.. toctree::
+   :maxdepth: 1
 
-As with image classification models, all pre-trained models expect input images normalized in the same way.
-The images have to be loaded in to a range of ``[0, 1]`` and then normalized using
-``mean = [0.485, 0.456, 0.406]`` and ``std = [0.229, 0.224, 0.225]``.
-They have been trained on images resized such that their minimum size is 520.
+   models/resnet
+   models/vgg
 
-For details on how to plot the masks of such models, you may refer to :ref:`semantic_seg_output`.
 
-The pre-trained models have been trained on a subset of COCO train2017, on the 20 categories that are
-present in the Pascal VOC dataset. You can see more information on how the subset has been selected in
-``references/segmentation/coco_utils.py``. The classes that the pre-trained model outputs are the following,
-in order:
+Table of all available classificaiton weights
+---------------------------------------------
 
-  .. code-block:: python
+Accuracies are reported on ImageNet
 
-      ['__background__', 'aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus',
-       'car', 'cat', 'chair', 'cow', 'diningtable', 'dog', 'horse', 'motorbike',
-       'person', 'pottedplant', 'sheep', 'sofa', 'train', 'tvmonitor']
+.. include:: generated/classification_table.rst
 
-The accuracies of the pre-trained models evaluated on COCO val2017 are as follows
-
-================================  =============  ====================
-Network                           mean IoU       global pixelwise acc
-================================  =============  ====================
-FCN ResNet50                      60.5           91.4
-FCN ResNet101                     63.7           91.9
-DeepLabV3 ResNet50                66.4           92.4
-DeepLabV3 ResNet101               67.4           92.4
-DeepLabV3 MobileNetV3-Large       60.3           91.2
-LR-ASPP MobileNetV3-Large         57.9           91.2
-================================  =============  ====================
-
-
-Fully Convolutional Networks
-----------------------------
-
-.. autosummary::
-    :toctree: generated/
-    :template: function.rst
-
-    torchvision.models.segmentation.fcn_resnet50
-    torchvision.models.segmentation.fcn_resnet101
-
-
-DeepLabV3
----------
-
-.. autosummary::
-    :toctree: generated/
-    :template: function.rst
-
-    torchvision.models.segmentation.deeplabv3_resnet50
-    torchvision.models.segmentation.deeplabv3_resnet101
-    torchvision.models.segmentation.deeplabv3_mobilenet_v3_large
-
-
-LR-ASPP
--------
-
-.. autosummary::
-    :toctree: generated/
-    :template: function.rst
-
-    torchvision.models.segmentation.lraspp_mobilenet_v3_large
-
-.. _object_det_inst_seg_pers_keypoint_det:
 
 Object Detection, Instance Segmentation and Person Keypoint Detection
 =====================================================================
 
-The models subpackage contains definitions for the following model
-architectures for detection:
-
-- `Faster R-CNN <https://arxiv.org/abs/1506.01497>`_
-- `FCOS <https://arxiv.org/abs/1904.01355>`_
-- `Mask R-CNN <https://arxiv.org/abs/1703.06870>`_
-- `RetinaNet <https://arxiv.org/abs/1708.02002>`_
-- `SSD <https://arxiv.org/abs/1512.02325>`_
-- `SSDlite <https://arxiv.org/abs/1801.04381>`_
-
-The pre-trained models for detection, instance segmentation and
-keypoint detection are initialized with the classification models
-in torchvision.
-
-The models expect a list of ``Tensor[C, H, W]``, in the range ``0-1``.
-The models internally resize the images but the behaviour varies depending
-on the model. Check the constructor of the models for more information. The
-output format of such models is illustrated in :ref:`instance_seg_output`.
-
-
-For object detection and instance segmentation, the pre-trained
-models return the predictions of the following classes:
-
-  .. code-block:: python
-
-      COCO_INSTANCE_CATEGORY_NAMES = [
-          '__background__', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
-          'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'N/A', 'stop sign',
-          'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',
-          'elephant', 'bear', 'zebra', 'giraffe', 'N/A', 'backpack', 'umbrella', 'N/A', 'N/A',
-          'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball',
-          'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket',
-          'bottle', 'N/A', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl',
-          'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza',
-          'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed', 'N/A', 'dining table',
-          'N/A', 'N/A', 'toilet', 'N/A', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone',
-          'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'N/A', 'book',
-          'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush'
-      ]
-
-
-Here are the summary of the accuracies for the models trained on
-the instances set of COCO train2017 and evaluated on COCO val2017.
-
-======================================  =======  ========  ===========
-Network                                 box AP   mask AP   keypoint AP
-======================================  =======  ========  ===========
-Faster R-CNN ResNet-50 FPN              37.0     -         -
-Faster R-CNN MobileNetV3-Large FPN      32.8     -         -
-Faster R-CNN MobileNetV3-Large 320 FPN  22.8     -         -
-FCOS ResNet-50 FPN                      39.2     -         -
-RetinaNet ResNet-50 FPN                 36.4     -         -
-SSD300 VGG16                            25.1     -         -
-SSDlite320 MobileNetV3-Large            21.3     -         -
-Mask R-CNN ResNet-50 FPN                37.9     34.6      -
-======================================  =======  ========  ===========
-
-For person keypoint detection, the accuracies for the pre-trained
-models are as follows
-
-================================  =======  ========  ===========
-Network                           box AP   mask AP   keypoint AP
-================================  =======  ========  ===========
-Keypoint R-CNN ResNet-50 FPN      54.6     -         65.0
-================================  =======  ========  ===========
-
-For person keypoint detection, the pre-trained model return the
-keypoints in the following order:
-
-  .. code-block:: python
-
-    COCO_PERSON_KEYPOINT_NAMES = [
-        'nose',
-        'left_eye',
-        'right_eye',
-        'left_ear',
-        'right_ear',
-        'left_shoulder',
-        'right_shoulder',
-        'left_elbow',
-        'right_elbow',
-        'left_wrist',
-        'right_wrist',
-        'left_hip',
-        'right_hip',
-        'left_knee',
-        'right_knee',
-        'left_ankle',
-        'right_ankle'
-    ]
-
-Runtime characteristics
------------------------
-
-The implementations of the models for object detection, instance segmentation
-and keypoint detection are efficient.
-
-In the following table, we use 8 GPUs to report the results. During training,
-we use a batch size of 2 per GPU for all models except SSD which uses 4
-and SSDlite which uses 24. During testing a batch size  of 1 is used.
-
-For test time, we report the time for the model evaluation and postprocessing
-(including mask pasting in image), but not the time for computing the
-precision-recall.
-
-======================================  ===================  ==================  ===========
-Network                                 train time (s / it)  test time (s / it)  memory (GB)
-======================================  ===================  ==================  ===========
-Faster R-CNN ResNet-50 FPN              0.2288               0.0590              5.2
-Faster R-CNN MobileNetV3-Large FPN      0.1020               0.0415              1.0
-Faster R-CNN MobileNetV3-Large 320 FPN  0.0978               0.0376              0.6
-FCOS ResNet-50 FPN                      0.1450               0.0539              3.3
-RetinaNet ResNet-50 FPN                 0.2514               0.0939              4.1
-SSD300 VGG16                            0.2093               0.0744              1.5
-SSDlite320 MobileNetV3-Large            0.1773               0.0906              1.5
-Mask R-CNN ResNet-50 FPN                0.2728               0.0903              5.4
-Keypoint R-CNN ResNet-50 FPN            0.3789               0.1242              6.8
-======================================  ===================  ==================  ===========
-
-
-Faster R-CNN
-------------
-
-.. autosummary::
-    :toctree: generated/
-    :template: function.rst
-
-    torchvision.models.detection.fasterrcnn_resnet50_fpn
-    torchvision.models.detection.fasterrcnn_mobilenet_v3_large_fpn
-    torchvision.models.detection.fasterrcnn_mobilenet_v3_large_320_fpn
-
-FCOS
-----
-
-.. autosummary::
-    :toctree: generated/
-    :template: function.rst
-
-    torchvision.models.detection.fcos_resnet50_fpn
-
-
-RetinaNet
----------
-
-.. autosummary::
-    :toctree: generated/
-    :template: function.rst
-
-    torchvision.models.detection.retinanet_resnet50_fpn
-
-
-SSD
----
-
-.. autosummary::
-    :toctree: generated/
-    :template: function.rst
-
-    torchvision.models.detection.ssd300_vgg16
-
-
-SSDlite
--------
-
-.. autosummary::
-    :toctree: generated/
-    :template: function.rst
-
-    torchvision.models.detection.ssdlite320_mobilenet_v3_large
-
-
-Mask R-CNN
-----------
-
-.. autosummary::
-    :toctree: generated/
-    :template: function.rst
-
-    torchvision.models.detection.maskrcnn_resnet50_fpn
-
-
-Keypoint R-CNN
---------------
-
-.. autosummary::
-    :toctree: generated/
-    :template: function.rst
-
-    torchvision.models.detection.keypointrcnn_resnet50_fpn
-
-
-Video classification
-====================
-
-We provide models for action recognition pre-trained on Kinetics-400.
-They have all been trained with the scripts provided in ``references/video_classification``.
-
-All pre-trained models expect input images normalized in the same way,
-i.e. mini-batches of 3-channel RGB videos of shape (3 x T x H x W),
-where H and W are expected to be 112, and T is a number of video frames in a clip.
-The images have to be loaded in to a range of [0, 1] and then normalized
-using ``mean = [0.43216, 0.394666, 0.37645]`` and ``std = [0.22803, 0.22145, 0.216989]``.
-
-
-.. note::
-  The normalization parameters are different from the image classification ones, and correspond
-  to the mean and std from Kinetics-400.
-
-.. note::
-  For now, normalization code can be found in ``references/video_classification/transforms.py``,
-  see the ``Normalize`` function there. Note that it differs from standard normalization for
-  images because it assumes the video is 4d.
-
-Kinetics 1-crop accuracies for clip length 16 (16x112x112)
-
-================================  =============   =============
-Network                           Clip acc@1      Clip acc@5
-================================  =============   =============
-ResNet 3D 18                      52.75           75.45
-ResNet MC 18                      53.90           76.29
-ResNet (2+1)D                     57.50           78.81
-================================  =============   =============
-
-
-ResNet 3D
-----------
-
-.. autosummary::
-    :toctree: generated/
-    :template: function.rst
-
-    torchvision.models.video.r3d_18
-
-ResNet Mixed Convolution
-------------------------
-
-.. autosummary::
-    :toctree: generated/
-    :template: function.rst
-
-    torchvision.models.video.mc3_18
-
-ResNet (2+1)D
--------------
-
-.. autosummary::
-    :toctree: generated/
-    :template: function.rst
-
-    torchvision.models.video.r2plus1d_18
-
-Optical flow
-============
-
-Raft
-----
-
-.. autosummary::
-    :toctree: generated/
-    :template: function.rst
-
-    torchvision.models.optical_flow.raft_large
-    torchvision.models.optical_flow.raft_small
+TODO: Something similar to classification models: list of models + table of weights
diff --git a/docs/source/models/resnet.rst b/docs/source/models/resnet.rst
new file mode 100644
index 00000000000..8ab79fe885b
--- /dev/null
+++ b/docs/source/models/resnet.rst
@@ -0,0 +1,28 @@
+ResNet
+======
+
+.. currentmodule:: torchvision.models
+
+The ResNet model is based on the `Deep Residual Learning for Image Recognition
+<https://arxiv.org/abs/1512.03385>`_ paper.
+
+
+Model builders
+--------------
+
+The following model builders can be used to instanciate a ResNet model, with or
+without pre-trained weights. All the model builders internally rely on the
+``torchvision.models.resnet.ResNet`` base class. Please refer to the `source
+code
+<https://github.com/pytorch/vision/blob/main/torchvision/models/resnet.py>`_ for
+more details about this class.
+
+.. autosummary::
+    :toctree: generated/
+    :template: function.rst
+
+    resnet18
+    resnet34
+    resnet50
+    resnet101
+    resnet152
diff --git a/docs/source/models/vgg.rst b/docs/source/models/vgg.rst
new file mode 100644
index 00000000000..068bd330c8b
--- /dev/null
+++ b/docs/source/models/vgg.rst
@@ -0,0 +1,30 @@
+VGG
+===
+
+.. currentmodule:: torchvision.models
+
+The VGG model is based on the `Very Deep Convolutional Networks for Large-Scale
+Image Recognition <https://arxiv.org/abs/1409.1556>`_ paper.
+
+
+Model builders
+--------------
+
+The following model builders can be used to instanciate a VGG model, with or
+without pre-trained weights. All the model buidlers internally rely on the
+``torchvision.models.vgg.VGG`` base class. Please refer to the `source code
+<https://github.com/pytorch/vision/blob/main/torchvision/models/vgg.py>`_ for
+more details about this class.
+
+.. autosummary::
+    :toctree: generated/
+    :template: function.rst
+
+    vgg11
+    vgg11_bn
+    vgg13
+    vgg13_bn
+    vgg16
+    vgg16_bn
+    vgg19
+    vgg19_bn
diff --git a/torchvision/models/resnet.py b/torchvision/models/resnet.py
index 8f44e553296..3d1a831becf 100644
--- a/torchvision/models/resnet.py
+++ b/torchvision/models/resnet.py
@@ -556,12 +556,23 @@ class Wide_ResNet101_2_Weights(WeightsEnum):
 
 @handle_legacy_interface(weights=("pretrained", ResNet18_Weights.IMAGENET1K_V1))
 def resnet18(*, weights: Optional[ResNet18_Weights] = None, progress: bool = True, **kwargs: Any) -> ResNet:
-    r"""ResNet-18 model from
-    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_.
+    """ResNet-18 from `Deep Residual Learning for Image Recognition <https://arxiv.org/pdf/1512.03385.pdf>`__.
 
     Args:
-        weights (ResNet18_Weights, optional): The pretrained weights for the model
-        progress (bool): If True, displays a progress bar of the download to stderr
+        weights (:class:`~torchvision.models.ResNet18_Weights`, optional): The
+            pretrained weights to use. See
+            :class:`~torchvision.models.ResNet18_Weights` below for
+            more details, and possible values. By default, no pre-trained
+            weights are used.
+        progress (bool, optional): If True, displays a progress bar of the
+            download to stderr. Default is True.
+        **kwargs: parameters passed to the ``torchvision.models.resnet.ResNet``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/resnet.py>`_
+            for more details about this class.
+
+    .. autoclass:: torchvision.models.ResNet18_Weights
+        :members:
     """
     weights = ResNet18_Weights.verify(weights)
 
@@ -570,12 +581,23 @@ def resnet18(*, weights: Optional[ResNet18_Weights] = None, progress: bool = Tru
 
 @handle_legacy_interface(weights=("pretrained", ResNet34_Weights.IMAGENET1K_V1))
 def resnet34(*, weights: Optional[ResNet34_Weights] = None, progress: bool = True, **kwargs: Any) -> ResNet:
-    r"""ResNet-34 model from
-    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_.
+    """ResNet-34 from `Deep Residual Learning for Image Recognition <https://arxiv.org/pdf/1512.03385.pdf>`__.
 
     Args:
-        weights (ResNet34_Weights, optional): The pretrained weights for the model
-        progress (bool): If True, displays a progress bar of the download to stderr
+        weights (:class:`~torchvision.models.ResNet34_Weights`, optional): The
+            pretrained weights to use. See
+            :class:`~torchvision.models.ResNet34_Weights` below for
+            more details, and possible values. By default, no pre-trained
+            weights are used.
+        progress (bool, optional): If True, displays a progress bar of the
+            download to stderr. Default is True.
+        **kwargs: parameters passed to the ``torchvision.models.resnet.ResNet``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/resnet.py>`_
+            for more details about this class.
+
+    .. autoclass:: torchvision.models.ResNet34_Weights
+        :members:
     """
     weights = ResNet34_Weights.verify(weights)
 
@@ -584,12 +606,23 @@ def resnet34(*, weights: Optional[ResNet34_Weights] = None, progress: bool = Tru
 
 @handle_legacy_interface(weights=("pretrained", ResNet50_Weights.IMAGENET1K_V1))
 def resnet50(*, weights: Optional[ResNet50_Weights] = None, progress: bool = True, **kwargs: Any) -> ResNet:
-    r"""ResNet-50 model from
-    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_.
+    """ResNet-50 from `Deep Residual Learning for Image Recognition <https://arxiv.org/pdf/1512.03385.pdf>`__.
 
     Args:
-        weights (ResNet50_Weights, optional): The pretrained weights for the model
-        progress (bool): If True, displays a progress bar of the download to stderr
+        weights (:class:`~torchvision.models.ResNet50_Weights`, optional): The
+            pretrained weights to use. See
+            :class:`~torchvision.models.ResNet50_Weights` below for
+            more details, and possible values. By default, no pre-trained
+            weights are used.
+        progress (bool, optional): If True, displays a progress bar of the
+            download to stderr. Default is True.
+        **kwargs: parameters passed to the ``torchvision.models.resnet.ResNet``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/resnet.py>`_
+            for more details about this class.
+
+    .. autoclass:: torchvision.models.ResNet50_Weights
+        :members:
     """
     weights = ResNet50_Weights.verify(weights)
 
@@ -598,12 +631,23 @@ def resnet50(*, weights: Optional[ResNet50_Weights] = None, progress: bool = Tru
 
 @handle_legacy_interface(weights=("pretrained", ResNet101_Weights.IMAGENET1K_V1))
 def resnet101(*, weights: Optional[ResNet101_Weights] = None, progress: bool = True, **kwargs: Any) -> ResNet:
-    r"""ResNet-101 model from
-    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_.
+    """ResNet-101 from `Deep Residual Learning for Image Recognition <https://arxiv.org/pdf/1512.03385.pdf>`__.
 
     Args:
-        weights (ResNet101_Weights, optional): The pretrained weights for the model
-        progress (bool): If True, displays a progress bar of the download to stderr
+        weights (:class:`~torchvision.models.ResNet101_Weights`, optional): The
+            pretrained weights to use. See
+            :class:`~torchvision.models.ResNet101_Weights` below for
+            more details, and possible values. By default, no pre-trained
+            weights are used.
+        progress (bool, optional): If True, displays a progress bar of the
+            download to stderr. Default is True.
+        **kwargs: parameters passed to the ``torchvision.models.resnet.ResNet``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/resnet.py>`_
+            for more details about this class.
+
+    .. autoclass:: torchvision.models.ResNet101_Weights
+        :members:
     """
     weights = ResNet101_Weights.verify(weights)
 
@@ -612,12 +656,23 @@ def resnet101(*, weights: Optional[ResNet101_Weights] = None, progress: bool = T
 
 @handle_legacy_interface(weights=("pretrained", ResNet152_Weights.IMAGENET1K_V1))
 def resnet152(*, weights: Optional[ResNet152_Weights] = None, progress: bool = True, **kwargs: Any) -> ResNet:
-    r"""ResNet-152 model from
-    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_.
+    """ResNet-152 from `Deep Residual Learning for Image Recognition <https://arxiv.org/pdf/1512.03385.pdf>`__.
 
     Args:
-        weights (ResNet152_Weights, optional): The pretrained weights for the model
-        progress (bool): If True, displays a progress bar of the download to stderr
+        weights (:class:`~torchvision.models.ResNet152_Weights`, optional): The
+            pretrained weights to use. See
+            :class:`~torchvision.models.ResNet152_Weights` below for
+            more details, and possible values. By default, no pre-trained
+            weights are used.
+        progress (bool, optional): If True, displays a progress bar of the
+            download to stderr. Default is True.
+        **kwargs: parameters passed to the ``torchvision.models.resnet.ResNet``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/resnet.py>`_
+            for more details about this class.
+
+    .. autoclass:: torchvision.models.ResNet152_Weights
+        :members:
     """
     weights = ResNet152_Weights.verify(weights)
 
diff --git a/torchvision/models/vgg.py b/torchvision/models/vgg.py
index c245eef6482..45f2dae5808 100644
--- a/torchvision/models/vgg.py
+++ b/torchvision/models/vgg.py
@@ -252,13 +252,23 @@ class VGG19_BN_Weights(WeightsEnum):
 
 @handle_legacy_interface(weights=("pretrained", VGG11_Weights.IMAGENET1K_V1))
 def vgg11(*, weights: Optional[VGG11_Weights] = None, progress: bool = True, **kwargs: Any) -> VGG:
-    r"""VGG 11-layer model (configuration "A") from
-    `"Very Deep Convolutional Networks For Large-Scale Image Recognition" <https://arxiv.org/pdf/1409.1556.pdf>`_.
-    The required minimum input size of the model is 32x32.
+    """VGG-11 from `Very Deep Convolutional Networks for Large-Scale Image Recognition <https://arxiv.org/abs/1409.1556>`__.
 
     Args:
-        weights (VGG11_Weights, optional): The pretrained weights for the model
-        progress (bool): If True, displays a progress bar of the download to stderr
+        weights (:class:`~torchvision.models.VGG11_Weights`, optional): The
+            pretrained weights to use. See
+            :class:`~torchvision.models.VGG11_Weights` below for
+            more details, and possible values. By default, no pre-trained
+            weights are used.
+        progress (bool, optional): If True, displays a progress bar of the
+            download to stderr. Default is True.
+        **kwargs: parameters passed to the ``torchvision.models.vgg.VGG``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/vgg.py>`_
+            for more details about this class.
+
+    .. autoclass:: torchvision.models.VGG11_Weights
+        :members:
     """
     weights = VGG11_Weights.verify(weights)
 
@@ -267,13 +277,23 @@ def vgg11(*, weights: Optional[VGG11_Weights] = None, progress: bool = True, **k
 
 @handle_legacy_interface(weights=("pretrained", VGG11_BN_Weights.IMAGENET1K_V1))
 def vgg11_bn(*, weights: Optional[VGG11_BN_Weights] = None, progress: bool = True, **kwargs: Any) -> VGG:
-    r"""VGG 11-layer model (configuration "A") with batch normalization
-    `"Very Deep Convolutional Networks For Large-Scale Image Recognition" <https://arxiv.org/pdf/1409.1556.pdf>`_.
-    The required minimum input size of the model is 32x32.
+    """VGG-11-BN from `Very Deep Convolutional Networks for Large-Scale Image Recognition <https://arxiv.org/abs/1409.1556>`__.
 
     Args:
-        weights (VGG11_BN_Weights, optional): The pretrained weights for the model
-        progress (bool): If True, displays a progress bar of the download to stderr
+        weights (:class:`~torchvision.models.VGG11_BN_Weights`, optional): The
+            pretrained weights to use. See
+            :class:`~torchvision.models.VGG11_BN_Weights` below for
+            more details, and possible values. By default, no pre-trained
+            weights are used.
+        progress (bool, optional): If True, displays a progress bar of the
+            download to stderr. Default is True.
+        **kwargs: parameters passed to the ``torchvision.models.vgg.VGG``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/vgg.py>`_
+            for more details about this class.
+
+    .. autoclass:: torchvision.models.VGG11_BN_Weights
+        :members:
     """
     weights = VGG11_BN_Weights.verify(weights)
 
@@ -282,13 +302,23 @@ def vgg11_bn(*, weights: Optional[VGG11_BN_Weights] = None, progress: bool = Tru
 
 @handle_legacy_interface(weights=("pretrained", VGG13_Weights.IMAGENET1K_V1))
 def vgg13(*, weights: Optional[VGG13_Weights] = None, progress: bool = True, **kwargs: Any) -> VGG:
-    r"""VGG 13-layer model (configuration "B")
-    `"Very Deep Convolutional Networks For Large-Scale Image Recognition" <https://arxiv.org/pdf/1409.1556.pdf>`_.
-    The required minimum input size of the model is 32x32.
+    """VGG-13 from `Very Deep Convolutional Networks for Large-Scale Image Recognition <https://arxiv.org/abs/1409.1556>`__.
 
     Args:
-        weights (VGG13_Weights, optional): The pretrained weights for the model
-        progress (bool): If True, displays a progress bar of the download to stderr
+        weights (:class:`~torchvision.models.VGG13_Weights`, optional): The
+            pretrained weights to use. See
+            :class:`~torchvision.models.VGG13_Weights` below for
+            more details, and possible values. By default, no pre-trained
+            weights are used.
+        progress (bool, optional): If True, displays a progress bar of the
+            download to stderr. Default is True.
+        **kwargs: parameters passed to the ``torchvision.models.vgg.VGG``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/vgg.py>`_
+            for more details about this class.
+
+    .. autoclass:: torchvision.models.VGG13_Weights
+        :members:
     """
     weights = VGG13_Weights.verify(weights)
 
@@ -297,13 +327,23 @@ def vgg13(*, weights: Optional[VGG13_Weights] = None, progress: bool = True, **k
 
 @handle_legacy_interface(weights=("pretrained", VGG13_BN_Weights.IMAGENET1K_V1))
 def vgg13_bn(*, weights: Optional[VGG13_BN_Weights] = None, progress: bool = True, **kwargs: Any) -> VGG:
-    r"""VGG 13-layer model (configuration "B") with batch normalization
-    `"Very Deep Convolutional Networks For Large-Scale Image Recognition" <https://arxiv.org/pdf/1409.1556.pdf>`_.
-    The required minimum input size of the model is 32x32.
+    """VGG-13-BN from `Very Deep Convolutional Networks for Large-Scale Image Recognition <https://arxiv.org/abs/1409.1556>`__.
 
     Args:
-        weights (VGG13_BN_Weights, optional): The pretrained weights for the model
-        progress (bool): If True, displays a progress bar of the download to stderr
+        weights (:class:`~torchvision.models.VGG13_BN_Weights`, optional): The
+            pretrained weights to use. See
+            :class:`~torchvision.models.VGG13_BN_Weights` below for
+            more details, and possible values. By default, no pre-trained
+            weights are used.
+        progress (bool, optional): If True, displays a progress bar of the
+            download to stderr. Default is True.
+        **kwargs: parameters passed to the ``torchvision.models.vgg.VGG``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/vgg.py>`_
+            for more details about this class.
+
+    .. autoclass:: torchvision.models.VGG13_BN_Weights
+        :members:
     """
     weights = VGG13_BN_Weights.verify(weights)
 
@@ -312,13 +352,23 @@ def vgg13_bn(*, weights: Optional[VGG13_BN_Weights] = None, progress: bool = Tru
 
 @handle_legacy_interface(weights=("pretrained", VGG16_Weights.IMAGENET1K_V1))
 def vgg16(*, weights: Optional[VGG16_Weights] = None, progress: bool = True, **kwargs: Any) -> VGG:
-    r"""VGG 16-layer model (configuration "D")
-    `"Very Deep Convolutional Networks For Large-Scale Image Recognition" <https://arxiv.org/pdf/1409.1556.pdf>`_.
-    The required minimum input size of the model is 32x32.
+    """VGG-16 from `Very Deep Convolutional Networks for Large-Scale Image Recognition <https://arxiv.org/abs/1409.1556>`__.
 
     Args:
-        weights (VGG16_Weights, optional): The pretrained weights for the model
-        progress (bool): If True, displays a progress bar of the download to stderr
+        weights (:class:`~torchvision.models.VGG16_Weights`, optional): The
+            pretrained weights to use. See
+            :class:`~torchvision.models.VGG16_Weights` below for
+            more details, and possible values. By default, no pre-trained
+            weights are used.
+        progress (bool, optional): If True, displays a progress bar of the
+            download to stderr. Default is True.
+        **kwargs: parameters passed to the ``torchvision.models.vgg.VGG``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/vgg.py>`_
+            for more details about this class.
+
+    .. autoclass:: torchvision.models.VGG16_Weights
+        :members:
     """
     weights = VGG16_Weights.verify(weights)
 
@@ -327,13 +377,23 @@ def vgg16(*, weights: Optional[VGG16_Weights] = None, progress: bool = True, **k
 
 @handle_legacy_interface(weights=("pretrained", VGG16_BN_Weights.IMAGENET1K_V1))
 def vgg16_bn(*, weights: Optional[VGG16_BN_Weights] = None, progress: bool = True, **kwargs: Any) -> VGG:
-    r"""VGG 16-layer model (configuration "D") with batch normalization
-    `"Very Deep Convolutional Networks For Large-Scale Image Recognition" <https://arxiv.org/pdf/1409.1556.pdf>`_.
-    The required minimum input size of the model is 32x32.
+    """VGG-16-BN from `Very Deep Convolutional Networks for Large-Scale Image Recognition <https://arxiv.org/abs/1409.1556>`__.
 
     Args:
-        weights (VGG16_BN_Weights, optional): The pretrained weights for the model
-        progress (bool): If True, displays a progress bar of the download to stderr
+        weights (:class:`~torchvision.models.VGG16_BN_Weights`, optional): The
+            pretrained weights to use. See
+            :class:`~torchvision.models.VGG16_BN_Weights` below for
+            more details, and possible values. By default, no pre-trained
+            weights are used.
+        progress (bool, optional): If True, displays a progress bar of the
+            download to stderr. Default is True.
+        **kwargs: parameters passed to the ``torchvision.models.vgg.VGG``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/vgg.py>`_
+            for more details about this class.
+
+    .. autoclass:: torchvision.models.VGG16_BN_Weights
+        :members:
     """
     weights = VGG16_BN_Weights.verify(weights)
 
@@ -342,13 +402,23 @@ def vgg16_bn(*, weights: Optional[VGG16_BN_Weights] = None, progress: bool = Tru
 
 @handle_legacy_interface(weights=("pretrained", VGG19_Weights.IMAGENET1K_V1))
 def vgg19(*, weights: Optional[VGG19_Weights] = None, progress: bool = True, **kwargs: Any) -> VGG:
-    r"""VGG 19-layer model (configuration "E")
-    `"Very Deep Convolutional Networks For Large-Scale Image Recognition" <https://arxiv.org/pdf/1409.1556.pdf>`_.
-    The required minimum input size of the model is 32x32.
+    """VGG-19 from `Very Deep Convolutional Networks for Large-Scale Image Recognition <https://arxiv.org/abs/1409.1556>`__.
 
     Args:
-        weights (VGG19_Weights, optional): The pretrained weights for the model
-        progress (bool): If True, displays a progress bar of the download to stderr
+        weights (:class:`~torchvision.models.VGG19_Weights`, optional): The
+            pretrained weights to use. See
+            :class:`~torchvision.models.VGG19_Weights` below for
+            more details, and possible values. By default, no pre-trained
+            weights are used.
+        progress (bool, optional): If True, displays a progress bar of the
+            download to stderr. Default is True.
+        **kwargs: parameters passed to the ``torchvision.models.vgg.VGG``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/vgg.py>`_
+            for more details about this class.
+
+    .. autoclass:: torchvision.models.VGG19_Weights
+        :members:
     """
     weights = VGG19_Weights.verify(weights)
 
@@ -357,13 +427,23 @@ def vgg19(*, weights: Optional[VGG19_Weights] = None, progress: bool = True, **k
 
 @handle_legacy_interface(weights=("pretrained", VGG19_BN_Weights.IMAGENET1K_V1))
 def vgg19_bn(*, weights: Optional[VGG19_BN_Weights] = None, progress: bool = True, **kwargs: Any) -> VGG:
-    r"""VGG 19-layer model (configuration 'E') with batch normalization
-    `"Very Deep Convolutional Networks For Large-Scale Image Recognition" <https://arxiv.org/pdf/1409.1556.pdf>`_.
-    The required minimum input size of the model is 32x32.
+    """VGG-19_BN from `Very Deep Convolutional Networks for Large-Scale Image Recognition <https://arxiv.org/abs/1409.1556>`__.
 
     Args:
-        weights (VGG19_BN_Weights, optional): The pretrained weights for the model
-        progress (bool): If True, displays a progress bar of the download to stderr
+        weights (:class:`~torchvision.models.VGG19_BN_Weights`, optional): The
+            pretrained weights to use. See
+            :class:`~torchvision.models.VGG19_BN_Weights` below for
+            more details, and possible values. By default, no pre-trained
+            weights are used.
+        progress (bool, optional): If True, displays a progress bar of the
+            download to stderr. Default is True.
+        **kwargs: parameters passed to the ``torchvision.models.vgg.VGG``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/vgg.py>`_
+            for more details about this class.
+
+    .. autoclass:: torchvision.models.VGG19_BN_Weights
+        :members:
     """
     weights = VGG19_BN_Weights.verify(weights)
 

From 1058c4526022dfc2b12656d53689b50ca61a61a5 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Thu, 14 Apr 2022 14:47:18 +0100
Subject: [PATCH 2/7] Deactivating fail on warning, temporarily

---
 docs/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/Makefile b/docs/Makefile
index 389a07a604e..11be1d45fce 100644
--- a/docs/Makefile
+++ b/docs/Makefile
@@ -6,7 +6,7 @@ ifneq ($(EXAMPLES_PATTERN),)
 endif
 
 # You can set these variables from the command line.
-SPHINXOPTS    = -W -j auto $(EXAMPLES_PATTERN_OPTS)
+SPHINXOPTS    = -j auto $(EXAMPLES_PATTERN_OPTS)
 SPHINXBUILD   = sphinx-build
 SPHINXPROJ    = torchvision
 SOURCEDIR     = source

From 41969bd13c2ba9cacfda4c35b322567614637962 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Thu, 14 Apr 2022 15:37:19 +0100
Subject: [PATCH 3/7] Remove commnet

---
 docs/source/conf.py | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/docs/source/conf.py b/docs/source/conf.py
index 8a1428c1908..013596dcd11 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -22,6 +22,7 @@
 
 import os
 import textwrap
+from pathlib import Path
 
 import pytorch_sphinx_theme
 import torchvision
@@ -313,10 +314,8 @@ def inject_weight_metadata(app, what, name, obj, options, lines):
             lines.append("")
 
 
-def generate_table():
+def generate_classification_table():
 
-    # TODO: this is ugly af and incorrect. We'll need an automatic way to
-    # retrieve weight enums for each section, or manually list them.
     weight_enums = [getattr(M, name) for name in dir(M) if name.endswith("Weights")]
     weights = [w for weight_enum in weight_enums for w in weight_enum if "acc@1" in w.meta]
 
@@ -336,8 +335,6 @@ def get_weight_link(w):
     ]
     table = tabulate(content, headers=column_names, tablefmt="rst")
 
-    from pathlib import Path
-
     generated_dir = Path("generated")
     generated_dir.mkdir(exist_ok=True)
     with open(generated_dir / "classification_table.rst", "w+") as table_file:
@@ -346,7 +343,7 @@ def get_weight_link(w):
         table_file.write(f"{textwrap.indent(table, ' ' * 4)}\n\n")
 
 
-generate_table()
+generate_classification_table()
 
 
 def setup(app):

From ad2899a2f99e397438b8883b7e9df2d516c8e529 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Thu, 14 Apr 2022 15:53:06 +0100
Subject: [PATCH 4/7] Minor changes

---
 docs/source/conf.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/docs/source/conf.py b/docs/source/conf.py
index 013596dcd11..df9f4486a8b 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -306,8 +306,11 @@ def inject_weight_metadata(app, what, name, obj, options, lines):
 
             table = []
             for k, v in field.meta.items():
-                if k != "categories":
-                    table.append((str(k), str(v)))
+                if k == "categories":
+                    continue
+                elif k == "recipe":
+                    v = f"`link <{v}>`__"
+                table.append((str(k), str(v)))
             table = tabulate(table, tablefmt="rst")
             lines += [".. table::", ""]
             lines += textwrap.indent(table, " " * 4).split("\n")
@@ -317,15 +320,12 @@ def inject_weight_metadata(app, what, name, obj, options, lines):
 def generate_classification_table():
 
     weight_enums = [getattr(M, name) for name in dir(M) if name.endswith("Weights")]
-    weights = [w for weight_enum in weight_enums for w in weight_enum if "acc@1" in w.meta]
-
-    def get_weight_link(w):
-        return f":class:`{w} <{type(w).__name__}>`"
+    weights = [w for weight_enum in weight_enums for w in weight_enum]
 
     column_names = ("**Weight**", "**Acc@1**", "**Acc@5**", "**Params**", "**Recipe**")
     content = [
         (
-            get_weight_link(w),
+            f":class:`{w} <{type(w).__name__}>`",
             w.meta["acc@1"],
             w.meta["acc@5"],
             f"{w.meta['num_params']/1e6:.1f}M",

From ca4b03ebfa8498222ae71ca50aa4d6ee8e5e9bb3 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Tue, 19 Apr 2022 11:28:46 +0100
Subject: [PATCH 5/7] Typos

---
 docs/source/conf.py    | 2 +-
 docs/source/models.rst | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/conf.py b/docs/source/conf.py
index df9f4486a8b..d0cb718f4fa 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -319,7 +319,7 @@ def inject_weight_metadata(app, what, name, obj, options, lines):
 
 def generate_classification_table():
 
-    weight_enums = [getattr(M, name) for name in dir(M) if name.endswith("Weights")]
+    weight_enums = [getattr(M, name) for name in dir(M) if name.endswith("_Weights")]
     weights = [w for weight_enum in weight_enums for w in weight_enum]
 
     column_names = ("**Weight**", "**Acc@1**", "**Acc@5**", "**Params**", "**Recipe**")
diff --git a/docs/source/models.rst b/docs/source/models.rst
index eb3c059170e..38f55ba574e 100644
--- a/docs/source/models.rst
+++ b/docs/source/models.rst
@@ -34,7 +34,7 @@ weights:
    models/vgg
 
 
-Table of all available classificaiton weights
+Table of all available classification weights
 ---------------------------------------------
 
 Accuracies are reported on ImageNet

From 3763a886554c62dcddcf0d88e047f8f1a4651d0b Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Tue, 19 Apr 2022 11:30:35 +0100
Subject: [PATCH 6/7] Added TODO in Makefile

---
 docs/Makefile | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/Makefile b/docs/Makefile
index 11be1d45fce..c0282d23230 100644
--- a/docs/Makefile
+++ b/docs/Makefile
@@ -6,6 +6,8 @@ ifneq ($(EXAMPLES_PATTERN),)
 endif
 
 # You can set these variables from the command line.
+# TODO: Once the models doc revamp is done, set back the -W option to raise
+# errors on warnings. See https://github.com/pytorch/vision/pull/5821#discussion_r850500693
 SPHINXOPTS    = -j auto $(EXAMPLES_PATTERN_OPTS)
 SPHINXBUILD   = sphinx-build
 SPHINXPROJ    = torchvision

From edfcbe0346c3357b5fc7ecba67cf850fdf638a4e Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Tue, 19 Apr 2022 13:39:13 +0100
Subject: [PATCH 7/7] Keep old models.rst file intact, move new docs into new
 models_new.rst file

---
 docs/source/index.rst      |   1 +
 docs/source/models.rst     | 814 ++++++++++++++++++++++++++++++++++++-
 docs/source/models_new.rst |  54 +++
 3 files changed, 858 insertions(+), 11 deletions(-)
 create mode 100644 docs/source/models_new.rst

diff --git a/docs/source/index.rst b/docs/source/index.rst
index 79dbebdd047..06737ae4b60 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -38,6 +38,7 @@ architectures, and common image transformations for computer vision.
    ops
    io
    feature_extraction
+   models_new
 
 .. toctree::
    :maxdepth: 1
diff --git a/docs/source/models.rst b/docs/source/models.rst
index 38f55ba574e..f84d9c7fd1a 100644
--- a/docs/source/models.rst
+++ b/docs/source/models.rst
@@ -22,27 +22,819 @@ keypoint detection, video classification, and optical flow.
 Classification
 ==============
 
+The models subpackage contains definitions for the following model
+architectures for image classification:
+
+-  `AlexNet`_
+-  `VGG`_
+-  `ResNet`_
+-  `SqueezeNet`_
+-  `DenseNet`_
+-  `Inception`_ v3
+-  `GoogLeNet`_
+-  `ShuffleNet`_ v2
+-  `MobileNetV2`_
+-  `MobileNetV3`_
+-  `ResNeXt`_
+-  `Wide ResNet`_
+-  `MNASNet`_
+-  `EfficientNet`_ v1 & v2
+-  `RegNet`_
+-  `VisionTransformer`_
+-  `ConvNeXt`_
+
+You can construct a model with random weights by calling its constructor:
+
+.. code:: python
+
+    import torchvision.models as models
+    resnet18 = models.resnet18()
+    alexnet = models.alexnet()
+    vgg16 = models.vgg16()
+    squeezenet = models.squeezenet1_0()
+    densenet = models.densenet161()
+    inception = models.inception_v3()
+    googlenet = models.googlenet()
+    shufflenet = models.shufflenet_v2_x1_0()
+    mobilenet_v2 = models.mobilenet_v2()
+    mobilenet_v3_large = models.mobilenet_v3_large()
+    mobilenet_v3_small = models.mobilenet_v3_small()
+    resnext50_32x4d = models.resnext50_32x4d()
+    wide_resnet50_2 = models.wide_resnet50_2()
+    mnasnet = models.mnasnet1_0()
+    efficientnet_b0 = models.efficientnet_b0()
+    efficientnet_b1 = models.efficientnet_b1()
+    efficientnet_b2 = models.efficientnet_b2()
+    efficientnet_b3 = models.efficientnet_b3()
+    efficientnet_b4 = models.efficientnet_b4()
+    efficientnet_b5 = models.efficientnet_b5()
+    efficientnet_b6 = models.efficientnet_b6()
+    efficientnet_b7 = models.efficientnet_b7()
+    efficientnet_v2_s = models.efficientnet_v2_s()
+    efficientnet_v2_m = models.efficientnet_v2_m()
+    efficientnet_v2_l = models.efficientnet_v2_l()
+    regnet_y_400mf = models.regnet_y_400mf()
+    regnet_y_800mf = models.regnet_y_800mf()
+    regnet_y_1_6gf = models.regnet_y_1_6gf()
+    regnet_y_3_2gf = models.regnet_y_3_2gf()
+    regnet_y_8gf = models.regnet_y_8gf()
+    regnet_y_16gf = models.regnet_y_16gf()
+    regnet_y_32gf = models.regnet_y_32gf()
+    regnet_y_128gf = models.regnet_y_128gf()
+    regnet_x_400mf = models.regnet_x_400mf()
+    regnet_x_800mf = models.regnet_x_800mf()
+    regnet_x_1_6gf = models.regnet_x_1_6gf()
+    regnet_x_3_2gf = models.regnet_x_3_2gf()
+    regnet_x_8gf = models.regnet_x_8gf()
+    regnet_x_16gf = models.regnet_x_16gf()
+    regnet_x_32gf = models.regnet_x_32gf()
+    vit_b_16 = models.vit_b_16()
+    vit_b_32 = models.vit_b_32()
+    vit_l_16 = models.vit_l_16()
+    vit_l_32 = models.vit_l_32()
+    vit_h_14 = models.vit_h_14()
+    convnext_tiny = models.convnext_tiny()
+    convnext_small = models.convnext_small()
+    convnext_base = models.convnext_base()
+    convnext_large = models.convnext_large()
+
+We provide pre-trained models, using the PyTorch :mod:`torch.utils.model_zoo`.
+
+Instancing a pre-trained model will download its weights to a cache directory.
+This directory can be set using the `TORCH_HOME` environment variable. See
+:func:`torch.hub.load_state_dict_from_url` for details.
+
+Some models use modules which have different training and evaluation
+behavior, such as batch normalization. To switch between these modes, use
+``model.train()`` or ``model.eval()`` as appropriate. See
+:meth:`~torch.nn.Module.train` or :meth:`~torch.nn.Module.eval` for details.
+
+All pre-trained models expect input images normalized in the same way,
+i.e. mini-batches of 3-channel RGB images of shape (3 x H x W),
+where H and W are expected to be at least 224.
+The images have to be loaded in to a range of [0, 1] and then normalized
+using ``mean = [0.485, 0.456, 0.406]`` and ``std = [0.229, 0.224, 0.225]``.
+You can use the following transform to normalize::
+
+    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
+                                     std=[0.229, 0.224, 0.225])
+
+An example of such normalization can be found in the imagenet example
+`here <https://github.com/pytorch/examples/blob/42e5b996718797e45c46a25c55b031e6768f8440/imagenet/main.py#L89-L101>`_
+
+The process for obtaining the values of `mean` and `std` is roughly equivalent
+to::
+
+    import torch
+    from torchvision import datasets, transforms as T
+
+    transform = T.Compose([T.Resize(256), T.CenterCrop(224), T.PILToTensor(), T.ConvertImageDtype(torch.float)])
+    dataset = datasets.ImageNet(".", split="train", transform=transform)
+
+    means = []
+    stds = []
+    for img in subset(dataset):
+        means.append(torch.mean(img))
+        stds.append(torch.std(img))
+
+    mean = torch.mean(torch.tensor(means))
+    std = torch.mean(torch.tensor(stds))
+
+Unfortunately, the concrete `subset` that was used is lost. For more
+information see `this discussion <https://github.com/pytorch/vision/issues/1439>`_
+or `these experiments <https://github.com/pytorch/vision/pull/1965>`_.
+
+The sizes of the EfficientNet models depend on the variant. For the exact input sizes
+`check here <https://github.com/pytorch/vision/blob/d2bfd639e46e1c5dc3c177f889dc7750c8d137c7/references/classification/train.py#L92-L93>`_
+
+ImageNet 1-crop error rates
+
+================================  =============   =============
+Model                             Acc@1           Acc@5
+================================  =============   =============
+AlexNet                           56.522          79.066
+VGG-11                            69.020          88.628
+VGG-13                            69.928          89.246
+VGG-16                            71.592          90.382
+VGG-19                            72.376          90.876
+VGG-11 with batch normalization   70.370          89.810
+VGG-13 with batch normalization   71.586          90.374
+VGG-16 with batch normalization   73.360          91.516
+VGG-19 with batch normalization   74.218          91.842
+ResNet-18                         69.758          89.078
+ResNet-34                         73.314          91.420
+ResNet-50                         76.130          92.862
+ResNet-101                        77.374          93.546
+ResNet-152                        78.312          94.046
+SqueezeNet 1.0                    58.092          80.420
+SqueezeNet 1.1                    58.178          80.624
+Densenet-121                      74.434          91.972
+Densenet-169                      75.600          92.806
+Densenet-201                      76.896          93.370
+Densenet-161                      77.138          93.560
+Inception v3                      77.294          93.450
+GoogleNet                         69.778          89.530
+ShuffleNet V2 x1.0                69.362          88.316
+ShuffleNet V2 x0.5                60.552          81.746
+MobileNet V2                      71.878          90.286
+MobileNet V3 Large                74.042          91.340
+MobileNet V3 Small                67.668          87.402
+ResNeXt-50-32x4d                  77.618          93.698
+ResNeXt-101-32x8d                 79.312          94.526
+Wide ResNet-50-2                  78.468          94.086
+Wide ResNet-101-2                 78.848          94.284
+MNASNet 1.0                       73.456          91.510
+MNASNet 0.5                       67.734          87.490
+EfficientNet-B0                   77.692          93.532
+EfficientNet-B1                   78.642          94.186
+EfficientNet-B2                   80.608          95.310
+EfficientNet-B3                   82.008          96.054
+EfficientNet-B4                   83.384          96.594
+EfficientNet-B5                   83.444          96.628
+EfficientNet-B6                   84.008          96.916
+EfficientNet-B7                   84.122          96.908
+EfficientNetV2-s                  84.228          96.878
+EfficientNetV2-m                  85.112          97.156
+EfficientNetV2-l                  85.810          97.792
+regnet_x_400mf                    72.834          90.950
+regnet_x_800mf                    75.212          92.348
+regnet_x_1_6gf                    77.040          93.440
+regnet_x_3_2gf                    78.364          93.992
+regnet_x_8gf                      79.344          94.686 
+regnet_x_16gf                     80.058          94.944
+regnet_x_32gf                     80.622          95.248
+regnet_y_400mf                    74.046          91.716
+regnet_y_800mf                    76.420          93.136
+regnet_y_1_6gf                    77.950          93.966
+regnet_y_3_2gf                    78.948          94.576
+regnet_y_8gf                      80.032          95.048
+regnet_y_16gf                     80.424          95.240
+regnet_y_32gf                     80.878          95.340
+vit_b_16                          81.072          95.318
+vit_b_32                          75.912          92.466
+vit_l_16                          79.662          94.638
+vit_l_32                          76.972          93.070
+vit_h_14                          88.552          98.694 
+convnext_tiny                     82.520          96.146
+convnext_small                    83.616          96.650
+convnext_base                     84.062          96.870
+convnext_large                    84.414          96.976
+================================  =============   =============
+
+
+.. _AlexNet: https://arxiv.org/abs/1404.5997
+.. _VGG: https://arxiv.org/abs/1409.1556
+.. _ResNet: https://arxiv.org/abs/1512.03385
+.. _SqueezeNet: https://arxiv.org/abs/1602.07360
+.. _DenseNet: https://arxiv.org/abs/1608.06993
+.. _Inception: https://arxiv.org/abs/1512.00567
+.. _GoogLeNet: https://arxiv.org/abs/1409.4842
+.. _ShuffleNet: https://arxiv.org/abs/1807.11164
+.. _MobileNetV2: https://arxiv.org/abs/1801.04381
+.. _MobileNetV3: https://arxiv.org/abs/1905.02244
+.. _ResNeXt: https://arxiv.org/abs/1611.05431
+.. _MNASNet: https://arxiv.org/abs/1807.11626
+.. _EfficientNet: https://arxiv.org/abs/1905.11946
+.. _RegNet: https://arxiv.org/abs/2003.13678
+.. _VisionTransformer: https://arxiv.org/abs/2010.11929
+.. _ConvNeXt: https://arxiv.org/abs/2201.03545
+
 .. currentmodule:: torchvision.models
 
-The following classification models are available, with or without pre-trained
-weights:
+Alexnet
+-------
+
+.. autosummary::
+    :toctree: generated/
+    :template: function.rst
+
+    alexnet
+
+VGG
+---
+
+.. autosummary::
+    :toctree: generated/
+    :template: function.rst
+
+    vgg11
+    vgg11_bn
+    vgg13
+    vgg13_bn
+    vgg16
+    vgg16_bn
+    vgg19
+    vgg19_bn
+
+
+ResNet
+------
+
+.. autosummary::
+    :toctree: generated/
+    :template: function.rst
+
+    resnet18
+    resnet34
+    resnet50
+    resnet101
+    resnet152
+
+SqueezeNet
+----------
+
+.. autosummary::
+    :toctree: generated/
+    :template: function.rst
+
+    squeezenet1_0
+    squeezenet1_1
+
+DenseNet
+---------
+
+.. autosummary::
+    :toctree: generated/
+    :template: function.rst
+
+    densenet121
+    densenet169
+    densenet161
+    densenet201
+
+Inception v3
+------------
+
+.. autosummary::
+    :toctree: generated/
+    :template: function.rst
+
+    inception_v3
+
+GoogLeNet
+------------
+
+.. autosummary::
+    :toctree: generated/
+    :template: function.rst
+
+    googlenet
+
+ShuffleNet v2
+-------------
+
+.. autosummary::
+    :toctree: generated/
+    :template: function.rst
+
+    shufflenet_v2_x0_5
+    shufflenet_v2_x1_0
+    shufflenet_v2_x1_5
+    shufflenet_v2_x2_0
+
+MobileNet v2
+-------------
+
+.. autosummary::
+    :toctree: generated/
+    :template: function.rst
+
+    mobilenet_v2
+
+MobileNet v3
+-------------
+
+.. autosummary::
+    :toctree: generated/
+    :template: function.rst
+
+    mobilenet_v3_large
+    mobilenet_v3_small
+
+ResNext
+-------
+
+.. autosummary::
+    :toctree: generated/
+    :template: function.rst
+
+    resnext50_32x4d
+    resnext101_32x8d
+
+Wide ResNet
+-----------
+
+.. autosummary::
+    :toctree: generated/
+    :template: function.rst
+
+    wide_resnet50_2
+    wide_resnet101_2
+
+MNASNet
+--------
+
+.. autosummary::
+    :toctree: generated/
+    :template: function.rst
+
+    mnasnet0_5
+    mnasnet0_75
+    mnasnet1_0
+    mnasnet1_3
+
+EfficientNet
+------------
+
+.. autosummary::
+    :toctree: generated/
+    :template: function.rst
+
+    efficientnet_b0
+    efficientnet_b1
+    efficientnet_b2
+    efficientnet_b3
+    efficientnet_b4
+    efficientnet_b5
+    efficientnet_b6
+    efficientnet_b7
+    efficientnet_v2_s
+    efficientnet_v2_m
+    efficientnet_v2_l
+
+RegNet
+------------
+
+.. autosummary::
+    :toctree: generated/
+    :template: function.rst
+
+    regnet_y_400mf
+    regnet_y_800mf
+    regnet_y_1_6gf
+    regnet_y_3_2gf
+    regnet_y_8gf
+    regnet_y_16gf
+    regnet_y_32gf
+    regnet_y_128gf
+    regnet_x_400mf
+    regnet_x_800mf
+    regnet_x_1_6gf
+    regnet_x_3_2gf
+    regnet_x_8gf
+    regnet_x_16gf
+    regnet_x_32gf
+
+VisionTransformer
+-----------------
+
+.. autosummary::
+    :toctree: generated/
+    :template: function.rst
+
+    vit_b_16
+    vit_b_32
+    vit_l_16
+    vit_l_32
+    vit_h_14
+
+ConvNeXt
+--------
+
+.. autosummary::
+    :toctree: generated/
+    :template: function.rst
+
+    convnext_tiny
+    convnext_small
+    convnext_base
+    convnext_large
+
+Quantized Models
+----------------
+
+The following architectures provide support for INT8 quantized models. You can get
+a model with random weights by calling its constructor:
+
+.. code:: python
+
+    import torchvision.models as models
+    googlenet = models.quantization.googlenet()
+    inception_v3 = models.quantization.inception_v3()
+    mobilenet_v2 = models.quantization.mobilenet_v2()
+    mobilenet_v3_large = models.quantization.mobilenet_v3_large()
+    resnet18 = models.quantization.resnet18()
+    resnet50 = models.quantization.resnet50()
+    resnext101_32x8d = models.quantization.resnext101_32x8d()
+    shufflenet_v2_x0_5 = models.quantization.shufflenet_v2_x0_5()
+    shufflenet_v2_x1_0 = models.quantization.shufflenet_v2_x1_0()
+
+Obtaining a pre-trained quantized model can be done with a few lines of code:
+
+.. code:: python
+
+    import torchvision.models as models
+    model = models.quantization.mobilenet_v2(weights=MobileNet_V2_QuantizedWeights.IMAGENET1K_QNNPACK_V1, quantize=True)
+    model.eval()
+    # run the model with quantized inputs and weights
+    out = model(torch.rand(1, 3, 224, 224))
+
+We provide pre-trained quantized weights for the following models:
+
+================================  =============  =============
+Model                             Acc@1          Acc@5
+================================  =============  =============
+MobileNet V2                      71.658         90.150
+MobileNet V3 Large                73.004         90.858
+ShuffleNet V2 x1.0                68.360         87.582
+ShuffleNet V2 x0.5                57.972         79.780
+ResNet 18                         69.494         88.882
+ResNet 50                         75.920         92.814
+ResNext 101 32x8d                 78.986         94.480
+Inception V3                      77.176         93.354
+GoogleNet                         69.826         89.404
+================================  =============  =============
+
+
+Semantic Segmentation
+=====================
+
+The models subpackage contains definitions for the following model
+architectures for semantic segmentation:
 
-.. toctree::
-   :maxdepth: 1
+- `FCN ResNet50, ResNet101 <https://arxiv.org/abs/1411.4038>`_
+- `DeepLabV3 ResNet50, ResNet101, MobileNetV3-Large <https://arxiv.org/abs/1706.05587>`_
+- `LR-ASPP MobileNetV3-Large <https://arxiv.org/abs/1905.02244>`_
 
-   models/resnet
-   models/vgg
+As with image classification models, all pre-trained models expect input images normalized in the same way.
+The images have to be loaded in to a range of ``[0, 1]`` and then normalized using
+``mean = [0.485, 0.456, 0.406]`` and ``std = [0.229, 0.224, 0.225]``.
+They have been trained on images resized such that their minimum size is 520.
 
+For details on how to plot the masks of such models, you may refer to :ref:`semantic_seg_output`.
 
-Table of all available classification weights
----------------------------------------------
+The pre-trained models have been trained on a subset of COCO train2017, on the 20 categories that are
+present in the Pascal VOC dataset. You can see more information on how the subset has been selected in
+``references/segmentation/coco_utils.py``. The classes that the pre-trained model outputs are the following,
+in order:
 
-Accuracies are reported on ImageNet
+  .. code-block:: python
 
-.. include:: generated/classification_table.rst
+      ['__background__', 'aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus',
+       'car', 'cat', 'chair', 'cow', 'diningtable', 'dog', 'horse', 'motorbike',
+       'person', 'pottedplant', 'sheep', 'sofa', 'train', 'tvmonitor']
 
+The accuracies of the pre-trained models evaluated on COCO val2017 are as follows
+
+================================  =============  ====================
+Network                           mean IoU       global pixelwise acc
+================================  =============  ====================
+FCN ResNet50                      60.5           91.4
+FCN ResNet101                     63.7           91.9
+DeepLabV3 ResNet50                66.4           92.4
+DeepLabV3 ResNet101               67.4           92.4
+DeepLabV3 MobileNetV3-Large       60.3           91.2
+LR-ASPP MobileNetV3-Large         57.9           91.2
+================================  =============  ====================
+
+
+Fully Convolutional Networks
+----------------------------
+
+.. autosummary::
+    :toctree: generated/
+    :template: function.rst
+
+    torchvision.models.segmentation.fcn_resnet50
+    torchvision.models.segmentation.fcn_resnet101
+
+
+DeepLabV3
+---------
+
+.. autosummary::
+    :toctree: generated/
+    :template: function.rst
+
+    torchvision.models.segmentation.deeplabv3_resnet50
+    torchvision.models.segmentation.deeplabv3_resnet101
+    torchvision.models.segmentation.deeplabv3_mobilenet_v3_large
+
+
+LR-ASPP
+-------
+
+.. autosummary::
+    :toctree: generated/
+    :template: function.rst
+
+    torchvision.models.segmentation.lraspp_mobilenet_v3_large
+
+.. _object_det_inst_seg_pers_keypoint_det:
 
 Object Detection, Instance Segmentation and Person Keypoint Detection
 =====================================================================
 
-TODO: Something similar to classification models: list of models + table of weights
+The models subpackage contains definitions for the following model
+architectures for detection:
+
+- `Faster R-CNN <https://arxiv.org/abs/1506.01497>`_
+- `FCOS <https://arxiv.org/abs/1904.01355>`_
+- `Mask R-CNN <https://arxiv.org/abs/1703.06870>`_
+- `RetinaNet <https://arxiv.org/abs/1708.02002>`_
+- `SSD <https://arxiv.org/abs/1512.02325>`_
+- `SSDlite <https://arxiv.org/abs/1801.04381>`_
+
+The pre-trained models for detection, instance segmentation and
+keypoint detection are initialized with the classification models
+in torchvision.
+
+The models expect a list of ``Tensor[C, H, W]``, in the range ``0-1``.
+The models internally resize the images but the behaviour varies depending
+on the model. Check the constructor of the models for more information. The
+output format of such models is illustrated in :ref:`instance_seg_output`.
+
+
+For object detection and instance segmentation, the pre-trained
+models return the predictions of the following classes:
+
+  .. code-block:: python
+
+      COCO_INSTANCE_CATEGORY_NAMES = [
+          '__background__', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
+          'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'N/A', 'stop sign',
+          'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',
+          'elephant', 'bear', 'zebra', 'giraffe', 'N/A', 'backpack', 'umbrella', 'N/A', 'N/A',
+          'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball',
+          'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket',
+          'bottle', 'N/A', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl',
+          'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza',
+          'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed', 'N/A', 'dining table',
+          'N/A', 'N/A', 'toilet', 'N/A', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone',
+          'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'N/A', 'book',
+          'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush'
+      ]
+
+
+Here are the summary of the accuracies for the models trained on
+the instances set of COCO train2017 and evaluated on COCO val2017.
+
+======================================  =======  ========  ===========
+Network                                 box AP   mask AP   keypoint AP
+======================================  =======  ========  ===========
+Faster R-CNN ResNet-50 FPN              37.0     -         -
+Faster R-CNN MobileNetV3-Large FPN      32.8     -         -
+Faster R-CNN MobileNetV3-Large 320 FPN  22.8     -         -
+FCOS ResNet-50 FPN                      39.2     -         -
+RetinaNet ResNet-50 FPN                 36.4     -         -
+SSD300 VGG16                            25.1     -         -
+SSDlite320 MobileNetV3-Large            21.3     -         -
+Mask R-CNN ResNet-50 FPN                37.9     34.6      -
+======================================  =======  ========  ===========
+
+For person keypoint detection, the accuracies for the pre-trained
+models are as follows
+
+================================  =======  ========  ===========
+Network                           box AP   mask AP   keypoint AP
+================================  =======  ========  ===========
+Keypoint R-CNN ResNet-50 FPN      54.6     -         65.0
+================================  =======  ========  ===========
+
+For person keypoint detection, the pre-trained model return the
+keypoints in the following order:
+
+  .. code-block:: python
+
+    COCO_PERSON_KEYPOINT_NAMES = [
+        'nose',
+        'left_eye',
+        'right_eye',
+        'left_ear',
+        'right_ear',
+        'left_shoulder',
+        'right_shoulder',
+        'left_elbow',
+        'right_elbow',
+        'left_wrist',
+        'right_wrist',
+        'left_hip',
+        'right_hip',
+        'left_knee',
+        'right_knee',
+        'left_ankle',
+        'right_ankle'
+    ]
+
+Runtime characteristics
+-----------------------
+
+The implementations of the models for object detection, instance segmentation
+and keypoint detection are efficient.
+
+In the following table, we use 8 GPUs to report the results. During training,
+we use a batch size of 2 per GPU for all models except SSD which uses 4
+and SSDlite which uses 24. During testing a batch size  of 1 is used.
+
+For test time, we report the time for the model evaluation and postprocessing
+(including mask pasting in image), but not the time for computing the
+precision-recall.
+
+======================================  ===================  ==================  ===========
+Network                                 train time (s / it)  test time (s / it)  memory (GB)
+======================================  ===================  ==================  ===========
+Faster R-CNN ResNet-50 FPN              0.2288               0.0590              5.2
+Faster R-CNN MobileNetV3-Large FPN      0.1020               0.0415              1.0
+Faster R-CNN MobileNetV3-Large 320 FPN  0.0978               0.0376              0.6
+FCOS ResNet-50 FPN                      0.1450               0.0539              3.3
+RetinaNet ResNet-50 FPN                 0.2514               0.0939              4.1
+SSD300 VGG16                            0.2093               0.0744              1.5
+SSDlite320 MobileNetV3-Large            0.1773               0.0906              1.5
+Mask R-CNN ResNet-50 FPN                0.2728               0.0903              5.4
+Keypoint R-CNN ResNet-50 FPN            0.3789               0.1242              6.8
+======================================  ===================  ==================  ===========
+
+
+Faster R-CNN
+------------
+
+.. autosummary::
+    :toctree: generated/
+    :template: function.rst
+
+    torchvision.models.detection.fasterrcnn_resnet50_fpn
+    torchvision.models.detection.fasterrcnn_mobilenet_v3_large_fpn
+    torchvision.models.detection.fasterrcnn_mobilenet_v3_large_320_fpn
+
+FCOS
+----
+
+.. autosummary::
+    :toctree: generated/
+    :template: function.rst
+
+    torchvision.models.detection.fcos_resnet50_fpn
+
+
+RetinaNet
+---------
+
+.. autosummary::
+    :toctree: generated/
+    :template: function.rst
+
+    torchvision.models.detection.retinanet_resnet50_fpn
+
+
+SSD
+---
+
+.. autosummary::
+    :toctree: generated/
+    :template: function.rst
+
+    torchvision.models.detection.ssd300_vgg16
+
+
+SSDlite
+-------
+
+.. autosummary::
+    :toctree: generated/
+    :template: function.rst
+
+    torchvision.models.detection.ssdlite320_mobilenet_v3_large
+
+
+Mask R-CNN
+----------
+
+.. autosummary::
+    :toctree: generated/
+    :template: function.rst
+
+    torchvision.models.detection.maskrcnn_resnet50_fpn
+
+
+Keypoint R-CNN
+--------------
+
+.. autosummary::
+    :toctree: generated/
+    :template: function.rst
+
+    torchvision.models.detection.keypointrcnn_resnet50_fpn
+
+
+Video classification
+====================
+
+We provide models for action recognition pre-trained on Kinetics-400.
+They have all been trained with the scripts provided in ``references/video_classification``.
+
+All pre-trained models expect input images normalized in the same way,
+i.e. mini-batches of 3-channel RGB videos of shape (3 x T x H x W),
+where H and W are expected to be 112, and T is a number of video frames in a clip.
+The images have to be loaded in to a range of [0, 1] and then normalized
+using ``mean = [0.43216, 0.394666, 0.37645]`` and ``std = [0.22803, 0.22145, 0.216989]``.
+
+
+.. note::
+  The normalization parameters are different from the image classification ones, and correspond
+  to the mean and std from Kinetics-400.
+
+.. note::
+  For now, normalization code can be found in ``references/video_classification/transforms.py``,
+  see the ``Normalize`` function there. Note that it differs from standard normalization for
+  images because it assumes the video is 4d.
+
+Kinetics 1-crop accuracies for clip length 16 (16x112x112)
+
+================================  =============   =============
+Network                           Clip acc@1      Clip acc@5
+================================  =============   =============
+ResNet 3D 18                      52.75           75.45
+ResNet MC 18                      53.90           76.29
+ResNet (2+1)D                     57.50           78.81
+================================  =============   =============
+
+
+ResNet 3D
+----------
+
+.. autosummary::
+    :toctree: generated/
+    :template: function.rst
+
+    torchvision.models.video.r3d_18
+
+ResNet Mixed Convolution
+------------------------
+
+.. autosummary::
+    :toctree: generated/
+    :template: function.rst
+
+    torchvision.models.video.mc3_18
+
+ResNet (2+1)D
+-------------
+
+.. autosummary::
+    :toctree: generated/
+    :template: function.rst
+
+    torchvision.models.video.r2plus1d_18
+
+Optical flow
+============
+
+Raft
+----
+
+.. autosummary::
+    :toctree: generated/
+    :template: function.rst
+
+    torchvision.models.optical_flow.raft_large
+    torchvision.models.optical_flow.raft_small
diff --git a/docs/source/models_new.rst b/docs/source/models_new.rst
new file mode 100644
index 00000000000..c644320d9cb
--- /dev/null
+++ b/docs/source/models_new.rst
@@ -0,0 +1,54 @@
+.. _models_new:
+
+Models and pre-trained weights - New
+####################################
+
+.. note::
+
+    These are the new models docs, documenting the new multi-weight API.
+    TODO: Once all is done, remove the "- New" part in the title above, and
+    rename this file as models.rst
+
+
+The ``torchvision.models`` subpackage contains definitions of models for addressing
+different tasks, including: image classification, pixelwise semantic
+segmentation, object detection, instance segmentation, person
+keypoint detection, video classification, and optical flow.
+
+.. note ::
+    Backward compatibility is guaranteed for loading a serialized 
+    ``state_dict`` to the model created using old PyTorch version. 
+    On the contrary, loading entire saved models or serialized 
+    ``ScriptModules`` (seralized using older versions of PyTorch) 
+    may not preserve the historic behaviour. Refer to the following 
+    `documentation 
+    <https://pytorch.org/docs/stable/notes/serialization.html#id6>`_   
+
+
+Classification
+==============
+
+.. currentmodule:: torchvision.models
+
+The following classification models are available, with or without pre-trained
+weights:
+
+.. toctree::
+   :maxdepth: 1
+
+   models/resnet
+   models/vgg
+
+
+Table of all available classification weights
+---------------------------------------------
+
+Accuracies are reported on ImageNet
+
+.. include:: generated/classification_table.rst
+
+
+Object Detection, Instance Segmentation and Person Keypoint Detection
+=====================================================================
+
+TODO: Something similar to classification models: list of models + table of weights