diff --git a/test/test_extended_models.py b/test/test_extended_models.py index a07f741c9f7..be45a53b17f 100644 --- a/test/test_extended_models.py +++ b/test/test_extended_models.py @@ -79,12 +79,13 @@ def test_naming_conventions(model_fn): ) @run_if_test_with_extended def test_schema_meta_validation(model_fn): - classification_fields = ["size", "categories", "acc@1", "acc@5", "min_size"] + # TODO: add list of permitted fields + classification_fields = ["categories", "acc@1", "acc@5"] defaults = { - "all": ["task", "architecture", "recipe", "num_params"], + "all": ["recipe", "num_params", "min_size"], "models": classification_fields, "detection": ["categories", "map"], - "quantization": classification_fields + ["backend", "quantization", "unquantized"], + "quantization": classification_fields + ["backend", "unquantized"], "segmentation": ["categories", "mIoU", "acc"], "video": classification_fields, "optical_flow": [], diff --git a/torchvision/models/alexnet.py b/torchvision/models/alexnet.py index 0128b82b08a..327530d6b2b 100644 --- a/torchvision/models/alexnet.py +++ b/torchvision/models/alexnet.py @@ -57,10 +57,7 @@ class AlexNet_Weights(WeightsEnum): url="https://download.pytorch.org/models/alexnet-owt-7be5be79.pth", transforms=partial(ImageClassification, crop_size=224), meta={ - "task": "image_classification", - "architecture": "AlexNet", "num_params": 61100840, - "size": (224, 224), "min_size": (63, 63), "categories": _IMAGENET_CATEGORIES, "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#alexnet-and-vgg", diff --git a/torchvision/models/convnext.py b/torchvision/models/convnext.py index 8720c590ede..4b64157634e 100644 --- a/torchvision/models/convnext.py +++ b/torchvision/models/convnext.py @@ -204,9 +204,6 @@ def _convnext( _COMMON_META = { - "task": "image_classification", - "architecture": "ConvNeXt", - "size": (224, 224), "min_size": (32, 32), "categories": _IMAGENET_CATEGORIES, "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#convnext", diff --git a/torchvision/models/densenet.py b/torchvision/models/densenet.py index 7bc53ad2679..38e726d1825 100644 --- a/torchvision/models/densenet.py +++ b/torchvision/models/densenet.py @@ -266,9 +266,6 @@ def _densenet( _COMMON_META = { - "task": "image_classification", - "architecture": "DenseNet", - "size": (224, 224), "min_size": (29, 29), "categories": _IMAGENET_CATEGORIES, "recipe": "https://github.com/pytorch/vision/pull/116", diff --git a/torchvision/models/detection/faster_rcnn.py b/torchvision/models/detection/faster_rcnn.py index ce794b0ed76..08274e7bc75 100644 --- a/torchvision/models/detection/faster_rcnn.py +++ b/torchvision/models/detection/faster_rcnn.py @@ -370,9 +370,8 @@ def forward(self, x): _COMMON_META = { - "task": "image_object_detection", - "architecture": "FasterRCNN", "categories": _COCO_CATEGORIES, + "min_size": (1, 1), } diff --git a/torchvision/models/detection/fcos.py b/torchvision/models/detection/fcos.py index 3bb9a35f517..438566d3ecd 100644 --- a/torchvision/models/detection/fcos.py +++ b/torchvision/models/detection/fcos.py @@ -651,10 +651,9 @@ class FCOS_ResNet50_FPN_Weights(WeightsEnum): url="https://download.pytorch.org/models/fcos_resnet50_fpn_coco-99b0c9b7.pth", transforms=ObjectDetection, meta={ - "task": "image_object_detection", - "architecture": "FCOS", "num_params": 32269600, "categories": _COCO_CATEGORIES, + "min_size": (1, 1), "recipe": "https://github.com/pytorch/vision/tree/main/references/detection#fcos-resnet-50-fpn", "map": 39.2, }, diff --git a/torchvision/models/detection/keypoint_rcnn.py b/torchvision/models/detection/keypoint_rcnn.py index 469d97b3f2f..b864c2854ef 100644 --- a/torchvision/models/detection/keypoint_rcnn.py +++ b/torchvision/models/detection/keypoint_rcnn.py @@ -308,10 +308,9 @@ def forward(self, x): _COMMON_META = { - "task": "image_object_detection", - "architecture": "KeypointRCNN", "categories": _COCO_PERSON_CATEGORIES, "keypoint_names": _COCO_PERSON_KEYPOINT_NAMES, + "min_size": (1, 1), } diff --git a/torchvision/models/detection/mask_rcnn.py b/torchvision/models/detection/mask_rcnn.py index ae6b0aa81f0..44254913a22 100644 --- a/torchvision/models/detection/mask_rcnn.py +++ b/torchvision/models/detection/mask_rcnn.py @@ -351,9 +351,8 @@ def __init__(self, in_channels, dim_reduced, num_classes): _COMMON_META = { - "task": "image_object_detection", - "architecture": "MaskRCNN", "categories": _COCO_CATEGORIES, + "min_size": (1, 1), } diff --git a/torchvision/models/detection/retinanet.py b/torchvision/models/detection/retinanet.py index 247e9cdc0d4..ed40702eed8 100644 --- a/torchvision/models/detection/retinanet.py +++ b/torchvision/models/detection/retinanet.py @@ -674,9 +674,8 @@ def forward(self, images, targets=None): _COMMON_META = { - "task": "image_object_detection", - "architecture": "RetinaNet", "categories": _COCO_CATEGORIES, + "min_size": (1, 1), } diff --git a/torchvision/models/detection/ssd.py b/torchvision/models/detection/ssd.py index d4e26e1517b..7d9800f15ea 100644 --- a/torchvision/models/detection/ssd.py +++ b/torchvision/models/detection/ssd.py @@ -30,11 +30,9 @@ class SSD300_VGG16_Weights(WeightsEnum): url="https://download.pytorch.org/models/ssd300_vgg16_coco-b556d3b4.pth", transforms=ObjectDetection, meta={ - "task": "image_object_detection", - "architecture": "SSD", "num_params": 35641826, - "size": (300, 300), "categories": _COCO_CATEGORIES, + "min_size": (1, 1), "recipe": "https://github.com/pytorch/vision/tree/main/references/detection#ssd300-vgg16", "map": 25.1, }, diff --git a/torchvision/models/detection/ssdlite.py b/torchvision/models/detection/ssdlite.py index d55991f48d5..73afc1eaca6 100644 --- a/torchvision/models/detection/ssdlite.py +++ b/torchvision/models/detection/ssdlite.py @@ -189,11 +189,9 @@ class SSDLite320_MobileNet_V3_Large_Weights(WeightsEnum): url="https://download.pytorch.org/models/ssdlite320_mobilenet_v3_large_coco-a79551df.pth", transforms=ObjectDetection, meta={ - "task": "image_object_detection", - "architecture": "SSDLite", "num_params": 3440060, - "size": (320, 320), "categories": _COCO_CATEGORIES, + "min_size": (1, 1), "recipe": "https://github.com/pytorch/vision/tree/main/references/detection#ssdlite320-mobilenetv3-large", "map": 21.3, }, diff --git a/torchvision/models/efficientnet.py b/torchvision/models/efficientnet.py index a5519c60824..ef7e0c323a4 100644 --- a/torchvision/models/efficientnet.py +++ b/torchvision/models/efficientnet.py @@ -3,7 +3,7 @@ import warnings from dataclasses import dataclass from functools import partial -from typing import Any, Callable, Optional, List, Sequence, Tuple, Union +from typing import Any, Callable, Dict, Optional, List, Sequence, Tuple, Union import torch from torch import nn, Tensor @@ -429,8 +429,7 @@ def _efficientnet_conf( return inverted_residual_setting, last_channel -_COMMON_META = { - "task": "image_classification", +_COMMON_META: Dict[str, Any] = { "categories": _IMAGENET_CATEGORIES, "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#efficientnet", } @@ -438,14 +437,12 @@ def _efficientnet_conf( _COMMON_META_V1 = { **_COMMON_META, - "architecture": "EfficientNet", "min_size": (1, 1), } _COMMON_META_V2 = { **_COMMON_META, - "architecture": "EfficientNetV2", "min_size": (33, 33), } @@ -459,7 +456,6 @@ class EfficientNet_B0_Weights(WeightsEnum): meta={ **_COMMON_META_V1, "num_params": 5288548, - "size": (224, 224), "acc@1": 77.692, "acc@5": 93.532, }, @@ -476,7 +472,6 @@ class EfficientNet_B1_Weights(WeightsEnum): meta={ **_COMMON_META_V1, "num_params": 7794184, - "size": (240, 240), "acc@1": 78.642, "acc@5": 94.186, }, @@ -490,7 +485,6 @@ class EfficientNet_B1_Weights(WeightsEnum): **_COMMON_META_V1, "num_params": 7794184, "recipe": "https://github.com/pytorch/vision/issues/3995#new-recipe-with-lr-wd-crop-tuning", - "size": (240, 240), "acc@1": 79.838, "acc@5": 94.934, }, @@ -507,7 +501,6 @@ class EfficientNet_B2_Weights(WeightsEnum): meta={ **_COMMON_META_V1, "num_params": 9109994, - "size": (288, 288), "acc@1": 80.608, "acc@5": 95.310, }, @@ -524,7 +517,6 @@ class EfficientNet_B3_Weights(WeightsEnum): meta={ **_COMMON_META_V1, "num_params": 12233232, - "size": (300, 300), "acc@1": 82.008, "acc@5": 96.054, }, @@ -541,7 +533,6 @@ class EfficientNet_B4_Weights(WeightsEnum): meta={ **_COMMON_META_V1, "num_params": 19341616, - "size": (380, 380), "acc@1": 83.384, "acc@5": 96.594, }, @@ -558,7 +549,6 @@ class EfficientNet_B5_Weights(WeightsEnum): meta={ **_COMMON_META_V1, "num_params": 30389784, - "size": (456, 456), "acc@1": 83.444, "acc@5": 96.628, }, @@ -575,7 +565,6 @@ class EfficientNet_B6_Weights(WeightsEnum): meta={ **_COMMON_META_V1, "num_params": 43040704, - "size": (528, 528), "acc@1": 84.008, "acc@5": 96.916, }, @@ -592,7 +581,6 @@ class EfficientNet_B7_Weights(WeightsEnum): meta={ **_COMMON_META_V1, "num_params": 66347960, - "size": (600, 600), "acc@1": 84.122, "acc@5": 96.908, }, @@ -612,7 +600,6 @@ class EfficientNet_V2_S_Weights(WeightsEnum): meta={ **_COMMON_META_V2, "num_params": 21458488, - "size": (384, 384), "acc@1": 84.228, "acc@5": 96.878, }, @@ -632,7 +619,6 @@ class EfficientNet_V2_M_Weights(WeightsEnum): meta={ **_COMMON_META_V2, "num_params": 54139356, - "size": (480, 480), "acc@1": 85.112, "acc@5": 97.156, }, @@ -654,7 +640,6 @@ class EfficientNet_V2_L_Weights(WeightsEnum): meta={ **_COMMON_META_V2, "num_params": 118515272, - "size": (480, 480), "acc@1": 85.808, "acc@5": 97.788, }, diff --git a/torchvision/models/googlenet.py b/torchvision/models/googlenet.py index a84d394d8d6..e5f420198e0 100644 --- a/torchvision/models/googlenet.py +++ b/torchvision/models/googlenet.py @@ -280,10 +280,7 @@ class GoogLeNet_Weights(WeightsEnum): url="https://download.pytorch.org/models/googlenet-1378be20.pth", transforms=partial(ImageClassification, crop_size=224), meta={ - "task": "image_classification", - "architecture": "GoogLeNet", "num_params": 6624904, - "size": (224, 224), "min_size": (15, 15), "categories": _IMAGENET_CATEGORIES, "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#googlenet", diff --git a/torchvision/models/inception.py b/torchvision/models/inception.py index a1997eb8dbd..f7e006233af 100644 --- a/torchvision/models/inception.py +++ b/torchvision/models/inception.py @@ -412,10 +412,7 @@ class Inception_V3_Weights(WeightsEnum): url="https://download.pytorch.org/models/inception_v3_google-0cc3c7bd.pth", transforms=partial(ImageClassification, crop_size=299, resize_size=342), meta={ - "task": "image_classification", - "architecture": "InceptionV3", "num_params": 27161264, - "size": (299, 299), "min_size": (75, 75), "categories": _IMAGENET_CATEGORIES, "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#inception-v3", diff --git a/torchvision/models/mnasnet.py b/torchvision/models/mnasnet.py index 7517d3c9772..27967b50608 100644 --- a/torchvision/models/mnasnet.py +++ b/torchvision/models/mnasnet.py @@ -212,9 +212,6 @@ def _load_from_state_dict( _COMMON_META = { - "task": "image_classification", - "architecture": "MNASNet", - "size": (224, 224), "min_size": (1, 1), "categories": _IMAGENET_CATEGORIES, "recipe": "https://github.com/1e100/mnasnet_trainer", diff --git a/torchvision/models/mobilenetv2.py b/torchvision/models/mobilenetv2.py index cf1580e3b75..af8b51049fd 100644 --- a/torchvision/models/mobilenetv2.py +++ b/torchvision/models/mobilenetv2.py @@ -195,10 +195,7 @@ def forward(self, x: Tensor) -> Tensor: _COMMON_META = { - "task": "image_classification", - "architecture": "MobileNetV2", "num_params": 3504872, - "size": (224, 224), "min_size": (1, 1), "categories": _IMAGENET_CATEGORIES, } diff --git a/torchvision/models/mobilenetv3.py b/torchvision/models/mobilenetv3.py index 8c0eb4c3223..128ff580641 100644 --- a/torchvision/models/mobilenetv3.py +++ b/torchvision/models/mobilenetv3.py @@ -304,9 +304,6 @@ def _mobilenet_v3( _COMMON_META = { - "task": "image_classification", - "architecture": "MobileNetV3", - "size": (224, 224), "min_size": (1, 1), "categories": _IMAGENET_CATEGORIES, } diff --git a/torchvision/models/optical_flow/raft.py b/torchvision/models/optical_flow/raft.py index 3a3d0334593..346641ae83d 100644 --- a/torchvision/models/optical_flow/raft.py +++ b/torchvision/models/optical_flow/raft.py @@ -512,8 +512,7 @@ def forward(self, image1, image2, num_flow_updates: int = 12): _COMMON_META = { - "task": "optical_flow", - "architecture": "RAFT", + "min_size": (128, 128), } diff --git a/torchvision/models/quantization/googlenet.py b/torchvision/models/quantization/googlenet.py index 37a420a0a10..196bedf1b29 100644 --- a/torchvision/models/quantization/googlenet.py +++ b/torchvision/models/quantization/googlenet.py @@ -111,14 +111,10 @@ class GoogLeNet_QuantizedWeights(WeightsEnum): url="https://download.pytorch.org/models/quantized/googlenet_fbgemm-c00238cf.pth", transforms=partial(ImageClassification, crop_size=224), meta={ - "task": "image_classification", - "architecture": "GoogLeNet", "num_params": 6624904, - "size": (224, 224), "min_size": (15, 15), "categories": _IMAGENET_CATEGORIES, "backend": "fbgemm", - "quantization": "Post Training Quantization", "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#post-training-quantized-models", "unquantized": GoogLeNet_Weights.IMAGENET1K_V1, "acc@1": 69.826, diff --git a/torchvision/models/quantization/inception.py b/torchvision/models/quantization/inception.py index 288be0068ca..d0d19b36a7e 100644 --- a/torchvision/models/quantization/inception.py +++ b/torchvision/models/quantization/inception.py @@ -177,14 +177,10 @@ class Inception_V3_QuantizedWeights(WeightsEnum): url="https://download.pytorch.org/models/quantized/inception_v3_google_fbgemm-71447a44.pth", transforms=partial(ImageClassification, crop_size=299, resize_size=342), meta={ - "task": "image_classification", - "architecture": "InceptionV3", "num_params": 27161264, - "size": (299, 299), "min_size": (75, 75), "categories": _IMAGENET_CATEGORIES, "backend": "fbgemm", - "quantization": "Post Training Quantization", "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#post-training-quantized-models", "unquantized": Inception_V3_Weights.IMAGENET1K_V1, "acc@1": 77.176, diff --git a/torchvision/models/quantization/mobilenetv2.py b/torchvision/models/quantization/mobilenetv2.py index 8c78ebcfd3b..d550b92d30a 100644 --- a/torchvision/models/quantization/mobilenetv2.py +++ b/torchvision/models/quantization/mobilenetv2.py @@ -69,14 +69,10 @@ class MobileNet_V2_QuantizedWeights(WeightsEnum): url="https://download.pytorch.org/models/quantized/mobilenet_v2_qnnpack_37f702c5.pth", transforms=partial(ImageClassification, crop_size=224), meta={ - "task": "image_classification", - "architecture": "MobileNetV2", "num_params": 3504872, - "size": (224, 224), "min_size": (1, 1), "categories": _IMAGENET_CATEGORIES, "backend": "qnnpack", - "quantization": "Quantization Aware Training", "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#qat-mobilenetv2", "unquantized": MobileNet_V2_Weights.IMAGENET1K_V1, "acc@1": 71.658, diff --git a/torchvision/models/quantization/mobilenetv3.py b/torchvision/models/quantization/mobilenetv3.py index afee0b1d954..be061ca6364 100644 --- a/torchvision/models/quantization/mobilenetv3.py +++ b/torchvision/models/quantization/mobilenetv3.py @@ -159,14 +159,10 @@ class MobileNet_V3_Large_QuantizedWeights(WeightsEnum): url="https://download.pytorch.org/models/quantized/mobilenet_v3_large_qnnpack-5bcacf28.pth", transforms=partial(ImageClassification, crop_size=224), meta={ - "task": "image_classification", - "architecture": "MobileNetV3", "num_params": 5483032, - "size": (224, 224), "min_size": (1, 1), "categories": _IMAGENET_CATEGORIES, "backend": "qnnpack", - "quantization": "Quantization Aware Training", "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#qat-mobilenetv3", "unquantized": MobileNet_V3_Large_Weights.IMAGENET1K_V1, "acc@1": 73.004, diff --git a/torchvision/models/quantization/resnet.py b/torchvision/models/quantization/resnet.py index 4534678af7f..b01f92ae547 100644 --- a/torchvision/models/quantization/resnet.py +++ b/torchvision/models/quantization/resnet.py @@ -147,12 +147,9 @@ def _resnet( _COMMON_META = { - "task": "image_classification", - "size": (224, 224), "min_size": (1, 1), "categories": _IMAGENET_CATEGORIES, "backend": "fbgemm", - "quantization": "Post Training Quantization", "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#post-training-quantized-models", } @@ -163,7 +160,6 @@ class ResNet18_QuantizedWeights(WeightsEnum): transforms=partial(ImageClassification, crop_size=224), meta={ **_COMMON_META, - "architecture": "ResNet", "num_params": 11689512, "unquantized": ResNet18_Weights.IMAGENET1K_V1, "acc@1": 69.494, @@ -179,7 +175,6 @@ class ResNet50_QuantizedWeights(WeightsEnum): transforms=partial(ImageClassification, crop_size=224), meta={ **_COMMON_META, - "architecture": "ResNet", "num_params": 25557032, "unquantized": ResNet50_Weights.IMAGENET1K_V1, "acc@1": 75.920, @@ -191,7 +186,6 @@ class ResNet50_QuantizedWeights(WeightsEnum): transforms=partial(ImageClassification, crop_size=224, resize_size=232), meta={ **_COMMON_META, - "architecture": "ResNet", "num_params": 25557032, "unquantized": ResNet50_Weights.IMAGENET1K_V2, "acc@1": 80.282, @@ -207,7 +201,6 @@ class ResNeXt101_32X8D_QuantizedWeights(WeightsEnum): transforms=partial(ImageClassification, crop_size=224), meta={ **_COMMON_META, - "architecture": "ResNeXt", "num_params": 88791336, "unquantized": ResNeXt101_32X8D_Weights.IMAGENET1K_V1, "acc@1": 78.986, @@ -219,7 +212,6 @@ class ResNeXt101_32X8D_QuantizedWeights(WeightsEnum): transforms=partial(ImageClassification, crop_size=224, resize_size=232), meta={ **_COMMON_META, - "architecture": "ResNeXt", "num_params": 88791336, "unquantized": ResNeXt101_32X8D_Weights.IMAGENET1K_V2, "acc@1": 82.574, diff --git a/torchvision/models/quantization/shufflenetv2.py b/torchvision/models/quantization/shufflenetv2.py index 5e679da7399..f795fcad5b8 100644 --- a/torchvision/models/quantization/shufflenetv2.py +++ b/torchvision/models/quantization/shufflenetv2.py @@ -102,13 +102,9 @@ def _shufflenetv2( _COMMON_META = { - "task": "image_classification", - "architecture": "ShuffleNetV2", - "size": (224, 224), "min_size": (1, 1), "categories": _IMAGENET_CATEGORIES, "backend": "fbgemm", - "quantization": "Post Training Quantization", "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#post-training-quantized-models", } diff --git a/torchvision/models/regnet.py b/torchvision/models/regnet.py index 17c54203a8f..f878bdd5754 100644 --- a/torchvision/models/regnet.py +++ b/torchvision/models/regnet.py @@ -1,7 +1,7 @@ import math from collections import OrderedDict from functools import partial -from typing import Any, Callable, List, Optional, Tuple +from typing import Any, Callable, Dict, List, Optional, Tuple import torch from torch import nn, Tensor @@ -402,17 +402,13 @@ def _regnet( return model -_COMMON_META = { - "task": "image_classification", - "architecture": "RegNet", - "size": (224, 224), +_COMMON_META: Dict[str, Any] = { "min_size": (1, 1), "categories": _IMAGENET_CATEGORIES, } _COMMON_SWAG_META = { **_COMMON_META, - "size": (384, 384), "recipe": "https://github.com/facebookresearch/SWAG", "license": "https://github.com/facebookresearch/SWAG/blob/main/LICENSE", } diff --git a/torchvision/models/resnet.py b/torchvision/models/resnet.py index c0c102710be..25990e0d4d4 100644 --- a/torchvision/models/resnet.py +++ b/torchvision/models/resnet.py @@ -302,8 +302,6 @@ def _resnet( _COMMON_META = { - "task": "image_classification", - "size": (224, 224), "min_size": (1, 1), "categories": _IMAGENET_CATEGORIES, } @@ -315,7 +313,6 @@ class ResNet18_Weights(WeightsEnum): transforms=partial(ImageClassification, crop_size=224), meta={ **_COMMON_META, - "architecture": "ResNet", "num_params": 11689512, "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#resnet", "acc@1": 69.758, @@ -331,7 +328,6 @@ class ResNet34_Weights(WeightsEnum): transforms=partial(ImageClassification, crop_size=224), meta={ **_COMMON_META, - "architecture": "ResNet", "num_params": 21797672, "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#resnet", "acc@1": 73.314, @@ -347,7 +343,6 @@ class ResNet50_Weights(WeightsEnum): transforms=partial(ImageClassification, crop_size=224), meta={ **_COMMON_META, - "architecture": "ResNet", "num_params": 25557032, "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#resnet", "acc@1": 76.130, @@ -359,7 +354,6 @@ class ResNet50_Weights(WeightsEnum): transforms=partial(ImageClassification, crop_size=224, resize_size=232), meta={ **_COMMON_META, - "architecture": "ResNet", "num_params": 25557032, "recipe": "https://github.com/pytorch/vision/issues/3995#issuecomment-1013906621", "acc@1": 80.858, @@ -375,7 +369,6 @@ class ResNet101_Weights(WeightsEnum): transforms=partial(ImageClassification, crop_size=224), meta={ **_COMMON_META, - "architecture": "ResNet", "num_params": 44549160, "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#resnet", "acc@1": 77.374, @@ -387,7 +380,6 @@ class ResNet101_Weights(WeightsEnum): transforms=partial(ImageClassification, crop_size=224, resize_size=232), meta={ **_COMMON_META, - "architecture": "ResNet", "num_params": 44549160, "recipe": "https://github.com/pytorch/vision/issues/3995#new-recipe", "acc@1": 81.886, @@ -403,7 +395,6 @@ class ResNet152_Weights(WeightsEnum): transforms=partial(ImageClassification, crop_size=224), meta={ **_COMMON_META, - "architecture": "ResNet", "num_params": 60192808, "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#resnet", "acc@1": 78.312, @@ -415,7 +406,6 @@ class ResNet152_Weights(WeightsEnum): transforms=partial(ImageClassification, crop_size=224, resize_size=232), meta={ **_COMMON_META, - "architecture": "ResNet", "num_params": 60192808, "recipe": "https://github.com/pytorch/vision/issues/3995#new-recipe", "acc@1": 82.284, @@ -431,7 +421,6 @@ class ResNeXt50_32X4D_Weights(WeightsEnum): transforms=partial(ImageClassification, crop_size=224), meta={ **_COMMON_META, - "architecture": "ResNeXt", "num_params": 25028904, "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#resnext", "acc@1": 77.618, @@ -443,7 +432,6 @@ class ResNeXt50_32X4D_Weights(WeightsEnum): transforms=partial(ImageClassification, crop_size=224, resize_size=232), meta={ **_COMMON_META, - "architecture": "ResNeXt", "num_params": 25028904, "recipe": "https://github.com/pytorch/vision/issues/3995#new-recipe", "acc@1": 81.198, @@ -459,7 +447,6 @@ class ResNeXt101_32X8D_Weights(WeightsEnum): transforms=partial(ImageClassification, crop_size=224), meta={ **_COMMON_META, - "architecture": "ResNeXt", "num_params": 88791336, "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#resnext", "acc@1": 79.312, @@ -471,7 +458,6 @@ class ResNeXt101_32X8D_Weights(WeightsEnum): transforms=partial(ImageClassification, crop_size=224, resize_size=232), meta={ **_COMMON_META, - "architecture": "ResNeXt", "num_params": 88791336, "recipe": "https://github.com/pytorch/vision/issues/3995#new-recipe-with-fixres", "acc@1": 82.834, @@ -487,7 +473,6 @@ class Wide_ResNet50_2_Weights(WeightsEnum): transforms=partial(ImageClassification, crop_size=224), meta={ **_COMMON_META, - "architecture": "WideResNet", "num_params": 68883240, "recipe": "https://github.com/pytorch/vision/pull/912#issue-445437439", "acc@1": 78.468, @@ -499,7 +484,6 @@ class Wide_ResNet50_2_Weights(WeightsEnum): transforms=partial(ImageClassification, crop_size=224, resize_size=232), meta={ **_COMMON_META, - "architecture": "WideResNet", "num_params": 68883240, "recipe": "https://github.com/pytorch/vision/issues/3995#new-recipe-with-fixres", "acc@1": 81.602, @@ -515,7 +499,6 @@ class Wide_ResNet101_2_Weights(WeightsEnum): transforms=partial(ImageClassification, crop_size=224), meta={ **_COMMON_META, - "architecture": "WideResNet", "num_params": 126886696, "recipe": "https://github.com/pytorch/vision/pull/912#issue-445437439", "acc@1": 78.848, @@ -527,7 +510,6 @@ class Wide_ResNet101_2_Weights(WeightsEnum): transforms=partial(ImageClassification, crop_size=224, resize_size=232), meta={ **_COMMON_META, - "architecture": "WideResNet", "num_params": 126886696, "recipe": "https://github.com/pytorch/vision/issues/3995#new-recipe", "acc@1": 82.510, diff --git a/torchvision/models/segmentation/deeplabv3.py b/torchvision/models/segmentation/deeplabv3.py index beaa1a8c7fe..f4fbab128bb 100644 --- a/torchvision/models/segmentation/deeplabv3.py +++ b/torchvision/models/segmentation/deeplabv3.py @@ -129,9 +129,8 @@ def _deeplabv3_resnet( _COMMON_META = { - "task": "image_semantic_segmentation", - "architecture": "DeepLabV3", "categories": _VOC_CATEGORIES, + "min_size": (1, 1), } diff --git a/torchvision/models/segmentation/fcn.py b/torchvision/models/segmentation/fcn.py index 678872915eb..09a62c97089 100644 --- a/torchvision/models/segmentation/fcn.py +++ b/torchvision/models/segmentation/fcn.py @@ -48,9 +48,8 @@ def __init__(self, in_channels: int, channels: int) -> None: _COMMON_META = { - "task": "image_semantic_segmentation", - "architecture": "FCN", "categories": _VOC_CATEGORIES, + "min_size": (1, 1), } diff --git a/torchvision/models/segmentation/lraspp.py b/torchvision/models/segmentation/lraspp.py index eb66fe93af2..48107f13466 100644 --- a/torchvision/models/segmentation/lraspp.py +++ b/torchvision/models/segmentation/lraspp.py @@ -98,10 +98,9 @@ class LRASPP_MobileNet_V3_Large_Weights(WeightsEnum): url="https://download.pytorch.org/models/lraspp_mobilenet_v3_large-d234d4ea.pth", transforms=partial(SemanticSegmentation, resize_size=520), meta={ - "task": "image_semantic_segmentation", - "architecture": "LRASPP", "num_params": 3221538, "categories": _VOC_CATEGORIES, + "min_size": (1, 1), "recipe": "https://github.com/pytorch/vision/tree/main/references/segmentation#lraspp_mobilenet_v3_large", "mIoU": 57.9, "acc": 91.2, diff --git a/torchvision/models/shufflenetv2.py b/torchvision/models/shufflenetv2.py index 4bde891d6b0..c4f6f2466b0 100644 --- a/torchvision/models/shufflenetv2.py +++ b/torchvision/models/shufflenetv2.py @@ -184,9 +184,6 @@ def _shufflenetv2( _COMMON_META = { - "task": "image_classification", - "architecture": "ShuffleNetV2", - "size": (224, 224), "min_size": (1, 1), "categories": _IMAGENET_CATEGORIES, "recipe": "https://github.com/barrh/Shufflenet-v2-Pytorch/tree/v0.1.0", diff --git a/torchvision/models/squeezenet.py b/torchvision/models/squeezenet.py index 42618f02b9d..43fe8a516e3 100644 --- a/torchvision/models/squeezenet.py +++ b/torchvision/models/squeezenet.py @@ -115,9 +115,6 @@ def _squeezenet( _COMMON_META = { - "task": "image_classification", - "architecture": "SqueezeNet", - "size": (224, 224), "categories": _IMAGENET_CATEGORIES, "recipe": "https://github.com/pytorch/vision/pull/49#issuecomment-277560717", } diff --git a/torchvision/models/vgg.py b/torchvision/models/vgg.py index e75c8ad2c6f..89e08b331f6 100644 --- a/torchvision/models/vgg.py +++ b/torchvision/models/vgg.py @@ -107,9 +107,6 @@ def _vgg(cfg: str, batch_norm: bool, weights: Optional[WeightsEnum], progress: b _COMMON_META = { - "task": "image_classification", - "architecture": "VGG", - "size": (224, 224), "min_size": (32, 32), "categories": _IMAGENET_CATEGORIES, "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#alexnet-and-vgg", diff --git a/torchvision/models/video/resnet.py b/torchvision/models/video/resnet.py index b0fcd39fecc..dab837571bd 100644 --- a/torchvision/models/video/resnet.py +++ b/torchvision/models/video/resnet.py @@ -309,8 +309,6 @@ def _video_resnet( _COMMON_META = { - "task": "video_classification", - "size": (112, 112), "min_size": (1, 1), "categories": _KINETICS400_CATEGORIES, "recipe": "https://github.com/pytorch/vision/tree/main/references/video_classification", @@ -323,7 +321,6 @@ class R3D_18_Weights(WeightsEnum): transforms=partial(VideoClassification, crop_size=(112, 112), resize_size=(128, 171)), meta={ **_COMMON_META, - "architecture": "R3D", "num_params": 33371472, "acc@1": 52.75, "acc@5": 75.45, @@ -338,7 +335,6 @@ class MC3_18_Weights(WeightsEnum): transforms=partial(VideoClassification, crop_size=(112, 112), resize_size=(128, 171)), meta={ **_COMMON_META, - "architecture": "MC3", "num_params": 11695440, "acc@1": 53.90, "acc@5": 76.29, @@ -353,7 +349,6 @@ class R2Plus1D_18_Weights(WeightsEnum): transforms=partial(VideoClassification, crop_size=(112, 112), resize_size=(128, 171)), meta={ **_COMMON_META, - "architecture": "R(2+1)D", "num_params": 31505325, "acc@1": 57.50, "acc@5": 78.81, diff --git a/torchvision/models/vision_transformer.py b/torchvision/models/vision_transformer.py index ffc6265eafc..6d881080d04 100644 --- a/torchvision/models/vision_transformer.py +++ b/torchvision/models/vision_transformer.py @@ -1,7 +1,7 @@ import math from collections import OrderedDict from functools import partial -from typing import Any, Callable, List, NamedTuple, Optional, Sequence, Dict +from typing import Any, Callable, List, NamedTuple, Optional, Dict import torch import torch.nn as nn @@ -288,18 +288,8 @@ def _vision_transformer( ) -> VisionTransformer: if weights is not None: _ovewrite_named_param(kwargs, "num_classes", len(weights.meta["categories"])) - if isinstance(weights.meta["size"], int): - _ovewrite_named_param(kwargs, "image_size", weights.meta["size"]) - elif isinstance(weights.meta["size"], Sequence): - if len(weights.meta["size"]) != 2 or weights.meta["size"][0] != weights.meta["size"][1]: - raise ValueError( - f'size: {weights.meta["size"]} is not valid! Currently we only support a 2-dimensional square and width = height' - ) - _ovewrite_named_param(kwargs, "image_size", weights.meta["size"][0]) - else: - raise ValueError( - f'weights.meta["size"]: {weights.meta["size"]} is not valid, the type should be either an int or a Sequence[int]' - ) + assert weights.meta["min_size"][0] == weights.meta["min_size"][1] + _ovewrite_named_param(kwargs, "image_size", weights.meta["min_size"][0]) image_size = kwargs.pop("image_size", 224) model = VisionTransformer( @@ -319,12 +309,10 @@ def _vision_transformer( _COMMON_META: Dict[str, Any] = { - "task": "image_classification", - "architecture": "ViT", "categories": _IMAGENET_CATEGORIES, } -_COMMON_SWAG_META: Dict[str, Any] = { +_COMMON_SWAG_META = { **_COMMON_META, "recipe": "https://github.com/facebookresearch/SWAG", "license": "https://github.com/facebookresearch/SWAG/blob/main/LICENSE", @@ -338,7 +326,6 @@ class ViT_B_16_Weights(WeightsEnum): meta={ **_COMMON_META, "num_params": 86567656, - "size": (224, 224), "min_size": (224, 224), "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#vit_b_16", "acc@1": 81.072, @@ -356,7 +343,6 @@ class ViT_B_16_Weights(WeightsEnum): meta={ **_COMMON_SWAG_META, "num_params": 86859496, - "size": (384, 384), "min_size": (384, 384), "acc@1": 85.304, "acc@5": 97.650, @@ -374,7 +360,6 @@ class ViT_B_16_Weights(WeightsEnum): **_COMMON_SWAG_META, "recipe": "https://github.com/pytorch/vision/pull/5793", "num_params": 86567656, - "size": (224, 224), "min_size": (224, 224), "acc@1": 81.886, "acc@5": 96.180, @@ -390,7 +375,6 @@ class ViT_B_32_Weights(WeightsEnum): meta={ **_COMMON_META, "num_params": 88224232, - "size": (224, 224), "min_size": (224, 224), "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#vit_b_32", "acc@1": 75.912, @@ -407,7 +391,6 @@ class ViT_L_16_Weights(WeightsEnum): meta={ **_COMMON_META, "num_params": 304326632, - "size": (224, 224), "min_size": (224, 224), "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#vit_l_16", "acc@1": 79.662, @@ -425,7 +408,6 @@ class ViT_L_16_Weights(WeightsEnum): meta={ **_COMMON_SWAG_META, "num_params": 305174504, - "size": (512, 512), "min_size": (512, 512), "acc@1": 88.064, "acc@5": 98.512, @@ -443,7 +425,6 @@ class ViT_L_16_Weights(WeightsEnum): **_COMMON_SWAG_META, "recipe": "https://github.com/pytorch/vision/pull/5793", "num_params": 304326632, - "size": (224, 224), "min_size": (224, 224), "acc@1": 85.146, "acc@5": 97.422, @@ -459,7 +440,6 @@ class ViT_L_32_Weights(WeightsEnum): meta={ **_COMMON_META, "num_params": 306535400, - "size": (224, 224), "min_size": (224, 224), "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#vit_l_32", "acc@1": 76.972, @@ -481,7 +461,6 @@ class ViT_H_14_Weights(WeightsEnum): meta={ **_COMMON_SWAG_META, "num_params": 633470440, - "size": (518, 518), "min_size": (518, 518), "acc@1": 88.552, "acc@5": 98.694, @@ -499,7 +478,6 @@ class ViT_H_14_Weights(WeightsEnum): **_COMMON_SWAG_META, "recipe": "https://github.com/pytorch/vision/pull/5793", "num_params": 632045800, - "size": (224, 224), "min_size": (224, 224), "acc@1": 85.708, "acc@5": 97.730,