diff --git a/torchvision/models/detection/faster_rcnn.py b/torchvision/models/detection/faster_rcnn.py index 9df5428ecab..d38c870e26d 100644 --- a/torchvision/models/detection/faster_rcnn.py +++ b/torchvision/models/detection/faster_rcnn.py @@ -99,7 +99,41 @@ class FasterRCNN(GeneralizedRCNN): Example:: - >>> model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True) + >>> import torchvision + >>> from torchvision.models.detection import FasterRCNN + >>> from torchvision.models.detection.rpn import AnchorGenerator + >>> # load a pre-trained model for classification and return + >>> # only the features + >>> backbone = torchvision.models.mobilenet_v2(pretrained=True).features + >>> # FasterRCNN needs to know the number of + >>> # output channels in a backbone. For mobilenet_v2, it's 1280 + >>> # so we need to add it here + >>> backbone.out_channels = 1280 + >>> + >>> # let's make the RPN generate 5 x 3 anchors per spatial + >>> # location, with 5 different sizes and 3 different aspect + >>> # ratios. We have a Tuple[Tuple[int]] because each feature + >>> # map could potentially have different sizes and + >>> # aspect ratios + >>> anchor_generator = AnchorGenerator(sizes=((32, 64, 128, 256, 512),), + >>> aspect_ratios=((0.5, 1.0, 2.0),)) + >>> + >>> # let's define what are the feature maps that we will + >>> # use to perform the region of interest cropping, as well as + >>> # the size of the crop after rescaling. + >>> # if your backbone returns a Tensor, featmap_names is expected to + >>> # be [0]. More generally, the backbone should return an + >>> # OrderedDict[Tensor], and in featmap_names you can choose which + >>> # feature maps to use. + >>> roi_pooler = torchvision.ops.MultiScaleRoIAlign(featmap_names=[0], + >>> output_size=7, + >>> sampling_ratio=2) + >>> + >>> # put the pieces together inside a FasterRCNN model + >>> model = FasterRCNN(backbone, + >>> num_classes=2, + >>> rpn_anchor_generator=anchor_generator, + >>> box_roi_pool=roi_pooler) >>> model.eval() >>> x = [torch.rand(3, 300, 400), torch.rand(3, 500, 400)] >>> predictions = model(x) diff --git a/torchvision/models/detection/keypoint_rcnn.py b/torchvision/models/detection/keypoint_rcnn.py index 9a950e3cc34..6cac815f918 100644 --- a/torchvision/models/detection/keypoint_rcnn.py +++ b/torchvision/models/detection/keypoint_rcnn.py @@ -101,7 +101,47 @@ class KeypointRCNN(FasterRCNN): Example:: - >>> model = torchvision.models.detection.keypointrcnn_resnet50_fpn(pretrained=True) + >>> import torchvision + >>> from torchvision.models.detection import KeypointRCNN + >>> from torchvision.models.detection.rpn import AnchorGenerator + >>> + >>> # load a pre-trained model for classification and return + >>> # only the features + >>> backbone = torchvision.models.mobilenet_v2(pretrained=True).features + >>> # KeypointRCNN needs to know the number of + >>> # output channels in a backbone. For mobilenet_v2, it's 1280 + >>> # so we need to add it here + >>> backbone.out_channels = 1280 + >>> + >>> # let's make the RPN generate 5 x 3 anchors per spatial + >>> # location, with 5 different sizes and 3 different aspect + >>> # ratios. We have a Tuple[Tuple[int]] because each feature + >>> # map could potentially have different sizes and + >>> # aspect ratios + >>> anchor_generator = AnchorGenerator(sizes=((32, 64, 128, 256, 512),), + >>> aspect_ratios=((0.5, 1.0, 2.0),)) + >>> + >>> # let's define what are the feature maps that we will + >>> # use to perform the region of interest cropping, as well as + >>> # the size of the crop after rescaling. + >>> # if your backbone returns a Tensor, featmap_names is expected to + >>> # be [0]. More generally, the backbone should return an + >>> # OrderedDict[Tensor], and in featmap_names you can choose which + >>> # feature maps to use. + >>> roi_pooler = torchvision.ops.MultiScaleRoIAlign(featmap_names=[0], + >>> output_size=7, + >>> sampling_ratio=2) + >>> + >>> keypoint_roi_pooler = torchvision.ops.MultiScaleRoIAlign(featmap_names=[0], + >>> output_size=14, + >>> sampling_ratio=2) + >>> # put the pieces together inside a FasterRCNN model + >>> model = KeypointRCNN(backbone, + >>> num_classes=2, + >>> rpn_anchor_generator=anchor_generator, + >>> box_roi_pool=roi_pooler, + >>> keypoint_roi_pool=keypoint_roi_pooler) + >>> model.eval() >>> model.eval() >>> x = [torch.rand(3, 300, 400), torch.rand(3, 500, 400)] >>> predictions = model(x) diff --git a/torchvision/models/detection/mask_rcnn.py b/torchvision/models/detection/mask_rcnn.py index 9f4d62b82df..4f9f95aa868 100644 --- a/torchvision/models/detection/mask_rcnn.py +++ b/torchvision/models/detection/mask_rcnn.py @@ -104,7 +104,46 @@ class MaskRCNN(FasterRCNN): Example:: - >>> model = torchvision.models.detection.maskrcnn_resnet50_fpn(pretrained=True) + >>> import torchvision + >>> from torchvision.models.detection import MaskRCNN + >>> from torchvision.models.detection.rpn import AnchorGenerator + >>> + >>> # load a pre-trained model for classification and return + >>> # only the features + >>> backbone = torchvision.models.mobilenet_v2(pretrained=True).features + >>> # MaskRCNN needs to know the number of + >>> # output channels in a backbone. For mobilenet_v2, it's 1280 + >>> # so we need to add it here + >>> backbone.out_channels = 1280 + >>> + >>> # let's make the RPN generate 5 x 3 anchors per spatial + >>> # location, with 5 different sizes and 3 different aspect + >>> # ratios. We have a Tuple[Tuple[int]] because each feature + >>> # map could potentially have different sizes and + >>> # aspect ratios + >>> anchor_generator = AnchorGenerator(sizes=((32, 64, 128, 256, 512),), + >>> aspect_ratios=((0.5, 1.0, 2.0),)) + >>> + >>> # let's define what are the feature maps that we will + >>> # use to perform the region of interest cropping, as well as + >>> # the size of the crop after rescaling. + >>> # if your backbone returns a Tensor, featmap_names is expected to + >>> # be [0]. More generally, the backbone should return an + >>> # OrderedDict[Tensor], and in featmap_names you can choose which + >>> # feature maps to use. + >>> roi_pooler = torchvision.ops.MultiScaleRoIAlign(featmap_names=[0], + >>> output_size=7, + >>> sampling_ratio=2) + >>> + >>> mask_roi_pooler = torchvision.ops.MultiScaleRoIAlign(featmap_names=[0], + >>> output_size=14, + >>> sampling_ratio=2) + >>> # put the pieces together inside a FasterRCNN model + >>> model = MaskRCNN(backbone, + >>> num_classes=2, + >>> rpn_anchor_generator=anchor_generator, + >>> box_roi_pool=roi_pooler, + >>> mask_roi_pool=mask_roi_pooler) >>> model.eval() >>> x = [torch.rand(3, 300, 400), torch.rand(3, 500, 400)] >>> predictions = model(x) @@ -149,8 +188,10 @@ def __init__(self, backbone, num_classes=None, mask_head = MaskRCNNHeads(out_channels, mask_layers, mask_dilation) if mask_predictor is None: - mask_dim_reduced = 256 # == mask_layers[-1] - mask_predictor = MaskRCNNPredictor(out_channels, mask_dim_reduced, num_classes) + mask_predictor_in_channels = 256 # == mask_layers[-1] + mask_dim_reduced = 256 + mask_predictor = MaskRCNNPredictor(mask_predictor_in_channels, + mask_dim_reduced, num_classes) super(MaskRCNN, self).__init__( backbone, num_classes,