From 0b3dd469eb0362a997e77d98d0e9c453f4839c24 Mon Sep 17 00:00:00 2001
From: Vasilis Vryniotis <vvryniotis@fb.com>
Date: Sun, 23 Jan 2022 09:31:51 +0000
Subject: [PATCH] Revert vit_h_14 as it breaks our CI

---
 docs/source/models.rst                        |   2 --
 hubconf.py                                    |   1 -
 .../ModelTester.test_vit_h_14_expect.pkl      | Bin 939 -> 0 bytes
 torchvision/models/vision_transformer.py      |  21 ----------------
 .../prototype/models/vision_transformer.py    |  23 ------------------
 5 files changed, 47 deletions(-)
 delete mode 100644 test/expect/ModelTester.test_vit_h_14_expect.pkl

diff --git a/docs/source/models.rst b/docs/source/models.rst
index 4daee5d5534..82eb3170e78 100644
--- a/docs/source/models.rst
+++ b/docs/source/models.rst
@@ -89,7 +89,6 @@ You can construct a model with random weights by calling its constructor:
     vit_b_32 = models.vit_b_32()
     vit_l_16 = models.vit_l_16()
     vit_l_32 = models.vit_l_32()
-    vit_h_14 = models.vit_h_14()
 
 We provide pre-trained models, using the PyTorch :mod:`torch.utils.model_zoo`.
 These can be constructed by passing ``pretrained=True``:
@@ -464,7 +463,6 @@ VisionTransformer
     vit_b_32
     vit_l_16
     vit_l_32
-    vit_h_14
 
 Quantized Models
 ----------------
diff --git a/hubconf.py b/hubconf.py
index 1b3b191efa4..2b2eeb1c166 100644
--- a/hubconf.py
+++ b/hubconf.py
@@ -63,5 +63,4 @@
     vit_b_32,
     vit_l_16,
     vit_l_32,
-    vit_h_14,
 )
diff --git a/test/expect/ModelTester.test_vit_h_14_expect.pkl b/test/expect/ModelTester.test_vit_h_14_expect.pkl
deleted file mode 100644
index 1f846beb6a0bccf8b545f5a67b74482015cc878b..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 939
zcmWIWW@cev;NW1u00Im`42ea_8JT6N`YDMeiFyUuIc`pT3{fbcfhoBpAE-(%zO*DW
zr<g0gC^e}xGbbg!BsH%%zbL-Uh^vr6LnDG6XnrwJWny}2AtOW!ms?JLVo5MWkgJd>
zf)S|3ppZF&8AvA=loqmh8<iBY26{7iGkP1f6|#97c{8>ZvUemW=jY_4CYNO9=M{7L
z7p0^YrKY%KCYNv(a%ct>a+VZw1r>7Z1$eV_Fj*X^nFTZrgadH;l#f9R#i#lPZcb`w
z{zUOK5(jATumGUYDqB%_@&wQ~AdEY-_!+F>p;eYzR1Ay-Hz#u<g9|C-AWUN}kj)q8
zrH3*BZ3W=~Z$=OWPm{=TC;*Z`0q7|dT{p6y_)v7d0`ib`>l>i!MRpZGie3qz3t@Vp
zVG!WW#-;;RB*&}^R}M<dAONGcL%0l+z@7womkpFR7(Ag0K$#%Go0SbD#teiY^$@iH
DhE$l7

diff --git a/torchvision/models/vision_transformer.py b/torchvision/models/vision_transformer.py
index a64f342e1a0..9037b7b1c27 100644
--- a/torchvision/models/vision_transformer.py
+++ b/torchvision/models/vision_transformer.py
@@ -15,7 +15,6 @@
     "vit_b_32",
     "vit_l_16",
     "vit_l_32",
-    "vit_h_14",
 ]
 
 model_urls = {
@@ -357,26 +356,6 @@ def vit_l_32(pretrained: bool = False, progress: bool = True, **kwargs: Any) ->
     )
 
 
-def vit_h_14(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> VisionTransformer:
-    """
-    Constructs a vit_h_14 architecture from
-    `"An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale" <https://arxiv.org/abs/2010.11929>`_.
-
-    NOTE: Pretrained weights are not available for this model.
-    """
-    return _vision_transformer(
-        arch="vit_h_14",
-        patch_size=14,
-        num_layers=32,
-        num_heads=16,
-        hidden_dim=1280,
-        mlp_dim=5120,
-        pretrained=pretrained,
-        progress=progress,
-        **kwargs,
-    )
-
-
 def interpolate_embeddings(
     image_size: int,
     patch_size: int,
diff --git a/torchvision/prototype/models/vision_transformer.py b/torchvision/prototype/models/vision_transformer.py
index 72330fd1191..3f256842429 100644
--- a/torchvision/prototype/models/vision_transformer.py
+++ b/torchvision/prototype/models/vision_transformer.py
@@ -19,12 +19,10 @@
     "ViT_B_32_Weights",
     "ViT_L_16_Weights",
     "ViT_L_32_Weights",
-    "ViT_H_14_Weights",
     "vit_b_16",
     "vit_b_32",
     "vit_l_16",
     "vit_l_32",
-    "vit_h_14",
 ]
 
 
@@ -105,11 +103,6 @@ class ViT_L_32_Weights(WeightsEnum):
     default = ImageNet1K_V1
 
 
-class ViT_H_14_Weights(WeightsEnum):
-    # Weights are not available yet.
-    pass
-
-
 def _vision_transformer(
     patch_size: int,
     num_layers: int,
@@ -203,19 +196,3 @@ def vit_l_32(*, weights: Optional[ViT_L_32_Weights] = None, progress: bool = Tru
         progress=progress,
         **kwargs,
     )
-
-
-@handle_legacy_interface(weights=("pretrained", None))
-def vit_h_14(*, weights: Optional[ViT_H_14_Weights] = None, progress: bool = True, **kwargs: Any) -> VisionTransformer:
-    weights = ViT_H_14_Weights.verify(weights)
-
-    return _vision_transformer(
-        patch_size=14,
-        num_layers=32,
-        num_heads=16,
-        hidden_dim=1280,
-        mlp_dim=5120,
-        weights=weights,
-        progress=progress,
-        **kwargs,
-    )