diff --git a/.circleci/create_circleci_config.py b/.circleci/create_circleci_config.py index 6a45f94a6578a..0b26762b080ab 100644 --- a/.circleci/create_circleci_config.py +++ b/.circleci/create_circleci_config.py @@ -374,8 +374,7 @@ def job_name(self): "pip install 'git+https://github.com/facebookresearch/detectron2.git'", "sudo apt install tesseract-ocr", "pip install pytesseract", - # wait until natten is ready for torch 2.0.0 - # "pip install natten", + "pip install natten", ], tests_to_run=[ "tests/models/*layoutlmv*", diff --git a/setup.py b/setup.py index 943bb196b5d64..c28387a3d454a 100644 --- a/setup.py +++ b/setup.py @@ -129,7 +129,7 @@ "keras-nlp>=0.3.1", "librosa", "nltk", - "natten>=0.14.4", + "natten>=0.14.5", "numpy>=1.17", "onnxconverter-common", "onnxruntime-tools>=1.4.2", diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py index 79f9118ae847f..aa638a6a9f60c 100644 --- a/src/transformers/dependency_versions_table.py +++ b/src/transformers/dependency_versions_table.py @@ -35,7 +35,7 @@ "keras-nlp": "keras-nlp>=0.3.1", "librosa": "librosa", "nltk": "nltk", - "natten": "natten>=0.14.4", + "natten": "natten>=0.14.5", "numpy": "numpy>=1.17", "onnxconverter-common": "onnxconverter-common", "onnxruntime-tools": "onnxruntime-tools>=1.4.2", diff --git a/src/transformers/models/dinat/modeling_dinat.py b/src/transformers/models/dinat/modeling_dinat.py index 95191d52b5f6b..efeb68846fce5 100644 --- a/src/transformers/models/dinat/modeling_dinat.py +++ b/src/transformers/models/dinat/modeling_dinat.py @@ -347,7 +347,7 @@ def forward( query_layer = query_layer / math.sqrt(self.attention_head_size) # Compute NA between "query" and "key" to get the raw attention scores, and add relative positional biases. - attention_scores = natten2dqkrpb(query_layer, key_layer, self.rpb, self.dilation) + attention_scores = natten2dqkrpb(query_layer, key_layer, self.rpb, self.kernel_size, self.dilation) # Normalize the attention scores to probabilities. attention_probs = nn.functional.softmax(attention_scores, dim=-1) diff --git a/src/transformers/models/nat/modeling_nat.py b/src/transformers/models/nat/modeling_nat.py index 4b34fe730c161..3a93b81e4bc53 100644 --- a/src/transformers/models/nat/modeling_nat.py +++ b/src/transformers/models/nat/modeling_nat.py @@ -339,7 +339,7 @@ def forward( query_layer = query_layer / math.sqrt(self.attention_head_size) # Compute NA between "query" and "key" to get the raw attention scores, and add relative positional biases. - attention_scores = natten2dqkrpb(query_layer, key_layer, self.rpb, 1) + attention_scores = natten2dqkrpb(query_layer, key_layer, self.rpb, self.kernel_size, 1) # Normalize the attention scores to probabilities. attention_probs = nn.functional.softmax(attention_scores, dim=-1)