speechbrain · asumagic · Jun 5, 2024 · Apr 26, 2024 · Apr 30, 2024 · May 3, 2024
diff --git a/speechbrain/lobes/models/transformer/Transformer.py b/speechbrain/lobes/models/transformer/Transformer.py
@@ -15,6 +15,7 @@
 from speechbrain.nnet.activations import Swish
 from speechbrain.nnet.attention import RelPosEncXL
 from speechbrain.nnet.CNN import Conv1d
+from speechbrain.utils.checkpoints import map_old_state_dict_weights
 
 from .Branchformer import BranchformerEncoder
 from .Conformer import ConformerEncoder
@@ -779,6 +780,12 @@ def forward(
 
         return tgt, self_attn, multihead_attention
 
+    def _load_from_state_dict(self, state_dict, prefix, *args, **kwargs):
+        """Load the model from a state_dict and map the old keys to the new keys."""
+        mapping = {"mutihead_attention": "multihead_attention"}
+        state_dict = map_old_state_dict_weights(state_dict, mapping)
+        super()._load_from_state_dict(state_dict, prefix, *args, **kwargs)
+
 
 class TransformerDecoder(nn.Module):
     """This class implements the Transformer decoder.

diff --git a/speechbrain/utils/checkpoints.py b/speechbrain/utils/checkpoints.py
@@ -46,6 +46,7 @@
 
 Authors
  * Aku Rouhe 2020
+ * Adel Moumen 2024
 """
 
 import collections
@@ -57,6 +58,7 @@
 import shutil
 import time
 import warnings
+from typing import Dict
 
 import torch
 import yaml
@@ -75,6 +77,41 @@
 CKPT_PREFIX = "CKPT"
 METAFNAME = f"{CKPT_PREFIX}.yaml"  # Important that this is not .ckpt
 PARAMFILE_EXT = ".ckpt"  # ...because these files will be
+# some keys have been renamed in the new version of the code
+KEYS_MAPPING: Dict[str, str] = {
+    "mutihead_attn": "multihead_attn",  # see PR #2489
+}
+
+
+def map_old_state_dict_weights(
+    state_dict: Dict[str, torch.Tensor], mapping: Dict[str, str]
+) -> Dict[str, torch.Tensor]:
+    """
+    Maps the keys in the old state dictionary according to the provided mapping,
+    under the given prefix.
+
+    Parameters
+    ----------
+    state_dict : dict
+        The old state dictionary to be mapped.
+    mapping : dict
+        A dictionary specifying the mapping between old and new keys.
+
+    Returns
+    -------
+    dict
+        The modified state dictionary with mapped keys.
+    """
+    for checkpoint_name, attribute_name in mapping.items():
+        for full_checkpoint_name in list(state_dict.keys()):
+            if checkpoint_name in full_checkpoint_name:
+                full_attribute_name = full_checkpoint_name.replace(
+                    checkpoint_name, attribute_name
+                )
+                state_dict[full_attribute_name] = state_dict.pop(
+                    full_checkpoint_name
+                )
+    return state_dict
 
 
 def torch_recovery(obj, path, end_of_epoch):
@@ -94,10 +131,13 @@ def torch_recovery(obj, path, end_of_epoch):
     """
     del end_of_epoch  # Unused
     device = "cpu"
+
+    state_dict = torch.load(path, map_location=device)
+    state_dict = map_old_state_dict_weights(state_dict, KEYS_MAPPING)
     try:
-        obj.load_state_dict(torch.load(path, map_location=device), strict=True)
+        obj.load_state_dict(state_dict, strict=True)
     except TypeError:
-        obj.load_state_dict(torch.load(path, map_location=device))
+        obj.load_state_dict(state_dict)
 
 
 @main_process_only
@@ -1247,4 +1287,10 @@ def average_checkpoints(
         parameter_loader(ckpt.paramfiles[recoverable_name], map_location=device)
         for ckpt in checkpoint_list
     )
-    return averager(parameter_iterator)
+    parameter_iterator = (
+        map_old_state_dict_weights(state_dict, KEYS_MAPPING)
+        for state_dict in parameter_iterator
+    )
+
+    avg_ckpt = averager(parameter_iterator)
+    return avg_ckpt