Accumulate FIsher for multiple iterations

quanpn90 · Jun 2, 2022 · 14d5c6f · 14d5c6f
1 parent 989eb3c
commit 14d5c6f
Show file tree

Hide file tree

Showing 26 changed files with 31 additions and 819 deletions.
diff --git a/.idea/.gitignore b/.idea/.gitignore
diff --git a/.idea/asrMT.iml b/.idea/asrMT.iml
diff --git a/.idea/deployment.xml b/.idea/deployment.xml
diff --git a/.idea/dictionaries/pquan.xml b/.idea/dictionaries/pquan.xml
diff --git a/.idea/encodings.xml b/.idea/encodings.xml
diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml
diff --git a/.idea/libraries/R_User_Library.xml b/.idea/libraries/R_User_Library.xml
diff --git a/.idea/misc.xml b/.idea/misc.xml
diff --git a/.idea/modules.xml b/.idea/modules.xml
diff --git a/.idea/other.xml b/.idea/other.xml
diff --git a/.idea/sshConfigs.xml b/.idea/sshConfigs.xml
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
diff --git a/.idea/webServers.xml b/.idea/webServers.xml
diff --git a/README.md b/README.md
@@ -1,44 +1,2 @@
-# Introduction
+# Transformer networks for Neural Machine Translation
 
-# Requirements and Installation
-* A [PyTorch installation](http://pytorch.org/)
-* For training new models, you'll also need an NVIDIA GPU and [NCCL](https://github.com/NVIDIA/nccl)
-* Python version 3.7+
-
-Currently NMTG requires PyTorch version >= 1.8.0. Best is 1.10.0
-Please follow the instructions here: https://github.com/pytorch/pytorch#installation.
-
-
-After PyTorch is installed, you can install the requirements with:
-```
-pip install -r requirements.txt
-```
-
-# C++/CUDA module installation
-
-NMTG supports a couple of modules written using custom Pytorch/C++/CUDA modules to utilize GPU better and reduce overheads, including:
-* Self-attention and encoder-decoder attention with CUBLASLT
-* Multi-layer Perceptrons with CUBLASLT and fused dropout-relu/gelu/silu where inplace is implemented whenever possible
-* Highly optimized layer norm and multi-head attention (only available with sm80 (NVIDIA A100)) from Apex
-* Fused Logsoftmax/Cross-entropy loss to save memory for large output layer, from Apex
-* Fused inplaced Dropout Add for residual Transformers
-
-Installation requires CUDA and nvcc with the same version with PyTorch. Its possible to install CUDA from conda via:
-
-```
-conda install -c nvidia/label/cuda-11.5.2 cuda-toolkit
-```
-
-And then navigate to the extension modules and install nmtgminor-cuda via
-
-```
-cd onmt/modules/extension
-python setup.py install
-```
-
-Without this step, all modules backoff to PyTorch versions.
-
-# IWSLT 2022 Speech Translation models
-
-
-# Interspeech 2022 Multilingual ASR models 
diff --git a/onmt/data/binarizer.py b/onmt/data/binarizer.py
@@ -348,6 +348,7 @@ def binarize_file(filename, vocab, tokenizer, bos_word=None, eos_word=None,
                 ext_tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50", src_lang=lang)
                 if ext_tokenizer.src_lang != lang:
                     raise RuntimeError("The language %s does not exist in mBART50." % lang)
+                torch.save(ext_tokenizer, "mbart-large-50.tokenizer.pt")
         elif "m2m100" in external_tokenizer.lower():
 
             print("[INFO] Using the external %s tokenizer..." % external_tokenizer)

diff --git a/onmt/models/speech_recognizer/fairseq_wav2vec2/fairseq_modules.py b/onmt/models/speech_recognizer/fairseq_wav2vec2/fairseq_modules.py
@@ -627,6 +627,7 @@ def forward(
                 pos_proj_weight = F.dropout(self.pos_proj_weight, self.weight_drop, training=self.training) \
                                   if self.pos_proj_weight is not None else None
 
+                sub_pos_factor = None
                 if self.is_factorized:
                     if self.multiplicative_factorize:
                         # squeeze possible because only 1
@@ -718,11 +719,15 @@ def forward(
 
                         add_factor_in.add_(sub_add_factor_in)
                         add_factor_out.add_(sub_add_factor_out)
-                        if self.relative: pos_factor.add_(sub_pos_factor)
+                        if self.relative:
+                            pos_factor.add_(sub_pos_factor)
 
                     in_proj_weight = in_proj_weight + add_factor_in
                     out_proj_weight = out_proj_weight + add_factor_out
-                    if self.relative: pos_proj_weight = pos_proj_weight + sub_pos_factor
+                    if self.relative:
+                        if sub_pos_factor is None:
+                            sub_pos_factor = pos_factor
+                        pos_proj_weight = pos_proj_weight + sub_pos_factor
 
                 # Forward Pass starts here
                 if query.ndim == 3:

diff --git a/onmt/models/speech_recognizer/wav2vec2.py b/onmt/models/speech_recognizer/wav2vec2.py
@@ -706,6 +706,8 @@ def forward(self, batch, zero_encoder=False, factorize=False, target_mask=None,
         if hasattr(self.decoder, 'dec_pretrained_model') and self.decoder.dec_pretrained_model in ["bart"]:
             batch_first_output = True
 
+        # print(src_lang, src_atb, tgt_lang, tgt_atb)
+
         # during training mixture is always None
         encoder_output = self.encoder(src, batch_first_output=batch_first_output,
                                       lang=src_lang, atb=src_atb,

diff --git a/onmt/modules/optimized/encdec_attention_func_bias.py b/onmt/modules/optimized/encdec_attention_func_bias.py
@@ -18,7 +18,7 @@
     encdec_multihead_attn_bias_cuda = None
 
 try:
-        import encdec_multihead_attn_bias_blaslt
+    import encdec_multihead_attn_bias_blaslt
 except (ModuleNotFoundError, ImportError) as e:
     encdec_multihead_attn_bias_blaslt = None
 

diff --git a/onmt/requirement.txt b/onmt/requirement.txt
diff --git a/onmt/train_utils/mp_trainer.py b/onmt/train_utils/mp_trainer.py
@@ -1156,7 +1156,7 @@ def is_factorize_params(p_name):
         data_iterator = generate_data_iterator(dataset, self.rank, self.world_size,
                                                seed=self.opt.seed, num_workers=opt.num_workers,
                                                epoch=0, buffer_size=opt.buffer_size, split_even=True,
-                                               dataset_ids=train_sets)
+                                               dataset_ids=opt.train_sets)
 
         streaming = False
         epoch_iterator = data_iterator.next_epoch_itr(not streaming, pin_memory=opt.pin_memory)
@@ -1396,9 +1396,16 @@ def maybe_no_sync():
                 print("Done...")
 
         if self.rank == 0:
+            # Accumulate fisher info from previous iteration
+            if self.fisher_info is not None:
+                print("[INFO] Accumulating fisher information from a previous iteration...")
+                for n in precision_matrices:
+                    if n in self.fisher_info:
+                        precision_matrices[n] = self.fisher_info['fisher_diag'][n] + precision_matrices[n]
+
             # normalizing by the number of sentences
             # for n in precision_matrices:
-            #     precision_matrices[n].div_(num_accumulated_sents)
+            #     precision_matrices[n].div_(num_d_sents)
 
             means = dict()
             for n, p in parameters.items():

diff --git a/options.py b/options.py
@@ -13,7 +13,7 @@ def make_parser(parser):
     parser.add_argument('-multi_dataset', action='store_true',
                         help='Reading multiple datasets (sharing the same dictionary)')
     parser.add_argument('-train_sets', default=[], nargs='+', type=int,
-                        help="IDs.")
+                        help="Use CUDA on the listed devices.")
     parser.add_argument('-valid_sets', default=[], nargs='+', type=int,
                         help="Use CUDA on the listed devices.")
     parser.add_argument('-run_validation_before_training', action='store_true',

diff --git a/requirement.txt b/requirement.txt
diff --git a/test/grad_check.py b/test/grad_check.py