Skip to content

Commit

Permalink
Accumulate FIsher for multiple iterations
Browse files Browse the repository at this point in the history
  • Loading branch information
quanpn90 committed Jun 2, 2022
1 parent 989eb3c commit 14d5c6f
Show file tree
Hide file tree
Showing 26 changed files with 31 additions and 819 deletions.
8 changes: 7 additions & 1 deletion .idea/.gitignore

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

16 changes: 0 additions & 16 deletions .idea/asrMT.iml

This file was deleted.

22 changes: 0 additions & 22 deletions .idea/deployment.xml

This file was deleted.

3 changes: 0 additions & 3 deletions .idea/dictionaries/pquan.xml

This file was deleted.

4 changes: 0 additions & 4 deletions .idea/encodings.xml

This file was deleted.

29 changes: 0 additions & 29 deletions .idea/inspectionProfiles/Project_Default.xml

This file was deleted.

6 changes: 0 additions & 6 deletions .idea/libraries/R_User_Library.xml

This file was deleted.

5 changes: 1 addition & 4 deletions .idea/misc.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion .idea/modules.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

7 changes: 0 additions & 7 deletions .idea/other.xml

This file was deleted.

9 changes: 0 additions & 9 deletions .idea/sshConfigs.xml

This file was deleted.

1 change: 0 additions & 1 deletion .idea/vcs.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

21 changes: 0 additions & 21 deletions .idea/webServers.xml

This file was deleted.

44 changes: 1 addition & 43 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,44 +1,2 @@
# Introduction
# Transformer networks for Neural Machine Translation

# Requirements and Installation
* A [PyTorch installation](http://pytorch.org/)
* For training new models, you'll also need an NVIDIA GPU and [NCCL](https://github.com/NVIDIA/nccl)
* Python version 3.7+

Currently NMTG requires PyTorch version >= 1.8.0. Best is 1.10.0
Please follow the instructions here: https://github.com/pytorch/pytorch#installation.


After PyTorch is installed, you can install the requirements with:
```
pip install -r requirements.txt
```

# C++/CUDA module installation

NMTG supports a couple of modules written using custom Pytorch/C++/CUDA modules to utilize GPU better and reduce overheads, including:
* Self-attention and encoder-decoder attention with CUBLASLT
* Multi-layer Perceptrons with CUBLASLT and fused dropout-relu/gelu/silu where inplace is implemented whenever possible
* Highly optimized layer norm and multi-head attention (only available with sm80 (NVIDIA A100)) from Apex
* Fused Logsoftmax/Cross-entropy loss to save memory for large output layer, from Apex
* Fused inplaced Dropout Add for residual Transformers

Installation requires CUDA and nvcc with the same version with PyTorch. Its possible to install CUDA from conda via:

```
conda install -c nvidia/label/cuda-11.5.2 cuda-toolkit
```

And then navigate to the extension modules and install nmtgminor-cuda via

```
cd onmt/modules/extension
python setup.py install
```

Without this step, all modules backoff to PyTorch versions.

# IWSLT 2022 Speech Translation models


# Interspeech 2022 Multilingual ASR models
1 change: 1 addition & 0 deletions onmt/data/binarizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -348,6 +348,7 @@ def binarize_file(filename, vocab, tokenizer, bos_word=None, eos_word=None,
ext_tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50", src_lang=lang)
if ext_tokenizer.src_lang != lang:
raise RuntimeError("The language %s does not exist in mBART50." % lang)
torch.save(ext_tokenizer, "mbart-large-50.tokenizer.pt")
elif "m2m100" in external_tokenizer.lower():

print("[INFO] Using the external %s tokenizer..." % external_tokenizer)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -627,6 +627,7 @@ def forward(
pos_proj_weight = F.dropout(self.pos_proj_weight, self.weight_drop, training=self.training) \
if self.pos_proj_weight is not None else None

sub_pos_factor = None
if self.is_factorized:
if self.multiplicative_factorize:
# squeeze possible because only 1
Expand Down Expand Up @@ -718,11 +719,15 @@ def forward(

add_factor_in.add_(sub_add_factor_in)
add_factor_out.add_(sub_add_factor_out)
if self.relative: pos_factor.add_(sub_pos_factor)
if self.relative:
pos_factor.add_(sub_pos_factor)

in_proj_weight = in_proj_weight + add_factor_in
out_proj_weight = out_proj_weight + add_factor_out
if self.relative: pos_proj_weight = pos_proj_weight + sub_pos_factor
if self.relative:
if sub_pos_factor is None:
sub_pos_factor = pos_factor
pos_proj_weight = pos_proj_weight + sub_pos_factor

# Forward Pass starts here
if query.ndim == 3:
Expand Down
2 changes: 2 additions & 0 deletions onmt/models/speech_recognizer/wav2vec2.py
Original file line number Diff line number Diff line change
Expand Up @@ -706,6 +706,8 @@ def forward(self, batch, zero_encoder=False, factorize=False, target_mask=None,
if hasattr(self.decoder, 'dec_pretrained_model') and self.decoder.dec_pretrained_model in ["bart"]:
batch_first_output = True

# print(src_lang, src_atb, tgt_lang, tgt_atb)

# during training mixture is always None
encoder_output = self.encoder(src, batch_first_output=batch_first_output,
lang=src_lang, atb=src_atb,
Expand Down
2 changes: 1 addition & 1 deletion onmt/modules/optimized/encdec_attention_func_bias.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
encdec_multihead_attn_bias_cuda = None

try:
import encdec_multihead_attn_bias_blaslt
import encdec_multihead_attn_bias_blaslt
except (ModuleNotFoundError, ImportError) as e:
encdec_multihead_attn_bias_blaslt = None

Expand Down
5 changes: 0 additions & 5 deletions onmt/requirement.txt

This file was deleted.

11 changes: 9 additions & 2 deletions onmt/train_utils/mp_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -1156,7 +1156,7 @@ def is_factorize_params(p_name):
data_iterator = generate_data_iterator(dataset, self.rank, self.world_size,
seed=self.opt.seed, num_workers=opt.num_workers,
epoch=0, buffer_size=opt.buffer_size, split_even=True,
dataset_ids=train_sets)
dataset_ids=opt.train_sets)

streaming = False
epoch_iterator = data_iterator.next_epoch_itr(not streaming, pin_memory=opt.pin_memory)
Expand Down Expand Up @@ -1396,9 +1396,16 @@ def maybe_no_sync():
print("Done...")

if self.rank == 0:
# Accumulate fisher info from previous iteration
if self.fisher_info is not None:
print("[INFO] Accumulating fisher information from a previous iteration...")
for n in precision_matrices:
if n in self.fisher_info:
precision_matrices[n] = self.fisher_info['fisher_diag'][n] + precision_matrices[n]

# normalizing by the number of sentences
# for n in precision_matrices:
# precision_matrices[n].div_(num_accumulated_sents)
# precision_matrices[n].div_(num_d_sents)

means = dict()
for n, p in parameters.items():
Expand Down
2 changes: 1 addition & 1 deletion options.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ def make_parser(parser):
parser.add_argument('-multi_dataset', action='store_true',
help='Reading multiple datasets (sharing the same dictionary)')
parser.add_argument('-train_sets', default=[], nargs='+', type=int,
help="IDs.")
help="Use CUDA on the listed devices.")
parser.add_argument('-valid_sets', default=[], nargs='+', type=int,
help="Use CUDA on the listed devices.")
parser.add_argument('-run_validation_before_training', action='store_true',
Expand Down
5 changes: 0 additions & 5 deletions requirement.txt

This file was deleted.

76 changes: 0 additions & 76 deletions test/grad_check.py

This file was deleted.

Loading

0 comments on commit 14d5c6f

Please sign in to comment.