diff --git a/OpenNMT/.gitignore b/OpenNMT/.gitignore new file mode 100644 index 0000000000..1ff9f3fe17 --- /dev/null +++ b/OpenNMT/.gitignore @@ -0,0 +1,3 @@ +pred.txt +multi-bleu.perl +*.pt diff --git a/OpenNMT/README.md b/OpenNMT/README.md index 3d578b6c62..96ea7dcfa0 100644 --- a/OpenNMT/README.md +++ b/OpenNMT/README.md @@ -8,23 +8,69 @@ an open-source (MIT) neural machine translation system. ## Quickstart -OpenNMT consists of three commands: +Use of OpenNMT consists of four steps: -0) Download the data. +### 0) Download the data. ```wget https://s3.amazonaws.com/pytorch/examples/opennmt/data/onmt-data.tar && tar -xf onmt-data.tar``` -1) Preprocess the data. +### 1) Preprocess the data. ```python preprocess.py -train_src data/src-train.txt -train_tgt data/tgt-train.txt -valid_src data/src-val.txt -valid_tgt data/tgt-val.txt -save_data data/demo``` -2) Train the model. +### 2) Train the model. -```python train.py -data data/demo-train.pt -save_model model -gpus 1``` +```python train.py -data data/demo-train.pt -save_model demo_model -gpus 0``` -3) Translate sentences. +### 3) Translate sentences. -```python translate.py -gpu 1 -model model_e13_*.pt -src data/src-test.txt -tgt data/tgt-test.txt -replace_unk -verbose``` +```python translate.py -gpu 0 -model demo_model_e13_*.pt -src data/src-test.txt -tgt data/tgt-test.txt -replace_unk -verbose -output demo_pred.txt``` + +### 4) Evaluate. + +```bash +wget https://raw.githubusercontent.com/moses-smt/mosesdecoder/master/scripts/generic/multi-bleu.perl +perl multi-bleu.perl data/tgt-test.txt < demo_pred.txt +``` + +## WMT'16 Multimodal Translation: Multi30k (de-en) + +Data might not come as clean as the demo data. Here is a second example that uses the Moses tokenizer (http://www.statmt.org/moses/) to prepare the Multi30k data from the WMT'16 Multimodal Translation task (http://www.statmt.org/wmt16/multimodal-task.html). + +### 0) Download the data. + +```bash +mkdir -p data/multi30k +wget http://www.quest.dcs.shef.ac.uk/wmt16_files_mmt/training.tar.gz && tar -xf training.tar.gz -C data/multi30k && rm training.tar.gz +wget http://www.quest.dcs.shef.ac.uk/wmt16_files_mmt/validation.tar.gz && tar -xf validation.tar.gz -C data/multi30k && rm validation.tar.gz +wget https://staff.fnwi.uva.nl/d.elliott/wmt16/mmt16_task1_test.tgz && tar -xf mmt16_task1_test.tgz -C data/multi30k && rm mmt16_task1_test.tgz +``` + +### 1) Preprocess the data. + +```bash +wget https://raw.githubusercontent.com/moses-smt/mosesdecoder/master/scripts/tokenizer/tokenizer.perl +sed -i "s/$RealBin\/..\/share\/nonbreaking_prefixes//" tokenizer.perl +wget https://github.com/moses-smt/mosesdecoder/blob/master/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.de +wget https://github.com/moses-smt/mosesdecoder/blob/master/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.en +for l in en de; do for f in data/multi30k/*.$l; do if [[ "$f" != *"test"* ]]; then sed -i "$ d" $f; fi; perl tokenizer.perl -no-escape -l $l -q < $f > $f.tok; done; done +python preprocess.py -train_src data/multi30k/train.en.tok -train_tgt data/multi30k/train.de.tok -valid_src data/multi30k/val.en.tok -valid_tgt data/multi30k/val.de.tok -save_data data/multi30k +``` + +### 2) Train the model. + +```python train.py -data data/multi30k-train.pt -save_model multi30k_model -gpus 0``` + +### 3) Translate sentences. + +```python translate.py -gpu 0 -model multi30k_model_e13_*.pt -src data/multi30k/test.en.tok -tgt data/multi30k/test.de.tok -replace_unk -verbose -output multi30k_pred.txt``` + +### 4) Evaluate. + +```bash +wget https://raw.githubusercontent.com/moses-smt/mosesdecoder/master/scripts/generic/multi-bleu.perl +perl multi-bleu.perl data/multi30k/test.de.tok < multi30k_pred.txt +``` ## Pretrained Models diff --git a/OpenNMT/onmt/Dataset.py b/OpenNMT/onmt/Dataset.py index 2651ae6458..7f9c54226e 100644 --- a/OpenNMT/onmt/Dataset.py +++ b/OpenNMT/onmt/Dataset.py @@ -1,10 +1,13 @@ +import math +import random + import onmt from torch.autograd import Variable class Dataset(object): - def __init__(self, srcData, tgtData, batchSize, cuda): + def __init__(self, srcData, tgtData, batchSize, cuda, volatile=False): self.src = srcData if tgtData: self.tgt = tgtData @@ -14,7 +17,8 @@ def __init__(self, srcData, tgtData, batchSize, cuda): self.cuda = cuda self.batchSize = batchSize - self.numBatches = (len(self.src) + batchSize - 1) // batchSize + self.numBatches = math.ceil(len(self.src)/batchSize) + self.volatile = volatile def _batchify(self, data, align_right=False): max_length = max(x.size(0) for x in data) @@ -28,7 +32,7 @@ def _batchify(self, data, align_right=False): if self.cuda: out = out.cuda() - v = Variable(out) + v = Variable(out, volatile=self.volatile) return v def __getitem__(self, index): @@ -46,3 +50,9 @@ def __getitem__(self, index): def __len__(self): return self.numBatches + + + def shuffle(self): + zipped = list(zip(self.src, self.tgt)) + random.shuffle(zipped) + self.src, self.tgt = [x[0] for x in zipped], [x[1] for x in zipped] diff --git a/OpenNMT/onmt/Dict.py b/OpenNMT/onmt/Dict.py index 59a4a45b67..cd60d37984 100644 --- a/OpenNMT/onmt/Dict.py +++ b/OpenNMT/onmt/Dict.py @@ -2,10 +2,11 @@ class Dict(object): - def __init__(self, data=None): + def __init__(self, data=None, lower=False): self.idxToLabel = {} self.labelToIdx = {} self.frequencies = {} + self.lower = lower # Special entries will not be pruned. self.special = [] @@ -37,6 +38,7 @@ def writeFile(self, filename): file.close() def lookup(self, key, default=None): + key = key.lower() if self.lower else key try: return self.labelToIdx[key] except KeyError: @@ -60,6 +62,7 @@ def addSpecials(self, labels): # Add `label` in the dictionary. Use `idx` as its index if given. def add(self, label, idx=None): + label = label.lower() if self.lower else label if idx is not None: self.idxToLabel[idx] = label self.labelToIdx[label] = idx @@ -89,6 +92,7 @@ def prune(self, size): _, idx = torch.sort(freq, 0, True) newDict = Dict() + newDict.lower = self.lower # Add special entries in all cases. for i in self.special: diff --git a/OpenNMT/onmt/Models.py b/OpenNMT/onmt/Models.py index 7fb48cdf86..7bde7090b1 100644 --- a/OpenNMT/onmt/Models.py +++ b/OpenNMT/onmt/Models.py @@ -10,28 +10,26 @@ def __init__(self, opt, dicts): self.num_directions = 2 if opt.brnn else 1 assert opt.rnn_size % self.num_directions == 0 self.hidden_size = opt.rnn_size // self.num_directions - inputSize = opt.word_vec_size + input_size = opt.word_vec_size super(Encoder, self).__init__() self.word_lut = nn.Embedding(dicts.size(), opt.word_vec_size, padding_idx=onmt.Constants.PAD) - self.rnn = nn.LSTM(inputSize, self.hidden_size, + self.rnn = nn.LSTM(input_size, self.hidden_size, num_layers=opt.layers, dropout=opt.dropout, bidirectional=opt.brnn) - # self.rnn.bias_ih_l0.data.div_(2) - # self.rnn.bias_hh_l0.data.copy_(self.rnn.bias_ih_l0.data) - if opt.pre_word_vecs_enc is not None: pretrained = torch.load(opt.pre_word_vecs_enc) self.word_lut.weight.copy_(pretrained) def forward(self, input, hidden=None): - batch_size = input.size(0) # batch first for multi-gpu compatibility - emb = self.word_lut(input).transpose(0, 1) + emb = self.word_lut(input) + if hidden is None: + batch_size = emb.size(1) h_size = (self.layers * self.num_directions, batch_size, self.hidden_size) h_0 = Variable(emb.data.new(*h_size).zero_(), requires_grad=False) c_0 = Variable(emb.data.new(*h_size).zero_(), requires_grad=False) @@ -46,17 +44,16 @@ def __init__(self, num_layers, input_size, rnn_size, dropout): super(StackedLSTM, self).__init__() self.dropout = nn.Dropout(dropout) self.num_layers = num_layers + self.layers = nn.ModuleList() for i in range(num_layers): - layer = nn.LSTMCell(input_size, rnn_size) - self.add_module('layer_%d' % i, layer) + self.layers.append(nn.LSTMCell(input_size, rnn_size)) input_size = rnn_size def forward(self, input, hidden): h_0, c_0 = hidden h_1, c_1 = [], [] - for i in range(self.num_layers): - layer = getattr(self, 'layer_%d' % i) + for i, layer in enumerate(self.layers): h_1_i, c_1_i = layer(input, (h_0[i], c_0[i])) input = h_1_i if i != self.num_layers: @@ -87,9 +84,6 @@ def __init__(self, opt, dicts): self.attn = onmt.modules.GlobalAttention(opt.rnn_size) self.dropout = nn.Dropout(opt.dropout) - # self.rnn.bias_ih.data.div_(2) - # self.rnn.bias_hh.data.copy_(self.rnn.bias_ih.data) - self.hidden_size = opt.rnn_size if opt.pre_word_vecs_enc is not None: @@ -98,39 +92,33 @@ def __init__(self, opt, dicts): def forward(self, input, hidden, context, init_output): - emb = self.word_lut(input).transpose(0, 1) - - batch_size = input.size(0) - - h_size = (batch_size, self.hidden_size) - output = Variable(emb.data.new(*h_size).zero_(), requires_grad=False) + emb = self.word_lut(input) # n.b. you can increase performance if you compute W_ih * x for all # iterations in parallel, but that's only possible if # self.input_feed=False outputs = [] output = init_output - for i, emb_t in enumerate(emb.chunk(emb.size(0), dim=0)): + for emb_t in emb.split(1): emb_t = emb_t.squeeze(0) if self.input_feed: emb_t = torch.cat([emb_t, output], 1) - output, h = self.rnn(emb_t, hidden) + output, hidden = self.rnn(emb_t, hidden) output, attn = self.attn(output, context.t()) output = self.dropout(output) outputs += [output] outputs = torch.stack(outputs) - return outputs.transpose(0, 1), h, attn + return outputs, hidden, attn class NMTModel(nn.Module): - def __init__(self, encoder, decoder, generator): + def __init__(self, encoder, decoder): super(NMTModel, self).__init__() self.encoder = encoder self.decoder = decoder - self.generator = generator self.generate = False def set_generate(self, enabled): @@ -153,7 +141,7 @@ def _fix_enc_hidden(self, h): def forward(self, input): src = input[0] - tgt = input[1][:, :-1] # exclude last target from inputs + tgt = input[1][:-1] # exclude last target from inputs enc_hidden, context = self.encoder(src) init_output = self.make_init_decoder_output(context) @@ -161,7 +149,7 @@ def forward(self, input): self._fix_enc_hidden(enc_hidden[1])) out, dec_hidden, _attn = self.decoder(tgt, enc_hidden, context, init_output) - if self.generate: + if hasattr(self, 'generator') and self.generate: out = self.generator(out) return out diff --git a/OpenNMT/onmt/Optim.py b/OpenNMT/onmt/Optim.py index 0d0e6e3c72..870bf9867e 100644 --- a/OpenNMT/onmt/Optim.py +++ b/OpenNMT/onmt/Optim.py @@ -1,5 +1,7 @@ import math import torch.optim as optim +import torch.nn as nn +from torch.nn.utils import clip_grad_norm class Optim(object): @@ -29,19 +31,9 @@ def __init__(self, params, method, lr, max_grad_norm, lr_decay=1, start_decay_at def step(self): # Compute gradients norm. - grad_norm = 0 - for param in self.params: - grad_norm += math.pow(param.grad.data.norm(), 2) - - grad_norm = math.sqrt(grad_norm) - shrinkage = self.max_grad_norm / grad_norm - - for param in self.params: - if shrinkage < 1: - param.grad.data.mul_(shrinkage) - + if self.max_grad_norm: + clip_grad_norm(self.params, self.max_grad_norm) self.optimizer.step() - return grad_norm # decay learning rate if val perf does not improve or we hit the start_decay_at limit def updateLearningRate(self, ppl, epoch): diff --git a/OpenNMT/onmt/Translator.py b/OpenNMT/onmt/Translator.py index 9640528c10..80405b9117 100644 --- a/OpenNMT/onmt/Translator.py +++ b/OpenNMT/onmt/Translator.py @@ -1,4 +1,5 @@ import onmt +import torch.nn as nn import torch from torch.autograd import Variable @@ -9,17 +10,34 @@ def __init__(self, opt): self.tt = torch.cuda if opt.cuda else torch checkpoint = torch.load(opt.model) - self.model = checkpoint['model'] - self.model.eval() + model_opt = checkpoint['opt'] + self.src_dict = checkpoint['dicts']['src'] + self.tgt_dict = checkpoint['dicts']['tgt'] + + encoder = onmt.Models.Encoder(model_opt, self.src_dict) + decoder = onmt.Models.Decoder(model_opt, self.tgt_dict) + model = onmt.Models.NMTModel(encoder, decoder) + + generator = nn.Sequential( + nn.Linear(model_opt.rnn_size, self.tgt_dict.size()), + nn.LogSoftmax()) + + model.load_state_dict(checkpoint['model']) + generator.load_state_dict(checkpoint['generator']) if opt.cuda: - self.model.cuda() + model.cuda() + generator.cuda() else: - self.model.cpu() + model.cpu() + generator.cpu() + + model.generator = generator + + self.model = model + self.model.eval() - self.src_dict = checkpoint['dicts']['src'] - self.tgt_dict = checkpoint['dicts']['tgt'] def buildData(self, srcBatch, goldBatch): srcData = [self.src_dict.convertToIdx(b, @@ -48,32 +66,38 @@ def buildTargetTokens(self, pred, src, attn): def translateBatch(self, batch): srcBatch, tgtBatch = batch - batchSize = srcBatch.size(0) + batchSize = srcBatch.size(1) beamSize = self.opt.beam_size # (1) run the encoder on the src - # have to execute the encoder manually to deal with padding - encStates = None - context = [] - for srcBatch_t in srcBatch.chunk(srcBatch.size(1), dim=1): - encStates, context_t = self.model.encoder(srcBatch_t, hidden=encStates) - batchPadIdx = srcBatch_t.data.squeeze(1).eq(onmt.Constants.PAD).nonzero() - if batchPadIdx.nelement() > 0: - batchPadIdx = batchPadIdx.squeeze(1) - encStates[0].data.index_fill_(1, batchPadIdx, 0) - encStates[1].data.index_fill_(1, batchPadIdx, 0) - context += [context_t] + encStates, context = None, None - encStates = (self.model._fix_enc_hidden(encStates[0]), - self.model._fix_enc_hidden(encStates[1])) + if self.model.encoder.num_directions == 2: + # bidirectional encoder is negatively impacted by padding + # run with batch size 1 for improved translations + # This will be resolved when variable length LSTMs are used instead + encStates, context = self.model.encoder(srcBatch, hidden=encStates) + else: + # have to execute the encoder manually to deal with padding + context = [] + for srcBatch_t in srcBatch.split(1): + encStates, context_t = self.model.encoder(srcBatch_t, hidden=encStates) + batchPadIdx = srcBatch_t.data.squeeze(0).eq(onmt.Constants.PAD).nonzero() + if batchPadIdx.nelement() > 0: + batchPadIdx = batchPadIdx.squeeze(1) + encStates[0].data.index_fill_(1, batchPadIdx, 0) + encStates[1].data.index_fill_(1, batchPadIdx, 0) + context += [context_t] + context = torch.cat(context) - context = torch.cat(context) rnnSize = context.size(2) + encStates = (self.model._fix_enc_hidden(encStates[0]), + self.model._fix_enc_hidden(encStates[1])) # This mask is applied to the attention model inside the decoder # so that the attention ignores source padding - padMask = srcBatch.data.eq(onmt.Constants.PAD) + padMask = srcBatch.data.eq(onmt.Constants.PAD).t() def applyContextMask(m): if isinstance(m, onmt.modules.GlobalAttention): m.applyMask(padMask) @@ -88,8 +112,8 @@ def applyContextMask(m): initOutput = self.model.make_init_decoder_output(context) decOut, decStates, attn = self.model.decoder( - tgtBatch[:, :-1], decStates, context, initOutput) - for dec_t, tgt_t in zip(decOut.transpose(0, 1), tgtBatch.transpose(0, 1)[1:].data): + tgtBatch[:-1], decStates, context, initOutput) + for dec_t, tgt_t in zip(decOut, tgtBatch[1:].data): gen_t = self.model.generator.forward(dec_t) tgt_t = tgt_t.unsqueeze(1) scores = gen_t.data.gather(1, tgt_t) @@ -99,15 +123,15 @@ def applyContextMask(m): # (3) run the decoder to generate sentences, using beam search # Expand tensors for each beam. - context = Variable(context.data.repeat(1, beamSize, 1)) - decStates = (Variable(encStates[0].data.repeat(1, beamSize, 1)), - Variable(encStates[1].data.repeat(1, beamSize, 1))) + context = Variable(context.data.repeat(1, beamSize, 1), volatile=True) + decStates = (Variable(encStates[0].data.repeat(1, beamSize, 1), volatile=True), + Variable(encStates[1].data.repeat(1, beamSize, 1), volatile=True)) beam = [onmt.Beam(beamSize, self.opt.cuda) for k in range(batchSize)] decOut = self.model.make_init_decoder_output(context) - padMask = srcBatch.data.eq(onmt.Constants.PAD).unsqueeze(0).repeat(beamSize, 1, 1) + padMask = srcBatch.data.eq(onmt.Constants.PAD).t().unsqueeze(0).repeat(beamSize, 1, 1) batchIdx = list(range(batchSize)) remainingSents = batchSize @@ -120,9 +144,9 @@ def applyContextMask(m): if not b.done]).t().contiguous().view(1, -1) decOut, decStates, attn = self.model.decoder( - Variable(input).transpose(0, 1), decStates, context, decOut) + Variable(input, volatile=True), decStates, context, decOut) # decOut: 1 x (beam*batch) x numWords - decOut = decOut.transpose(0, 1).squeeze(0) + decOut = decOut.squeeze(0) out = self.model.generator.forward(decOut) # batch x beam x numWords @@ -159,7 +183,7 @@ def updateActive(t): newSize = list(t.size()) newSize[-2] = newSize[-2] * len(activeIdx) // remainingSents return Variable(view.index_select(1, activeIdx) \ - .view(*newSize)) + .view(*newSize), volatile=True) decStates = (updateActive(decStates[0]), updateActive(decStates[1])) decOut = updateActive(decOut) @@ -177,7 +201,7 @@ def updateActive(t): scores, ks = beam[b].sortBest() allScores += [scores[:n_best]] - valid_attn = srcBatch.transpose(0, 1).data[:, b].ne(onmt.Constants.PAD).nonzero().squeeze(1) + valid_attn = srcBatch.data[:, b].ne(onmt.Constants.PAD).nonzero().squeeze(1) hyps, attn = zip(*[beam[b].getHyp(k) for k in ks[:n_best]]) attn = [a.index_select(1, valid_attn) for a in attn] allHyp += [hyps] @@ -189,14 +213,13 @@ def translate(self, srcBatch, goldBatch): # (1) convert words to indexes dataset = self.buildData(srcBatch, goldBatch) batch = dataset[0] - batch = [x.transpose(0, 1) for x in batch] # (2) translate pred, predScore, attn, goldScore = self.translateBatch(batch) # (3) convert indexes to words predBatch = [] - for b in range(batch[0].size(0)): + for b in range(batch[0].size(1)): predBatch.append( [self.buildTargetTokens(pred[b][n], srcBatch[b], attn[b][n]) for n in range(self.opt.n_best)] diff --git a/OpenNMT/onmt/modules/GlobalAttention.py b/OpenNMT/onmt/modules/GlobalAttention.py index 7980cb0746..ac2f315641 100644 --- a/OpenNMT/onmt/modules/GlobalAttention.py +++ b/OpenNMT/onmt/modules/GlobalAttention.py @@ -24,8 +24,6 @@ import torch.nn as nn import math -_INF = float('inf') - class GlobalAttention(nn.Module): def __init__(self, dim): super(GlobalAttention, self).__init__() @@ -48,7 +46,7 @@ def forward(self, input, context): # Get attention attn = torch.bmm(context, targetT).squeeze(2) # batch x sourceL if self.mask is not None: - attn.data.masked_fill_(self.mask, -_INF) + attn.data.masked_fill_(self.mask, -float('inf')) attn = self.sm(attn) attn3 = attn.view(attn.size(0), 1, attn.size(1)) # batch x 1 x sourceL diff --git a/OpenNMT/preprocess.py b/OpenNMT/preprocess.py index ea7f57b814..8987e20174 100644 --- a/OpenNMT/preprocess.py +++ b/OpenNMT/preprocess.py @@ -40,6 +40,8 @@ parser.add_argument('-seed', type=int, default=3435, help="Random seed") +parser.add_argument('-lower', action='store_true', help='lowercase data') + parser.add_argument('-report_every', type=int, default=100000, help="Report status every this many sentences") @@ -48,7 +50,7 @@ def makeVocabulary(filename, size): vocab = onmt.Dict([onmt.Constants.PAD_WORD, onmt.Constants.UNK_WORD, - onmt.Constants.BOS_WORD, onmt.Constants.EOS_WORD]) + onmt.Constants.BOS_WORD, onmt.Constants.EOS_WORD], lower=opt.lower) with open(filename) as f: for sent in f.readlines(): diff --git a/OpenNMT/train.py b/OpenNMT/train.py index 585ab4115b..8aea7bac04 100644 --- a/OpenNMT/train.py +++ b/OpenNMT/train.py @@ -17,7 +17,10 @@ help="""Model filename (the model will be saved as _epochN_PPL.pt where PPL is the validation perplexity""") -parser.add_argument('-train_from', +parser.add_argument('-train_from_state_dict', default='', type=str, + help="""If training from a checkpoint then this is the + path to the pretrained model's state_dict.""") +parser.add_argument('-train_from', default='', type=str, help="""If training from a checkpoint then this is the path to the pretrained model.""") @@ -61,22 +64,34 @@ parser.add_argument('-learning_rate', type=float, default=1.0, help="""Starting learning rate. If adagrad/adadelta/adam is used, then this is the global learning rate. Recommended - settings: sgd = 1, adagrad = 0.1, adadelta = 1, adam = 0.1""") + settings: sgd = 1, adagrad = 0.1, adadelta = 1, adam = 0.001""") parser.add_argument('-max_grad_norm', type=float, default=5, help="""If the norm of the gradient vector exceeds this, renormalize it to have the norm equal to max_grad_norm""") parser.add_argument('-dropout', type=float, default=0.3, help='Dropout probability; applied between LSTM stacks.') -parser.add_argument('-learning_rate_decay', type=float, default=0.5, - help="""Decay learning rate by this much if (i) perplexity - does not decrease on the validation set or (ii) epoch has - gone past the start_decay_at_limit""") -parser.add_argument('-start_decay_at', default=8, - help="Start decay after this epoch") parser.add_argument('-curriculum', action="store_true", help="""For this many epochs, order the minibatches based on source sequence length. Sometimes setting this to 1 will increase convergence speed.""") +parser.add_argument('-extra_shuffle', action="store_true", + help="""By default only shuffle mini-batch order; when true, + shuffle and re-assign mini-batches""") + +#learning rate +parser.add_argument('-fix_learning_rate', action='store_false', dest='update_learning_rate', + help="Do not decay learning rate (may be desirable for some optimzers (e.g. Adam)") +parser.add_argument('-learning_rate_decay', type=float, default=0.5, + help="""If update_learning_rate, decay learning rate by + this much if (i) perplexity does not decrease on the + validation set or (ii) epoch has gone past + start_decay_at""") +parser.add_argument('-start_decay_at', type=int, default=8, + help="""Start decaying every epoch after and including this + epoch""") + +#pretrained word vectors + parser.add_argument('-pre_word_vecs_enc', help="""If a valid path is specified, then this will load pretrained word embeddings on the encoder side. @@ -88,7 +103,7 @@ # GPU parser.add_argument('-gpus', default=[], nargs='+', type=int, - help="Use CUDA") + help="Use CUDA on the listed devices.") parser.add_argument('-log_interval', type=int, default=50, help="Print stats at this interval.") @@ -96,69 +111,70 @@ # help="Seed for random initialization") opt = parser.parse_args() -opt.cuda = len(opt.gpus) print(opt) -if torch.cuda.is_available() and not opt.cuda: - print("WARNING: You have a CUDA device, so you should probably run with -gpus 1") +if torch.cuda.is_available() and not opt.gpus: + print("WARNING: You have a CUDA device, so you should probably run with -gpus 0") -if opt.cuda: +if opt.gpus: cuda.set_device(opt.gpus[0]) def NMTCriterion(vocabSize): weight = torch.ones(vocabSize) weight[onmt.Constants.PAD] = 0 crit = nn.NLLLoss(weight, size_average=False) - if opt.cuda: + if opt.gpus: crit.cuda() return crit def memoryEfficientLoss(outputs, targets, generator, crit, eval=False): # compute generations one piece at a time - loss = 0 - outputs = Variable(outputs.data, requires_grad=(not eval), volatile=eval).contiguous() + num_correct, loss = 0, 0 + outputs = Variable(outputs.data, requires_grad=(not eval), volatile=eval) batch_size = outputs.size(1) outputs_split = torch.split(outputs, opt.max_generator_batches) - targets_split = torch.split(targets.contiguous(), opt.max_generator_batches) - for out_t, targ_t in zip(outputs_split, targets_split): + targets_split = torch.split(targets, opt.max_generator_batches) + for i, (out_t, targ_t) in enumerate(zip(outputs_split, targets_split)): out_t = out_t.view(-1, out_t.size(2)) - pred_t = generator(out_t) - loss_t = crit(pred_t, targ_t.view(-1)) + scores_t = generator(out_t) + loss_t = crit(scores_t, targ_t.view(-1)) + pred_t = scores_t.max(1)[1] + num_correct_t = pred_t.data.eq(targ_t.data).masked_select(targ_t.ne(onmt.Constants.PAD).data).sum() + num_correct += num_correct_t loss += loss_t.data[0] if not eval: loss_t.div(batch_size).backward() grad_output = None if outputs.grad is None else outputs.grad.data - return loss, grad_output + return loss, grad_output, num_correct def eval(model, criterion, data): total_loss = 0 total_words = 0 + total_num_correct = 0 model.eval() for i in range(len(data)): - batch = [x.transpose(0, 1) for x in data[i]] # must be batch first for gather/scatter in DataParallel - outputs = model(batch) # FIXME volatile - targets = batch[1][:, 1:] # exclude from targets - loss, _ = memoryEfficientLoss( + batch = data[i] + outputs = model(batch) + targets = batch[1][1:] # exclude from targets + loss, _, num_correct = memoryEfficientLoss( outputs, targets, model.generator, criterion, eval=True) total_loss += loss + total_num_correct += num_correct total_words += targets.data.ne(onmt.Constants.PAD).sum() model.train() - return total_loss / total_words + return total_loss / total_words, total_num_correct / total_words def trainModel(model, trainData, validData, dataset, optim): print(model) model.train() - if optim.last_ppl is None: - for p in model.parameters(): - p.data.uniform_(-opt.param_init, opt.param_init) # define criterion of each GPU criterion = NMTCriterion(dataset['dicts']['tgt'].size()) @@ -166,73 +182,87 @@ def trainModel(model, trainData, validData, dataset, optim): start_time = time.time() def trainEpoch(epoch): + if opt.extra_shuffle and epoch > opt.curriculum: + trainData.shuffle() + # shuffle mini batch order batchOrder = torch.randperm(len(trainData)) - total_loss, report_loss = 0, 0 - total_words, report_words = 0, 0 + total_loss, total_words, total_num_correct = 0, 0, 0 + report_loss, report_tgt_words, report_src_words, report_num_correct = 0, 0, 0, 0 start = time.time() for i in range(len(trainData)): - batchIdx = batchOrder[i] if epoch >= opt.curriculum else i + batchIdx = batchOrder[i] if epoch > opt.curriculum else i batch = trainData[batchIdx] - batch = [x.transpose(0, 1) for x in batch] # must be batch first for gather/scatter in DataParallel model.zero_grad() outputs = model(batch) - targets = batch[1][:, 1:] # exclude from targets - loss, gradOutput = memoryEfficientLoss( + targets = batch[1][1:] # exclude from targets + loss, gradOutput, num_correct = memoryEfficientLoss( outputs, targets, model.generator, criterion) outputs.backward(gradOutput) # update the parameters - grad_norm = optim.step() + optim.step() + num_words = targets.data.ne(onmt.Constants.PAD).sum() report_loss += loss + report_num_correct += num_correct + report_tgt_words += num_words + report_src_words += batch[0].data.ne(onmt.Constants.PAD).sum() total_loss += loss - num_words = targets.data.ne(onmt.Constants.PAD).sum() + total_num_correct += num_correct total_words += num_words - report_words += num_words - if i % opt.log_interval == 0 and i > 0: - print("Epoch %2d, %5d/%5d batches; perplexity: %6.2f; %3.0f tokens/s; %6.0f s elapsed" % - (epoch, i, len(trainData), - math.exp(report_loss / report_words), - report_words/(time.time()-start), + if i % opt.log_interval == -1 % opt.log_interval: + print("Epoch %2d, %5d/%5d; acc: %6.2f; ppl: %6.2f; %3.0f src tok/s; %3.0f tgt tok/s; %6.0f s elapsed" % + (epoch, i+1, len(trainData), + report_num_correct / report_tgt_words * 100, + math.exp(report_loss / report_tgt_words), + report_src_words/(time.time()-start), + report_tgt_words/(time.time()-start), time.time()-start_time)) - report_loss = report_words = 0 + report_loss = report_tgt_words = report_src_words = report_num_correct = 0 start = time.time() - return total_loss / total_words + return total_loss / total_words, total_num_correct / total_words for epoch in range(opt.start_epoch, opt.epochs + 1): print('') # (1) train for one epoch on the training set - train_loss = trainEpoch(epoch) - print('Train perplexity: %g' % math.exp(min(train_loss, 100))) + train_loss, train_acc = trainEpoch(epoch) + train_ppl = math.exp(min(train_loss, 100)) + print('Train perplexity: %g' % train_ppl) + print('Train accuracy: %g' % train_acc) # (2) evaluate on the validation set - valid_loss = eval(model, criterion, validData) + valid_loss, valid_acc = eval(model, criterion, validData) valid_ppl = math.exp(min(valid_loss, 100)) print('Validation perplexity: %g' % valid_ppl) + print('Validation accuracy: %g' % (valid_acc*100)) # (3) maybe update the learning rate - if opt.optim == 'sgd': + if opt.update_learning_rate: optim.updateLearningRate(valid_loss, epoch) + model_state_dict = model.module.state_dict() if len(opt.gpus) > 1 else model.state_dict() + model_state_dict = {k: v for k, v in model_state_dict.items() if 'generator' not in k} + generator_state_dict = model.generator.module.state_dict() if len(opt.gpus) > 1 else model.generator.state_dict() # (4) drop a checkpoint checkpoint = { - 'model': model, + 'model': model_state_dict, + 'generator': generator_state_dict, 'dicts': dataset['dicts'], 'opt': opt, 'epoch': epoch, - 'optim': optim, + 'optimizer': optim.optimizer.state_dict(), + 'last_ppl': optim.last_ppl, } torch.save(checkpoint, - '%s_e%d_%.2f.pt' % (opt.save_model, epoch, valid_ppl)) - + '%s_acc_%.2f_ppl_%.2f_e%d.pt' % (opt.save_model, 100*valid_acc, valid_ppl, epoch)) def main(): @@ -240,10 +270,17 @@ def main(): dataset = torch.load(opt.data) + dict_checkpoint = opt.train_from if opt.train_from else opt.train_from_state_dict + if dict_checkpoint: + print('Loading dicts from checkpoint at %s' % dict_checkpoint) + checkpoint = torch.load(dict_checkpoint) + dataset['dicts'] = checkpoint['dicts'] + trainData = onmt.Dataset(dataset['train']['src'], - dataset['train']['tgt'], opt.batch_size, opt.cuda) + dataset['train']['tgt'], opt.batch_size, opt.gpus) validData = onmt.Dataset(dataset['valid']['src'], - dataset['valid']['tgt'], opt.batch_size, opt.cuda) + dataset['valid']['tgt'], opt.batch_size, opt.gpus, + volatile=True) dicts = dataset['dicts'] print(' * vocabulary size. source = %d; target = %d' % @@ -254,42 +291,57 @@ def main(): print('Building model...') - if opt.train_from is None: - encoder = onmt.Models.Encoder(opt, dicts['src']) - decoder = onmt.Models.Decoder(opt, dicts['tgt']) - generator = nn.Sequential( - nn.Linear(opt.rnn_size, dicts['tgt'].size()), - nn.LogSoftmax()) - if opt.cuda > 1: - generator = nn.DataParallel(generator, device_ids=opt.gpus) - model = onmt.Models.NMTModel(encoder, decoder, generator) - if opt.cuda > 1: - model = nn.DataParallel(model, device_ids=opt.gpus) - if opt.cuda: - model.cuda() - else: - model.cpu() - - model.generator = generator + encoder = onmt.Models.Encoder(opt, dicts['src']) + decoder = onmt.Models.Decoder(opt, dicts['tgt']) + + generator = nn.Sequential( + nn.Linear(opt.rnn_size, dicts['tgt'].size()), + nn.LogSoftmax()) + model = onmt.Models.NMTModel(encoder, decoder) + + if opt.train_from: + print('Loading model from checkpoint at %s' % opt.train_from) + chk_model = checkpoint['model'] + generator_state_dict = chk_model.generator.state_dict() + model_state_dict = {k: v for k, v in chk_model.state_dict().items() if 'generator' not in k} + model.load_state_dict(model_state_dict) + generator.load_state_dict(generator_state_dict) + opt.start_epoch = checkpoint['epoch'] + 1 + + if opt.train_from_state_dict: + print('Loading model from checkpoint at %s' % opt.train_from_state_dict) + model.load_state_dict(checkpoint['model']) + opt.start_epoch = checkpoint['epoch'] + 1 + + if len(opt.gpus) >= 1: + model.cuda() + generator.cuda() + else: + model.cpu() + generator.cpu() + + if len(opt.gpus) > 1: + model = nn.DataParallel(model, device_ids=opt.gpus, dim=1) + generator = nn.DataParallel(generator, device_ids=opt.gpus, dim=0) + + model.generator = generator + + if not opt.train_from_state_dict and not opt.train_from: for p in model.parameters(): p.data.uniform_(-opt.param_init, opt.param_init) - optim = onmt.Optim( - model.parameters(), opt.optim, opt.learning_rate, opt.max_grad_norm, - lr_decay=opt.learning_rate_decay, - start_decay_at=opt.start_decay_at - ) - else: - print('Loading from checkpoint at %s' % opt.train_from) - checkpoint = torch.load(opt.train_from) - model = checkpoint['model'] - if opt.cuda: - model.cuda() - else: - model.cpu() - optim = checkpoint['optim'] - opt.start_epoch = checkpoint['epoch'] + 1 + optim = onmt.Optim( + model.parameters(), opt.optim, opt.learning_rate, opt.max_grad_norm, + lr_decay=opt.learning_rate_decay, + start_decay_at=opt.start_decay_at + ) + + if opt.train_from: + optim.optimizer.load_state_dict(checkpoint['optim'].optimizer.state_dict()) + + if opt.train_from_state_dict: + optim.optimizer.load_state_dict(checkpoint['optimizer']) nParams = sum([p.nelement() for p in model.parameters()]) print('* number of parameters: %d' % nParams) diff --git a/OpenNMT/translate.py b/OpenNMT/translate.py index dd7f1d3f98..4a93bb1bcf 100644 --- a/OpenNMT/translate.py +++ b/OpenNMT/translate.py @@ -50,7 +50,8 @@ def reportScore(name, scoreTotal, wordsTotal): def main(): opt = parser.parse_args() opt.cuda = opt.gpu > -1 - torch.cuda.set_device(opt.gpu) + if opt.cuda: + torch.cuda.set_device(opt.gpu) translator = onmt.Translator(opt) @@ -77,7 +78,7 @@ def main(): predBatch, predScore, goldScore = translator.translate(srcBatch, tgtBatch) predScoreTotal += sum(score[0] for score in predScore) - predWordsTotal += sum(len(x) for x in predBatch) + predWordsTotal += sum(len(x[0]) for x in predBatch) if tgtF is not None: goldScoreTotal += sum(goldScore) goldWordsTotal += sum(len(x) for x in tgtBatch) @@ -87,18 +88,24 @@ def main(): outF.write(" ".join(predBatch[b][0]) + '\n') if opt.verbose: - print('SENT %d: %s' % (count, " ".join(srcBatch[b]))) + srcSent = ' '.join(srcBatch[b]) + if translator.tgt_dict.lower: + srcSent = srcSent.lower() + print('SENT %d: %s' % (count, srcSent)) print('PRED %d: %s' % (count, " ".join(predBatch[b][0]))) print("PRED SCORE: %.4f" % predScore[b][0]) if tgtF is not None: - print('GOLD %d: %s ' % (count, " ".join(tgtBatch[b]))) + tgtSent = ' '.join(tgtBatch[b]) + if translator.tgt_dict.lower: + tgtSent = tgtSent.lower() + print('GOLD %d: %s ' % (count, tgtSent)) print("GOLD SCORE: %.4f" % goldScore[b]) if opt.n_best > 1: print('\nBEST HYP:') for n in range(opt.n_best): - print("[%.4f] %s" % (predScore[b][n], " ".join(predBatch[b][0]))) + print("[%.4f] %s" % (predScore[b][n], " ".join(predBatch[b][n]))) print('')