From 6832a7d10cdc191f4fa4409c79941fec97ff5c53 Mon Sep 17 00:00:00 2001 From: Bryan Marcus McCann Date: Thu, 23 Feb 2017 16:01:53 -0800 Subject: [PATCH 01/44] translate bug fix --- OpenNMT/translate.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/OpenNMT/translate.py b/OpenNMT/translate.py index dd7f1d3f98..b6dff86dda 100644 --- a/OpenNMT/translate.py +++ b/OpenNMT/translate.py @@ -50,7 +50,8 @@ def reportScore(name, scoreTotal, wordsTotal): def main(): opt = parser.parse_args() opt.cuda = opt.gpu > -1 - torch.cuda.set_device(opt.gpu) + if opt.cuda: + torch.cuda.set_device(opt.gpu) translator = onmt.Translator(opt) From fe15b4afef78cfc1e79563aa65389877d9e573b9 Mon Sep 17 00:00:00 2001 From: Bryan Marcus McCann Date: Thu, 23 Feb 2017 16:02:41 -0800 Subject: [PATCH 02/44] README changes for multi-gpu --- OpenNMT/README.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/OpenNMT/README.md b/OpenNMT/README.md index 3d578b6c62..7853121e24 100644 --- a/OpenNMT/README.md +++ b/OpenNMT/README.md @@ -20,11 +20,12 @@ OpenNMT consists of three commands: 2) Train the model. -```python train.py -data data/demo-train.pt -save_model model -gpus 1``` +```python train.py -data data/demo-train.pt -save_model model -gpus 0``` 3) Translate sentences. -```python translate.py -gpu 1 -model model_e13_*.pt -src data/src-test.txt -tgt data/tgt-test.txt -replace_unk -verbose``` +```python translate.py -gpu 0 -model model_e13_*.pt -src data/src-test.txt -tgt data/tgt-test.txt -replace_unk -verbose``` +>>>>>>> c19b7d3... README changes for multi-gpu ## Pretrained Models From 316a524b8567a15c57698ee0634965b822340280 Mon Sep 17 00:00:00 2001 From: Bryan Marcus McCann Date: Thu, 23 Feb 2017 16:04:40 -0800 Subject: [PATCH 03/44] removing reinit of checkpoint params again --- OpenNMT/train.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/OpenNMT/train.py b/OpenNMT/train.py index 585ab4115b..3825862d37 100644 --- a/OpenNMT/train.py +++ b/OpenNMT/train.py @@ -156,9 +156,6 @@ def eval(model, criterion, data): def trainModel(model, trainData, validData, dataset, optim): print(model) model.train() - if optim.last_ppl is None: - for p in model.parameters(): - p.data.uniform_(-opt.param_init, opt.param_init) # define criterion of each GPU criterion = NMTCriterion(dataset['dicts']['tgt'].size()) From 99dec5c9329f90fdfb13bdfe0ad5419dd0f1d1ff Mon Sep 17 00:00:00 2001 From: Bryan Marcus McCann Date: Thu, 23 Feb 2017 16:25:45 -0800 Subject: [PATCH 04/44] using split instead of chunk --- OpenNMT/onmt/Models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/OpenNMT/onmt/Models.py b/OpenNMT/onmt/Models.py index 7fb48cdf86..482c170c97 100644 --- a/OpenNMT/onmt/Models.py +++ b/OpenNMT/onmt/Models.py @@ -110,7 +110,7 @@ def forward(self, input, hidden, context, init_output): # self.input_feed=False outputs = [] output = init_output - for i, emb_t in enumerate(emb.chunk(emb.size(0), dim=0)): + for i, emb_t in enumerate(emb.split(1)): emb_t = emb_t.squeeze(0) if self.input_feed: emb_t = torch.cat([emb_t, output], 1) From 2078b14048a14bcbb3a84be0d43de0e0993d45fc Mon Sep 17 00:00:00 2001 From: Bryan Marcus McCann Date: Thu, 23 Feb 2017 16:29:02 -0800 Subject: [PATCH 05/44] replacing opt.cuda with opt.gpus as needed --- OpenNMT/train.py | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/OpenNMT/train.py b/OpenNMT/train.py index 3825862d37..c28571f0fe 100644 --- a/OpenNMT/train.py +++ b/OpenNMT/train.py @@ -96,21 +96,20 @@ # help="Seed for random initialization") opt = parser.parse_args() -opt.cuda = len(opt.gpus) print(opt) -if torch.cuda.is_available() and not opt.cuda: - print("WARNING: You have a CUDA device, so you should probably run with -gpus 1") +if torch.cuda.is_available() and not opt.gpus: + print("WARNING: You have a CUDA device, so you should probably run with -gpus 0") -if opt.cuda: +if opt.gpus: cuda.set_device(opt.gpus[0]) def NMTCriterion(vocabSize): weight = torch.ones(vocabSize) weight[onmt.Constants.PAD] = 0 crit = nn.NLLLoss(weight, size_average=False) - if opt.cuda: + if opt.gpus: crit.cuda() return crit @@ -238,9 +237,9 @@ def main(): dataset = torch.load(opt.data) trainData = onmt.Dataset(dataset['train']['src'], - dataset['train']['tgt'], opt.batch_size, opt.cuda) + dataset['train']['tgt'], opt.batch_size, opt.gpus) validData = onmt.Dataset(dataset['valid']['src'], - dataset['valid']['tgt'], opt.batch_size, opt.cuda) + dataset['valid']['tgt'], opt.batch_size, opt.gpus) dicts = dataset['dicts'] print(' * vocabulary size. source = %d; target = %d' % @@ -257,12 +256,12 @@ def main(): generator = nn.Sequential( nn.Linear(opt.rnn_size, dicts['tgt'].size()), nn.LogSoftmax()) - if opt.cuda > 1: + if len(opt.gpus) > 1: generator = nn.DataParallel(generator, device_ids=opt.gpus) model = onmt.Models.NMTModel(encoder, decoder, generator) - if opt.cuda > 1: + if len(opt.gpus) > 1: model = nn.DataParallel(model, device_ids=opt.gpus) - if opt.cuda: + if opt.gpus: model.cuda() else: model.cpu() @@ -281,7 +280,7 @@ def main(): print('Loading from checkpoint at %s' % opt.train_from) checkpoint = torch.load(opt.train_from) model = checkpoint['model'] - if opt.cuda: + if opt.gpus: model.cuda() else: model.cpu() From d685c1c4faf6fa15dc5287161408e2194296b4cc Mon Sep 17 00:00:00 2001 From: Bryan Marcus McCann Date: Thu, 23 Feb 2017 17:20:22 -0800 Subject: [PATCH 06/44] using ModuleList --- OpenNMT/onmt/Models.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/OpenNMT/onmt/Models.py b/OpenNMT/onmt/Models.py index 482c170c97..8d2fd4543b 100644 --- a/OpenNMT/onmt/Models.py +++ b/OpenNMT/onmt/Models.py @@ -46,17 +46,16 @@ def __init__(self, num_layers, input_size, rnn_size, dropout): super(StackedLSTM, self).__init__() self.dropout = nn.Dropout(dropout) self.num_layers = num_layers + self.layers = nn.ModuleList() for i in range(num_layers): - layer = nn.LSTMCell(input_size, rnn_size) - self.add_module('layer_%d' % i, layer) + self.layers.append(nn.LSTMCell(input_size, rnn_size)) input_size = rnn_size def forward(self, input, hidden): h_0, c_0 = hidden h_1, c_1 = [], [] - for i in range(self.num_layers): - layer = getattr(self, 'layer_%d' % i) + for i, layer in enumerate(self.layers): h_1_i, c_1_i = layer(input, (h_0[i], c_0[i])) input = h_1_i if i != self.num_layers: From 4d0c84f5629b9845cabee023d96e2b10fd2e870d Mon Sep 17 00:00:00 2001 From: Bryan Marcus McCann Date: Thu, 23 Feb 2017 21:17:16 -0800 Subject: [PATCH 07/44] default type for start_decay_at --- OpenNMT/train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/OpenNMT/train.py b/OpenNMT/train.py index c28571f0fe..497d7c97b1 100644 --- a/OpenNMT/train.py +++ b/OpenNMT/train.py @@ -71,7 +71,7 @@ help="""Decay learning rate by this much if (i) perplexity does not decrease on the validation set or (ii) epoch has gone past the start_decay_at_limit""") -parser.add_argument('-start_decay_at', default=8, +parser.add_argument('-start_decay_at', type=int, default=8, help="Start decay after this epoch") parser.add_argument('-curriculum', action="store_true", help="""For this many epochs, order the minibatches based From f059047e7c9554b66880751917db151b3c22c770 Mon Sep 17 00:00:00 2001 From: Bryan Marcus McCann Date: Tue, 28 Feb 2017 08:29:54 -0800 Subject: [PATCH 08/44] decoder hidden state fix --- OpenNMT/onmt/Models.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/OpenNMT/onmt/Models.py b/OpenNMT/onmt/Models.py index 8d2fd4543b..718fbfdfe6 100644 --- a/OpenNMT/onmt/Models.py +++ b/OpenNMT/onmt/Models.py @@ -114,13 +114,13 @@ def forward(self, input, hidden, context, init_output): if self.input_feed: emb_t = torch.cat([emb_t, output], 1) - output, h = self.rnn(emb_t, hidden) + output, hidden = self.rnn(emb_t, hidden) output, attn = self.attn(output, context.t()) output = self.dropout(output) outputs += [output] outputs = torch.stack(outputs) - return outputs.transpose(0, 1), h, attn + return outputs.transpose(0, 1), hidden, attn class NMTModel(nn.Module): From af94796b89cc18cd9f9ec40740b3a1799d1266c6 Mon Sep 17 00:00:00 2001 From: Bryan Marcus McCann Date: Tue, 28 Feb 2017 16:30:41 -0800 Subject: [PATCH 09/44] nn.clip_grad_norm --- OpenNMT/onmt/Optim.py | 16 ++++------------ OpenNMT/train.py | 2 +- 2 files changed, 5 insertions(+), 13 deletions(-) diff --git a/OpenNMT/onmt/Optim.py b/OpenNMT/onmt/Optim.py index 0d0e6e3c72..870bf9867e 100644 --- a/OpenNMT/onmt/Optim.py +++ b/OpenNMT/onmt/Optim.py @@ -1,5 +1,7 @@ import math import torch.optim as optim +import torch.nn as nn +from torch.nn.utils import clip_grad_norm class Optim(object): @@ -29,19 +31,9 @@ def __init__(self, params, method, lr, max_grad_norm, lr_decay=1, start_decay_at def step(self): # Compute gradients norm. - grad_norm = 0 - for param in self.params: - grad_norm += math.pow(param.grad.data.norm(), 2) - - grad_norm = math.sqrt(grad_norm) - shrinkage = self.max_grad_norm / grad_norm - - for param in self.params: - if shrinkage < 1: - param.grad.data.mul_(shrinkage) - + if self.max_grad_norm: + clip_grad_norm(self.params, self.max_grad_norm) self.optimizer.step() - return grad_norm # decay learning rate if val perf does not improve or we hit the start_decay_at limit def updateLearningRate(self, ppl, epoch): diff --git a/OpenNMT/train.py b/OpenNMT/train.py index 497d7c97b1..a68d2ea0e0 100644 --- a/OpenNMT/train.py +++ b/OpenNMT/train.py @@ -183,7 +183,7 @@ def trainEpoch(epoch): outputs.backward(gradOutput) # update the parameters - grad_norm = optim.step() + optim.step() report_loss += loss total_loss += loss From a2caf6483544c4a3c81f8ec62f8b41fa0cd82da6 Mon Sep 17 00:00:00 2001 From: Bryan Marcus McCann Date: Tue, 28 Feb 2017 16:43:23 -0800 Subject: [PATCH 10/44] adding src/tgt tokens/s --- OpenNMT/train.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/OpenNMT/train.py b/OpenNMT/train.py index a68d2ea0e0..8d389b5c5f 100644 --- a/OpenNMT/train.py +++ b/OpenNMT/train.py @@ -166,7 +166,7 @@ def trainEpoch(epoch): batchOrder = torch.randperm(len(trainData)) total_loss, report_loss = 0, 0 - total_words, report_words = 0, 0 + total_words, report_tgt_words, report_src_words = 0, 0, 0 start = time.time() for i in range(len(trainData)): @@ -189,15 +189,17 @@ def trainEpoch(epoch): total_loss += loss num_words = targets.data.ne(onmt.Constants.PAD).sum() total_words += num_words - report_words += num_words + report_tgt_words += num_words + report_src_words += batch[0].data.ne(onmt.Constants.PAD).sum() if i % opt.log_interval == 0 and i > 0: - print("Epoch %2d, %5d/%5d batches; perplexity: %6.2f; %3.0f tokens/s; %6.0f s elapsed" % + print("Epoch %2d, %5d/%5d batches; perplexity: %6.2f; %3.0f source tokens/s; %3.0f target tokens/s; %6.0f s elapsed" % (epoch, i, len(trainData), - math.exp(report_loss / report_words), - report_words/(time.time()-start), + math.exp(report_loss / report_tgt_words), + report_src_words/(time.time()-start), + report_tgt_words/(time.time()-start), time.time()-start_time)) - report_loss = report_words = 0 + report_loss = report_tgt_words = report_src_words = 0 start = time.time() return total_loss / total_words From 727863b037c8b1e92d0b56e3e3100ac4f4df6d24 Mon Sep 17 00:00:00 2001 From: Bryan Marcus McCann Date: Tue, 28 Feb 2017 16:44:18 -0800 Subject: [PATCH 11/44] index in verbose translate was fixed --- OpenNMT/translate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/OpenNMT/translate.py b/OpenNMT/translate.py index b6dff86dda..f599f47e49 100644 --- a/OpenNMT/translate.py +++ b/OpenNMT/translate.py @@ -99,7 +99,7 @@ def main(): if opt.n_best > 1: print('\nBEST HYP:') for n in range(opt.n_best): - print("[%.4f] %s" % (predScore[b][n], " ".join(predBatch[b][0]))) + print("[%.4f] %s" % (predScore[b][n], " ".join(predBatch[b][n]))) print('') From 639eb45861bd025e06c80158d2af6c8d3a74b211 Mon Sep 17 00:00:00 2001 From: Bryan Marcus McCann Date: Tue, 28 Feb 2017 16:47:36 -0800 Subject: [PATCH 12/44] bug in total num predicted words --- OpenNMT/translate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/OpenNMT/translate.py b/OpenNMT/translate.py index f599f47e49..0f9c2016b9 100644 --- a/OpenNMT/translate.py +++ b/OpenNMT/translate.py @@ -78,7 +78,7 @@ def main(): predBatch, predScore, goldScore = translator.translate(srcBatch, tgtBatch) predScoreTotal += sum(score[0] for score in predScore) - predWordsTotal += sum(len(x) for x in predBatch) + predWordsTotal += sum(len(x[0]) for x in predBatch) if tgtF is not None: goldScoreTotal += sum(goldScore) goldWordsTotal += sum(len(x) for x in tgtBatch) From bb9d462bff3ea936590b309e71721e5733407fad Mon Sep 17 00:00:00 2001 From: Bryan Marcus McCann Date: Wed, 1 Mar 2017 13:35:56 -0800 Subject: [PATCH 13/44] Variables in Translator can be volatile --- OpenNMT/onmt/Translator.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/OpenNMT/onmt/Translator.py b/OpenNMT/onmt/Translator.py index 9640528c10..3cb68b4584 100644 --- a/OpenNMT/onmt/Translator.py +++ b/OpenNMT/onmt/Translator.py @@ -99,9 +99,9 @@ def applyContextMask(m): # (3) run the decoder to generate sentences, using beam search # Expand tensors for each beam. - context = Variable(context.data.repeat(1, beamSize, 1)) - decStates = (Variable(encStates[0].data.repeat(1, beamSize, 1)), - Variable(encStates[1].data.repeat(1, beamSize, 1))) + context = Variable(context.data.repeat(1, beamSize, 1), volatile=True) + decStates = (Variable(encStates[0].data.repeat(1, beamSize, 1), volatile=True), + Variable(encStates[1].data.repeat(1, beamSize, 1), volatile=True)) beam = [onmt.Beam(beamSize, self.opt.cuda) for k in range(batchSize)] @@ -120,7 +120,7 @@ def applyContextMask(m): if not b.done]).t().contiguous().view(1, -1) decOut, decStates, attn = self.model.decoder( - Variable(input).transpose(0, 1), decStates, context, decOut) + Variable(input, volatile=True).transpose(0, 1), decStates, context, decOut) # decOut: 1 x (beam*batch) x numWords decOut = decOut.transpose(0, 1).squeeze(0) out = self.model.generator.forward(decOut) @@ -159,7 +159,7 @@ def updateActive(t): newSize = list(t.size()) newSize[-2] = newSize[-2] * len(activeIdx) // remainingSents return Variable(view.index_select(1, activeIdx) \ - .view(*newSize)) + .view(*newSize), volatile=True) decStates = (updateActive(decStates[0]), updateActive(decStates[1])) decOut = updateActive(decOut) From 2681c91b60ced47df5bf71d6f2155bed7074b88e Mon Sep 17 00:00:00 2001 From: Bryan Marcus McCann Date: Wed, 1 Mar 2017 16:09:38 -0800 Subject: [PATCH 14/44] removing unnecessary def --- OpenNMT/onmt/modules/GlobalAttention.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/OpenNMT/onmt/modules/GlobalAttention.py b/OpenNMT/onmt/modules/GlobalAttention.py index 7980cb0746..ac2f315641 100644 --- a/OpenNMT/onmt/modules/GlobalAttention.py +++ b/OpenNMT/onmt/modules/GlobalAttention.py @@ -24,8 +24,6 @@ import torch.nn as nn import math -_INF = float('inf') - class GlobalAttention(nn.Module): def __init__(self, dim): super(GlobalAttention, self).__init__() @@ -48,7 +46,7 @@ def forward(self, input, context): # Get attention attn = torch.bmm(context, targetT).squeeze(2) # batch x sourceL if self.mask is not None: - attn.data.masked_fill_(self.mask, -_INF) + attn.data.masked_fill_(self.mask, -float('inf')) attn = self.sm(attn) attn3 = attn.view(attn.size(0), 1, attn.size(1)) # batch x 1 x sourceL From 647814726aa3fcd7b92c3078fec7eb8326e43cc4 Mon Sep 17 00:00:00 2001 From: Bryan Marcus McCann Date: Wed, 1 Mar 2017 16:10:00 -0800 Subject: [PATCH 15/44] allowing lowercase option --- OpenNMT/onmt/Dict.py | 5 ++++- OpenNMT/preprocess.py | 4 +++- OpenNMT/translate.py | 10 ++++++++-- 3 files changed, 15 insertions(+), 4 deletions(-) diff --git a/OpenNMT/onmt/Dict.py b/OpenNMT/onmt/Dict.py index 59a4a45b67..43eaf23d88 100644 --- a/OpenNMT/onmt/Dict.py +++ b/OpenNMT/onmt/Dict.py @@ -2,10 +2,11 @@ class Dict(object): - def __init__(self, data=None): + def __init__(self, data=None, lower=False): self.idxToLabel = {} self.labelToIdx = {} self.frequencies = {} + self.lower = True # Special entries will not be pruned. self.special = [] @@ -37,6 +38,7 @@ def writeFile(self, filename): file.close() def lookup(self, key, default=None): + key = key.lower() if self.lower else key try: return self.labelToIdx[key] except KeyError: @@ -60,6 +62,7 @@ def addSpecials(self, labels): # Add `label` in the dictionary. Use `idx` as its index if given. def add(self, label, idx=None): + label = label.lower() if self.lower else label if idx is not None: self.idxToLabel[idx] = label self.labelToIdx[label] = idx diff --git a/OpenNMT/preprocess.py b/OpenNMT/preprocess.py index ea7f57b814..5d46493a26 100644 --- a/OpenNMT/preprocess.py +++ b/OpenNMT/preprocess.py @@ -40,6 +40,8 @@ parser.add_argument('-seed', type=int, default=3435, help="Random seed") +parser.add_argument('-lower', action='store_true', help='lowercase data') + parser.add_argument('-report_every', type=int, default=100000, help="Report status every this many sentences") @@ -48,7 +50,7 @@ def makeVocabulary(filename, size): vocab = onmt.Dict([onmt.Constants.PAD_WORD, onmt.Constants.UNK_WORD, - onmt.Constants.BOS_WORD, onmt.Constants.EOS_WORD]) + onmt.Constants.BOS_WORD, onmt.Constants.EOS_WORD], lower=True) with open(filename) as f: for sent in f.readlines(): diff --git a/OpenNMT/translate.py b/OpenNMT/translate.py index 0f9c2016b9..4a93bb1bcf 100644 --- a/OpenNMT/translate.py +++ b/OpenNMT/translate.py @@ -88,12 +88,18 @@ def main(): outF.write(" ".join(predBatch[b][0]) + '\n') if opt.verbose: - print('SENT %d: %s' % (count, " ".join(srcBatch[b]))) + srcSent = ' '.join(srcBatch[b]) + if translator.tgt_dict.lower: + srcSent = srcSent.lower() + print('SENT %d: %s' % (count, srcSent)) print('PRED %d: %s' % (count, " ".join(predBatch[b][0]))) print("PRED SCORE: %.4f" % predScore[b][0]) if tgtF is not None: - print('GOLD %d: %s ' % (count, " ".join(tgtBatch[b]))) + tgtSent = ' '.join(tgtBatch[b]) + if translator.tgt_dict.lower: + tgtSent = tgtSent.lower() + print('GOLD %d: %s ' % (count, tgtSent)) print("GOLD SCORE: %.4f" % goldScore[b]) if opt.n_best > 1: From 70c3d8f4e011eab1c397c4cf60c84282fb902e12 Mon Sep 17 00:00:00 2001 From: Bryan Marcus McCann Date: Wed, 1 Mar 2017 16:11:47 -0800 Subject: [PATCH 16/44] pointing out one way to do bleu scores in README --- OpenNMT/README.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/OpenNMT/README.md b/OpenNMT/README.md index 7853121e24..7290aed043 100644 --- a/OpenNMT/README.md +++ b/OpenNMT/README.md @@ -27,6 +27,11 @@ OpenNMT consists of three commands: ```python translate.py -gpu 0 -model model_e13_*.pt -src data/src-test.txt -tgt data/tgt-test.txt -replace_unk -verbose``` >>>>>>> c19b7d3... README changes for multi-gpu +4) Evaluate. + +```wget https://raw.githubusercontent.com/moses-smt/mosesdecoder/master/scripts/generic/multi-bleu.perl``` +```perl multi-bleu.perl data/tgt-test.txt < pred.txt``` + ## Pretrained Models The following pretrained models can be downloaded and used with translate.py. From 1c7d2ea1397161eed9f71bc44d2bf74135d07b69 Mon Sep 17 00:00:00 2001 From: Bryan Marcus McCann Date: Wed, 1 Mar 2017 16:13:08 -0800 Subject: [PATCH 17/44] adding files to ignore --- OpenNMT/.gitignore | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 OpenNMT/.gitignore diff --git a/OpenNMT/.gitignore b/OpenNMT/.gitignore new file mode 100644 index 0000000000..1ff9f3fe17 --- /dev/null +++ b/OpenNMT/.gitignore @@ -0,0 +1,3 @@ +pred.txt +multi-bleu.perl +*.pt From f45c628e509e35731f763b68c04e86bb386dabe1 Mon Sep 17 00:00:00 2001 From: Bryan Marcus McCann Date: Wed, 1 Mar 2017 16:56:23 -0800 Subject: [PATCH 18/44] preprocess needs to use lower option --- OpenNMT/onmt/Dict.py | 2 +- OpenNMT/preprocess.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/OpenNMT/onmt/Dict.py b/OpenNMT/onmt/Dict.py index 43eaf23d88..119fcf8933 100644 --- a/OpenNMT/onmt/Dict.py +++ b/OpenNMT/onmt/Dict.py @@ -6,7 +6,7 @@ def __init__(self, data=None, lower=False): self.idxToLabel = {} self.labelToIdx = {} self.frequencies = {} - self.lower = True + self.lower = lower # Special entries will not be pruned. self.special = [] diff --git a/OpenNMT/preprocess.py b/OpenNMT/preprocess.py index 5d46493a26..8987e20174 100644 --- a/OpenNMT/preprocess.py +++ b/OpenNMT/preprocess.py @@ -50,7 +50,7 @@ def makeVocabulary(filename, size): vocab = onmt.Dict([onmt.Constants.PAD_WORD, onmt.Constants.UNK_WORD, - onmt.Constants.BOS_WORD, onmt.Constants.EOS_WORD], lower=True) + onmt.Constants.BOS_WORD, onmt.Constants.EOS_WORD], lower=opt.lower) with open(filename) as f: for sent in f.readlines(): From 36793a03cf155313537f10db83c0bc5eaf9c1640 Mon Sep 17 00:00:00 2001 From: Bryan Marcus McCann Date: Wed, 1 Mar 2017 20:01:19 -0800 Subject: [PATCH 19/44] tips for non-demo mt via flickr30k example --- OpenNMT/README.md | 43 +++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 41 insertions(+), 2 deletions(-) diff --git a/OpenNMT/README.md b/OpenNMT/README.md index 7290aed043..8aab00c12f 100644 --- a/OpenNMT/README.md +++ b/OpenNMT/README.md @@ -12,26 +12,65 @@ OpenNMT consists of three commands: 0) Download the data. +Demo: + ```wget https://s3.amazonaws.com/pytorch/examples/opennmt/data/onmt-data.tar && tar -xf onmt-data.tar``` +Flickr30k (de-en): + +```mkdir -p data/flickr``` +```wget http://www.quest.dcs.shef.ac.uk/wmt16_files_mmt/training.tar.gz && tar -xf training.tar.gz -C data/flickr && rm training.tar.gz``` +```wget http://www.quest.dcs.shef.ac.uk/wmt16_files_mmt/validation.tar.gz && tar -xf validation.tar.gz -C data/flickr && rm validation.tar.gz``` +```wget https://staff.fnwi.uva.nl/d.elliott/wmt16/mmt16_task1_test.tgz && tar -xf mmt16_task1_test.tgz -C data/flickr && rm mmt16_task1_test.tgz``` + 1) Preprocess the data. +Demo: + ```python preprocess.py -train_src data/src-train.txt -train_tgt data/tgt-train.txt -valid_src data/src-val.txt -valid_tgt data/tgt-val.txt -save_data data/demo``` +Flickr30k: + +```wget https://raw.githubusercontent.com/moses-smt/mosesdecoder/master/scripts/tokenizer/tokenizer.perl``` +```sed -i "s/$RealBin\/..\/share\/nonbreaking_prefixes//" tokenizer.perl``` +```wget https://github.com/moses-smt/mosesdecoder/blob/master/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.de``` +```wget https://github.com/moses-smt/mosesdecoder/blob/master/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.en``` +```for l in en de; do for f in data/flickr/*.$l; do if [[ "$f" != *"test"* ]]; then sed -i "$ d" $f; fi; perl tokenizer.perl -no-escape -l $l -q < $f > $f.tok; done; done``` +```python preprocess.py -train_src data/flickr/train.en.tok -train_tgt data/flickr/train.de.tok -valid_src data/flickr/val.en.tok -valid_tgt data/flickr/val.de.tok -save_data data/flickr``` + 2) Train the model. +Demo: + ```python train.py -data data/demo-train.pt -save_model model -gpus 0``` +Flickr30k: + +```python train.py -data data/flickr-train.pt -save_model flickr_model -gpus 0``` + 3) Translate sentences. -```python translate.py -gpu 0 -model model_e13_*.pt -src data/src-test.txt -tgt data/tgt-test.txt -replace_unk -verbose``` ->>>>>>> c19b7d3... README changes for multi-gpu +Demo: + +```python translate.py -gpu 0 -model model_e13_*.pt -src data/src-test.txt -tgt data/tgt-test.txt -replace_unk -verbose -output demo-pred.txt``` + +Flickr30k: + +```python translate.py -gpu 0 -model flickr_model_e7_*.pt -src data/flickr/test.en.tok -tgt data/flickr/test.de.tok -replace_unk -verbose -output flickr_pred.txt``` +>>>>>>> c87fc08... tips for non-demo mt via flickr30k example 4) Evaluate. ```wget https://raw.githubusercontent.com/moses-smt/mosesdecoder/master/scripts/generic/multi-bleu.perl``` + +Demo: + ```perl multi-bleu.perl data/tgt-test.txt < pred.txt``` +Flickr30k: + +```perl multi-bleu.perl data/flickr/test.de < flickr_pred.txt``` + ## Pretrained Models The following pretrained models can be downloaded and used with translate.py. From a5349bf5f3089913f52a19cb1eb8d8f99661a229 Mon Sep 17 00:00:00 2001 From: Bryan Marcus McCann Date: Wed, 1 Mar 2017 20:14:50 -0800 Subject: [PATCH 20/44] cleaning up readme --- OpenNMT/README.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/OpenNMT/README.md b/OpenNMT/README.md index 8aab00c12f..1891a7dbd4 100644 --- a/OpenNMT/README.md +++ b/OpenNMT/README.md @@ -19,8 +19,11 @@ Demo: Flickr30k (de-en): ```mkdir -p data/flickr``` + ```wget http://www.quest.dcs.shef.ac.uk/wmt16_files_mmt/training.tar.gz && tar -xf training.tar.gz -C data/flickr && rm training.tar.gz``` + ```wget http://www.quest.dcs.shef.ac.uk/wmt16_files_mmt/validation.tar.gz && tar -xf validation.tar.gz -C data/flickr && rm validation.tar.gz``` + ```wget https://staff.fnwi.uva.nl/d.elliott/wmt16/mmt16_task1_test.tgz && tar -xf mmt16_task1_test.tgz -C data/flickr && rm mmt16_task1_test.tgz``` 1) Preprocess the data. @@ -32,10 +35,15 @@ Demo: Flickr30k: ```wget https://raw.githubusercontent.com/moses-smt/mosesdecoder/master/scripts/tokenizer/tokenizer.perl``` + ```sed -i "s/$RealBin\/..\/share\/nonbreaking_prefixes//" tokenizer.perl``` + ```wget https://github.com/moses-smt/mosesdecoder/blob/master/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.de``` + ```wget https://github.com/moses-smt/mosesdecoder/blob/master/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.en``` + ```for l in en de; do for f in data/flickr/*.$l; do if [[ "$f" != *"test"* ]]; then sed -i "$ d" $f; fi; perl tokenizer.perl -no-escape -l $l -q < $f > $f.tok; done; done``` + ```python preprocess.py -train_src data/flickr/train.en.tok -train_tgt data/flickr/train.de.tok -valid_src data/flickr/val.en.tok -valid_tgt data/flickr/val.de.tok -save_data data/flickr``` 2) Train the model. From 7f518d268bccc80a68dad67219208d12061678e7 Mon Sep 17 00:00:00 2001 From: Bryan McCann Date: Wed, 1 Mar 2017 20:33:43 -0800 Subject: [PATCH 21/44] clean up the readme --- OpenNMT/README.md | 62 +++++++++++++++++++++-------------------------- 1 file changed, 28 insertions(+), 34 deletions(-) diff --git a/OpenNMT/README.md b/OpenNMT/README.md index 1891a7dbd4..ad0d1e0c05 100644 --- a/OpenNMT/README.md +++ b/OpenNMT/README.md @@ -8,15 +8,34 @@ an open-source (MIT) neural machine translation system. ## Quickstart -OpenNMT consists of three commands: +Use of OpenNMT consists of four steps: -0) Download the data. - -Demo: +### 0) Download the data. ```wget https://s3.amazonaws.com/pytorch/examples/opennmt/data/onmt-data.tar && tar -xf onmt-data.tar``` -Flickr30k (de-en): +### 1) Preprocess the data. + +```python preprocess.py -train_src data/src-train.txt -train_tgt data/tgt-train.txt -valid_src data/src-val.txt -valid_tgt data/tgt-val.txt -save_data data/demo``` + +### 2) Train the model. + +```python train.py -data data/demo-train.pt -save_model demo_model -gpus 0``` + +### 3) Translate sentences. + +```python translate.py -gpu 0 -model demo_model_e13_*.pt -src data/src-test.txt -tgt data/tgt-test.txt -replace_unk -verbose -output demo_pred.txt``` + +### 4) Evaluate. + +```wget https://raw.githubusercontent.com/moses-smt/mosesdecoder/master/scripts/generic/multi-bleu.perl``` +```perl multi-bleu.perl data/tgt-test.txt < demo_pred.txt``` + +## WMT'16 Multimodal Translation: Flickr30k (de-en) + +Data might not come as clean as the demo data. Here is a second example that uses the Moses tokenizer (http://www.statmt.org/moses/) to prepare the Flickr30k data from the WMT'16 Multimodal Translation task (http://www.statmt.org/wmt16/multimodal-task.html). + +### 0) Download the data. ```mkdir -p data/flickr``` @@ -26,13 +45,7 @@ Flickr30k (de-en): ```wget https://staff.fnwi.uva.nl/d.elliott/wmt16/mmt16_task1_test.tgz && tar -xf mmt16_task1_test.tgz -C data/flickr && rm mmt16_task1_test.tgz``` -1) Preprocess the data. - -Demo: - -```python preprocess.py -train_src data/src-train.txt -train_tgt data/tgt-train.txt -valid_src data/src-val.txt -valid_tgt data/tgt-val.txt -save_data data/demo``` - -Flickr30k: +### 1) Preprocess the data. ```wget https://raw.githubusercontent.com/moses-smt/mosesdecoder/master/scripts/tokenizer/tokenizer.perl``` @@ -46,37 +59,18 @@ Flickr30k: ```python preprocess.py -train_src data/flickr/train.en.tok -train_tgt data/flickr/train.de.tok -valid_src data/flickr/val.en.tok -valid_tgt data/flickr/val.de.tok -save_data data/flickr``` -2) Train the model. - -Demo: - -```python train.py -data data/demo-train.pt -save_model model -gpus 0``` - -Flickr30k: +### 2) Train the model. ```python train.py -data data/flickr-train.pt -save_model flickr_model -gpus 0``` -3) Translate sentences. - -Demo: - -```python translate.py -gpu 0 -model model_e13_*.pt -src data/src-test.txt -tgt data/tgt-test.txt -replace_unk -verbose -output demo-pred.txt``` - -Flickr30k: +### 3) Translate sentences. ```python translate.py -gpu 0 -model flickr_model_e7_*.pt -src data/flickr/test.en.tok -tgt data/flickr/test.de.tok -replace_unk -verbose -output flickr_pred.txt``` >>>>>>> c87fc08... tips for non-demo mt via flickr30k example -4) Evaluate. +### 4) Evaluate. ```wget https://raw.githubusercontent.com/moses-smt/mosesdecoder/master/scripts/generic/multi-bleu.perl``` - -Demo: - -```perl multi-bleu.perl data/tgt-test.txt < pred.txt``` - -Flickr30k: - ```perl multi-bleu.perl data/flickr/test.de < flickr_pred.txt``` ## Pretrained Models From 9af532ca6ec6516d6d35df3b20d4cebefa307eb9 Mon Sep 17 00:00:00 2001 From: Bryan McCann Date: Wed, 1 Mar 2017 20:34:59 -0800 Subject: [PATCH 22/44] spacing in readme --- OpenNMT/README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/OpenNMT/README.md b/OpenNMT/README.md index ad0d1e0c05..92b236d81b 100644 --- a/OpenNMT/README.md +++ b/OpenNMT/README.md @@ -29,6 +29,7 @@ Use of OpenNMT consists of four steps: ### 4) Evaluate. ```wget https://raw.githubusercontent.com/moses-smt/mosesdecoder/master/scripts/generic/multi-bleu.perl``` + ```perl multi-bleu.perl data/tgt-test.txt < demo_pred.txt``` ## WMT'16 Multimodal Translation: Flickr30k (de-en) @@ -71,6 +72,7 @@ Data might not come as clean as the demo data. Here is a second example that use ### 4) Evaluate. ```wget https://raw.githubusercontent.com/moses-smt/mosesdecoder/master/scripts/generic/multi-bleu.perl``` + ```perl multi-bleu.perl data/flickr/test.de < flickr_pred.txt``` ## Pretrained Models From e48f62073cd131045bd6f82c676eb3709d34abca Mon Sep 17 00:00:00 2001 From: Bryan Marcus McCann Date: Wed, 1 Mar 2017 21:45:54 -0800 Subject: [PATCH 23/44] cudnn decoder --- OpenNMT/onmt/Models.py | 55 +++++++++--------------------------------- 1 file changed, 11 insertions(+), 44 deletions(-) diff --git a/OpenNMT/onmt/Models.py b/OpenNMT/onmt/Models.py index 718fbfdfe6..3e4f2d19bf 100644 --- a/OpenNMT/onmt/Models.py +++ b/OpenNMT/onmt/Models.py @@ -10,20 +10,17 @@ def __init__(self, opt, dicts): self.num_directions = 2 if opt.brnn else 1 assert opt.rnn_size % self.num_directions == 0 self.hidden_size = opt.rnn_size // self.num_directions - inputSize = opt.word_vec_size + input_size = opt.word_vec_size super(Encoder, self).__init__() self.word_lut = nn.Embedding(dicts.size(), opt.word_vec_size, padding_idx=onmt.Constants.PAD) - self.rnn = nn.LSTM(inputSize, self.hidden_size, + self.rnn = nn.LSTM(input_size, self.hidden_size, num_layers=opt.layers, dropout=opt.dropout, bidirectional=opt.brnn) - # self.rnn.bias_ih_l0.data.div_(2) - # self.rnn.bias_hh_l0.data.copy_(self.rnn.bias_ih_l0.data) - if opt.pre_word_vecs_enc is not None: pretrained = torch.load(opt.pre_word_vecs_enc) self.word_lut.weight.copy_(pretrained) @@ -41,34 +38,6 @@ def forward(self, input, hidden=None): return hidden_t, outputs -class StackedLSTM(nn.Module): - def __init__(self, num_layers, input_size, rnn_size, dropout): - super(StackedLSTM, self).__init__() - self.dropout = nn.Dropout(dropout) - self.num_layers = num_layers - self.layers = nn.ModuleList() - - for i in range(num_layers): - self.layers.append(nn.LSTMCell(input_size, rnn_size)) - input_size = rnn_size - - def forward(self, input, hidden): - h_0, c_0 = hidden - h_1, c_1 = [], [] - for i, layer in enumerate(self.layers): - h_1_i, c_1_i = layer(input, (h_0[i], c_0[i])) - input = h_1_i - if i != self.num_layers: - input = self.dropout(input) - h_1 += [h_1_i] - c_1 += [c_1_i] - - h_1 = torch.stack(h_1) - c_1 = torch.stack(c_1) - - return input, (h_1, c_1) - - class Decoder(nn.Module): def __init__(self, opt, dicts): @@ -82,13 +51,12 @@ def __init__(self, opt, dicts): self.word_lut = nn.Embedding(dicts.size(), opt.word_vec_size, padding_idx=onmt.Constants.PAD) - self.rnn = StackedLSTM(opt.layers, input_size, opt.rnn_size, opt.dropout) + self.rnn = nn.LSTM(input_size, opt.rnn_size, + num_layers=opt.layers, + dropout=opt.dropout) self.attn = onmt.modules.GlobalAttention(opt.rnn_size) self.dropout = nn.Dropout(opt.dropout) - # self.rnn.bias_ih.data.div_(2) - # self.rnn.bias_hh.data.copy_(self.rnn.bias_ih.data) - self.hidden_size = opt.rnn_size if opt.pre_word_vecs_enc is not None: @@ -102,7 +70,6 @@ def forward(self, input, hidden, context, init_output): batch_size = input.size(0) h_size = (batch_size, self.hidden_size) - output = Variable(emb.data.new(*h_size).zero_(), requires_grad=False) # n.b. you can increase performance if you compute W_ih * x for all # iterations in parallel, but that's only possible if @@ -110,16 +77,16 @@ def forward(self, input, hidden, context, init_output): outputs = [] output = init_output for i, emb_t in enumerate(emb.split(1)): - emb_t = emb_t.squeeze(0) + emb_t = emb_t if self.input_feed: - emb_t = torch.cat([emb_t, output], 1) + emb_t = torch.cat([emb_t, output], 2) output, hidden = self.rnn(emb_t, hidden) - output, attn = self.attn(output, context.t()) - output = self.dropout(output) + output, attn = self.attn(output.squeeze(0), context.t()) + output = self.dropout(output.unsqueeze(0)) outputs += [output] - outputs = torch.stack(outputs) + outputs = torch.cat(outputs, 0) return outputs.transpose(0, 1), hidden, attn @@ -138,7 +105,7 @@ def set_generate(self, enabled): def make_init_decoder_output(self, context): batch_size = context.size(1) h_size = (batch_size, self.decoder.hidden_size) - return Variable(context.data.new(*h_size).zero_(), requires_grad=False) + return Variable(context.data.new(1, *h_size).zero_(), requires_grad=False) def _fix_enc_hidden(self, h): # the encoder hidden is (layers*directions) x batch x dim From d5cfec3ead80415317f35203be855cf4e524c4d1 Mon Sep 17 00:00:00 2001 From: Bryan Marcus McCann Date: Thu, 2 Mar 2017 13:51:25 -0800 Subject: [PATCH 24/44] reverting cudnn decoder to lstmcell --- OpenNMT/onmt/Models.py | 44 +++++++++++++++++++++++++++++++++--------- 1 file changed, 35 insertions(+), 9 deletions(-) diff --git a/OpenNMT/onmt/Models.py b/OpenNMT/onmt/Models.py index 3e4f2d19bf..3c583f3b4d 100644 --- a/OpenNMT/onmt/Models.py +++ b/OpenNMT/onmt/Models.py @@ -38,6 +38,34 @@ def forward(self, input, hidden=None): return hidden_t, outputs +class StackedLSTM(nn.Module): + def __init__(self, num_layers, input_size, rnn_size, dropout): + super(StackedLSTM, self).__init__() + self.dropout = nn.Dropout(dropout) + self.num_layers = num_layers + self.layers = nn.ModuleList() + + for i in range(num_layers): + self.layers.append(nn.LSTMCell(input_size, rnn_size)) + input_size = rnn_size + + def forward(self, input, hidden): + h_0, c_0 = hidden + h_1, c_1 = [], [] + for i, layer in enumerate(self.layers): + h_1_i, c_1_i = layer(input, (h_0[i], c_0[i])) + input = h_1_i + if i != self.num_layers: + input = self.dropout(input) + h_1 += [h_1_i] + c_1 += [c_1_i] + + h_1 = torch.stack(h_1) + c_1 = torch.stack(c_1) + + return input, (h_1, c_1) + + class Decoder(nn.Module): def __init__(self, opt, dicts): @@ -51,9 +79,7 @@ def __init__(self, opt, dicts): self.word_lut = nn.Embedding(dicts.size(), opt.word_vec_size, padding_idx=onmt.Constants.PAD) - self.rnn = nn.LSTM(input_size, opt.rnn_size, - num_layers=opt.layers, - dropout=opt.dropout) + self.rnn = StackedLSTM(opt.layers, input_size, opt.rnn_size, opt.dropout) self.attn = onmt.modules.GlobalAttention(opt.rnn_size) self.dropout = nn.Dropout(opt.dropout) @@ -77,16 +103,16 @@ def forward(self, input, hidden, context, init_output): outputs = [] output = init_output for i, emb_t in enumerate(emb.split(1)): - emb_t = emb_t + emb_t = emb_t.squeeze(0) if self.input_feed: - emb_t = torch.cat([emb_t, output], 2) + emb_t = torch.cat([emb_t, output], 1) output, hidden = self.rnn(emb_t, hidden) - output, attn = self.attn(output.squeeze(0), context.t()) - output = self.dropout(output.unsqueeze(0)) + output, attn = self.attn(output, context.t()) + output = self.dropout(output) outputs += [output] - outputs = torch.cat(outputs, 0) + outputs = torch.stack(outputs) return outputs.transpose(0, 1), hidden, attn @@ -105,7 +131,7 @@ def set_generate(self, enabled): def make_init_decoder_output(self, context): batch_size = context.size(1) h_size = (batch_size, self.decoder.hidden_size) - return Variable(context.data.new(1, *h_size).zero_(), requires_grad=False) + return Variable(context.data.new(*h_size).zero_(), requires_grad=False) def _fix_enc_hidden(self, h): # the encoder hidden is (layers*directions) x batch x dim From a8d66b4a737d842a3436aac9f0510d5e86a426e5 Mon Sep 17 00:00:00 2001 From: Bryan Marcus McCann Date: Fri, 3 Mar 2017 10:25:32 -0800 Subject: [PATCH 25/44] new DataParallel allows dim 1; remove unnecessary transposes; add train_ppl to chkpt --- OpenNMT/onmt/Models.py | 15 ++++++++------- OpenNMT/onmt/Translator.py | 23 +++++++++++------------ OpenNMT/train.py | 23 +++++++++++------------ 3 files changed, 30 insertions(+), 31 deletions(-) diff --git a/OpenNMT/onmt/Models.py b/OpenNMT/onmt/Models.py index 3c583f3b4d..99e82b15a9 100644 --- a/OpenNMT/onmt/Models.py +++ b/OpenNMT/onmt/Models.py @@ -26,9 +26,10 @@ def __init__(self, opt, dicts): self.word_lut.weight.copy_(pretrained) def forward(self, input, hidden=None): - batch_size = input.size(0) # batch first for multi-gpu compatibility - emb = self.word_lut(input).transpose(0, 1) + emb = self.word_lut(input) + if hidden is None: + batch_size = emb.size(1) h_size = (self.layers * self.num_directions, batch_size, self.hidden_size) h_0 = Variable(emb.data.new(*h_size).zero_(), requires_grad=False) c_0 = Variable(emb.data.new(*h_size).zero_(), requires_grad=False) @@ -91,9 +92,9 @@ def __init__(self, opt, dicts): def forward(self, input, hidden, context, init_output): - emb = self.word_lut(input).transpose(0, 1) + emb = self.word_lut(input) - batch_size = input.size(0) + batch_size = input.size(1) h_size = (batch_size, self.hidden_size) @@ -102,7 +103,7 @@ def forward(self, input, hidden, context, init_output): # self.input_feed=False outputs = [] output = init_output - for i, emb_t in enumerate(emb.split(1)): + for emb_t in emb.split(1): emb_t = emb_t.squeeze(0) if self.input_feed: emb_t = torch.cat([emb_t, output], 1) @@ -113,7 +114,7 @@ def forward(self, input, hidden, context, init_output): outputs += [output] outputs = torch.stack(outputs) - return outputs.transpose(0, 1), hidden, attn + return outputs, hidden, attn class NMTModel(nn.Module): @@ -145,7 +146,7 @@ def _fix_enc_hidden(self, h): def forward(self, input): src = input[0] - tgt = input[1][:, :-1] # exclude last target from inputs + tgt = input[1][:-1] # exclude last target from inputs enc_hidden, context = self.encoder(src) init_output = self.make_init_decoder_output(context) diff --git a/OpenNMT/onmt/Translator.py b/OpenNMT/onmt/Translator.py index 3cb68b4584..48dbcb7208 100644 --- a/OpenNMT/onmt/Translator.py +++ b/OpenNMT/onmt/Translator.py @@ -48,7 +48,7 @@ def buildTargetTokens(self, pred, src, attn): def translateBatch(self, batch): srcBatch, tgtBatch = batch - batchSize = srcBatch.size(0) + batchSize = srcBatch.size(1) beamSize = self.opt.beam_size # (1) run the encoder on the src @@ -56,9 +56,9 @@ def translateBatch(self, batch): # have to execute the encoder manually to deal with padding encStates = None context = [] - for srcBatch_t in srcBatch.chunk(srcBatch.size(1), dim=1): + for srcBatch_t in srcBatch.split(1): encStates, context_t = self.model.encoder(srcBatch_t, hidden=encStates) - batchPadIdx = srcBatch_t.data.squeeze(1).eq(onmt.Constants.PAD).nonzero() + batchPadIdx = srcBatch_t.data.squeeze(0).eq(onmt.Constants.PAD).nonzero() if batchPadIdx.nelement() > 0: batchPadIdx = batchPadIdx.squeeze(1) encStates[0].data.index_fill_(1, batchPadIdx, 0) @@ -73,7 +73,7 @@ def translateBatch(self, batch): # This mask is applied to the attention model inside the decoder # so that the attention ignores source padding - padMask = srcBatch.data.eq(onmt.Constants.PAD) + padMask = srcBatch.data.eq(onmt.Constants.PAD).t() def applyContextMask(m): if isinstance(m, onmt.modules.GlobalAttention): m.applyMask(padMask) @@ -88,8 +88,8 @@ def applyContextMask(m): initOutput = self.model.make_init_decoder_output(context) decOut, decStates, attn = self.model.decoder( - tgtBatch[:, :-1], decStates, context, initOutput) - for dec_t, tgt_t in zip(decOut.transpose(0, 1), tgtBatch.transpose(0, 1)[1:].data): + tgtBatch[:-1], decStates, context, initOutput) + for dec_t, tgt_t in zip(decOut, tgtBatch[1:].data): gen_t = self.model.generator.forward(dec_t) tgt_t = tgt_t.unsqueeze(1) scores = gen_t.data.gather(1, tgt_t) @@ -107,7 +107,7 @@ def applyContextMask(m): decOut = self.model.make_init_decoder_output(context) - padMask = srcBatch.data.eq(onmt.Constants.PAD).unsqueeze(0).repeat(beamSize, 1, 1) + padMask = srcBatch.data.eq(onmt.Constants.PAD).t().unsqueeze(0).repeat(beamSize, 1, 1) batchIdx = list(range(batchSize)) remainingSents = batchSize @@ -120,9 +120,9 @@ def applyContextMask(m): if not b.done]).t().contiguous().view(1, -1) decOut, decStates, attn = self.model.decoder( - Variable(input, volatile=True).transpose(0, 1), decStates, context, decOut) + Variable(input, volatile=True), decStates, context, decOut) # decOut: 1 x (beam*batch) x numWords - decOut = decOut.transpose(0, 1).squeeze(0) + decOut = decOut.squeeze(0) out = self.model.generator.forward(decOut) # batch x beam x numWords @@ -177,7 +177,7 @@ def updateActive(t): scores, ks = beam[b].sortBest() allScores += [scores[:n_best]] - valid_attn = srcBatch.transpose(0, 1).data[:, b].ne(onmt.Constants.PAD).nonzero().squeeze(1) + valid_attn = srcBatch.data[:, b].ne(onmt.Constants.PAD).nonzero().squeeze(1) hyps, attn = zip(*[beam[b].getHyp(k) for k in ks[:n_best]]) attn = [a.index_select(1, valid_attn) for a in attn] allHyp += [hyps] @@ -189,14 +189,13 @@ def translate(self, srcBatch, goldBatch): # (1) convert words to indexes dataset = self.buildData(srcBatch, goldBatch) batch = dataset[0] - batch = [x.transpose(0, 1) for x in batch] # (2) translate pred, predScore, attn, goldScore = self.translateBatch(batch) # (3) convert indexes to words predBatch = [] - for b in range(batch[0].size(0)): + for b in range(batch[0].size(1)): predBatch.append( [self.buildTargetTokens(pred[b][n], srcBatch[b], attn[b][n]) for n in range(self.opt.n_best)] diff --git a/OpenNMT/train.py b/OpenNMT/train.py index 8d389b5c5f..7266111a0f 100644 --- a/OpenNMT/train.py +++ b/OpenNMT/train.py @@ -117,11 +117,11 @@ def NMTCriterion(vocabSize): def memoryEfficientLoss(outputs, targets, generator, crit, eval=False): # compute generations one piece at a time loss = 0 - outputs = Variable(outputs.data, requires_grad=(not eval), volatile=eval).contiguous() + outputs = Variable(outputs.data, requires_grad=(not eval), volatile=eval) batch_size = outputs.size(1) outputs_split = torch.split(outputs, opt.max_generator_batches) - targets_split = torch.split(targets.contiguous(), opt.max_generator_batches) + targets_split = torch.split(targets, opt.max_generator_batches) for out_t, targ_t in zip(outputs_split, targets_split): out_t = out_t.view(-1, out_t.size(2)) pred_t = generator(out_t) @@ -140,9 +140,9 @@ def eval(model, criterion, data): model.eval() for i in range(len(data)): - batch = [x.transpose(0, 1) for x in data[i]] # must be batch first for gather/scatter in DataParallel + batch = data[i] outputs = model(batch) # FIXME volatile - targets = batch[1][:, 1:] # exclude from targets + targets = batch[1][1:] # exclude from targets loss, _ = memoryEfficientLoss( outputs, targets, model.generator, criterion, eval=True) total_loss += loss @@ -172,11 +172,10 @@ def trainEpoch(epoch): batchIdx = batchOrder[i] if epoch >= opt.curriculum else i batch = trainData[batchIdx] - batch = [x.transpose(0, 1) for x in batch] # must be batch first for gather/scatter in DataParallel model.zero_grad() outputs = model(batch) - targets = batch[1][:, 1:] # exclude from targets + targets = batch[1][1:] # exclude from targets loss, gradOutput = memoryEfficientLoss( outputs, targets, model.generator, criterion) @@ -209,7 +208,8 @@ def trainEpoch(epoch): # (1) train for one epoch on the training set train_loss = trainEpoch(epoch) - print('Train perplexity: %g' % math.exp(min(train_loss, 100))) + train_ppl = math.exp(min(train_loss, 100)) + print('Train perplexity: %g' % train_ppl) # (2) evaluate on the validation set valid_loss = eval(model, criterion, validData) @@ -229,8 +229,7 @@ def trainEpoch(epoch): 'optim': optim, } torch.save(checkpoint, - '%s_e%d_%.2f.pt' % (opt.save_model, epoch, valid_ppl)) - + '%s_val%.2f_e%d_train%.2f.pt' % (opt.save_model, valid_ppl, epoch, train_ppl)) def main(): @@ -258,11 +257,11 @@ def main(): generator = nn.Sequential( nn.Linear(opt.rnn_size, dicts['tgt'].size()), nn.LogSoftmax()) - if len(opt.gpus) > 1: - generator = nn.DataParallel(generator, device_ids=opt.gpus) +# if len(opt.gpus) > 1: +# generator = nn.DataParallel(generator, device_ids=opt.gpus) model = onmt.Models.NMTModel(encoder, decoder, generator) if len(opt.gpus) > 1: - model = nn.DataParallel(model, device_ids=opt.gpus) + model = nn.DataParallel(model, device_ids=opt.gpus, dim=1) if opt.gpus: model.cuda() else: From 3d91103d984256d6297cd4ae76213b9c9381c443 Mon Sep 17 00:00:00 2001 From: Bryan Marcus McCann Date: Fri, 3 Mar 2017 11:07:38 -0800 Subject: [PATCH 26/44] mend --- OpenNMT/train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/OpenNMT/train.py b/OpenNMT/train.py index 7266111a0f..c0d80cf3dc 100644 --- a/OpenNMT/train.py +++ b/OpenNMT/train.py @@ -229,7 +229,7 @@ def trainEpoch(epoch): 'optim': optim, } torch.save(checkpoint, - '%s_val%.2f_e%d_train%.2f.pt' % (opt.save_model, valid_ppl, epoch, train_ppl)) + '%s_val%.2f_train%.2f_e%d.pt' % (opt.save_model, valid_ppl, train_ppl, epoch)) def main(): From e4a6730e804e66631278d1ce95f002e1d8d7d6a2 Mon Sep 17 00:00:00 2001 From: Bryan Marcus McCann Date: Fri, 3 Mar 2017 15:05:56 -0800 Subject: [PATCH 27/44] allows use of models trained on dataset to be trained on another; doesn't augment vocab --- OpenNMT/train.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/OpenNMT/train.py b/OpenNMT/train.py index c0d80cf3dc..c93001b875 100644 --- a/OpenNMT/train.py +++ b/OpenNMT/train.py @@ -236,6 +236,9 @@ def main(): print("Loading data from '%s'" % opt.data) dataset = torch.load(opt.data) + if opt.train_from: + checkpoint = torch.load(opt.train_from) + dataset['dicts'] = checkpoint['dicts'] trainData = onmt.Dataset(dataset['train']['src'], dataset['train']['tgt'], opt.batch_size, opt.gpus) From a2d8bf736bf2fc7e0d39107c44521890fde6cb6c Mon Sep 17 00:00:00 2001 From: Bryan Marcus McCann Date: Fri, 3 Mar 2017 15:37:44 -0800 Subject: [PATCH 28/44] manual unrolling was broken for brnn; patch until varlen rnn replacement --- OpenNMT/onmt/Translator.py | 34 ++++++++++++++++++++-------------- OpenNMT/train.py | 2 +- 2 files changed, 21 insertions(+), 15 deletions(-) diff --git a/OpenNMT/onmt/Translator.py b/OpenNMT/onmt/Translator.py index 48dbcb7208..ef586fe93e 100644 --- a/OpenNMT/onmt/Translator.py +++ b/OpenNMT/onmt/Translator.py @@ -53,23 +53,29 @@ def translateBatch(self, batch): # (1) run the encoder on the src - # have to execute the encoder manually to deal with padding - encStates = None - context = [] - for srcBatch_t in srcBatch.split(1): - encStates, context_t = self.model.encoder(srcBatch_t, hidden=encStates) - batchPadIdx = srcBatch_t.data.squeeze(0).eq(onmt.Constants.PAD).nonzero() - if batchPadIdx.nelement() > 0: - batchPadIdx = batchPadIdx.squeeze(1) - encStates[0].data.index_fill_(1, batchPadIdx, 0) - encStates[1].data.index_fill_(1, batchPadIdx, 0) - context += [context_t] + encStates, context = None, None - encStates = (self.model._fix_enc_hidden(encStates[0]), - self.model._fix_enc_hidden(encStates[1])) + if self.model.encoder.num_directions == 2: + # bidirectional encoder is negatively impacted by padding + # run with batch size 1 for improved translations + # This will be resolved when variable length LSTMs are used instead + encStates, context = self.model.encoder(srcBatch, hidden=encStates) + else: + # have to execute the encoder manually to deal with padding + context = [] + for srcBatch_t in srcBatch.split(1): + encStates, context_t = self.model.encoder(srcBatch_t, hidden=encStates) + batchPadIdx = srcBatch_t.data.squeeze(0).eq(onmt.Constants.PAD).nonzero() + if batchPadIdx.nelement() > 0: + batchPadIdx = batchPadIdx.squeeze(1) + encStates[0].data.index_fill_(1, batchPadIdx, 0) + encStates[1].data.index_fill_(1, batchPadIdx, 0) + context += [context_t] + context = torch.cat(context) - context = torch.cat(context) rnnSize = context.size(2) + encStates = (self.model._fix_enc_hidden(encStates[0]), + self.model._fix_enc_hidden(encStates[1])) # This mask is applied to the attention model inside the decoder # so that the attention ignores source padding diff --git a/OpenNMT/train.py b/OpenNMT/train.py index c93001b875..2aad4b6747 100644 --- a/OpenNMT/train.py +++ b/OpenNMT/train.py @@ -229,7 +229,7 @@ def trainEpoch(epoch): 'optim': optim, } torch.save(checkpoint, - '%s_val%.2f_train%.2f_e%d.pt' % (opt.save_model, valid_ppl, train_ppl, epoch)) + '%s_val_%.2f_train_%.2f_e%d.pt' % (opt.save_model, valid_ppl, train_ppl, epoch)) def main(): From 6dcb113c778abfdd140e67a4f45a8eb9f0cb6652 Mon Sep 17 00:00:00 2001 From: Bryan Marcus McCann Date: Mon, 6 Mar 2017 12:29:57 -0800 Subject: [PATCH 29/44] allowing learning rate update for non-sgd optimizers --- OpenNMT/train.py | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/OpenNMT/train.py b/OpenNMT/train.py index 2aad4b6747..76299610ee 100644 --- a/OpenNMT/train.py +++ b/OpenNMT/train.py @@ -61,22 +61,32 @@ parser.add_argument('-learning_rate', type=float, default=1.0, help="""Starting learning rate. If adagrad/adadelta/adam is used, then this is the global learning rate. Recommended - settings: sgd = 1, adagrad = 0.1, adadelta = 1, adam = 0.1""") + settings: sgd = 1, adagrad = 0.1, adadelta = 1, adam = 0.001""") parser.add_argument('-max_grad_norm', type=float, default=5, help="""If the norm of the gradient vector exceeds this, renormalize it to have the norm equal to max_grad_norm""") parser.add_argument('-dropout', type=float, default=0.3, help='Dropout probability; applied between LSTM stacks.') -parser.add_argument('-learning_rate_decay', type=float, default=0.5, - help="""Decay learning rate by this much if (i) perplexity - does not decrease on the validation set or (ii) epoch has - gone past the start_decay_at_limit""") -parser.add_argument('-start_decay_at', type=int, default=8, - help="Start decay after this epoch") +parser.add_argument('-padding_weight', type=int, default=0, + help='The weight to give padding in the loss') parser.add_argument('-curriculum', action="store_true", help="""For this many epochs, order the minibatches based on source sequence length. Sometimes setting this to 1 will increase convergence speed.""") + +#learning rate +parser.add_argument('-update_learning_rate', action='store_true', + help="Decay learning rate regardless of optimizer") +parser.add_argument('-learning_rate_decay', type=float, default=0.5, + help="""If update_learning_rate, decay learning rate by + this much if (i) perplexity does not decrease on the + validation set or (ii) epoch has gone past + start_decay_at""") +parser.add_argument('-start_decay_at', type=int, default=8, + help="Start decaying every epoch after and including this + epoch") + +#pretrained word vectors parser.add_argument('-pre_word_vecs_enc', help="""If a valid path is specified, then this will load pretrained word embeddings on the encoder side. @@ -107,7 +117,7 @@ def NMTCriterion(vocabSize): weight = torch.ones(vocabSize) - weight[onmt.Constants.PAD] = 0 + weight[onmt.Constants.PAD] = opt.padding_weight crit = nn.NLLLoss(weight, size_average=False) if opt.gpus: crit.cuda() @@ -217,7 +227,7 @@ def trainEpoch(epoch): print('Validation perplexity: %g' % valid_ppl) # (3) maybe update the learning rate - if opt.optim == 'sgd': + if opt.update_learning_rate: optim.updateLearningRate(valid_loss, epoch) # (4) drop a checkpoint From 1226bde8be2d55d1d21338d884854ba882aa81cd Mon Sep 17 00:00:00 2001 From: Bryan Marcus McCann Date: Mon, 6 Mar 2017 12:59:33 -0800 Subject: [PATCH 30/44] adding option to shuffle mini-batches --- OpenNMT/onmt/Dataset.py | 8 ++++++++ OpenNMT/train.py | 16 +++++++++++----- 2 files changed, 19 insertions(+), 5 deletions(-) diff --git a/OpenNMT/onmt/Dataset.py b/OpenNMT/onmt/Dataset.py index 2651ae6458..38d27c5a75 100644 --- a/OpenNMT/onmt/Dataset.py +++ b/OpenNMT/onmt/Dataset.py @@ -1,3 +1,5 @@ +import random + import onmt from torch.autograd import Variable @@ -46,3 +48,9 @@ def __getitem__(self, index): def __len__(self): return self.numBatches + + + def shuffle(self): + zipped = list(zip(self.src, self.tgt)) + random.shuffle(zipped) + self.src, self.tgt = [x[0] for x in zipped], [x[1] for x in zipped] diff --git a/OpenNMT/train.py b/OpenNMT/train.py index 76299610ee..2fa7ac0fd2 100644 --- a/OpenNMT/train.py +++ b/OpenNMT/train.py @@ -67,14 +67,16 @@ renormalize it to have the norm equal to max_grad_norm""") parser.add_argument('-dropout', type=float, default=0.3, help='Dropout probability; applied between LSTM stacks.') -parser.add_argument('-padding_weight', type=int, default=0, - help='The weight to give padding in the loss') parser.add_argument('-curriculum', action="store_true", help="""For this many epochs, order the minibatches based on source sequence length. Sometimes setting this to 1 will increase convergence speed.""") +parser.add_argument('-extra_shuffle', action="store_true", + help="""By default only shuffle mini-batch order; when true, + shuffle and re-assign mini-batches""") #learning rate + parser.add_argument('-update_learning_rate', action='store_true', help="Decay learning rate regardless of optimizer") parser.add_argument('-learning_rate_decay', type=float, default=0.5, @@ -83,10 +85,11 @@ validation set or (ii) epoch has gone past start_decay_at""") parser.add_argument('-start_decay_at', type=int, default=8, - help="Start decaying every epoch after and including this - epoch") + help="""Start decaying every epoch after and including this + epoch""") #pretrained word vectors + parser.add_argument('-pre_word_vecs_enc', help="""If a valid path is specified, then this will load pretrained word embeddings on the encoder side. @@ -117,7 +120,7 @@ def NMTCriterion(vocabSize): weight = torch.ones(vocabSize) - weight[onmt.Constants.PAD] = opt.padding_weight + weightonmt.Constants.PAD] = 0 crit = nn.NLLLoss(weight, size_average=False) if opt.gpus: crit.cuda() @@ -172,6 +175,9 @@ def trainModel(model, trainData, validData, dataset, optim): start_time = time.time() def trainEpoch(epoch): + if opt.extra_shuffle and epoch >= opt.curriculum: + trainData.shuffle() + # shuffle mini batch order batchOrder = torch.randperm(len(trainData)) From 8f543a8149aa3b7e6ddcffa81861da697b07491b Mon Sep 17 00:00:00 2001 From: Bryan Marcus McCann Date: Mon, 6 Mar 2017 15:29:43 -0800 Subject: [PATCH 31/44] adding word level accuracy as a metric --- OpenNMT/train.py | 38 ++++++++++++++++++++++++-------------- 1 file changed, 24 insertions(+), 14 deletions(-) diff --git a/OpenNMT/train.py b/OpenNMT/train.py index 2fa7ac0fd2..0bc39f76cc 100644 --- a/OpenNMT/train.py +++ b/OpenNMT/train.py @@ -120,7 +120,7 @@ def NMTCriterion(vocabSize): weight = torch.ones(vocabSize) - weightonmt.Constants.PAD] = 0 + weight[onmt.Constants.PAD] = 0 crit = nn.NLLLoss(weight, size_average=False) if opt.gpus: crit.cuda() @@ -129,40 +129,45 @@ def NMTCriterion(vocabSize): def memoryEfficientLoss(outputs, targets, generator, crit, eval=False): # compute generations one piece at a time - loss = 0 + num_correct, loss = 0, 0 outputs = Variable(outputs.data, requires_grad=(not eval), volatile=eval) batch_size = outputs.size(1) outputs_split = torch.split(outputs, opt.max_generator_batches) targets_split = torch.split(targets, opt.max_generator_batches) - for out_t, targ_t in zip(outputs_split, targets_split): + for i, (out_t, targ_t) in enumerate(zip(outputs_split, targets_split)): out_t = out_t.view(-1, out_t.size(2)) - pred_t = generator(out_t) - loss_t = crit(pred_t, targ_t.view(-1)) + scores_t = generator(out_t) + loss_t = crit(scores_t, targ_t.view(-1)) + pred_t = scores_t.max(1)[1] + num_correct_t = pred_t.data.eq(targ_t.data).masked_select(targ_t.ne(onmt.Constants.PAD).data).sum() + num_correct += num_correct_t loss += loss_t.data[0] if not eval: loss_t.div(batch_size).backward() grad_output = None if outputs.grad is None else outputs.grad.data - return loss, grad_output + return loss, grad_output, num_correct def eval(model, criterion, data): total_loss = 0 total_words = 0 + total_num_correct = 0 model.eval() for i in range(len(data)): batch = data[i] outputs = model(batch) # FIXME volatile targets = batch[1][1:] # exclude from targets - loss, _ = memoryEfficientLoss( + loss, _, num_correct = memoryEfficientLoss( outputs, targets, model.generator, criterion, eval=True) total_loss += loss + total_num_correct += num_correct total_words += targets.data.ne(onmt.Constants.PAD).sum() model.train() - return total_loss / total_words + return total_loss / total_words, total_num_correct / total_words def trainModel(model, trainData, validData, dataset, optim): @@ -183,6 +188,7 @@ def trainEpoch(epoch): total_loss, report_loss = 0, 0 total_words, report_tgt_words, report_src_words = 0, 0, 0 + total_num_correct = 0 start = time.time() for i in range(len(trainData)): @@ -192,7 +198,7 @@ def trainEpoch(epoch): model.zero_grad() outputs = model(batch) targets = batch[1][1:] # exclude from targets - loss, gradOutput = memoryEfficientLoss( + loss, gradOutput, num_correct = memoryEfficientLoss( outputs, targets, model.generator, criterion) outputs.backward(gradOutput) @@ -201,15 +207,17 @@ def trainEpoch(epoch): optim.step() report_loss += loss + total_num_correct += num_correct total_loss += loss num_words = targets.data.ne(onmt.Constants.PAD).sum() total_words += num_words report_tgt_words += num_words report_src_words += batch[0].data.ne(onmt.Constants.PAD).sum() if i % opt.log_interval == 0 and i > 0: - print("Epoch %2d, %5d/%5d batches; perplexity: %6.2f; %3.0f source tokens/s; %3.0f target tokens/s; %6.0f s elapsed" % + print("Epoch %2d, %5d/%5d; ppl: %6.2f; acc: %6.2f; %3.0f src tok/s; %3.0f tgt tok/s; %6.0f s elapsed" % (epoch, i, len(trainData), math.exp(report_loss / report_tgt_words), + num_correct / num_words * 100, report_src_words/(time.time()-start), report_tgt_words/(time.time()-start), time.time()-start_time)) @@ -217,20 +225,22 @@ def trainEpoch(epoch): report_loss = report_tgt_words = report_src_words = 0 start = time.time() - return total_loss / total_words + return total_loss / total_words, total_num_correct / total_words for epoch in range(opt.start_epoch, opt.epochs + 1): print('') # (1) train for one epoch on the training set - train_loss = trainEpoch(epoch) + train_loss, train_acc = trainEpoch(epoch) train_ppl = math.exp(min(train_loss, 100)) print('Train perplexity: %g' % train_ppl) + print('Train accuracy: %g' % train_acc) # (2) evaluate on the validation set - valid_loss = eval(model, criterion, validData) + valid_loss, valid_acc = eval(model, criterion, validData) valid_ppl = math.exp(min(valid_loss, 100)) print('Validation perplexity: %g' % valid_ppl) + print('Validation accuracy: %g' % valid_acc) # (3) maybe update the learning rate if opt.update_learning_rate: @@ -245,7 +255,7 @@ def trainEpoch(epoch): 'optim': optim, } torch.save(checkpoint, - '%s_val_%.2f_train_%.2f_e%d.pt' % (opt.save_model, valid_ppl, train_ppl, epoch)) + '%s_acc_%.2f_ppl_%.2f_e%d.pt' % (opt.save_model, valid_acc, valid_ppl, epoch)) def main(): From 4678ecda85847f2320e07954bae179042946bad0 Mon Sep 17 00:00:00 2001 From: Bryan Marcus McCann Date: Tue, 7 Mar 2017 11:32:53 -0800 Subject: [PATCH 32/44] touch ups and README updates --- OpenNMT/README.md | 23 +++++++++++------------ OpenNMT/train.py | 8 ++++---- 2 files changed, 15 insertions(+), 16 deletions(-) diff --git a/OpenNMT/README.md b/OpenNMT/README.md index 92b236d81b..b5f7e632b6 100644 --- a/OpenNMT/README.md +++ b/OpenNMT/README.md @@ -32,19 +32,19 @@ Use of OpenNMT consists of four steps: ```perl multi-bleu.perl data/tgt-test.txt < demo_pred.txt``` -## WMT'16 Multimodal Translation: Flickr30k (de-en) +## WMT'16 Multimodal Translation: Multi30k (de-en) -Data might not come as clean as the demo data. Here is a second example that uses the Moses tokenizer (http://www.statmt.org/moses/) to prepare the Flickr30k data from the WMT'16 Multimodal Translation task (http://www.statmt.org/wmt16/multimodal-task.html). +Data might not come as clean as the demo data. Here is a second example that uses the Moses tokenizer (http://www.statmt.org/moses/) to prepare the Multi30k data from the WMT'16 Multimodal Translation task (http://www.statmt.org/wmt16/multimodal-task.html). ### 0) Download the data. -```mkdir -p data/flickr``` +```mkdir -p data/multi30k``` -```wget http://www.quest.dcs.shef.ac.uk/wmt16_files_mmt/training.tar.gz && tar -xf training.tar.gz -C data/flickr && rm training.tar.gz``` +```wget http://www.quest.dcs.shef.ac.uk/wmt16_files_mmt/training.tar.gz && tar -xf training.tar.gz -C data/multi30k && rm training.tar.gz``` -```wget http://www.quest.dcs.shef.ac.uk/wmt16_files_mmt/validation.tar.gz && tar -xf validation.tar.gz -C data/flickr && rm validation.tar.gz``` +```wget http://www.quest.dcs.shef.ac.uk/wmt16_files_mmt/validation.tar.gz && tar -xf validation.tar.gz -C data/multi30k && rm validation.tar.gz``` -```wget https://staff.fnwi.uva.nl/d.elliott/wmt16/mmt16_task1_test.tgz && tar -xf mmt16_task1_test.tgz -C data/flickr && rm mmt16_task1_test.tgz``` +```wget https://staff.fnwi.uva.nl/d.elliott/wmt16/mmt16_task1_test.tgz && tar -xf mmt16_task1_test.tgz -C data/multi30k && rm mmt16_task1_test.tgz``` ### 1) Preprocess the data. @@ -56,24 +56,23 @@ Data might not come as clean as the demo data. Here is a second example that use ```wget https://github.com/moses-smt/mosesdecoder/blob/master/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.en``` -```for l in en de; do for f in data/flickr/*.$l; do if [[ "$f" != *"test"* ]]; then sed -i "$ d" $f; fi; perl tokenizer.perl -no-escape -l $l -q < $f > $f.tok; done; done``` +```for l in en de; do for f in data/multi30k/*.$l; do if [[ "$f" != *"test"* ]]; then sed -i "$ d" $f; fi; perl tokenizer.perl -no-escape -l $l -q < $f > $f.tok; done; done``` -```python preprocess.py -train_src data/flickr/train.en.tok -train_tgt data/flickr/train.de.tok -valid_src data/flickr/val.en.tok -valid_tgt data/flickr/val.de.tok -save_data data/flickr``` +```python preprocess.py -train_src data/multi30k/train.en.tok -train_tgt data/multi30k/train.de.tok -valid_src data/multi30k/val.en.tok -valid_tgt data/multi30k/val.de.tok -save_data data/multi30k``` ### 2) Train the model. -```python train.py -data data/flickr-train.pt -save_model flickr_model -gpus 0``` +```python train.py -data data/multi30k-train.pt -save_model multi30k_model -gpus 0``` ### 3) Translate sentences. -```python translate.py -gpu 0 -model flickr_model_e7_*.pt -src data/flickr/test.en.tok -tgt data/flickr/test.de.tok -replace_unk -verbose -output flickr_pred.txt``` ->>>>>>> c87fc08... tips for non-demo mt via flickr30k example +```python translate.py -gpu 0 -model multi30k_model_e13_*.pt -src data/multi30k/test.en.tok -tgt data/multi30k/test.de.tok -replace_unk -verbose -output multi30k_pred.txt``` ### 4) Evaluate. ```wget https://raw.githubusercontent.com/moses-smt/mosesdecoder/master/scripts/generic/multi-bleu.perl``` -```perl multi-bleu.perl data/flickr/test.de < flickr_pred.txt``` +```perl multi-bleu.perl data/multi30k/test.de.tok < multi30k_pred.txt``` ## Pretrained Models diff --git a/OpenNMT/train.py b/OpenNMT/train.py index 0bc39f76cc..b9f26d1600 100644 --- a/OpenNMT/train.py +++ b/OpenNMT/train.py @@ -214,10 +214,10 @@ def trainEpoch(epoch): report_tgt_words += num_words report_src_words += batch[0].data.ne(onmt.Constants.PAD).sum() if i % opt.log_interval == 0 and i > 0: - print("Epoch %2d, %5d/%5d; ppl: %6.2f; acc: %6.2f; %3.0f src tok/s; %3.0f tgt tok/s; %6.0f s elapsed" % + print("Epoch %2d, %5d/%5d; acc: %6.2f; ppl: %6.2f; %3.0f src tok/s; %3.0f tgt tok/s; %6.0f s elapsed" % (epoch, i, len(trainData), - math.exp(report_loss / report_tgt_words), num_correct / num_words * 100, + math.exp(report_loss / report_tgt_words), report_src_words/(time.time()-start), report_tgt_words/(time.time()-start), time.time()-start_time)) @@ -240,7 +240,7 @@ def trainEpoch(epoch): valid_loss, valid_acc = eval(model, criterion, validData) valid_ppl = math.exp(min(valid_loss, 100)) print('Validation perplexity: %g' % valid_ppl) - print('Validation accuracy: %g' % valid_acc) + print('Validation accuracy: %g' % (valid_acc*100)) # (3) maybe update the learning rate if opt.update_learning_rate: @@ -255,7 +255,7 @@ def trainEpoch(epoch): 'optim': optim, } torch.save(checkpoint, - '%s_acc_%.2f_ppl_%.2f_e%d.pt' % (opt.save_model, valid_acc, valid_ppl, epoch)) + '%s_acc_%.2f_ppl_%.2f_e%d.pt' % (opt.save_model, 100*valid_acc, valid_ppl, epoch)) def main(): From 859412cd38f1b90db2c98b71daf48c7a0dbdec09 Mon Sep 17 00:00:00 2001 From: Bryan Marcus McCann Date: Wed, 8 Mar 2017 22:58:56 -0800 Subject: [PATCH 33/44] allowing validation data to volatile --- OpenNMT/onmt/Dataset.py | 7 ++++--- OpenNMT/train.py | 3 ++- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/OpenNMT/onmt/Dataset.py b/OpenNMT/onmt/Dataset.py index 38d27c5a75..7123f50960 100644 --- a/OpenNMT/onmt/Dataset.py +++ b/OpenNMT/onmt/Dataset.py @@ -6,7 +6,7 @@ class Dataset(object): - def __init__(self, srcData, tgtData, batchSize, cuda): + def __init__(self, srcData, tgtData, batchSize, cuda, volatile=False): self.src = srcData if tgtData: self.tgt = tgtData @@ -16,7 +16,8 @@ def __init__(self, srcData, tgtData, batchSize, cuda): self.cuda = cuda self.batchSize = batchSize - self.numBatches = (len(self.src) + batchSize - 1) // batchSize + self.numBatches = len(self.src) // batchSize + self.volatile = volatile def _batchify(self, data, align_right=False): max_length = max(x.size(0) for x in data) @@ -30,7 +31,7 @@ def _batchify(self, data, align_right=False): if self.cuda: out = out.cuda() - v = Variable(out) + v = Variable(out, volatile=self.volatile) return v def __getitem__(self, index): diff --git a/OpenNMT/train.py b/OpenNMT/train.py index b9f26d1600..ac83cb0d6b 100644 --- a/OpenNMT/train.py +++ b/OpenNMT/train.py @@ -269,7 +269,8 @@ def main(): trainData = onmt.Dataset(dataset['train']['src'], dataset['train']['tgt'], opt.batch_size, opt.gpus) validData = onmt.Dataset(dataset['valid']['src'], - dataset['valid']['tgt'], opt.batch_size, opt.gpus) + dataset['valid']['tgt'], opt.batch_size, opt.gpus, + volatile=True) dicts = dataset['dicts'] print(' * vocabulary size. source = %d; target = %d' % From 053aadf81f26997281e204a24755ae9f421ca3d1 Mon Sep 17 00:00:00 2001 From: Bryan Marcus McCann Date: Wed, 8 Mar 2017 23:00:46 -0800 Subject: [PATCH 34/44] num_batches was off by one --- OpenNMT/onmt/Dataset.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/OpenNMT/onmt/Dataset.py b/OpenNMT/onmt/Dataset.py index 7123f50960..7f9c54226e 100644 --- a/OpenNMT/onmt/Dataset.py +++ b/OpenNMT/onmt/Dataset.py @@ -1,3 +1,4 @@ +import math import random import onmt @@ -16,7 +17,7 @@ def __init__(self, srcData, tgtData, batchSize, cuda, volatile=False): self.cuda = cuda self.batchSize = batchSize - self.numBatches = len(self.src) // batchSize + self.numBatches = math.ceil(len(self.src)/batchSize) self.volatile = volatile def _batchify(self, data, align_right=False): From 45e13b5bc6c5f5b9c4c4f9cb86c90cc5e4d8a6a7 Mon Sep 17 00:00:00 2001 From: Bryan Marcus McCann Date: Wed, 8 Mar 2017 23:01:55 -0800 Subject: [PATCH 35/44] batch printing was off --- OpenNMT/train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/OpenNMT/train.py b/OpenNMT/train.py index ac83cb0d6b..4de27212b8 100644 --- a/OpenNMT/train.py +++ b/OpenNMT/train.py @@ -213,7 +213,7 @@ def trainEpoch(epoch): total_words += num_words report_tgt_words += num_words report_src_words += batch[0].data.ne(onmt.Constants.PAD).sum() - if i % opt.log_interval == 0 and i > 0: + if i % opt.log_interval == -1 % opt.log_interval: print("Epoch %2d, %5d/%5d; acc: %6.2f; ppl: %6.2f; %3.0f src tok/s; %3.0f tgt tok/s; %6.0f s elapsed" % (epoch, i, len(trainData), num_correct / num_words * 100, From e10644ced1d2e4c3911e88ee0e07357bbd920526 Mon Sep 17 00:00:00 2001 From: Bryan Marcus McCann Date: Wed, 8 Mar 2017 23:02:17 -0800 Subject: [PATCH 36/44] curriculum off by one --- OpenNMT/train.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/OpenNMT/train.py b/OpenNMT/train.py index 4de27212b8..c4d43f2a42 100644 --- a/OpenNMT/train.py +++ b/OpenNMT/train.py @@ -180,7 +180,7 @@ def trainModel(model, trainData, validData, dataset, optim): start_time = time.time() def trainEpoch(epoch): - if opt.extra_shuffle and epoch >= opt.curriculum: + if opt.extra_shuffle and epoch > opt.curriculum: trainData.shuffle() # shuffle mini batch order @@ -192,7 +192,7 @@ def trainEpoch(epoch): start = time.time() for i in range(len(trainData)): - batchIdx = batchOrder[i] if epoch >= opt.curriculum else i + batchIdx = batchOrder[i] if epoch > opt.curriculum else i batch = trainData[batchIdx] model.zero_grad() From c4ae24c51ac34dabe9f464ab1fa57f359ae5d45e Mon Sep 17 00:00:00 2001 From: Bryan Marcus McCann Date: Wed, 8 Mar 2017 23:10:36 -0800 Subject: [PATCH 37/44] accuracy now an average over log_interval batches --- OpenNMT/train.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/OpenNMT/train.py b/OpenNMT/train.py index c4d43f2a42..2e11409c17 100644 --- a/OpenNMT/train.py +++ b/OpenNMT/train.py @@ -186,9 +186,8 @@ def trainEpoch(epoch): # shuffle mini batch order batchOrder = torch.randperm(len(trainData)) - total_loss, report_loss = 0, 0 - total_words, report_tgt_words, report_src_words = 0, 0, 0 - total_num_correct = 0 + total_loss, total_words, total_num_correct = 0 + report_loss, report_tgt_words, report_src_words, report_num_correct = 0 start = time.time() for i in range(len(trainData)): @@ -206,23 +205,24 @@ def trainEpoch(epoch): # update the parameters optim.step() - report_loss += loss - total_num_correct += num_correct - total_loss += loss num_words = targets.data.ne(onmt.Constants.PAD).sum() - total_words += num_words + report_loss += loss + report_num_correct += num_correct report_tgt_words += num_words report_src_words += batch[0].data.ne(onmt.Constants.PAD).sum() + total_loss += loss + total_num_correct += num_correct + total_words += num_words if i % opt.log_interval == -1 % opt.log_interval: print("Epoch %2d, %5d/%5d; acc: %6.2f; ppl: %6.2f; %3.0f src tok/s; %3.0f tgt tok/s; %6.0f s elapsed" % (epoch, i, len(trainData), - num_correct / num_words * 100, + report_num_correct / report_tgt_words * 100, math.exp(report_loss / report_tgt_words), report_src_words/(time.time()-start), report_tgt_words/(time.time()-start), time.time()-start_time)) - report_loss = report_tgt_words = report_src_words = 0 + report_loss = report_tgt_words = report_src_words = report_num_correct = 0 start = time.time() return total_loss / total_words, total_num_correct / total_words From 0e728beb66e165bd11fb578ea13440702dcda4c1 Mon Sep 17 00:00:00 2001 From: Bryan Marcus McCann Date: Thu, 9 Mar 2017 21:37:56 +0000 Subject: [PATCH 38/44] off by one in printing batch number --- OpenNMT/train.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/OpenNMT/train.py b/OpenNMT/train.py index 2e11409c17..97b2f7343c 100644 --- a/OpenNMT/train.py +++ b/OpenNMT/train.py @@ -186,8 +186,8 @@ def trainEpoch(epoch): # shuffle mini batch order batchOrder = torch.randperm(len(trainData)) - total_loss, total_words, total_num_correct = 0 - report_loss, report_tgt_words, report_src_words, report_num_correct = 0 + total_loss, total_words, total_num_correct = 0, 0, 0 + report_loss, report_tgt_words, report_src_words, report_num_correct = 0, 0, 0, 0 start = time.time() for i in range(len(trainData)): @@ -215,7 +215,7 @@ def trainEpoch(epoch): total_words += num_words if i % opt.log_interval == -1 % opt.log_interval: print("Epoch %2d, %5d/%5d; acc: %6.2f; ppl: %6.2f; %3.0f src tok/s; %3.0f tgt tok/s; %6.0f s elapsed" % - (epoch, i, len(trainData), + (epoch, i+1, len(trainData), report_num_correct / report_tgt_words * 100, math.exp(report_loss / report_tgt_words), report_src_words/(time.time()-start), From 065af29a8c59de0262397492909169440b5d90f2 Mon Sep 17 00:00:00 2001 From: Bryan Marcus McCann Date: Thu, 9 Mar 2017 21:40:02 +0000 Subject: [PATCH 39/44] removing unused variables --- OpenNMT/onmt/Models.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/OpenNMT/onmt/Models.py b/OpenNMT/onmt/Models.py index 99e82b15a9..b1a4821717 100644 --- a/OpenNMT/onmt/Models.py +++ b/OpenNMT/onmt/Models.py @@ -94,10 +94,6 @@ def __init__(self, opt, dicts): def forward(self, input, hidden, context, init_output): emb = self.word_lut(input) - batch_size = input.size(1) - - h_size = (batch_size, self.hidden_size) - # n.b. you can increase performance if you compute W_ih * x for all # iterations in parallel, but that's only possible if # self.input_feed=False From 22e4fb1df282b48cf12c3b7046f363be2e27fd2c Mon Sep 17 00:00:00 2001 From: Bryan Marcus McCann Date: Fri, 10 Mar 2017 00:04:47 +0000 Subject: [PATCH 40/44] saving with state_dict --- OpenNMT/onmt/Models.py | 5 ++-- OpenNMT/train.py | 68 ++++++++++++++++++++++-------------------- 2 files changed, 38 insertions(+), 35 deletions(-) diff --git a/OpenNMT/onmt/Models.py b/OpenNMT/onmt/Models.py index b1a4821717..9f21528000 100644 --- a/OpenNMT/onmt/Models.py +++ b/OpenNMT/onmt/Models.py @@ -115,11 +115,10 @@ def forward(self, input, hidden, context, init_output): class NMTModel(nn.Module): - def __init__(self, encoder, decoder, generator): + def __init__(self, encoder, decoder): super(NMTModel, self).__init__() self.encoder = encoder self.decoder = decoder - self.generator = generator self.generate = False def set_generate(self, enabled): @@ -150,7 +149,7 @@ def forward(self, input): self._fix_enc_hidden(enc_hidden[1])) out, dec_hidden, _attn = self.decoder(tgt, enc_hidden, context, init_output) - if self.generate: + if hasattr(self, 'generate') and self.generate: out = self.generator(out) return out diff --git a/OpenNMT/train.py b/OpenNMT/train.py index 97b2f7343c..4bc2abac1c 100644 --- a/OpenNMT/train.py +++ b/OpenNMT/train.py @@ -76,9 +76,8 @@ shuffle and re-assign mini-batches""") #learning rate - -parser.add_argument('-update_learning_rate', action='store_true', - help="Decay learning rate regardless of optimizer") +parser.add_argument('-fix_learning_rate', action='store_false', dest='update_learning_rate', + help="Do not decay learning rate (may be desirable for some optimzers (e.g. Adam)") parser.add_argument('-learning_rate_decay', type=float, default=0.5, help="""If update_learning_rate, decay learning rate by this much if (i) perplexity does not decrease on the @@ -158,7 +157,7 @@ def eval(model, criterion, data): model.eval() for i in range(len(data)): batch = data[i] - outputs = model(batch) # FIXME volatile + outputs = model(batch) targets = batch[1][1:] # exclude from targets loss, _, num_correct = memoryEfficientLoss( outputs, targets, model.generator, criterion, eval=True) @@ -246,9 +245,13 @@ def trainEpoch(epoch): if opt.update_learning_rate: optim.updateLearningRate(valid_loss, epoch) + model_state_dict = model.module.state_dict() if len(opt.gpus) > 1 else model.state_dict() + model_state_dict = {k: v for k, v in model_state_dict.items() if 'generator' not in k} + generator_state_dict = model.generator.module.state_dict() if len(opt.gpus) > 1 else model.generator.state_dict() # (4) drop a checkpoint checkpoint = { - 'model': model, + 'model': model_state_dict, + 'generator': generator_state_dict, 'dicts': dataset['dicts'], 'opt': opt, 'epoch': epoch, @@ -262,7 +265,9 @@ def main(): print("Loading data from '%s'" % opt.data) dataset = torch.load(opt.data) + if opt.train_from: + print('Loading dicts from checkpoint at %s' % opt.train_from) checkpoint = torch.load(opt.train_from) dataset['dicts'] = checkpoint['dicts'] @@ -281,24 +286,33 @@ def main(): print('Building model...') - if opt.train_from is None: - encoder = onmt.Models.Encoder(opt, dicts['src']) - decoder = onmt.Models.Decoder(opt, dicts['tgt']) - generator = nn.Sequential( - nn.Linear(opt.rnn_size, dicts['tgt'].size()), - nn.LogSoftmax()) -# if len(opt.gpus) > 1: -# generator = nn.DataParallel(generator, device_ids=opt.gpus) - model = onmt.Models.NMTModel(encoder, decoder, generator) - if len(opt.gpus) > 1: - model = nn.DataParallel(model, device_ids=opt.gpus, dim=1) - if opt.gpus: - model.cuda() - else: - model.cpu() - - model.generator = generator + encoder = onmt.Models.Encoder(opt, dicts['src']) + decoder = onmt.Models.Decoder(opt, dicts['tgt']) + + generator = nn.Sequential( + nn.Linear(opt.rnn_size, dicts['tgt'].size()), + nn.LogSoftmax()) + + model = onmt.Models.NMTModel(encoder, decoder) + + if opt.train_from: + print('Loading model from checkpoint at %s' % opt.train_from) + model.load_state_dict(checkpoint['model']) + generator.load_state_dict(checkpoint['generator']) + optim = checkpoint['optim'] + opt.start_epoch = checkpoint['epoch'] + 1 + + if len(opt.gpus) >= 1: + model.cuda() + generator.cuda() + + if len(opt.gpus) > 1: + model = nn.DataParallel(model, device_ids=opt.gpus, dim=1) + generator = nn.DataParallel(generator, device_ids=opt.gpus, dim=0) + + model.generator = generator + if not opt.train_from: for p in model.parameters(): p.data.uniform_(-opt.param_init, opt.param_init) @@ -307,16 +321,6 @@ def main(): lr_decay=opt.learning_rate_decay, start_decay_at=opt.start_decay_at ) - else: - print('Loading from checkpoint at %s' % opt.train_from) - checkpoint = torch.load(opt.train_from) - model = checkpoint['model'] - if opt.gpus: - model.cuda() - else: - model.cpu() - optim = checkpoint['optim'] - opt.start_epoch = checkpoint['epoch'] + 1 nParams = sum([p.nelement() for p in model.parameters()]) print('* number of parameters: %d' % nParams) From 2c029712ae4f1d1a78d8f42d70fb2bd4fee92109 Mon Sep 17 00:00:00 2001 From: Bryan Marcus McCann Date: Fri, 10 Mar 2017 02:02:24 +0000 Subject: [PATCH 41/44] state_dicts for translation and optimizer --- OpenNMT/onmt/Models.py | 2 +- OpenNMT/onmt/Translator.py | 30 ++++++++++++++++++++++++------ OpenNMT/train.py | 23 +++++++++++++++-------- 3 files changed, 40 insertions(+), 15 deletions(-) diff --git a/OpenNMT/onmt/Models.py b/OpenNMT/onmt/Models.py index 9f21528000..7bde7090b1 100644 --- a/OpenNMT/onmt/Models.py +++ b/OpenNMT/onmt/Models.py @@ -149,7 +149,7 @@ def forward(self, input): self._fix_enc_hidden(enc_hidden[1])) out, dec_hidden, _attn = self.decoder(tgt, enc_hidden, context, init_output) - if hasattr(self, 'generate') and self.generate: + if hasattr(self, 'generator') and self.generate: out = self.generator(out) return out diff --git a/OpenNMT/onmt/Translator.py b/OpenNMT/onmt/Translator.py index ef586fe93e..80405b9117 100644 --- a/OpenNMT/onmt/Translator.py +++ b/OpenNMT/onmt/Translator.py @@ -1,4 +1,5 @@ import onmt +import torch.nn as nn import torch from torch.autograd import Variable @@ -9,17 +10,34 @@ def __init__(self, opt): self.tt = torch.cuda if opt.cuda else torch checkpoint = torch.load(opt.model) - self.model = checkpoint['model'] - self.model.eval() + model_opt = checkpoint['opt'] + self.src_dict = checkpoint['dicts']['src'] + self.tgt_dict = checkpoint['dicts']['tgt'] + + encoder = onmt.Models.Encoder(model_opt, self.src_dict) + decoder = onmt.Models.Decoder(model_opt, self.tgt_dict) + model = onmt.Models.NMTModel(encoder, decoder) + + generator = nn.Sequential( + nn.Linear(model_opt.rnn_size, self.tgt_dict.size()), + nn.LogSoftmax()) + + model.load_state_dict(checkpoint['model']) + generator.load_state_dict(checkpoint['generator']) if opt.cuda: - self.model.cuda() + model.cuda() + generator.cuda() else: - self.model.cpu() + model.cpu() + generator.cpu() + + model.generator = generator + + self.model = model + self.model.eval() - self.src_dict = checkpoint['dicts']['src'] - self.tgt_dict = checkpoint['dicts']['tgt'] def buildData(self, srcBatch, goldBatch): srcData = [self.src_dict.convertToIdx(b, diff --git a/OpenNMT/train.py b/OpenNMT/train.py index 4bc2abac1c..2cd4e704ab 100644 --- a/OpenNMT/train.py +++ b/OpenNMT/train.py @@ -100,7 +100,7 @@ # GPU parser.add_argument('-gpus', default=[], nargs='+', type=int, - help="Use CUDA") + help="Use CUDA on the listed devices.") parser.add_argument('-log_interval', type=int, default=50, help="Print stats at this interval.") @@ -255,7 +255,8 @@ def trainEpoch(epoch): 'dicts': dataset['dicts'], 'opt': opt, 'epoch': epoch, - 'optim': optim, + 'optimizer': optim.optimizer.state_dict(), + 'last_ppl': optim.last_ppl, } torch.save(checkpoint, '%s_acc_%.2f_ppl_%.2f_e%d.pt' % (opt.save_model, 100*valid_acc, valid_ppl, epoch)) @@ -299,12 +300,14 @@ def main(): print('Loading model from checkpoint at %s' % opt.train_from) model.load_state_dict(checkpoint['model']) generator.load_state_dict(checkpoint['generator']) - optim = checkpoint['optim'] opt.start_epoch = checkpoint['epoch'] + 1 if len(opt.gpus) >= 1: model.cuda() generator.cuda() + else: + model.cpu() + generator.cpu() if len(opt.gpus) > 1: model = nn.DataParallel(model, device_ids=opt.gpus, dim=1) @@ -316,11 +319,15 @@ def main(): for p in model.parameters(): p.data.uniform_(-opt.param_init, opt.param_init) - optim = onmt.Optim( - model.parameters(), opt.optim, opt.learning_rate, opt.max_grad_norm, - lr_decay=opt.learning_rate_decay, - start_decay_at=opt.start_decay_at - ) + optim = onmt.Optim( + model.parameters(), opt.optim, opt.learning_rate, opt.max_grad_norm, + lr_decay=opt.learning_rate_decay, + start_decay_at=opt.start_decay_at + ) + + if opt.train_from: + optim.last_ppl = checkpoint['last_ppl'] + optim.optimizer.load_state_dict(checkpoint['optimizer']) nParams = sum([p.nelement() for p in model.parameters()]) print('* number of parameters: %d' % nParams) From 6c8b710c469ffb859765afa016146c077ab2e672 Mon Sep 17 00:00:00 2001 From: Bryan McCann Date: Thu, 9 Mar 2017 18:16:00 -0800 Subject: [PATCH 42/44] Grouping bash commands together --- OpenNMT/README.md | 46 ++++++++++++++++++++++------------------------ 1 file changed, 22 insertions(+), 24 deletions(-) diff --git a/OpenNMT/README.md b/OpenNMT/README.md index b5f7e632b6..96ea7dcfa0 100644 --- a/OpenNMT/README.md +++ b/OpenNMT/README.md @@ -28,9 +28,10 @@ Use of OpenNMT consists of four steps: ### 4) Evaluate. -```wget https://raw.githubusercontent.com/moses-smt/mosesdecoder/master/scripts/generic/multi-bleu.perl``` - -```perl multi-bleu.perl data/tgt-test.txt < demo_pred.txt``` +```bash +wget https://raw.githubusercontent.com/moses-smt/mosesdecoder/master/scripts/generic/multi-bleu.perl +perl multi-bleu.perl data/tgt-test.txt < demo_pred.txt +``` ## WMT'16 Multimodal Translation: Multi30k (de-en) @@ -38,27 +39,23 @@ Data might not come as clean as the demo data. Here is a second example that use ### 0) Download the data. -```mkdir -p data/multi30k``` - -```wget http://www.quest.dcs.shef.ac.uk/wmt16_files_mmt/training.tar.gz && tar -xf training.tar.gz -C data/multi30k && rm training.tar.gz``` - -```wget http://www.quest.dcs.shef.ac.uk/wmt16_files_mmt/validation.tar.gz && tar -xf validation.tar.gz -C data/multi30k && rm validation.tar.gz``` - -```wget https://staff.fnwi.uva.nl/d.elliott/wmt16/mmt16_task1_test.tgz && tar -xf mmt16_task1_test.tgz -C data/multi30k && rm mmt16_task1_test.tgz``` +```bash +mkdir -p data/multi30k +wget http://www.quest.dcs.shef.ac.uk/wmt16_files_mmt/training.tar.gz && tar -xf training.tar.gz -C data/multi30k && rm training.tar.gz +wget http://www.quest.dcs.shef.ac.uk/wmt16_files_mmt/validation.tar.gz && tar -xf validation.tar.gz -C data/multi30k && rm validation.tar.gz +wget https://staff.fnwi.uva.nl/d.elliott/wmt16/mmt16_task1_test.tgz && tar -xf mmt16_task1_test.tgz -C data/multi30k && rm mmt16_task1_test.tgz +``` ### 1) Preprocess the data. -```wget https://raw.githubusercontent.com/moses-smt/mosesdecoder/master/scripts/tokenizer/tokenizer.perl``` - -```sed -i "s/$RealBin\/..\/share\/nonbreaking_prefixes//" tokenizer.perl``` - -```wget https://github.com/moses-smt/mosesdecoder/blob/master/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.de``` - -```wget https://github.com/moses-smt/mosesdecoder/blob/master/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.en``` - -```for l in en de; do for f in data/multi30k/*.$l; do if [[ "$f" != *"test"* ]]; then sed -i "$ d" $f; fi; perl tokenizer.perl -no-escape -l $l -q < $f > $f.tok; done; done``` - -```python preprocess.py -train_src data/multi30k/train.en.tok -train_tgt data/multi30k/train.de.tok -valid_src data/multi30k/val.en.tok -valid_tgt data/multi30k/val.de.tok -save_data data/multi30k``` +```bash +wget https://raw.githubusercontent.com/moses-smt/mosesdecoder/master/scripts/tokenizer/tokenizer.perl +sed -i "s/$RealBin\/..\/share\/nonbreaking_prefixes//" tokenizer.perl +wget https://github.com/moses-smt/mosesdecoder/blob/master/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.de +wget https://github.com/moses-smt/mosesdecoder/blob/master/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.en +for l in en de; do for f in data/multi30k/*.$l; do if [[ "$f" != *"test"* ]]; then sed -i "$ d" $f; fi; perl tokenizer.perl -no-escape -l $l -q < $f > $f.tok; done; done +python preprocess.py -train_src data/multi30k/train.en.tok -train_tgt data/multi30k/train.de.tok -valid_src data/multi30k/val.en.tok -valid_tgt data/multi30k/val.de.tok -save_data data/multi30k +``` ### 2) Train the model. @@ -70,9 +67,10 @@ Data might not come as clean as the demo data. Here is a second example that use ### 4) Evaluate. -```wget https://raw.githubusercontent.com/moses-smt/mosesdecoder/master/scripts/generic/multi-bleu.perl``` - -```perl multi-bleu.perl data/multi30k/test.de.tok < multi30k_pred.txt``` +```bash +wget https://raw.githubusercontent.com/moses-smt/mosesdecoder/master/scripts/generic/multi-bleu.perl +perl multi-bleu.perl data/multi30k/test.de.tok < multi30k_pred.txt +``` ## Pretrained Models From c359f4f67126493c9c05ba8d9e9a03e0a2118c7a Mon Sep 17 00:00:00 2001 From: Bryan Marcus McCann Date: Tue, 14 Mar 2017 16:09:03 -0700 Subject: [PATCH 43/44] backwards compatibility for checkpoints --- OpenNMT/train.py | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/OpenNMT/train.py b/OpenNMT/train.py index 2cd4e704ab..8aea7bac04 100644 --- a/OpenNMT/train.py +++ b/OpenNMT/train.py @@ -17,7 +17,10 @@ help="""Model filename (the model will be saved as _epochN_PPL.pt where PPL is the validation perplexity""") -parser.add_argument('-train_from', +parser.add_argument('-train_from_state_dict', default='', type=str, + help="""If training from a checkpoint then this is the + path to the pretrained model's state_dict.""") +parser.add_argument('-train_from', default='', type=str, help="""If training from a checkpoint then this is the path to the pretrained model.""") @@ -267,9 +270,10 @@ def main(): dataset = torch.load(opt.data) - if opt.train_from: - print('Loading dicts from checkpoint at %s' % opt.train_from) - checkpoint = torch.load(opt.train_from) + dict_checkpoint = opt.train_from if opt.train_from else opt.train_from_state_dict + if dict_checkpoint: + print('Loading dicts from checkpoint at %s' % dict_checkpoint) + checkpoint = torch.load(dict_checkpoint) dataset['dicts'] = checkpoint['dicts'] trainData = onmt.Dataset(dataset['train']['src'], @@ -298,8 +302,16 @@ def main(): if opt.train_from: print('Loading model from checkpoint at %s' % opt.train_from) + chk_model = checkpoint['model'] + generator_state_dict = chk_model.generator.state_dict() + model_state_dict = {k: v for k, v in chk_model.state_dict().items() if 'generator' not in k} + model.load_state_dict(model_state_dict) + generator.load_state_dict(generator_state_dict) + opt.start_epoch = checkpoint['epoch'] + 1 + + if opt.train_from_state_dict: + print('Loading model from checkpoint at %s' % opt.train_from_state_dict) model.load_state_dict(checkpoint['model']) - generator.load_state_dict(checkpoint['generator']) opt.start_epoch = checkpoint['epoch'] + 1 if len(opt.gpus) >= 1: @@ -315,7 +327,7 @@ def main(): model.generator = generator - if not opt.train_from: + if not opt.train_from_state_dict and not opt.train_from: for p in model.parameters(): p.data.uniform_(-opt.param_init, opt.param_init) @@ -326,7 +338,9 @@ def main(): ) if opt.train_from: - optim.last_ppl = checkpoint['last_ppl'] + optim.optimizer.load_state_dict(checkpoint['optim'].optimizer.state_dict()) + + if opt.train_from_state_dict: optim.optimizer.load_state_dict(checkpoint['optimizer']) nParams = sum([p.nelement() for p in model.parameters()]) From 8cebfba36258f0b52654c34233d5b7cb5fd667ad Mon Sep 17 00:00:00 2001 From: Bryan Marcus McCann Date: Tue, 14 Mar 2017 16:12:40 -0700 Subject: [PATCH 44/44] one more lowercase in dict --- OpenNMT/onmt/Dict.py | 1 + 1 file changed, 1 insertion(+) diff --git a/OpenNMT/onmt/Dict.py b/OpenNMT/onmt/Dict.py index 119fcf8933..cd60d37984 100644 --- a/OpenNMT/onmt/Dict.py +++ b/OpenNMT/onmt/Dict.py @@ -92,6 +92,7 @@ def prune(self, size): _, idx = torch.sort(freq, 0, True) newDict = Dict() + newDict.lower = self.lower # Add special entries in all cases. for i in self.special: