From 1095ce2506ea7bb728c3344d8e5329747d317d24 Mon Sep 17 00:00:00 2001 From: Ofir Press Date: Mon, 13 Mar 2017 14:50:34 +0200 Subject: [PATCH 1/2] Update model.py updated attribution of weight tying --- word_language_model/model.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/word_language_model/model.py b/word_language_model/model.py index a54e84c8b5..4947d93617 100644 --- a/word_language_model/model.py +++ b/word_language_model/model.py @@ -20,6 +20,9 @@ def __init__(self, rnn_type, ntoken, ninp, nhid, nlayers, dropout=0.5, tie_weigh self.decoder = nn.Linear(nhid, ntoken) # Optionally tie weights as in: + # "Using the Output Embedding to Improve Language Models" (Press & Wolf 2016) + # https://arxiv.org/abs/1608.05859 + # and # "Tying Word Vectors and Word Classifiers: A Loss Framework for Language Modeling" (Inan et al. 2016) # https://arxiv.org/abs/1611.01462 if tie_weights: From e808ab8e287f7999079fa95c70c666f923462732 Mon Sep 17 00:00:00 2001 From: Ofir Press Date: Mon, 13 Mar 2017 14:52:43 +0200 Subject: [PATCH 2/2] Update README.md updated attribution of weight tying --- word_language_model/README.md | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/word_language_model/README.md b/word_language_model/README.md index 30afd1d578..768e579f56 100644 --- a/word_language_model/README.md +++ b/word_language_model/README.md @@ -53,6 +53,4 @@ python main.py --cuda --emsize 1500 --nhid 1500 --dropout 0.65 --epochs 40 --tie These perplexities are equal or better than [Recurrent Neural Network Regularization (Zaremba et al. 2014)](https://arxiv.org/pdf/1409.2329.pdf) -and are similar to -[Tying Word Vectors and Word Classifiers: A Loss Framework for Language Modeling (Inan et al. 2016)](https://arxiv.org/pdf/1611.01462.pdf), -though Inan et al. have improved perplexities by using a form of recurrent dropout (variational dropout). +and are similar to [Using the Output Embedding to Improve Language Models (Press & Wolf 2016](https://arxiv.org/abs/1608.05859) and [Tying Word Vectors and Word Classifiers: A Loss Framework for Language Modeling (Inan et al. 2016)](https://arxiv.org/pdf/1611.01462.pdf), though both of these papers have improved perplexities by using a form of recurrent dropout [(variational dropout)](http://papers.nips.cc/paper/6241-a-theoretically-grounded-application-of-dropout-in-recurrent-neural-networks).