Add a maxout linear such as in

https://arxiv.org/pdf/1302.4389v4.pdf .get() the arg to keep old models alive Includes some comments on accuracy with maxout
stanfordnlp · Dec 1, 2022 · c708ce7 · c708ce7
1 parent 5edd724
commit c708ce7
Show file tree

Hide file tree

Showing 5 changed files with 84 additions and 7 deletions.
diff --git a/stanza/models/common/maxout_linear.py b/stanza/models/common/maxout_linear.py
@@ -0,0 +1,42 @@
+"""
+A layer which implements maxout from the "Maxout Networks" paper
+
+https://arxiv.org/pdf/1302.4389v4.pdf
+Goodfellow, Warde-Farley, Mirza, Courville, Bengio
+
+or a simpler explanation here:
+
+https://stats.stackexchange.com/questions/129698/what-is-maxout-in-neural-network/298705#298705
+
+The implementation here:
+for k layers of maxout, in -> out channels, we make a single linear
+  map of size in -> out*k
+then we reshape the end to be (..., k, out)
+and return the max over the k layers
+"""
+
+
+import torch
+import torch.nn as nn
+
+class MaxoutLinear(nn.Module):
+    def __init__(self, in_channels, out_channels, maxout_k):
+        super().__init__()
+
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.maxout_k = maxout_k
+
+        self.linear = nn.Linear(in_channels, out_channels * maxout_k)
+
+    def forward(self, inputs):
+        """
+        Use the oversized linear as the repeated linear, then take the max
+
+        One large linear map makes the implementation simpler and easier for pytorch to make parallel
+        """
+        outputs = self.linear(inputs)
+        outputs = outputs.view(*outputs.shape[:-1], self.maxout_k, self.out_channels)
+        outputs = torch.max(outputs, dim=-2)[0]
+        return outputs
+
diff --git a/stanza/models/constituency/lstm_model.py b/stanza/models/constituency/lstm_model.py
@@ -37,6 +37,7 @@
 from torch.nn.utils.rnn import pack_padded_sequence
 
 from stanza.models.common.bert_embedding import extract_bert_embeddings
+from stanza.models.common.maxout_linear import MaxoutLinear
 from stanza.models.common.utils import unsort
 from stanza.models.common.vocab import PAD_ID, UNK_ID
 from stanza.models.constituency.base_model import BaseModel
@@ -553,7 +554,9 @@ def __init__(self, pretrain, forward_charlm, backward_charlm, bert_model, bert_t
 
         # matrix for predicting the next transition using word/constituent/transition queues
         # word size + constituency size + transition size
-        self.output_layers = self.build_output_layers(self.args['num_output_layers'], len(transitions))
+        # TODO: .get() is only necessary until all models rebuilt with this param
+        self.maxout_k = self.args.get('maxout_k', 0)
+        self.output_layers = self.build_output_layers(self.args['num_output_layers'], len(transitions), self.maxout_k)
 
     @staticmethod
     def uses_lattn(args):
@@ -597,7 +600,7 @@ def copy_with_new_structure(self, other):
             else:
                 self.get_parameter(name).data.copy_(other_parameter.data)
 
-    def build_output_layers(self, num_output_layers, final_layer_size):
+    def build_output_layers(self, num_output_layers, final_layer_size, maxout_k):
         """
         Build a ModuleList of Linear transformations for the given num_output_layers
 
@@ -611,10 +614,14 @@ def build_output_layers(self, num_output_layers, final_layer_size):
         # constituent_stack: hidden_size
         predict_input_size = [self.hidden_size + self.hidden_size * self.num_tree_lstm_layers + self.transition_hidden_size] + [self.hidden_size] * middle_layers
         predict_output_size = [self.hidden_size] * middle_layers + [final_layer_size]
-        output_layers = nn.ModuleList([nn.Linear(input_size, output_size)
-                                       for input_size, output_size in zip(predict_input_size, predict_output_size)])
-        for output_layer, input_size in zip(output_layers, predict_input_size):
-            initialize_linear(output_layer, self.args['nonlinearity'], input_size)
+        if not maxout_k:
+            output_layers = nn.ModuleList([nn.Linear(input_size, output_size)
+                                           for input_size, output_size in zip(predict_input_size, predict_output_size)])
+            for output_layer, input_size in zip(output_layers, predict_input_size):
+                initialize_linear(output_layer, self.args['nonlinearity'], input_size)
+        else:
+            output_layers = nn.ModuleList([MaxoutLinear(input_size, output_size, maxout_k)
+                                           for input_size, output_size in zip(predict_input_size, predict_output_size)])
         return output_layers
 
     def num_words_known(self, words):
@@ -1035,7 +1042,7 @@ def forward(self, states):
         hx = torch.cat((word_hx, transition_hx, constituent_hx), axis=1)
         for idx, output_layer in enumerate(self.output_layers):
             hx = self.predict_dropout(hx)
-            if idx < len(self.output_layers) - 1:
+            if not self.maxout_k and idx < len(self.output_layers) - 1:
                 hx = self.nonlinearity(hx)
             hx = output_layer(hx)
         return hx

diff --git a/stanza/models/constituency/trainer.py b/stanza/models/constituency/trainer.py
@@ -96,6 +96,7 @@ def model_from_params(params, args, foundation_cache=None):
         update_args.pop("num_tree_lstm_layers", None)
         update_args.pop("transition_scheme", None)
         update_args.pop("transition_stack", None)
+        update_args.pop("maxout_k", None)
         saved_args.update(update_args)
 
         model_type = params['model_type']

diff --git a/stanza/models/constituency_parser.py b/stanza/models/constituency_parser.py
@@ -447,6 +447,15 @@ def parse_args(args=None):
     # trading places in terms of accuracy over those ~500 iterations.
     # leaky_relu was not an improvement - a full run on WSJ led to 0.9181 f1 instead of 0.919
     parser.add_argument('--nonlinearity', default='relu', choices=NONLINEARITY.keys(), help='Nonlinearity to use in the model.  relu is a noticeable improvement over tanh')
+    # In one experiment on an Italian dataset, VIT, we got the following:
+    #  0.8254 with relu as the nonlinearity   (10 trials)
+    #  0.8265 with maxout, k = 2              (15)
+    #  0.8253 with maxout, k = 3              (5)
+    # The speed in terms of trees/second might be slightly slower with maxout.
+    #  51.4 it/s on a Titan Xp with maxout 2 and 51.9 it/s with relu
+    # It might also be worth running some experiments with bigger
+    # output layers to see if that makes up for the difference in score.
+    parser.add_argument('--maxout_k', default=None, type=int, help="Use maxout layers instead of a nonlinearity for the output layers")
 
     parser.add_argument('--use_silver_words', default=True, dest='use_silver_words', action='store_true', help="Use/don't use words from the silver dataset")
     parser.add_argument('--no_use_silver_words', default=True, dest='use_silver_words', action='store_false', help="Use/don't use words from the silver dataset")

diff --git a/stanza/tests/constituency/test_lstm_model.py b/stanza/tests/constituency/test_lstm_model.py
@@ -417,6 +417,24 @@ def test_lstm_tree_cx_forward(pretrain_file):
     model = build_model(pretrain_file, '--num_tree_lstm_layers', '3', '--constituency_composition', 'tree_lstm_cx')
     run_forward_checks(model)
 
+def test_maxout(pretrain_file):
+    """
+    Test with and without maxout layers for output
+    """
+    model = build_model(pretrain_file, '--maxout_k', '0')
+    run_forward_checks(model)
+    # check the output size & implicitly check the type
+    # to check for a particularly silly bug
+    assert model.output_layers[-1].weight.shape[0] == len(model.transitions)
+
+    model = build_model(pretrain_file, '--maxout_k', '2')
+    run_forward_checks(model)
+    assert model.output_layers[-1].linear.weight.shape[0] == len(model.transitions) * 2
+
+    model = build_model(pretrain_file, '--maxout_k', '3')
+    run_forward_checks(model)
+    assert model.output_layers[-1].linear.weight.shape[0] == len(model.transitions) * 3
+
 def check_structure_test(pretrain_file, args1, args2):
     """
     Test that the "copy" method copies the parameters from one model to another