Skip to content

Commit

Permalink
Add a maxout linear such as in
Browse files Browse the repository at this point in the history
https://arxiv.org/pdf/1302.4389v4.pdf

.get() the arg to keep old models alive

Includes some comments on accuracy with maxout
  • Loading branch information
AngledLuffa committed Dec 1, 2022
1 parent 5edd724 commit c708ce7
Show file tree
Hide file tree
Showing 5 changed files with 84 additions and 7 deletions.
42 changes: 42 additions & 0 deletions stanza/models/common/maxout_linear.py
@@ -0,0 +1,42 @@
"""
A layer which implements maxout from the "Maxout Networks" paper
https://arxiv.org/pdf/1302.4389v4.pdf
Goodfellow, Warde-Farley, Mirza, Courville, Bengio
or a simpler explanation here:
https://stats.stackexchange.com/questions/129698/what-is-maxout-in-neural-network/298705#298705
The implementation here:
for k layers of maxout, in -> out channels, we make a single linear
map of size in -> out*k
then we reshape the end to be (..., k, out)
and return the max over the k layers
"""


import torch
import torch.nn as nn

class MaxoutLinear(nn.Module):
def __init__(self, in_channels, out_channels, maxout_k):
super().__init__()

self.in_channels = in_channels
self.out_channels = out_channels
self.maxout_k = maxout_k

self.linear = nn.Linear(in_channels, out_channels * maxout_k)

def forward(self, inputs):
"""
Use the oversized linear as the repeated linear, then take the max
One large linear map makes the implementation simpler and easier for pytorch to make parallel
"""
outputs = self.linear(inputs)
outputs = outputs.view(*outputs.shape[:-1], self.maxout_k, self.out_channels)
outputs = torch.max(outputs, dim=-2)[0]
return outputs

21 changes: 14 additions & 7 deletions stanza/models/constituency/lstm_model.py
Expand Up @@ -37,6 +37,7 @@
from torch.nn.utils.rnn import pack_padded_sequence

from stanza.models.common.bert_embedding import extract_bert_embeddings
from stanza.models.common.maxout_linear import MaxoutLinear
from stanza.models.common.utils import unsort
from stanza.models.common.vocab import PAD_ID, UNK_ID
from stanza.models.constituency.base_model import BaseModel
Expand Down Expand Up @@ -553,7 +554,9 @@ def __init__(self, pretrain, forward_charlm, backward_charlm, bert_model, bert_t

# matrix for predicting the next transition using word/constituent/transition queues
# word size + constituency size + transition size
self.output_layers = self.build_output_layers(self.args['num_output_layers'], len(transitions))
# TODO: .get() is only necessary until all models rebuilt with this param
self.maxout_k = self.args.get('maxout_k', 0)
self.output_layers = self.build_output_layers(self.args['num_output_layers'], len(transitions), self.maxout_k)

@staticmethod
def uses_lattn(args):
Expand Down Expand Up @@ -597,7 +600,7 @@ def copy_with_new_structure(self, other):
else:
self.get_parameter(name).data.copy_(other_parameter.data)

def build_output_layers(self, num_output_layers, final_layer_size):
def build_output_layers(self, num_output_layers, final_layer_size, maxout_k):
"""
Build a ModuleList of Linear transformations for the given num_output_layers
Expand All @@ -611,10 +614,14 @@ def build_output_layers(self, num_output_layers, final_layer_size):
# constituent_stack: hidden_size
predict_input_size = [self.hidden_size + self.hidden_size * self.num_tree_lstm_layers + self.transition_hidden_size] + [self.hidden_size] * middle_layers
predict_output_size = [self.hidden_size] * middle_layers + [final_layer_size]
output_layers = nn.ModuleList([nn.Linear(input_size, output_size)
for input_size, output_size in zip(predict_input_size, predict_output_size)])
for output_layer, input_size in zip(output_layers, predict_input_size):
initialize_linear(output_layer, self.args['nonlinearity'], input_size)
if not maxout_k:
output_layers = nn.ModuleList([nn.Linear(input_size, output_size)
for input_size, output_size in zip(predict_input_size, predict_output_size)])
for output_layer, input_size in zip(output_layers, predict_input_size):
initialize_linear(output_layer, self.args['nonlinearity'], input_size)
else:
output_layers = nn.ModuleList([MaxoutLinear(input_size, output_size, maxout_k)
for input_size, output_size in zip(predict_input_size, predict_output_size)])
return output_layers

def num_words_known(self, words):
Expand Down Expand Up @@ -1035,7 +1042,7 @@ def forward(self, states):
hx = torch.cat((word_hx, transition_hx, constituent_hx), axis=1)
for idx, output_layer in enumerate(self.output_layers):
hx = self.predict_dropout(hx)
if idx < len(self.output_layers) - 1:
if not self.maxout_k and idx < len(self.output_layers) - 1:
hx = self.nonlinearity(hx)
hx = output_layer(hx)
return hx
Expand Down
1 change: 1 addition & 0 deletions stanza/models/constituency/trainer.py
Expand Up @@ -96,6 +96,7 @@ def model_from_params(params, args, foundation_cache=None):
update_args.pop("num_tree_lstm_layers", None)
update_args.pop("transition_scheme", None)
update_args.pop("transition_stack", None)
update_args.pop("maxout_k", None)
saved_args.update(update_args)

model_type = params['model_type']
Expand Down
9 changes: 9 additions & 0 deletions stanza/models/constituency_parser.py
Expand Up @@ -447,6 +447,15 @@ def parse_args(args=None):
# trading places in terms of accuracy over those ~500 iterations.
# leaky_relu was not an improvement - a full run on WSJ led to 0.9181 f1 instead of 0.919
parser.add_argument('--nonlinearity', default='relu', choices=NONLINEARITY.keys(), help='Nonlinearity to use in the model. relu is a noticeable improvement over tanh')
# In one experiment on an Italian dataset, VIT, we got the following:
# 0.8254 with relu as the nonlinearity (10 trials)
# 0.8265 with maxout, k = 2 (15)
# 0.8253 with maxout, k = 3 (5)
# The speed in terms of trees/second might be slightly slower with maxout.
# 51.4 it/s on a Titan Xp with maxout 2 and 51.9 it/s with relu
# It might also be worth running some experiments with bigger
# output layers to see if that makes up for the difference in score.
parser.add_argument('--maxout_k', default=None, type=int, help="Use maxout layers instead of a nonlinearity for the output layers")

parser.add_argument('--use_silver_words', default=True, dest='use_silver_words', action='store_true', help="Use/don't use words from the silver dataset")
parser.add_argument('--no_use_silver_words', default=True, dest='use_silver_words', action='store_false', help="Use/don't use words from the silver dataset")
Expand Down
18 changes: 18 additions & 0 deletions stanza/tests/constituency/test_lstm_model.py
Expand Up @@ -417,6 +417,24 @@ def test_lstm_tree_cx_forward(pretrain_file):
model = build_model(pretrain_file, '--num_tree_lstm_layers', '3', '--constituency_composition', 'tree_lstm_cx')
run_forward_checks(model)

def test_maxout(pretrain_file):
"""
Test with and without maxout layers for output
"""
model = build_model(pretrain_file, '--maxout_k', '0')
run_forward_checks(model)
# check the output size & implicitly check the type
# to check for a particularly silly bug
assert model.output_layers[-1].weight.shape[0] == len(model.transitions)

model = build_model(pretrain_file, '--maxout_k', '2')
run_forward_checks(model)
assert model.output_layers[-1].linear.weight.shape[0] == len(model.transitions) * 2

model = build_model(pretrain_file, '--maxout_k', '3')
run_forward_checks(model)
assert model.output_layers[-1].linear.weight.shape[0] == len(model.transitions) * 3

def check_structure_test(pretrain_file, args1, args2):
"""
Test that the "copy" method copies the parameters from one model to another
Expand Down

0 comments on commit c708ce7

Please sign in to comment.