From 17f09b00eed26a8083b26bfcc13fc9908c64788e Mon Sep 17 00:00:00 2001 From: Sam Bowman Date: Tue, 12 May 2015 13:46:50 -0700 Subject: [PATCH] Quick test implementation of 'First Past' lattice. --- AdaDeltaUpdate.m | 2 +- ...teBatchSentenceClassificationCostAndGrad.m | 2 - LatticeBatch.m | 43 ++++++++++++------- RunExperiments/RunSSTExperiments.sh | 16 ++++--- TrainModel.m | 26 ++++++----- config/ALCIR.m | 1 + config/CompositionSetup.m | 2 +- config/Defaults.m | 3 ++ layer-fns/ComputeFirstPast.m | 36 ++++++++++++++++ layer-fns/ComputeFirstPastGradient.m | 24 +++++++++++ layer-fns/SigmoidDeriv.m | 6 ++- minFunc/autoDif/derivativeCheck.m | 2 +- 12 files changed, 121 insertions(+), 42 deletions(-) create mode 100644 layer-fns/ComputeFirstPast.m create mode 100644 layer-fns/ComputeFirstPastGradient.m diff --git a/AdaDeltaUpdate.m b/AdaDeltaUpdate.m index bd2fb210..d396d5bb 100644 --- a/AdaDeltaUpdate.m +++ b/AdaDeltaUpdate.m @@ -7,7 +7,7 @@ if length(embGrad) > 0 % Set up a separate SumSqGrad tracker for the embeddings. modelState.sumSqEmbGrad = fZeros(size(modelState.separateWordFeatures), hyperParams.gpu && ~hyperParams.largeVocabMode); - modelState.sumSqEmbDelta = fZeros(size(modelState.separateWordFeatures), hyperParams.gpu && ~hyperParams.largeVocabMode; + modelState.sumSqEmbDelta = fZeros(size(modelState.separateWordFeatures), hyperParams.gpu && ~hyperParams.largeVocabMode); end end diff --git a/ComputeBatchSentenceClassificationCostAndGrad.m b/ComputeBatchSentenceClassificationCostAndGrad.m index 291fd3b7..fdcf8bd5 100644 --- a/ComputeBatchSentenceClassificationCostAndGrad.m +++ b/ComputeBatchSentenceClassificationCostAndGrad.m @@ -5,7 +5,6 @@ % NOTE: This is reasonably well optimized. The time complexity here lies almost entirely within the batch objects in normal cases. -tic B = length(data); % Batch size. @@ -195,6 +194,5 @@ assert(hasInf, 'Infs in computed gradient.'); end -toc end diff --git a/LatticeBatch.m b/LatticeBatch.m index 91956b01..fe03627a 100644 --- a/LatticeBatch.m +++ b/LatticeBatch.m @@ -167,10 +167,16 @@ lb.scores(1:row, :, row) = ComputeSlantLayer(lb.scores(1:row, :, row), hyperParams.latticeSlant); end - % Softmax the scores. - [ merges, localConnectionCosts, probCorrect ] = ... - ComputeSoftmaxLayer(lb.scores(1:row, :, row), [], hyperParams, lb.connectionLabels(:, row), hyperParams.connectionCostScale ./ (lb.wordCounts' - 2), lb.activeNode(:, 1:row, row)'); - + if hyperParams.latticeFirstPastThreshold == 0 + % Softmax the scores. + [ merges, localConnectionCosts, probCorrect ] = ... + ComputeSoftmaxLayer(lb.scores(1:row, :, row), [], hyperParams, lb.connectionLabels(:, row), hyperParams.connectionCostScale ./ (lb.wordCounts' - 2), lb.activeNode(:, 1:row, row)'); + else + [ merges, probCorrect, localConnectionCosts ] = ComputeFirstPast(lb.scores(1:row, :, row), hyperParams.latticeFirstPastThreshold, ... + lb.connectionLabels(:, row), hyperParams.connectionCostScale ./ (lb.wordCounts' - 2), hyperParams.latticeFirstPastHardMax); + localConnectionCosts(~isfinite(localConnectionCosts)) = 0; + end + lb.connections(3, :, 1:row, row) = merges'; if hyperParams.latticeLocalCurriculum @@ -345,19 +351,24 @@ if row > 1 merges = permute(lb.connections(3, :, 1:row, row), [3, 2, 1, 4]); - % Compute gradients for the scores wrt. the incoming deltas from above and to the right. - [ ~, incomingDeltasToScores ] = ... - ComputeBareSoftmaxGradients([], merges, deltasToMerges, lb.scores(1:row, :, row), hyperParams.gpu); - - % Compute gradients for the scores wrt. the independent connection supervision signal. - [ ~, labelDeltasToScores ] = ... - ComputeSoftmaxClassificationGradients([], merges, lb.connectionLabels(:, row), ... - lb.scores(1:row, :, row), hyperParams, hyperParams.connectionCostScale .* lb.supervisionWeights(:, row) ./ (lb.wordCounts - 2)'); - - deltasToScores = labelDeltasToScores + incomingDeltasToScores; - + if hyperParams.latticeFirstPastThreshold == 0 + % Compute gradients for the scores wrt. the incoming deltas from above and to the right. + [ ~, incomingDeltasToScores ] = ... + ComputeBareSoftmaxGradients([], merges, deltasToMerges, lb.scores(1:row, :, row), hyperParams.gpu); + + % Compute gradients for the scores wrt. the independent connection supervision signal. + [ ~, labelDeltasToScores ] = ... + ComputeSoftmaxClassificationGradients([], merges, lb.connectionLabels(:, row), ... + lb.scores(1:row, :, row), hyperParams, hyperParams.connectionCostScale .* lb.supervisionWeights(:, row) ./ (lb.wordCounts - 2)'); + + deltasToScores = labelDeltasToScores + incomingDeltasToScores; + else + deltasToScores = ComputeFirstPastGradient(lb.scores(1:row, :, row), hyperParams.latticeFirstPastThreshold, merges, deltasToMerges, ... + lb.connectionLabels(:, row), hyperParams.connectionCostScale .* lb.supervisionWeights(:, row) ./ (lb.wordCounts - 2)', hyperParams.latticeFirstPastHardMax); + end + % Overwrite 0/0 deltas from inactive nodes. - deltasToScores(isnan(deltasToScores)) = 0; + deltasToScores(~isfinite(deltasToScores)) = 0; if hyperParams.latticeSlant > 0 deltasToScores = ComputeSlantLayerGradients(lb.slantInputs(1:row, :, row), ... diff --git a/RunExperiments/RunSSTExperiments.sh b/RunExperiments/RunSSTExperiments.sh index c8700e28..2e462dec 100644 --- a/RunExperiments/RunSSTExperiments.sh +++ b/RunExperiments/RunSSTExperiments.sh @@ -57,17 +57,21 @@ export MATLABCMD="cd quant; conCost = 5; lambda = 0.0005; dim = 50; ed = 200; td export MATLABCMD="cd quant; conCost = 5; lambda = 0.0005; dim = 50; ed = 200; td = 2; penult = 75; comp = 6; dropout = [0.9\, 0.9]; collo = 1; dataflag='dg-only-sst'; name='/scr/sbowman/updaterule4d'; TrainModel(''\, 1\, @Sick\, name\, dataflag\, ed\, dim\, td\, penult\, lambda\, comp\, dropout(1)\, dropout(2)\, collo\, 1\, conCost);" ; qsub -v MATLABCMD quant/run.sh -q john -l nodes=1:ppn=6 % Re-run, then manic tuning +export MATLABCMD="cd quant; lambda = 0.000001; dim = 35; ed = 200; td = 2; penult = 0; dropout = [0.9\, 0.9]; tot = 4; collo = 1; dataflag='sst-expanded'; name='/scr/sbowman/sst-expanded-5e'; TrainModel(''\, 1\, @SST\, name\, dataflag\, ed\, dim\, td\, penult\, lambda\, tot\, dropout(1)\, dropout(2)\, collo\, 1\, 1\, 1\, 1);" ; qsub -v MATLABCMD quant/run.sh -q john -l nodes=1:ppn=6 +export MATLABCMD="cd quant; lambda = 0.000001; dim = 35; ed = 200; td = 2; penult = 0; dropout = [0.9\, 0.9]; tot = 4; collo = 1; dataflag='sst-expanded'; name='/scr/sbowman/sst-expanded-5d'; TrainModel(''\, 1\, @SST\, name\, dataflag\, ed\, dim\, td\, penult\, lambda\, tot\, dropout(1)\, dropout(2)\, collo\, 1\, 1\, 3\, 1);" ; qsub -v MATLABCMD quant/run.sh -q john -l nodes=1:ppn=6 +export MATLABCMD="cd quant; lambda = 0.000001; dim = 35; ed = 200; td = 2; penult = 0; dropout = [0.9\, 0.9]; tot = 4; collo = 1; dataflag='sst-expanded'; name='/scr/sbowman/sst-expanded-5d'; TrainModel(''\, 1\, @SST\, name\, dataflag\, ed\, dim\, td\, penult\, lambda\, tot\, dropout(1)\, dropout(2)\, collo\, 1\, 1\, 8\, 1);" ; qsub -v MATLABCMD quant/run.sh -q john -l nodes=1:ppn=6 export MATLABCMD="cd quant; lambda = 0.000001; dim = 35; ed = 200; td = 2; penult = 0; dropout = [0.9\, 0.9]; tot = 4; collo = 1; dataflag='sst-expanded'; name='/scr/sbowman/sst-expanded-5d'; TrainModel(''\, 1\, @SST\, name\, dataflag\, ed\, dim\, td\, penult\, lambda\, tot\, dropout(1)\, dropout(2)\, collo\, 4\, 1\, 0\, 1);" ; qsub -v MATLABCMD quant/run.sh -q john -l nodes=1:ppn=6 export MATLABCMD="cd quant; lambda = 0.000001; dim = 35; ed = 200; td = 2; penult = 0; dropout = [0.9\, 0.9]; tot = 4; collo = 1; dataflag='sst-expanded'; name='/scr/sbowman/sst-expanded-5d'; TrainModel(''\, 1\, @SST\, name\, dataflag\, ed\, dim\, td\, penult\, lambda\, tot\, dropout(1)\, dropout(2)\, collo\, 3\, 1\, 0\, 1);" ; qsub -v MATLABCMD quant/run.sh -q john -l nodes=1:ppn=6 export MATLABCMD="cd quant; lambda = 0.000001; dim = 35; ed = 200; td = 2; penult = 0; dropout = [0.9\, 0.9]; tot = 4; collo = 1; dataflag='sst-expanded'; name='/scr/sbowman/sst-expanded-5d'; TrainModel(''\, 1\, @SST\, name\, dataflag\, ed\, dim\, td\, penult\, lambda\, tot\, dropout(1)\, dropout(2)\, collo\, 2\, 1\, 0\, 1);" ; qsub -v MATLABCMD quant/run.sh -q john -l nodes=1:ppn=6 export MATLABCMD="cd quant; lambda = 0.000001; dim = 35; ed = 200; td = 2; penult = 0; dropout = [0.9\, 0.9]; tot = 4; collo = 1; dataflag='sst-expanded'; name='/scr/sbowman/sst-expanded-5d'; TrainModel(''\, 1\, @SST\, name\, dataflag\, ed\, dim\, td\, penult\, lambda\, tot\, dropout(1)\, dropout(2)\, collo\, 1\, 1\, 0\, 1);" ; qsub -v MATLABCMD quant/run.sh -q john -l nodes=1:ppn=6 export MATLABCMD="cd quant; lambda = 0.000001; dim = 75; ed = 200; td = 2; penult = 0; dropout = [0.9\, 0.9]; tot = 4; collo = 1; dataflag='sst-expanded'; name='/scr/sbowman/sst-expanded-5d'; TrainModel(''\, 1\, @SST\, name\, dataflag\, ed\, dim\, td\, penult\, lambda\, tot\, dropout(1)\, dropout(2)\, collo\, 1\, 1\, 0\, 1);" ; qsub -v MATLABCMD quant/run.sh -q john -l nodes=1:ppn=6 -export MATLABCMD="cd quant; lambda = 0.000001; dim = 35; ed = 200; td = 2; penult = 0; dropout = [0.9\, 0.9]; tot = 4; collo = 1; dataflag='sst-expanded'; name='/scr/sbowman/sst-expanded-5e'; TrainModel(''\, 1\, @SST\, name\, dataflag\, ed\, dim\, td\, penult\, lambda\, tot\, dropout(1)\, dropout(2)\, collo\, 1\, 1\, 1\, 1);" ; qsub -v MATLABCMD quant/run.sh -q john -l nodes=1:ppn=6 -export MATLABCMD="cd quant; lambda = 0.000001; dim = 35; ed = 200; td = 2; penult = 0; dropout = [0.9\, 0.9]; tot = 4; collo = 1; dataflag='sst-expanded'; name='/scr/sbowman/sst-expanded-5d'; TrainModel(''\, 1\, @SST\, name\, dataflag\, ed\, dim\, td\, penult\, lambda\, tot\, dropout(1)\, dropout(2)\, collo\, 1\, 1\, 3\, 1);" ; qsub -v MATLABCMD quant/run.sh -q john -l nodes=1:ppn=6 -export MATLABCMD="cd quant; lambda = 0.000001; dim = 35; ed = 200; td = 2; penult = 0; dropout = [0.9\, 0.9]; tot = 4; collo = 1; dataflag='sst-expanded'; name='/scr/sbowman/sst-expanded-5d'; TrainModel(''\, 1\, @SST\, name\, dataflag\, ed\, dim\, td\, penult\, lambda\, tot\, dropout(1)\, dropout(2)\, collo\, 1\, 1\, 8\, 1);" ; qsub -v MATLABCMD quant/run.sh -q john -l nodes=1:ppn=6 - - - function [ hyperParams, options, wordMap, labelMap ] = SST(expName, dataflag, embDim, dim, topDepth, penult, lambda, composition, bottomDropout, topDropout, collo, conD, curr, mdn, ccs) +export MATLABCMD="cd quant; lambda = 0.0001; dim = 40; ed = 300; td = 2; penult = 0; dropout = [1\, 1]; tot = 4; collo = 3; dataflag='sst-expanded'; name='/scr/sbowman/sst-expanded-5d'; TrainModel(''\, 1\, @SST\, name\, dataflag\, ed\, dim\, td\, penult\, lambda\, tot\, dropout(1)\, dropout(2)\, collo\, 4\, 1\, 0\, .3);" ; qsub -v MATLABCMD quant/run.sh -q john -l nodes=1:ppn=6 +export MATLABCMD="cd quant; lambda = 0.001; dim = 50; ed = 300; td = 3; penult = 0; dropout = [.8\, 1]; tot = 4; collo = 3; dataflag='sst-expanded'; name='/scr/sbowman/sst-expanded-5d'; TrainModel(''\, 1\, @SST\, name\, dataflag\, ed\, dim\, td\, penult\, lambda\, tot\, dropout(1)\, dropout(2)\, collo\, 4\, 1\, 0\, 1);" ; qsub -v MATLABCMD quant/run.sh -q john -l nodes=1:ppn=6 +export MATLABCMD="cd quant; lambda = 0.00001; dim = 30; ed = 300; td = 4; penult = 0; dropout = [1\, .8]; tot = 4; collo = 3; dataflag='sst-expanded'; name='/scr/sbowman/sst-expanded-5d'; TrainModel(''\, 1\, @SST\, name\, dataflag\, ed\, dim\, td\, penult\, lambda\, tot\, dropout(1)\, dropout(2)\, collo\, 4\, 1\, 0\, .3);" ; qsub -v MATLABCMD quant/run.sh -q john -l nodes=1:ppn=6 +export MATLABCMD="cd quant; lambda = 0.001; dim = 40; ed = 300; td = 5; penult = 0; dropout = [1\, .8]; tot = 4; collo = 3; dataflag='sst-expanded'; name='/scr/sbowman/sst-expanded-5d'; TrainModel(''\, 1\, @SST\, name\, dataflag\, ed\, dim\, td\, penult\, lambda\, tot\, dropout(1)\, dropout(2)\, collo\, 4\, 1\, 0\, 3);" ; qsub -v MATLABCMD quant/run.sh -q john -l nodes=1:ppn=6 +export MATLABCMD="cd quant; lambda = 0.00001; dim = 50; ed = 300; td = 5; penult = 0; dropout = [.8\, 1]; tot = 4; collo = 3; dataflag='sst-expanded'; name='/scr/sbowman/sst-expanded-5d'; TrainModel(''\, 1\, @SST\, name\, dataflag\, ed\, dim\, td\, penult\, lambda\, tot\, dropout(1)\, dropout(2)\, collo\, 4\, 1\, 0\, 3);" ; qsub -v MATLABCMD quant/run.sh -q john -l nodes=1:ppn=6 +export MATLABCMD="cd quant; lambda = 0.001; dim = 40; ed = 300; td = 3; penult = 0; dropout = [1\, 1]; tot = 4; collo = 3; dataflag='sst-expanded'; name='/scr/sbowman/sst-expanded-5d'; TrainModel(''\, 1\, @SST\, name\, dataflag\, ed\, dim\, td\, penult\, lambda\, tot\, dropout(1)\, dropout(2)\, collo\, 4\, 1\, 0\, 1);" ; qsub -v MATLABCMD quant/run.sh -q john -l nodes=1:ppn=6 + function [ hyperParams, options, wordMap, labelMap ] = SST(expName, dataflag, embDim, dim, topDepth, penult, lambda, composition, bottomDropout, topDropout, collo, conD, curr, mdn, ccs) diff --git a/TrainModel.m b/TrainModel.m index 71a68bad..7043cf6f 100644 --- a/TrainModel.m +++ b/TrainModel.m @@ -20,20 +20,18 @@ function TrainModel(pretrainingFilename, fold, ConfigFn, varargin) addpath('layer-fns/') % Set up paralellization -if false - if isempty(gcp('nocreate')) - c = parcluster(); - t = tempname(); - mkdir(t); - c.JobStorageLocation = t; - c.NumWorkers = 4; - if exist('parpool') - % >= 2013b - parpool(c, 4); - else - % < 2013b - matlabpool(c, 4); - end +if isempty(gcp('nocreate')) + c = parcluster(); + t = tempname(); + mkdir(t); + c.JobStorageLocation = t; + c.NumWorkers = 4; + if exist('parpool') + % >= 2013b + parpool(c, 4); + else + % < 2013b + matlabpool(c, 4); end end diff --git a/config/ALCIR.m b/config/ALCIR.m index 892321ee..c964e770 100644 --- a/config/ALCIR.m +++ b/config/ALCIR.m @@ -26,6 +26,7 @@ hyperParams.latticeSlant = slant; end +hyperParams.latticeConnectionHiddenDim = 25; hyperParams.parensInSequences = 1; diff --git a/config/CompositionSetup.m b/config/CompositionSetup.m index 41c8985a..4a5f23d5 100644 --- a/config/CompositionSetup.m +++ b/config/CompositionSetup.m @@ -26,7 +26,7 @@ hyperParams.lstm = 0; hyperParams.useTrees = 0; hyperParams.useThirdOrderComposition = 0; - hyperParams.useThirdOrderMerge = 1; + hyperParams.useThirdOrderMerge = 0; elseif composition == 5 hyperParams.useLattices = 1; hyperParams.lstm = 0; diff --git a/config/Defaults.m b/config/Defaults.m index b7a37fb1..7e43e4cc 100644 --- a/config/Defaults.m +++ b/config/Defaults.m @@ -41,6 +41,9 @@ hyperParams.latticeSlant = 0; +hyperParams.latticeFirstPastThreshold = 0.45; +hyperParams.latticeFirstPastHardMax = 1; + % The number of embedding transform layers. topDepth = 1 means an NN layer will be % added above the embedding matrix. This is likely to only be useful when % learnWords is false, and so the embeddings do not exist in the same space diff --git a/layer-fns/ComputeFirstPast.m b/layer-fns/ComputeFirstPast.m new file mode 100644 index 00000000..ccfb9f2e --- /dev/null +++ b/layer-fns/ComputeFirstPast.m @@ -0,0 +1,36 @@ +function [ weights, correctWeight, loss ] = ComputeFirstPast(scores, threshold, labels, multipliers, hardMax) + +% Sigmoid to get a bounded range. +scores = Sigmoid(scores); + +% Lower the threshold if nothing passes it. +effectiveThreshold = min([repmat(threshold, size(scores, 2)); max(scores, [], 1)]); + +% Choose the first entry to pass the threshold. +[ ~, bestIndex ] = max(scores >= repmat(effectiveThreshold, size(scores, 1), 1), [], 1); + +loss = zeros(size(scores, 2), 1, 'like', scores); +paddedScores = [zeros(1, size(scores, 2)); scores]; +for b = 1:size(scores, 2) + loss(b) = max(0, threshold - paddedScores(labels(b) + 1, b)); + + if labels(b) > 1 + competingScores = scores(1:labels(b) - 1, b); + loss(b) = loss(b) + ... + sum(competingScores(competingScores >= repmat(threshold, labels(b) - 1, 1)) - threshold); + end +end + +weights = zeros(size(scores), 'like', scores); +if hardMax + weights(sub2ind(size(scores), bestIndex, 1:size(scores, 2))) = 1; +else + weights(sub2ind(size(scores), bestIndex, 1:size(scores, 2))) = scores(sub2ind(size(scores), bestIndex, 1:size(scores, 2))); +end + +paddedWeights = [zeros(1, size(weights, 2)); weights]; +correctWeight = paddedWeights(sub2ind(size(paddedWeights), labels' + 1, 1:length(labels))); + +loss = loss .* multipliers; + +end diff --git a/layer-fns/ComputeFirstPastGradient.m b/layer-fns/ComputeFirstPastGradient.m new file mode 100644 index 00000000..78d17bdc --- /dev/null +++ b/layer-fns/ComputeFirstPastGradient.m @@ -0,0 +1,24 @@ +function [ deltas ] = ComputeFirstPastGradient(scores, threshold, weights, deltasIn, labels, multipliers, hardMax) + +% TODO: Store this. +scores = Sigmoid(scores); + +deltas = zeros(size(deltasIn), 'like', deltasIn); + +% Looping for now during development. +for b = 1:size(scores, 2) + if labels(b) > 0 + deltas(labels(b), b) = deltas(labels(b), b) - (scores(labels(b), b) < threshold); + deltas(1:labels(b) - 1, b) = deltas(1:labels(b) - 1, b) + (scores(1:labels(b) - 1, b) >= repmat(threshold, labels(b) - 1, 1)); + end +end + +deltas = bsxfun(@times, deltas, multipliers'); + +if ~hardMax + deltas = deltas + deltasIn .* (weights > 0); +end + +deltas = deltas .* SigmoidDeriv([], scores); + +end diff --git a/layer-fns/SigmoidDeriv.m b/layer-fns/SigmoidDeriv.m index 53cb1c23..011d89cb 100644 --- a/layer-fns/SigmoidDeriv.m +++ b/layer-fns/SigmoidDeriv.m @@ -2,6 +2,10 @@ function deriv = SigmoidDeriv (in, out) % Compute the gradient of the sigmoid (now actually tanh) nonlinearity -assert(0, 'Not used.') +if isempty(out) + out = Sigmoid(in); +end + +deriv = out .* (1.0 - out); end diff --git a/minFunc/autoDif/derivativeCheck.m b/minFunc/autoDif/derivativeCheck.m index fb950a61..3dec680e 100644 --- a/minFunc/autoDif/derivativeCheck.m +++ b/minFunc/autoDif/derivativeCheck.m @@ -46,7 +46,7 @@ = stack2param(g, varargin{1}); mergeMatrices, mergeMatrix, ... - softmaxMatrix, trainedWordFeatures, connectionMatrix, ... + softmaxMatrix, trainedWordFeatures, connectionMatrix, scoringVector, ... compositionMatrix, classifierExtraMatrix, embeddingTransformMatrix fprintf('User gradient error (diff/abs+abs method, zeroing out tiny gradients):\n');