Quick test implementation of 'First Past' lattice.

sleepinyourhat · May 12, 2015 · 17f09b0 · 17f09b0
1 parent 1340534
commit 17f09b0
Show file tree

Hide file tree

Showing 12 changed files with 121 additions and 42 deletions.
diff --git a/AdaDeltaUpdate.m b/AdaDeltaUpdate.m
@@ -7,7 +7,7 @@
     if length(embGrad) > 0
         % Set up a separate SumSqGrad tracker for the embeddings.
         modelState.sumSqEmbGrad = fZeros(size(modelState.separateWordFeatures), hyperParams.gpu && ~hyperParams.largeVocabMode);
-        modelState.sumSqEmbDelta = fZeros(size(modelState.separateWordFeatures), hyperParams.gpu && ~hyperParams.largeVocabMode;
+        modelState.sumSqEmbDelta = fZeros(size(modelState.separateWordFeatures), hyperParams.gpu && ~hyperParams.largeVocabMode);
     end
 end
 

diff --git a/ComputeBatchSentenceClassificationCostAndGrad.m b/ComputeBatchSentenceClassificationCostAndGrad.m
@@ -5,7 +5,6 @@
 
 % NOTE: This is reasonably well optimized. The time complexity here lies almost entirely within the batch objects in normal cases.
 
-tic
 
 B = length(data);  % Batch size.
 
@@ -195,6 +194,5 @@
     assert(hasInf, 'Infs in computed gradient.'); 
 end
 
-toc
 
 end
diff --git a/LatticeBatch.m b/LatticeBatch.m
@@ -167,10 +167,16 @@
                         lb.scores(1:row, :, row) = ComputeSlantLayer(lb.scores(1:row, :, row), hyperParams.latticeSlant);
                     end
 
-                    % Softmax the scores.
-                    [ merges, localConnectionCosts, probCorrect ] = ...
-                        ComputeSoftmaxLayer(lb.scores(1:row, :, row), [], hyperParams, lb.connectionLabels(:, row), hyperParams.connectionCostScale ./ (lb.wordCounts' - 2), lb.activeNode(:, 1:row, row)');
-
+                    if hyperParams.latticeFirstPastThreshold == 0
+                        % Softmax the scores.
+                        [ merges, localConnectionCosts, probCorrect ] = ...
+                            ComputeSoftmaxLayer(lb.scores(1:row, :, row), [], hyperParams, lb.connectionLabels(:, row), hyperParams.connectionCostScale ./ (lb.wordCounts' - 2), lb.activeNode(:, 1:row, row)');
+                    else
+                        [ merges, probCorrect, localConnectionCosts ] = ComputeFirstPast(lb.scores(1:row, :, row), hyperParams.latticeFirstPastThreshold, ...
+                            lb.connectionLabels(:, row), hyperParams.connectionCostScale ./ (lb.wordCounts' - 2), hyperParams.latticeFirstPastHardMax);
+                        localConnectionCosts(~isfinite(localConnectionCosts)) = 0;
+                    end
+
                     lb.connections(3, :, 1:row, row) = merges';
 
                     if hyperParams.latticeLocalCurriculum
@@ -345,19 +351,24 @@
                 if row > 1
                     merges = permute(lb.connections(3, :, 1:row, row), [3, 2, 1, 4]);
 
-                    % Compute gradients for the scores wrt. the incoming deltas from above and to the right.
-                    [ ~, incomingDeltasToScores ] = ...
-                        ComputeBareSoftmaxGradients([], merges, deltasToMerges, lb.scores(1:row, :, row), hyperParams.gpu);
-
-                    % Compute gradients for the scores wrt. the independent connection supervision signal.
-                    [ ~, labelDeltasToScores ] = ...
-                            ComputeSoftmaxClassificationGradients([], merges, lb.connectionLabels(:, row), ...
-                                lb.scores(1:row, :, row), hyperParams, hyperParams.connectionCostScale .* lb.supervisionWeights(:, row) ./ (lb.wordCounts - 2)');
-
-                    deltasToScores = labelDeltasToScores + incomingDeltasToScores;
-
+                    if hyperParams.latticeFirstPastThreshold == 0
+                        % Compute gradients for the scores wrt. the incoming deltas from above and to the right.
+                        [ ~, incomingDeltasToScores ] = ...
+                            ComputeBareSoftmaxGradients([], merges, deltasToMerges, lb.scores(1:row, :, row), hyperParams.gpu);
+
+                        % Compute gradients for the scores wrt. the independent connection supervision signal.
+                        [ ~, labelDeltasToScores ] = ...
+                                ComputeSoftmaxClassificationGradients([], merges, lb.connectionLabels(:, row), ...
+                                    lb.scores(1:row, :, row), hyperParams, hyperParams.connectionCostScale .* lb.supervisionWeights(:, row) ./ (lb.wordCounts - 2)');
+
+                        deltasToScores = labelDeltasToScores + incomingDeltasToScores;
+                    else
+                        deltasToScores = ComputeFirstPastGradient(lb.scores(1:row, :, row), hyperParams.latticeFirstPastThreshold, merges, deltasToMerges, ...
+                            lb.connectionLabels(:, row), hyperParams.connectionCostScale .* lb.supervisionWeights(:, row) ./ (lb.wordCounts - 2)', hyperParams.latticeFirstPastHardMax);
+                    end
+
                     % Overwrite 0/0 deltas from inactive nodes.
-                    deltasToScores(isnan(deltasToScores)) = 0;
+                    deltasToScores(~isfinite(deltasToScores)) = 0;
 
                     if hyperParams.latticeSlant > 0
                         deltasToScores = ComputeSlantLayerGradients(lb.slantInputs(1:row, :, row), ...

diff --git a/RunExperiments/RunSSTExperiments.sh b/RunExperiments/RunSSTExperiments.sh
@@ -57,17 +57,21 @@ export MATLABCMD="cd quant; conCost = 5; lambda = 0.0005; dim = 50; ed = 200; td
 export MATLABCMD="cd quant; conCost = 5; lambda = 0.0005; dim = 50; ed = 200; td = 2; penult = 75; comp = 6; dropout = [0.9\, 0.9]; collo = 1; dataflag='dg-only-sst'; name='/scr/sbowman/updaterule4d'; TrainModel(''\, 1\, @Sick\, name\, dataflag\, ed\, dim\, td\, penult\, lambda\, comp\, dropout(1)\, dropout(2)\, collo\, 1\, conCost);" ; qsub -v MATLABCMD quant/run.sh -q john -l nodes=1:ppn=6
 
 % Re-run, then manic tuning
+export MATLABCMD="cd quant; lambda = 0.000001; dim = 35; ed = 200; td = 2; penult = 0; dropout = [0.9\, 0.9]; tot = 4; collo = 1; dataflag='sst-expanded'; name='/scr/sbowman/sst-expanded-5e'; TrainModel(''\, 1\, @SST\, name\, dataflag\, ed\, dim\, td\, penult\, lambda\, tot\, dropout(1)\, dropout(2)\, collo\, 1\, 1\, 1\, 1);" ; qsub -v MATLABCMD quant/run.sh -q john -l nodes=1:ppn=6
+export MATLABCMD="cd quant; lambda = 0.000001; dim = 35; ed = 200; td = 2; penult = 0; dropout = [0.9\, 0.9]; tot = 4; collo = 1; dataflag='sst-expanded'; name='/scr/sbowman/sst-expanded-5d'; TrainModel(''\, 1\, @SST\, name\, dataflag\, ed\, dim\, td\, penult\, lambda\, tot\, dropout(1)\, dropout(2)\, collo\, 1\, 1\, 3\, 1);" ; qsub -v MATLABCMD quant/run.sh -q john -l nodes=1:ppn=6
+export MATLABCMD="cd quant; lambda = 0.000001; dim = 35; ed = 200; td = 2; penult = 0; dropout = [0.9\, 0.9]; tot = 4; collo = 1; dataflag='sst-expanded'; name='/scr/sbowman/sst-expanded-5d'; TrainModel(''\, 1\, @SST\, name\, dataflag\, ed\, dim\, td\, penult\, lambda\, tot\, dropout(1)\, dropout(2)\, collo\, 1\, 1\, 8\, 1);" ; qsub -v MATLABCMD quant/run.sh -q john -l nodes=1:ppn=6
 export MATLABCMD="cd quant; lambda = 0.000001; dim = 35; ed = 200; td = 2; penult = 0; dropout = [0.9\, 0.9]; tot = 4; collo = 1; dataflag='sst-expanded'; name='/scr/sbowman/sst-expanded-5d'; TrainModel(''\, 1\, @SST\, name\, dataflag\, ed\, dim\, td\, penult\, lambda\, tot\, dropout(1)\, dropout(2)\, collo\, 4\, 1\, 0\, 1);" ; qsub -v MATLABCMD quant/run.sh -q john -l nodes=1:ppn=6
 export MATLABCMD="cd quant; lambda = 0.000001; dim = 35; ed = 200; td = 2; penult = 0; dropout = [0.9\, 0.9]; tot = 4; collo = 1; dataflag='sst-expanded'; name='/scr/sbowman/sst-expanded-5d'; TrainModel(''\, 1\, @SST\, name\, dataflag\, ed\, dim\, td\, penult\, lambda\, tot\, dropout(1)\, dropout(2)\, collo\, 3\, 1\, 0\, 1);" ; qsub -v MATLABCMD quant/run.sh -q john -l nodes=1:ppn=6
 export MATLABCMD="cd quant; lambda = 0.000001; dim = 35; ed = 200; td = 2; penult = 0; dropout = [0.9\, 0.9]; tot = 4; collo = 1; dataflag='sst-expanded'; name='/scr/sbowman/sst-expanded-5d'; TrainModel(''\, 1\, @SST\, name\, dataflag\, ed\, dim\, td\, penult\, lambda\, tot\, dropout(1)\, dropout(2)\, collo\, 2\, 1\, 0\, 1);" ; qsub -v MATLABCMD quant/run.sh -q john -l nodes=1:ppn=6
 export MATLABCMD="cd quant; lambda = 0.000001; dim = 35; ed = 200; td = 2; penult = 0; dropout = [0.9\, 0.9]; tot = 4; collo = 1; dataflag='sst-expanded'; name='/scr/sbowman/sst-expanded-5d'; TrainModel(''\, 1\, @SST\, name\, dataflag\, ed\, dim\, td\, penult\, lambda\, tot\, dropout(1)\, dropout(2)\, collo\, 1\, 1\, 0\, 1);" ; qsub -v MATLABCMD quant/run.sh -q john -l nodes=1:ppn=6
 export MATLABCMD="cd quant; lambda = 0.000001; dim = 75; ed = 200; td = 2; penult = 0; dropout = [0.9\, 0.9]; tot = 4; collo = 1; dataflag='sst-expanded'; name='/scr/sbowman/sst-expanded-5d'; TrainModel(''\, 1\, @SST\, name\, dataflag\, ed\, dim\, td\, penult\, lambda\, tot\, dropout(1)\, dropout(2)\, collo\, 1\, 1\, 0\, 1);" ; qsub -v MATLABCMD quant/run.sh -q john -l nodes=1:ppn=6
-export MATLABCMD="cd quant; lambda = 0.000001; dim = 35; ed = 200; td = 2; penult = 0; dropout = [0.9\, 0.9]; tot = 4; collo = 1; dataflag='sst-expanded'; name='/scr/sbowman/sst-expanded-5e'; TrainModel(''\, 1\, @SST\, name\, dataflag\, ed\, dim\, td\, penult\, lambda\, tot\, dropout(1)\, dropout(2)\, collo\, 1\, 1\, 1\, 1);" ; qsub -v MATLABCMD quant/run.sh -q john -l nodes=1:ppn=6
-export MATLABCMD="cd quant; lambda = 0.000001; dim = 35; ed = 200; td = 2; penult = 0; dropout = [0.9\, 0.9]; tot = 4; collo = 1; dataflag='sst-expanded'; name='/scr/sbowman/sst-expanded-5d'; TrainModel(''\, 1\, @SST\, name\, dataflag\, ed\, dim\, td\, penult\, lambda\, tot\, dropout(1)\, dropout(2)\, collo\, 1\, 1\, 3\, 1);" ; qsub -v MATLABCMD quant/run.sh -q john -l nodes=1:ppn=6
-export MATLABCMD="cd quant; lambda = 0.000001; dim = 35; ed = 200; td = 2; penult = 0; dropout = [0.9\, 0.9]; tot = 4; collo = 1; dataflag='sst-expanded'; name='/scr/sbowman/sst-expanded-5d'; TrainModel(''\, 1\, @SST\, name\, dataflag\, ed\, dim\, td\, penult\, lambda\, tot\, dropout(1)\, dropout(2)\, collo\, 1\, 1\, 8\, 1);" ; qsub -v MATLABCMD quant/run.sh -q john -l nodes=1:ppn=6
-
-
 
 
-																																									function [ hyperParams, options, wordMap, labelMap ] = SST(expName, dataflag, embDim, dim, topDepth, penult, lambda, composition, bottomDropout, topDropout, collo, conD, curr, mdn, ccs)
+export MATLABCMD="cd quant; lambda = 0.0001; dim = 40; ed = 300; td = 2; penult = 0; dropout = [1\, 1]; tot = 4; collo = 3; dataflag='sst-expanded'; name='/scr/sbowman/sst-expanded-5d'; TrainModel(''\, 1\, @SST\, name\, dataflag\, ed\, dim\, td\, penult\, lambda\, tot\, dropout(1)\, dropout(2)\, collo\, 4\, 1\, 0\, .3);" ; qsub -v MATLABCMD quant/run.sh -q john -l nodes=1:ppn=6
+export MATLABCMD="cd quant; lambda = 0.001; dim = 50; ed = 300; td = 3; penult = 0; dropout = [.8\, 1]; tot = 4; collo = 3; dataflag='sst-expanded'; name='/scr/sbowman/sst-expanded-5d'; TrainModel(''\, 1\, @SST\, name\, dataflag\, ed\, dim\, td\, penult\, lambda\, tot\, dropout(1)\, dropout(2)\, collo\, 4\, 1\, 0\, 1);" ; qsub -v MATLABCMD quant/run.sh -q john -l nodes=1:ppn=6
+export MATLABCMD="cd quant; lambda = 0.00001; dim = 30; ed = 300; td = 4; penult = 0; dropout = [1\, .8]; tot = 4; collo = 3; dataflag='sst-expanded'; name='/scr/sbowman/sst-expanded-5d'; TrainModel(''\, 1\, @SST\, name\, dataflag\, ed\, dim\, td\, penult\, lambda\, tot\, dropout(1)\, dropout(2)\, collo\, 4\, 1\, 0\, .3);" ; qsub -v MATLABCMD quant/run.sh -q john -l nodes=1:ppn=6
+export MATLABCMD="cd quant; lambda = 0.001; dim = 40; ed = 300; td = 5; penult = 0; dropout = [1\, .8]; tot = 4; collo = 3; dataflag='sst-expanded'; name='/scr/sbowman/sst-expanded-5d'; TrainModel(''\, 1\, @SST\, name\, dataflag\, ed\, dim\, td\, penult\, lambda\, tot\, dropout(1)\, dropout(2)\, collo\, 4\, 1\, 0\, 3);" ; qsub -v MATLABCMD quant/run.sh -q john -l nodes=1:ppn=6
+export MATLABCMD="cd quant; lambda = 0.00001; dim = 50; ed = 300; td = 5; penult = 0; dropout = [.8\, 1]; tot = 4; collo = 3; dataflag='sst-expanded'; name='/scr/sbowman/sst-expanded-5d'; TrainModel(''\, 1\, @SST\, name\, dataflag\, ed\, dim\, td\, penult\, lambda\, tot\, dropout(1)\, dropout(2)\, collo\, 4\, 1\, 0\, 3);" ; qsub -v MATLABCMD quant/run.sh -q john -l nodes=1:ppn=6
+export MATLABCMD="cd quant; lambda = 0.001; dim = 40; ed = 300; td = 3; penult = 0; dropout = [1\, 1]; tot = 4; collo = 3; dataflag='sst-expanded'; name='/scr/sbowman/sst-expanded-5d'; TrainModel(''\, 1\, @SST\, name\, dataflag\, ed\, dim\, td\, penult\, lambda\, tot\, dropout(1)\, dropout(2)\, collo\, 4\, 1\, 0\, 1);" ; qsub -v MATLABCMD quant/run.sh -q john -l nodes=1:ppn=6
+																																		function [ hyperParams, options, wordMap, labelMap ] = SST(expName, dataflag, embDim, dim, topDepth, penult, lambda, composition, bottomDropout, topDropout, collo, conD, curr, mdn, ccs)
 
diff --git a/TrainModel.m b/TrainModel.m
@@ -20,20 +20,18 @@ function TrainModel(pretrainingFilename, fold, ConfigFn, varargin)
 addpath('layer-fns/')
 
 % Set up paralellization
-if false
-    if isempty(gcp('nocreate'))
-        c = parcluster();
-        t = tempname();
-        mkdir(t);
-        c.JobStorageLocation = t;
-        c.NumWorkers = 4;
-        if exist('parpool')
-          % >= 2013b
-          parpool(c, 4);
-        else
-          % < 2013b
-          matlabpool(c, 4);
-        end
+if isempty(gcp('nocreate'))
+    c = parcluster();
+    t = tempname();
+    mkdir(t);
+    c.JobStorageLocation = t;
+    c.NumWorkers = 4;
+    if exist('parpool')
+      % >= 2013b
+      parpool(c, 4);
+    else
+      % < 2013b
+      matlabpool(c, 4);
     end
 end
 

diff --git a/config/ALCIR.m b/config/ALCIR.m
@@ -26,6 +26,7 @@
 	hyperParams.latticeSlant = slant;
 end
 
+hyperParams.latticeConnectionHiddenDim = 25;
 
 hyperParams.parensInSequences = 1;
 

diff --git a/config/CompositionSetup.m b/config/CompositionSetup.m
@@ -26,7 +26,7 @@
 	hyperParams.lstm = 0;
 	hyperParams.useTrees = 0;
 	hyperParams.useThirdOrderComposition = 0;
-	hyperParams.useThirdOrderMerge = 1;
+	hyperParams.useThirdOrderMerge = 0;
 elseif composition == 5
 	hyperParams.useLattices = 1;
 	hyperParams.lstm = 0;

diff --git a/config/Defaults.m b/config/Defaults.m
@@ -41,6 +41,9 @@
 
 hyperParams.latticeSlant = 0;
 
+hyperParams.latticeFirstPastThreshold = 0.45;
+hyperParams.latticeFirstPastHardMax = 1;
+
 % The number of embedding transform layers. topDepth = 1 means an NN layer will be
 % added above the embedding matrix. This is likely to only be useful when
 % learnWords is false, and so the embeddings do not exist in the same space

diff --git a/layer-fns/ComputeFirstPast.m b/layer-fns/ComputeFirstPast.m
@@ -0,0 +1,36 @@
+function [ weights, correctWeight, loss ] = ComputeFirstPast(scores, threshold, labels, multipliers, hardMax)
+
+% Sigmoid to get a bounded range.
+scores = Sigmoid(scores);
+
+% Lower the threshold if nothing passes it.
+effectiveThreshold = min([repmat(threshold, size(scores, 2)); max(scores, [], 1)]);
+
+% Choose the first entry to pass the threshold.
+[ ~, bestIndex ] = max(scores >= repmat(effectiveThreshold, size(scores, 1), 1), [], 1);
+
+loss = zeros(size(scores, 2), 1, 'like', scores);
+paddedScores = [zeros(1, size(scores, 2)); scores];
+for b = 1:size(scores, 2)
+	loss(b) = max(0, threshold - paddedScores(labels(b) + 1, b));
+
+	if labels(b) > 1
+		competingScores = scores(1:labels(b) - 1, b);
+		loss(b) = loss(b) + ...
+			sum(competingScores(competingScores >= repmat(threshold, labels(b) - 1, 1)) - threshold);
+	end
+end
+
+weights = zeros(size(scores), 'like', scores);
+if hardMax
+	weights(sub2ind(size(scores), bestIndex, 1:size(scores, 2))) = 1;
+else
+	weights(sub2ind(size(scores), bestIndex, 1:size(scores, 2))) = scores(sub2ind(size(scores), bestIndex, 1:size(scores, 2)));
+end
+
+paddedWeights = [zeros(1, size(weights, 2)); weights];
+correctWeight = paddedWeights(sub2ind(size(paddedWeights), labels' + 1, 1:length(labels)));
+
+loss = loss .* multipliers;
+
+end
diff --git a/layer-fns/ComputeFirstPastGradient.m b/layer-fns/ComputeFirstPastGradient.m
@@ -0,0 +1,24 @@
+function [ deltas ] = ComputeFirstPastGradient(scores, threshold, weights, deltasIn, labels, multipliers, hardMax)
+
+% TODO: Store this.
+scores = Sigmoid(scores);
+
+deltas = zeros(size(deltasIn), 'like', deltasIn);
+
+% Looping for now during development.
+for b = 1:size(scores, 2)
+	if labels(b) > 0
+		deltas(labels(b), b) = deltas(labels(b), b) - (scores(labels(b), b) < threshold);
+		deltas(1:labels(b) - 1, b) = deltas(1:labels(b) - 1, b) + (scores(1:labels(b) - 1, b) >= repmat(threshold, labels(b) - 1, 1));
+	end
+end
+
+deltas = bsxfun(@times, deltas, multipliers');
+
+if ~hardMax
+	deltas = deltas + deltasIn .* (weights > 0);
+end
+
+deltas = deltas .* SigmoidDeriv([], scores);
+
+end
diff --git a/layer-fns/SigmoidDeriv.m b/layer-fns/SigmoidDeriv.m
@@ -2,6 +2,10 @@
 function deriv = SigmoidDeriv (in, out)
 % Compute the gradient of the sigmoid (now actually tanh) nonlinearity
 
-assert(0, 'Not used.')
+if isempty(out)
+	out = Sigmoid(in);
+end
+
+deriv = out .* (1.0 - out);
 
 end
diff --git a/minFunc/autoDif/derivativeCheck.m b/minFunc/autoDif/derivativeCheck.m
@@ -46,7 +46,7 @@
 	    	= stack2param(g, varargin{1});
 
 	    mergeMatrices, mergeMatrix, ...
-	    softmaxMatrix, trainedWordFeatures, connectionMatrix, ...
+	    softmaxMatrix, trainedWordFeatures, connectionMatrix, scoringVector, ...
 	    compositionMatrix, classifierExtraMatrix, embeddingTransformMatrix
 
 	    fprintf('User gradient error (diff/abs+abs method, zeroing out tiny gradients):\n');