Skip to content

Commit

Permalink
Quick test implementation of 'First Past' lattice.
Browse files Browse the repository at this point in the history
  • Loading branch information
Sam Bowman committed May 12, 2015
1 parent 1340534 commit 17f09b0
Show file tree
Hide file tree
Showing 12 changed files with 121 additions and 42 deletions.
2 changes: 1 addition & 1 deletion AdaDeltaUpdate.m
Expand Up @@ -7,7 +7,7 @@
if length(embGrad) > 0
% Set up a separate SumSqGrad tracker for the embeddings.
modelState.sumSqEmbGrad = fZeros(size(modelState.separateWordFeatures), hyperParams.gpu && ~hyperParams.largeVocabMode);
modelState.sumSqEmbDelta = fZeros(size(modelState.separateWordFeatures), hyperParams.gpu && ~hyperParams.largeVocabMode;
modelState.sumSqEmbDelta = fZeros(size(modelState.separateWordFeatures), hyperParams.gpu && ~hyperParams.largeVocabMode);
end
end

Expand Down
2 changes: 0 additions & 2 deletions ComputeBatchSentenceClassificationCostAndGrad.m
Expand Up @@ -5,7 +5,6 @@

% NOTE: This is reasonably well optimized. The time complexity here lies almost entirely within the batch objects in normal cases.

tic

B = length(data); % Batch size.

Expand Down Expand Up @@ -195,6 +194,5 @@
assert(hasInf, 'Infs in computed gradient.');
end

toc

end
43 changes: 27 additions & 16 deletions LatticeBatch.m
Expand Up @@ -167,10 +167,16 @@
lb.scores(1:row, :, row) = ComputeSlantLayer(lb.scores(1:row, :, row), hyperParams.latticeSlant);
end

% Softmax the scores.
[ merges, localConnectionCosts, probCorrect ] = ...
ComputeSoftmaxLayer(lb.scores(1:row, :, row), [], hyperParams, lb.connectionLabels(:, row), hyperParams.connectionCostScale ./ (lb.wordCounts' - 2), lb.activeNode(:, 1:row, row)');

if hyperParams.latticeFirstPastThreshold == 0
% Softmax the scores.
[ merges, localConnectionCosts, probCorrect ] = ...
ComputeSoftmaxLayer(lb.scores(1:row, :, row), [], hyperParams, lb.connectionLabels(:, row), hyperParams.connectionCostScale ./ (lb.wordCounts' - 2), lb.activeNode(:, 1:row, row)');
else
[ merges, probCorrect, localConnectionCosts ] = ComputeFirstPast(lb.scores(1:row, :, row), hyperParams.latticeFirstPastThreshold, ...
lb.connectionLabels(:, row), hyperParams.connectionCostScale ./ (lb.wordCounts' - 2), hyperParams.latticeFirstPastHardMax);
localConnectionCosts(~isfinite(localConnectionCosts)) = 0;
end

lb.connections(3, :, 1:row, row) = merges';

if hyperParams.latticeLocalCurriculum
Expand Down Expand Up @@ -345,19 +351,24 @@
if row > 1
merges = permute(lb.connections(3, :, 1:row, row), [3, 2, 1, 4]);

% Compute gradients for the scores wrt. the incoming deltas from above and to the right.
[ ~, incomingDeltasToScores ] = ...
ComputeBareSoftmaxGradients([], merges, deltasToMerges, lb.scores(1:row, :, row), hyperParams.gpu);

% Compute gradients for the scores wrt. the independent connection supervision signal.
[ ~, labelDeltasToScores ] = ...
ComputeSoftmaxClassificationGradients([], merges, lb.connectionLabels(:, row), ...
lb.scores(1:row, :, row), hyperParams, hyperParams.connectionCostScale .* lb.supervisionWeights(:, row) ./ (lb.wordCounts - 2)');

deltasToScores = labelDeltasToScores + incomingDeltasToScores;

if hyperParams.latticeFirstPastThreshold == 0
% Compute gradients for the scores wrt. the incoming deltas from above and to the right.
[ ~, incomingDeltasToScores ] = ...
ComputeBareSoftmaxGradients([], merges, deltasToMerges, lb.scores(1:row, :, row), hyperParams.gpu);

% Compute gradients for the scores wrt. the independent connection supervision signal.
[ ~, labelDeltasToScores ] = ...
ComputeSoftmaxClassificationGradients([], merges, lb.connectionLabels(:, row), ...
lb.scores(1:row, :, row), hyperParams, hyperParams.connectionCostScale .* lb.supervisionWeights(:, row) ./ (lb.wordCounts - 2)');

deltasToScores = labelDeltasToScores + incomingDeltasToScores;
else
deltasToScores = ComputeFirstPastGradient(lb.scores(1:row, :, row), hyperParams.latticeFirstPastThreshold, merges, deltasToMerges, ...
lb.connectionLabels(:, row), hyperParams.connectionCostScale .* lb.supervisionWeights(:, row) ./ (lb.wordCounts - 2)', hyperParams.latticeFirstPastHardMax);
end

% Overwrite 0/0 deltas from inactive nodes.
deltasToScores(isnan(deltasToScores)) = 0;
deltasToScores(~isfinite(deltasToScores)) = 0;

if hyperParams.latticeSlant > 0
deltasToScores = ComputeSlantLayerGradients(lb.slantInputs(1:row, :, row), ...
Expand Down
16 changes: 10 additions & 6 deletions RunExperiments/RunSSTExperiments.sh
Expand Up @@ -57,17 +57,21 @@ export MATLABCMD="cd quant; conCost = 5; lambda = 0.0005; dim = 50; ed = 200; td
export MATLABCMD="cd quant; conCost = 5; lambda = 0.0005; dim = 50; ed = 200; td = 2; penult = 75; comp = 6; dropout = [0.9\, 0.9]; collo = 1; dataflag='dg-only-sst'; name='/scr/sbowman/updaterule4d'; TrainModel(''\, 1\, @Sick\, name\, dataflag\, ed\, dim\, td\, penult\, lambda\, comp\, dropout(1)\, dropout(2)\, collo\, 1\, conCost);" ; qsub -v MATLABCMD quant/run.sh -q john -l nodes=1:ppn=6

% Re-run, then manic tuning
export MATLABCMD="cd quant; lambda = 0.000001; dim = 35; ed = 200; td = 2; penult = 0; dropout = [0.9\, 0.9]; tot = 4; collo = 1; dataflag='sst-expanded'; name='/scr/sbowman/sst-expanded-5e'; TrainModel(''\, 1\, @SST\, name\, dataflag\, ed\, dim\, td\, penult\, lambda\, tot\, dropout(1)\, dropout(2)\, collo\, 1\, 1\, 1\, 1);" ; qsub -v MATLABCMD quant/run.sh -q john -l nodes=1:ppn=6
export MATLABCMD="cd quant; lambda = 0.000001; dim = 35; ed = 200; td = 2; penult = 0; dropout = [0.9\, 0.9]; tot = 4; collo = 1; dataflag='sst-expanded'; name='/scr/sbowman/sst-expanded-5d'; TrainModel(''\, 1\, @SST\, name\, dataflag\, ed\, dim\, td\, penult\, lambda\, tot\, dropout(1)\, dropout(2)\, collo\, 1\, 1\, 3\, 1);" ; qsub -v MATLABCMD quant/run.sh -q john -l nodes=1:ppn=6
export MATLABCMD="cd quant; lambda = 0.000001; dim = 35; ed = 200; td = 2; penult = 0; dropout = [0.9\, 0.9]; tot = 4; collo = 1; dataflag='sst-expanded'; name='/scr/sbowman/sst-expanded-5d'; TrainModel(''\, 1\, @SST\, name\, dataflag\, ed\, dim\, td\, penult\, lambda\, tot\, dropout(1)\, dropout(2)\, collo\, 1\, 1\, 8\, 1);" ; qsub -v MATLABCMD quant/run.sh -q john -l nodes=1:ppn=6
export MATLABCMD="cd quant; lambda = 0.000001; dim = 35; ed = 200; td = 2; penult = 0; dropout = [0.9\, 0.9]; tot = 4; collo = 1; dataflag='sst-expanded'; name='/scr/sbowman/sst-expanded-5d'; TrainModel(''\, 1\, @SST\, name\, dataflag\, ed\, dim\, td\, penult\, lambda\, tot\, dropout(1)\, dropout(2)\, collo\, 4\, 1\, 0\, 1);" ; qsub -v MATLABCMD quant/run.sh -q john -l nodes=1:ppn=6
export MATLABCMD="cd quant; lambda = 0.000001; dim = 35; ed = 200; td = 2; penult = 0; dropout = [0.9\, 0.9]; tot = 4; collo = 1; dataflag='sst-expanded'; name='/scr/sbowman/sst-expanded-5d'; TrainModel(''\, 1\, @SST\, name\, dataflag\, ed\, dim\, td\, penult\, lambda\, tot\, dropout(1)\, dropout(2)\, collo\, 3\, 1\, 0\, 1);" ; qsub -v MATLABCMD quant/run.sh -q john -l nodes=1:ppn=6
export MATLABCMD="cd quant; lambda = 0.000001; dim = 35; ed = 200; td = 2; penult = 0; dropout = [0.9\, 0.9]; tot = 4; collo = 1; dataflag='sst-expanded'; name='/scr/sbowman/sst-expanded-5d'; TrainModel(''\, 1\, @SST\, name\, dataflag\, ed\, dim\, td\, penult\, lambda\, tot\, dropout(1)\, dropout(2)\, collo\, 2\, 1\, 0\, 1);" ; qsub -v MATLABCMD quant/run.sh -q john -l nodes=1:ppn=6
export MATLABCMD="cd quant; lambda = 0.000001; dim = 35; ed = 200; td = 2; penult = 0; dropout = [0.9\, 0.9]; tot = 4; collo = 1; dataflag='sst-expanded'; name='/scr/sbowman/sst-expanded-5d'; TrainModel(''\, 1\, @SST\, name\, dataflag\, ed\, dim\, td\, penult\, lambda\, tot\, dropout(1)\, dropout(2)\, collo\, 1\, 1\, 0\, 1);" ; qsub -v MATLABCMD quant/run.sh -q john -l nodes=1:ppn=6
export MATLABCMD="cd quant; lambda = 0.000001; dim = 75; ed = 200; td = 2; penult = 0; dropout = [0.9\, 0.9]; tot = 4; collo = 1; dataflag='sst-expanded'; name='/scr/sbowman/sst-expanded-5d'; TrainModel(''\, 1\, @SST\, name\, dataflag\, ed\, dim\, td\, penult\, lambda\, tot\, dropout(1)\, dropout(2)\, collo\, 1\, 1\, 0\, 1);" ; qsub -v MATLABCMD quant/run.sh -q john -l nodes=1:ppn=6
export MATLABCMD="cd quant; lambda = 0.000001; dim = 35; ed = 200; td = 2; penult = 0; dropout = [0.9\, 0.9]; tot = 4; collo = 1; dataflag='sst-expanded'; name='/scr/sbowman/sst-expanded-5e'; TrainModel(''\, 1\, @SST\, name\, dataflag\, ed\, dim\, td\, penult\, lambda\, tot\, dropout(1)\, dropout(2)\, collo\, 1\, 1\, 1\, 1);" ; qsub -v MATLABCMD quant/run.sh -q john -l nodes=1:ppn=6
export MATLABCMD="cd quant; lambda = 0.000001; dim = 35; ed = 200; td = 2; penult = 0; dropout = [0.9\, 0.9]; tot = 4; collo = 1; dataflag='sst-expanded'; name='/scr/sbowman/sst-expanded-5d'; TrainModel(''\, 1\, @SST\, name\, dataflag\, ed\, dim\, td\, penult\, lambda\, tot\, dropout(1)\, dropout(2)\, collo\, 1\, 1\, 3\, 1);" ; qsub -v MATLABCMD quant/run.sh -q john -l nodes=1:ppn=6
export MATLABCMD="cd quant; lambda = 0.000001; dim = 35; ed = 200; td = 2; penult = 0; dropout = [0.9\, 0.9]; tot = 4; collo = 1; dataflag='sst-expanded'; name='/scr/sbowman/sst-expanded-5d'; TrainModel(''\, 1\, @SST\, name\, dataflag\, ed\, dim\, td\, penult\, lambda\, tot\, dropout(1)\, dropout(2)\, collo\, 1\, 1\, 8\, 1);" ; qsub -v MATLABCMD quant/run.sh -q john -l nodes=1:ppn=6




function [ hyperParams, options, wordMap, labelMap ] = SST(expName, dataflag, embDim, dim, topDepth, penult, lambda, composition, bottomDropout, topDropout, collo, conD, curr, mdn, ccs)
export MATLABCMD="cd quant; lambda = 0.0001; dim = 40; ed = 300; td = 2; penult = 0; dropout = [1\, 1]; tot = 4; collo = 3; dataflag='sst-expanded'; name='/scr/sbowman/sst-expanded-5d'; TrainModel(''\, 1\, @SST\, name\, dataflag\, ed\, dim\, td\, penult\, lambda\, tot\, dropout(1)\, dropout(2)\, collo\, 4\, 1\, 0\, .3);" ; qsub -v MATLABCMD quant/run.sh -q john -l nodes=1:ppn=6
export MATLABCMD="cd quant; lambda = 0.001; dim = 50; ed = 300; td = 3; penult = 0; dropout = [.8\, 1]; tot = 4; collo = 3; dataflag='sst-expanded'; name='/scr/sbowman/sst-expanded-5d'; TrainModel(''\, 1\, @SST\, name\, dataflag\, ed\, dim\, td\, penult\, lambda\, tot\, dropout(1)\, dropout(2)\, collo\, 4\, 1\, 0\, 1);" ; qsub -v MATLABCMD quant/run.sh -q john -l nodes=1:ppn=6
export MATLABCMD="cd quant; lambda = 0.00001; dim = 30; ed = 300; td = 4; penult = 0; dropout = [1\, .8]; tot = 4; collo = 3; dataflag='sst-expanded'; name='/scr/sbowman/sst-expanded-5d'; TrainModel(''\, 1\, @SST\, name\, dataflag\, ed\, dim\, td\, penult\, lambda\, tot\, dropout(1)\, dropout(2)\, collo\, 4\, 1\, 0\, .3);" ; qsub -v MATLABCMD quant/run.sh -q john -l nodes=1:ppn=6
export MATLABCMD="cd quant; lambda = 0.001; dim = 40; ed = 300; td = 5; penult = 0; dropout = [1\, .8]; tot = 4; collo = 3; dataflag='sst-expanded'; name='/scr/sbowman/sst-expanded-5d'; TrainModel(''\, 1\, @SST\, name\, dataflag\, ed\, dim\, td\, penult\, lambda\, tot\, dropout(1)\, dropout(2)\, collo\, 4\, 1\, 0\, 3);" ; qsub -v MATLABCMD quant/run.sh -q john -l nodes=1:ppn=6
export MATLABCMD="cd quant; lambda = 0.00001; dim = 50; ed = 300; td = 5; penult = 0; dropout = [.8\, 1]; tot = 4; collo = 3; dataflag='sst-expanded'; name='/scr/sbowman/sst-expanded-5d'; TrainModel(''\, 1\, @SST\, name\, dataflag\, ed\, dim\, td\, penult\, lambda\, tot\, dropout(1)\, dropout(2)\, collo\, 4\, 1\, 0\, 3);" ; qsub -v MATLABCMD quant/run.sh -q john -l nodes=1:ppn=6
export MATLABCMD="cd quant; lambda = 0.001; dim = 40; ed = 300; td = 3; penult = 0; dropout = [1\, 1]; tot = 4; collo = 3; dataflag='sst-expanded'; name='/scr/sbowman/sst-expanded-5d'; TrainModel(''\, 1\, @SST\, name\, dataflag\, ed\, dim\, td\, penult\, lambda\, tot\, dropout(1)\, dropout(2)\, collo\, 4\, 1\, 0\, 1);" ; qsub -v MATLABCMD quant/run.sh -q john -l nodes=1:ppn=6
function [ hyperParams, options, wordMap, labelMap ] = SST(expName, dataflag, embDim, dim, topDepth, penult, lambda, composition, bottomDropout, topDropout, collo, conD, curr, mdn, ccs)

26 changes: 12 additions & 14 deletions TrainModel.m
Expand Up @@ -20,20 +20,18 @@ function TrainModel(pretrainingFilename, fold, ConfigFn, varargin)
addpath('layer-fns/')

% Set up paralellization
if false
if isempty(gcp('nocreate'))
c = parcluster();
t = tempname();
mkdir(t);
c.JobStorageLocation = t;
c.NumWorkers = 4;
if exist('parpool')
% >= 2013b
parpool(c, 4);
else
% < 2013b
matlabpool(c, 4);
end
if isempty(gcp('nocreate'))
c = parcluster();
t = tempname();
mkdir(t);
c.JobStorageLocation = t;
c.NumWorkers = 4;
if exist('parpool')
% >= 2013b
parpool(c, 4);
else
% < 2013b
matlabpool(c, 4);
end
end

Expand Down
1 change: 1 addition & 0 deletions config/ALCIR.m
Expand Up @@ -26,6 +26,7 @@
hyperParams.latticeSlant = slant;
end

hyperParams.latticeConnectionHiddenDim = 25;

hyperParams.parensInSequences = 1;

Expand Down
2 changes: 1 addition & 1 deletion config/CompositionSetup.m
Expand Up @@ -26,7 +26,7 @@
hyperParams.lstm = 0;
hyperParams.useTrees = 0;
hyperParams.useThirdOrderComposition = 0;
hyperParams.useThirdOrderMerge = 1;
hyperParams.useThirdOrderMerge = 0;
elseif composition == 5
hyperParams.useLattices = 1;
hyperParams.lstm = 0;
Expand Down
3 changes: 3 additions & 0 deletions config/Defaults.m
Expand Up @@ -41,6 +41,9 @@

hyperParams.latticeSlant = 0;

hyperParams.latticeFirstPastThreshold = 0.45;
hyperParams.latticeFirstPastHardMax = 1;

% The number of embedding transform layers. topDepth = 1 means an NN layer will be
% added above the embedding matrix. This is likely to only be useful when
% learnWords is false, and so the embeddings do not exist in the same space
Expand Down
36 changes: 36 additions & 0 deletions layer-fns/ComputeFirstPast.m
@@ -0,0 +1,36 @@
function [ weights, correctWeight, loss ] = ComputeFirstPast(scores, threshold, labels, multipliers, hardMax)

% Sigmoid to get a bounded range.
scores = Sigmoid(scores);

% Lower the threshold if nothing passes it.
effectiveThreshold = min([repmat(threshold, size(scores, 2)); max(scores, [], 1)]);

% Choose the first entry to pass the threshold.
[ ~, bestIndex ] = max(scores >= repmat(effectiveThreshold, size(scores, 1), 1), [], 1);

loss = zeros(size(scores, 2), 1, 'like', scores);
paddedScores = [zeros(1, size(scores, 2)); scores];
for b = 1:size(scores, 2)
loss(b) = max(0, threshold - paddedScores(labels(b) + 1, b));

if labels(b) > 1
competingScores = scores(1:labels(b) - 1, b);
loss(b) = loss(b) + ...
sum(competingScores(competingScores >= repmat(threshold, labels(b) - 1, 1)) - threshold);
end
end

weights = zeros(size(scores), 'like', scores);
if hardMax
weights(sub2ind(size(scores), bestIndex, 1:size(scores, 2))) = 1;
else
weights(sub2ind(size(scores), bestIndex, 1:size(scores, 2))) = scores(sub2ind(size(scores), bestIndex, 1:size(scores, 2)));
end

paddedWeights = [zeros(1, size(weights, 2)); weights];
correctWeight = paddedWeights(sub2ind(size(paddedWeights), labels' + 1, 1:length(labels)));

loss = loss .* multipliers;

end
24 changes: 24 additions & 0 deletions layer-fns/ComputeFirstPastGradient.m
@@ -0,0 +1,24 @@
function [ deltas ] = ComputeFirstPastGradient(scores, threshold, weights, deltasIn, labels, multipliers, hardMax)

% TODO: Store this.
scores = Sigmoid(scores);

deltas = zeros(size(deltasIn), 'like', deltasIn);

% Looping for now during development.
for b = 1:size(scores, 2)
if labels(b) > 0
deltas(labels(b), b) = deltas(labels(b), b) - (scores(labels(b), b) < threshold);
deltas(1:labels(b) - 1, b) = deltas(1:labels(b) - 1, b) + (scores(1:labels(b) - 1, b) >= repmat(threshold, labels(b) - 1, 1));
end
end

deltas = bsxfun(@times, deltas, multipliers');

if ~hardMax
deltas = deltas + deltasIn .* (weights > 0);
end

deltas = deltas .* SigmoidDeriv([], scores);

end
6 changes: 5 additions & 1 deletion layer-fns/SigmoidDeriv.m
Expand Up @@ -2,6 +2,10 @@
function deriv = SigmoidDeriv (in, out)
% Compute the gradient of the sigmoid (now actually tanh) nonlinearity

assert(0, 'Not used.')
if isempty(out)
out = Sigmoid(in);
end

deriv = out .* (1.0 - out);

end
2 changes: 1 addition & 1 deletion minFunc/autoDif/derivativeCheck.m
Expand Up @@ -46,7 +46,7 @@
= stack2param(g, varargin{1});

mergeMatrices, mergeMatrix, ...
softmaxMatrix, trainedWordFeatures, connectionMatrix, ...
softmaxMatrix, trainedWordFeatures, connectionMatrix, scoringVector, ...
compositionMatrix, classifierExtraMatrix, embeddingTransformMatrix

fprintf('User gradient error (diff/abs+abs method, zeroing out tiny gradients):\n');
Expand Down

0 comments on commit 17f09b0

Please sign in to comment.