From adabdda0add5be6f8404a4c1ed01f34d9bd1a517 Mon Sep 17 00:00:00 2001
From: Sam Bowman <sbowman@stanford.edu>
Date: Thu, 18 Jun 2015 08:51:41 -0700
Subject: [PATCH] Cleanup for workshop camera ready.

---
 writing/F'14 paper/cameraready/intro.tex      |  2 +-
 writing/F'14 paper/cameraready/join.tex       | 54 +++++++++----------
 writing/F'14 paper/cameraready/methods.tex    | 10 ++--
 .../F'14 paper/cameraready/quantifiers.tex    |  2 +-
 writing/F'14 paper/cameraready/recursion.tex  |  2 +-
 writing/F'14 paper/cameraready/sick.tex       | 41 +++++++-------
 6 files changed, 57 insertions(+), 54 deletions(-)

diff --git a/writing/F'14 paper/cameraready/intro.tex b/writing/F'14 paper/cameraready/intro.tex
index ca5702a7..df20cf5d 100644
--- a/writing/F'14 paper/cameraready/intro.tex	
+++ b/writing/F'14 paper/cameraready/intro.tex	
@@ -1,6 +1,6 @@
 \section{Introduction}\label{sec:intro}
 
-Tree-structured recursive neural network models (TreeRNNs; \citealt{goller1996learning}) for sentence meaning
+Tree-structured recursive neural network models (TreeRNNs; \citealt{goller1996learning,socher2011semi}) for sentence meaning
 have been successful in an array of sophisticated language tasks,
 including sentiment analysis \cite{socher2011semi,irsoydeep},
 image description \cite{sochergrounded}, and paraphrase detection
diff --git a/writing/F'14 paper/cameraready/join.tex b/writing/F'14 paper/cameraready/join.tex
index 97b0ba87..ed76ba0f 100644
--- a/writing/F'14 paper/cameraready/join.tex	
+++ b/writing/F'14 paper/cameraready/join.tex	
@@ -32,7 +32,7 @@ \section{Reasoning about semantic relations}\label{sec:join}
 full set of sound such inferences on pairs of premise relations is depicted in
 Table~\ref{tab:jointable}. Though these basic inferences do not involve compositional
 sentence representations, any successful reasoning using compositional representations
-will rely on the ability to perform sound inferences of this kind, so our first experiment studies how well each model can learn to perform them them in isolation.
+will rely on the ability to perform sound inferences of this kind in order to be able to use unseen relational facts within larger derivations. Our first experiment studies how well each model can learn to perform them them in isolation.
 
 % about the relations themselves that do not depend on the
 % internal structure of the things being compared. For example, given
@@ -46,29 +46,6 @@ \section{Reasoning about semantic relations}\label{sec:join}
 % $a \natneg b$ and $b~|~c$ then $a \sqsupset c$.
 
 
-\paragraph{Experiments}
-We begin by creating a world model
-on which we will base the statements in the train and test sets.
-This takes the form of a small Boolean structure in which terms denote
-sets of entities from a small domain.  Fig.~\ref{lattice-figure}a
-depicts a structure of this form with three entities ($a$, $b$, and $c$) and eight proposition terms ($p_1$--$p_8$). We then generate a 
-relational statement for each pair of terms in the model, as shown in Fig.~\ref{lattice-figure}b. 
-We divide these statements evenly into train and test sets, and delete the test set
- examples which cannot be proven from the train examples, for which there is not enough information for even an ideal system to choose a correct label.
-In each experimental run, we create a model with 80 terms over a domain of 7 elements, yielding a training set of 3200 examples and a test set of 
-2960 examples.
-
-We trained models with both the NN and NTN comparison functions on these
-data sets.\footnote{Since this task relies crucially on the learning of a pair of vectors, no simpler version of our model is a viable baseline.} %+%
-In both cases, the models are implemented as
-described in \S\ref{methods}, but since the items being compared
-are single terms rather than full tree structures, the composition
-layer is not used, and the two models are not recursive. We simply present
-the models with the (randomly initialized) embedding vectors for each
-of two terms, ensuring that the model has no information about the terms
-being compared except for the relations between them that appear in training.
-
-
 \begin{figure}[t]
   \centering
   \begin{subfigure}[t]{0.45\textwidth}
@@ -106,7 +83,7 @@ \section{Reasoning about semantic relations}\label{sec:join}
       
       \labelednode{2.5}{0.5}{}{}
     \end{picture}}
-    \caption{Example boolean structure. The terms $p_1$--$p_8$ name the sets. Not all sets have names, and  some sets have multiple names, so that learning $\nateq$ is non-trivial.}
+    \caption{Example boolean structure, shown with edges idicating inclusion. The terms $p_1$--$p_8$ name the sets. Not all sets have names, and  some sets have multiple names, so that learning $\nateq$ is non-trivial.}
   \end{subfigure}
   \qquad\small
     \begin{subfigure}[t]{0.43\textwidth}
@@ -126,7 +103,7 @@ \section{Reasoning about semantic relations}\label{sec:join}
     \end{tabular}
 
     \caption{A few examples of atomic statements about the
-      model.  Test statements that are not provable from the training data shown are
+      model depicted above.  Test statements that are not provable from the training data shown are
       crossed out.}
   \end{subfigure}  
   \caption{Small example structure and data for learning relation composition.}
@@ -150,6 +127,29 @@ \section{Reasoning about semantic relations}\label{sec:join}
   \label{joinresultstable}
 \end{table}
 
+\paragraph{Experiments}
+We begin by creating a world model
+on which we will base the statements in the train and test sets.
+This takes the form of a small Boolean structure in which terms denote
+sets of entities from a small domain.  Fig.~\ref{lattice-figure}a
+depicts a structure of this form with three entities ($a$, $b$, and $c$) and eight proposition terms ($p_1$--$p_8$). We then generate a 
+relational statement for each pair of terms in the model, as shown in Fig.~\ref{lattice-figure}b. 
+We divide these statements evenly into train and test sets, and delete the test set
+ examples which cannot be proven from the train examples, for which there is not enough information for even an ideal system to choose a correct label.
+In each experimental run, we create a model with 80 terms over a domain of 7 elements, yielding a training set of 3200 examples and a test set of 
+2960 examples.
+
+We trained models with both the NN and NTN comparison functions on these
+data sets.\footnote{Since this task relies crucially on the learning of a pair of vectors, no simpler version of our model is a viable baseline.} %+%
+In both cases, the models are implemented as
+described in \S\ref{methods}, but since the items being compared
+are single terms rather than full tree structures, the composition
+layer is not used, and the two models are not recursive. We simply present
+the models with the (randomly initialized) embedding vectors for each
+of two terms, ensuring that the model has no information about the terms
+being compared except for the relations between them that appear in training.
+
+
 \paragraph{Results} 
 The results (Table \ref{joinresultstable}) show that NTN is able to accurately encode the relations between the terms in the geometric relations between their vectors, 
 and is able to then use that information to recover relations that 
@@ -157,7 +157,7 @@ \section{Reasoning about semantic relations}\label{sec:join}
 but makes enough errors that it remains an open question whether 
 it is capable of learning representations with these properties. 
 It is not possible for us to rule out the possibility that different optimization techniques or
-further hyperparameter tuning could lead an NN model to succeed here.
+finer-grained hyperparameter tuning could lead an NN model to succeed.
 
 As an example from our test data, both models correctly labeled $p_1 \natfor p_3$, potentially learning from the training examples $\{p_1 \natfor p_{51},~p_3 \natrev p_{51}\}$ or $\{p_1\natfor p_{65},~p_3 \natrev p_{65} \}$. On another example involving comparably frequent relations, the NTN correctly labeled $p_6 \natrev p_{24}$, likely on the basis of the training examples $\{p_6 \natcov p_{28},~p_{28} \natneg p_{24}\}$, while the NN incorrectly assigned it $\natind$.
 
diff --git a/writing/F'14 paper/cameraready/methods.tex b/writing/F'14 paper/cameraready/methods.tex
index 1f180661..2efef6c1 100644
--- a/writing/F'14 paper/cameraready/methods.tex	
+++ b/writing/F'14 paper/cameraready/methods.tex	
@@ -6,8 +6,8 @@ \section{Tree-structured neural networks} \label{methods}
  compositionality}, which says that the meanings for complex
 expressions are derived from the meanings of their parts
 via specific composition functions \cite{Partee84,Janssen97}. In our
-distributed setting, word meanings are embedding vectors of dimension $n$. A learned
-composition function maps pairs of them to single phrase vectors of dimension $n$, 
+distributed setting, word meanings are embedding vectors of dimension $N$. A learned
+composition function maps pairs of them to single phrase vectors of dimension $N$, 
 which can then be merged again to represent more complex
 phrases, forming a tree structure. Once the entire sentence-level representation has been
 derived at the top of the tree, it serves as a fixed-dimensional input for some subsequent layer function.
@@ -45,9 +45,9 @@ \section{Tree-structured neural networks} \label{methods}
 Here, $\vec{x}^{(l)}$ and $\vec{x}^{(r)}$ are the column vector
 representations for the left and right children of the node, and
 $\vec{y}$ is the node's output.  The TreeRNN concatenates them, multiplies
-them by an $n \times 2n$ matrix of learned weights, and adds a bias $\vec{b}$. 
+them by an $N \times 2N$ matrix of learned weights, and adds a bias $\vec{b}$. 
 The TreeRNTN adds a learned full rank third-order tensor 
-$\mathbf{T}$, of dimension $n \times n \times n$, modeling
+$\mathbf{T}$, of dimension $N \times N \times N$, modeling
 multiplicative interactions between the child vectors. 
 The comparison layer uses the same layer function as the
 composition layers (either an NN layer or an NTN layer) with
@@ -82,5 +82,5 @@ \section{Tree-structured neural networks} \label{methods}
 as the harmonic mean of average precision and average recall, both computed
 for all classes for which there is test data, setting precision to 0 
 where it is not defined.}
-Source code and generated data will be released after the review period.
+Source code and generated data can be downloaded from \url{http://stanford.edu/~sbowman/}.
 
diff --git a/writing/F'14 paper/cameraready/quantifiers.tex b/writing/F'14 paper/cameraready/quantifiers.tex
index ef906a38..a63f1267 100644
--- a/writing/F'14 paper/cameraready/quantifiers.tex	
+++ b/writing/F'14 paper/cameraready/quantifiers.tex	
@@ -56,7 +56,7 @@ \section{Reasoning with quantifiers and negation}\label{sec:quantifiers}
 % yields 66k sentence pairs. Some examples of these data are provided
 % in Table~\ref{examplesofdata}.
 
-In each run, we randomly partition the set of valid \textit{single sentences} into train and test, and then label all of the pairs from within each set to generate a training set of 27k pairs and a test set of 7k pairs. Because the model doesn't see the test sentences at training time, it cannot directly use the kind of reasoning described in \S\ref{sec:join} (treating sentences as unanalyzed symbols), and must instead infer the word-level relations and learn a complete reasoning system over them for our logic. 
+In each run, we randomly partition the set of valid \textit{single sentences} into train and test, and then label all of the pairs from within each set to generate a training set of 27k pairs and a test set of 7k pairs. Because the model doesn't see the test sentences at training time, it cannot directly use the kind of reasoning described in \S\ref{sec:join} at the sentence level (by treating sentences as unanalyzed symbols), and must instead jointly learn the word-level relations and a complete reasoning system over them for our logic. 
 
 We use the same summing baseline as in \S\ref{sec:recursion}.
 The highly consistent  sentence structure in this experiment means that this model
diff --git a/writing/F'14 paper/cameraready/recursion.tex b/writing/F'14 paper/cameraready/recursion.tex
index 679e1648..763a78e2 100644
--- a/writing/F'14 paper/cameraready/recursion.tex	
+++ b/writing/F'14 paper/cameraready/recursion.tex	
@@ -77,7 +77,7 @@ \section{Recursive structure}\label{sec:recursion}
       $\plneg\, (\plneg p_1 \pland \plneg p_2)$ & $\nateq$ & $(p_1 \plor p_2)$ \\ 
       \bottomrule
     \end{tabular}
-    \caption{Examples of the type of statements used for training and testing. These are relations between
+    \caption{Short examples of the type of statements used for training and testing. These are relations between
       well-formed formulae, computed in terms of sets of satisfying
       interpretation functions $\sem{\cdot}$.}\label{tab:plexs}
   \end{subtable}
diff --git a/writing/F'14 paper/cameraready/sick.tex b/writing/F'14 paper/cameraready/sick.tex
index 63a8900c..fa8da0c1 100644
--- a/writing/F'14 paper/cameraready/sick.tex	
+++ b/writing/F'14 paper/cameraready/sick.tex	
@@ -3,12 +3,31 @@ \section{The SICK textual entailment challenge}\label{sec:sick}
 % TODO: Something on how we can pack enough 
 
 The specific model architecture that we use is novel, and though the underlying tree structure approach has been validated elsewhere, our experiments so far do not guarantee that it viable model for handling inference over real
-natural language data. To investigate our models' ability to handle the noisy labels and the diverse range of linguistic structures seen in typical natural language data, we use the SICK textual entailment challenge corpus \cite{marelli2014sick}. The corpus consists of about 10k natural language sentence pairs, labeled with \ii{entailment}, \ii{contradiction}, or \ii{neutral}. At only a few thousand distinct sentences (many of them variants on an even smaller set of template sentences), the corpus is not large enough to train a high quality learned model of general natural language, but it is the largest human-labeled entailment corpus that we are aware of, and our results nonetheless show that tree-structured NN models can learn to do inference in the real world.
+natural language data. To investigate our models' ability to handle the noisy labels and the diverse range of linguistic structures seen in typical natural language data, we use the SICK textual entailment challenge corpus \cite{marelli2014sick}. The corpus consists of about 10k natural language sentence pairs, labeled with \ii{entailment}, \ii{contradiction}, or \ii{neutral}. At only a few thousand distinct sentences (many of them variants on an even smaller set of template sentences), the corpus is not large enough to train a high quality learned model of general natural language, but it is the largest human-labeled entailment corpus that we are aware of, and our results nonetheless show that tree-structured NN models can learn to approximate natural logic-style inference in the real world.
 
 Adapting to this task requires us to make a few additions to the techniques discussed in \S\ref{methods}. In order to better handle rare words, we initialized our word embeddings using 200 dimensional vectors trained with 
-GloVe \cite{pennington2014glove} on data from Wikipedia. Since 200 dimensional vectors are too large to be practical in an TreeRNTN on a small dataset, a new embedding transformation layer is needed. Before any embedding is used as an input to a recursive layer, it is passed through an additional $\tanh$ neural network layer with the same output dimension as the recursive layer. This new layer aggregates any usable information from the embedding vectors into a more compact working representation. An identical layer is added to the SumNN between the word vectors and the comparison layer.
+GloVe \cite{pennington2014glove} on data from Wikipedia. Since 200 dimensional vectors are too large to be practical in an TreeRNTN on a small dataset, a new embedding transformation layer is needed. Before any embedding is used as an input to a recursive layer, it is passed through an additional $\tanh$ neural network layer with the same output dimension as the recursive layer. This new layer allows the model to choose which aspects of the 200 dimensional representations from the unsupervised source it most values, rather than relying on GloVe---which is has no knowledge of the task---to do so, as would be the case were GloVe asked to directly produce vectors of the lower dimensionality.  An identical layer is added to the SumNN between the word vectors and the comparison layer.
 
-We also supplemented the SICK training data\footnote{We tuned the model using performance on a held out development set, but report performance here for a version of the model trained on both the training and development data and tested on the 4,928 example SICK test set. We also report training accuracy on a small sample from each data source.} with 600k examples of entailment data from the Denotation Graph project (DG, \citealt{hodoshimage}, also used by the winning SICK submission), a corpus of noisy automatically labeled entailment examples over image captions, the same genre of text from which SICK was drawn. We trained a single model on data from both sources, but used a separate set of softmax parameters for classifying into the labels from each source. We parsed the data from both sources with the Stanford PCFG Parser v.~3.3.1 \cite{klein2003accurate}. We also found that we were able to train a working model much more quickly with an additional technique: we collapse subtrees that were identical across both sentences in a pair by replacing them with a single head word. The training and test data on which we report performance are collapsed in this way, and both collapsed and uncollapsed copies of the training data are used in training. Finally, in order to improve regularization on the noisier data, we used dropout \cite{srivastava2014dropout} at the input to the comparison layer (10\%) and at the output from the embedding transform layer (25\%). 
+
+\begin{table*}[tp]
+  \centering\small
+  \begin{tabular}{l@{~~~}cl}
+    \toprule
+  The patient is being helped by the doctor	& \ii{entailment} & The doctor is helping the patient (\textsc{Passive})\\
+    A little girl is playing the violin on a beach & \ii{contradiction} &	There is no girl playing the violin on a beach (\textsc{Neg})\\
+    
+    The yellow dog is drinking water from a bottle& \ii{contradiction} &	The yellow dog is drinking water from a pot  (\textsc{Subst})\\
+        A woman is breaking two eggs in a bowl & \ii{neutral} &A man is mixing a few ingredients in a bowl (\textsc{MultiEd})\\
+        Dough is being spread by a man & \ii{neutral} & A woman is slicing meat with a knife (\textsc{Diff})\\
+    \bottomrule
+  \end{tabular}
+  \caption{\label{examplesofsickdata}Examples of each category used in error analysis from the SICK test data. }
+\end{table*}
+
+
+We also supplemented the SICK training data\footnote{We tuned the model using performance on a held out development set, but report performance here for a version of the model trained on both the training and development data and tested on the 4,928 example SICK test set. We also report training accuracy on a small sample from each data source.} (4500 examples) with 600k examples of approximate entailment data from the Denotation Graph project (DG, \citealt{hodoshimage}, also used by the winning SICK submission), a corpus of noisy automatically labeled entailment examples over image captions, the same genre of text from which SICK was drawn. We trained a single model on data from both sources, but used a separate set of softmax parameters for classifying into the labels from each source, and forced the model to sample SICK examples and DG examples about equally often during training. 
+
+We parsed the data from both sources with the Stanford PCFG Parser v.~3.3.1 \cite{klein2003accurate}. We also found that we were able to train a working model much more quickly with an additional technique: we collapse subtrees that were identical across both sentences in a pair by replacing them with a single head word. The training and test data on which we report performance are collapsed in this way, and both collapsed and uncollapsed copies of the training data are used in training. Finally, in order to improve regularization on the noisier data, we used dropout \cite{srivastava2014dropout} at the input to the comparison layer (10\%) and at the output from the embedding transform layer (25\%). 
 
 \begin{table}[tp]
   \centering \small
@@ -34,22 +53,6 @@ \section{The SICK textual entailment challenge}\label{sec:sick}
   \label{sresultstable}
 \end{table} 
 
-\begin{table*}[htp]
-  \centering\small
-  \begin{tabular}{l@{~~~}cl}
-    \toprule
-  The patient is being helped by the doctor	& \ii{entailment} & The doctor is helping the patient (\textsc{Passive})\\
-    A little girl is playing the violin on a beach & \ii{contradiction} &	There is no girl playing the violin on a beach (\textsc{Neg})\\
-    
-    The yellow dog is drinking water from a bottle& \ii{contradiction} &	The yellow dog is drinking water from a pot  (\textsc{Subst})\\
-        A woman is breaking two eggs in a bowl & \ii{neutral} &A man is mixing a few ingredients in a bowl (\textsc{MultiEd})\\
-        Dough is being spread by a man & \ii{neutral} & A woman is slicing meat with a knife (\textsc{Diff})\\
-    \bottomrule
-  \end{tabular}
-  \caption{\label{examplesofsickdata}Examples of each category used in error analysis from the SICK test data. }
-\end{table*}
-
-
 \paragraph{Results} Despite the small amount of high quality training data available and the lack of resources for learning lexical relationships, the results (Table~\ref{sresultstable}) show that our tree-structured models perform competitively on textual entailment, beating a strong baseline. Neither model reached the performance of the winning system (84.6\%), but the TreeRNTN did exceed that of eight out of 18 submitted systems, including several which used sophisticated hand-engineered features and lexical resources specific to the version of the entailment task at hand. 
 
 To better understand our results, we manually annotated a fraction of the SICK test set, using mutually exclusive categories for passive/active alternation pairs (\textsc{Passive}), pairs differing only by the presence of negation (\textsc{Neg}), pairs differing by a single word or phrase substitution (\textsc{Subst}), pairs differing by multiple edits (\textsc{MultiEd}), and pairs with little or no content word overlap (\textsc{Diff}). Examples of each are in Table \ref{examplesofsickdata}. We annotated 100 random examples to judge the frequency of each category, and  continued selectively annotating until each category contained at least 25. We also use the category \textsc{Short} for pairs in which neither sentence contains more than ten words.