writing/F'14 paper/cameraready/join.tex


\begin{table}[tp]
  \centering  \small
  \setlength{\arraycolsep}{8pt}
  \renewcommand{\arraystretch}{1.1}
  \newcommand{\UNK}{\cdot}  
  $\begin{array}[t]{c@{ \ }|*{7}{c}|}
    %\hline
    \multicolumn{1}{c}{}
             & \nateq     & \natfor     & \natrev     & \natneg    & \natalt     & \natcov     & \multicolumn{1}{c}{\natind} \\
    \cline{2-8}
    \nateq  & \nateq &   \natfor &  \natrev &  \natneg &   \natalt &  \natcov &  \natind \\
    \natfor & \natfor &  \natfor &  \UNK &  \natalt &   \natalt &  \UNK &  \UNK \\
    \natrev & \natrev &  \UNK &  \natrev &  \natcov &   \UNK &  \natcov &  \UNK \\
    \natneg & \natneg &  \natcov &  \natalt &  \nateq &    \natrev &  \natfor &  \natind \\
    \natalt & \natalt &  \UNK &  \natalt &  \natfor &   \UNK &  \natfor &  \UNK \\
    \natcov & \natcov &  \natcov &  \UNK &  \natrev &   \natrev &  \UNK &  \UNK \\
    \natind & \natind & \UNK &  \UNK &  \natind &  \UNK &  \UNK &  \UNK \\
    \cline{2-8}
  \end{array}$
  \caption{In \S\ref{sec:join}, we assess our models' ability to learn to do inference over pairs of relations using the rules represented here, which are derived from the definitions of the relations in Table~\ref{b-table}.  As an example, given that $p_1 \natfor p_2$ and $p_2 \natneg p_3$, the entry in the $\natfor$ row and the $\natneg$ column lets us conclude that $p_1 \natalt p_3$. Cells containing a dot correspond to situations for which no valid inference can be drawn.} 
  \label{tab:jointable}
\end{table}

\section{Reasoning about semantic relations}\label{sec:join}

The simplest kinds of deduction in natural logic involve atomic statements 
using the relations in Table~\ref{b-table}. 
For instance, from the relation $p_1 \natrev p_2$ between two propositions, 
one can infer the relation $p_2 \natfor p_1$ by applying the definitions of the relations directly. 
If one is also given the relation $p_2 \natrev p_3$ one can conclude that $p_1 \natrev p_3$, by basic set-theoretic reasoning (transitivity of $\natrev$). The
full set of sound such inferences on pairs of premise relations is depicted in
Table~\ref{tab:jointable}. Though these basic inferences do not involve compositional
sentence representations, any successful reasoning using compositional representations
will rely on the ability to perform sound inferences of this kind in order to be able to use unseen relational facts within larger derivations. Our first experiment studies how well each model can learn to perform them them in isolation.

% about the relations themselves that do not depend on the
% internal structure of the things being compared. For example, given
% that $a\sqsupset b$ and $b\sqsupset c$ one can conclude that
% $a\sqsupset c$ by the transitivity of $\sqsupset$, even without
% understanding $a$, $b$, or $c$. These seven relations support more
% than just transitivity: MacCartney and Manning's
% \cite{maccartney2009extended} join table defines 32 valid inferences
% that can be made on the basis of pairs of relations of the form $a R
% b$ and $b R' c$, including several less intuitive ones such as that if
% $a \natneg b$ and $b~|~c$ then $a \sqsupset c$.


\begin{figure}[t]
  \centering
  \begin{subfigure}[t]{0.45\textwidth}
    \centering
    \newcommand{\labelednode}[4]{\put(#1,#2){\oval(1.5,1)}\put(#1,#2){\makebox(0,0){$\begin{array}{c}#3\\\{#4\}\end{array}$}}}
    \setlength{\unitlength}{1cm}\scalebox{0.8}{
    \begin{picture}(5,5.5)
      \labelednode{2.50}{5}{}{a,b,c}
      
      \put(0.75,4){\line(3,1){1.5}}
      \put(2.5,4){\line(0,1){0.5}}
      \put(4.25,4){\line(-3,1){1.5}}
      
      \labelednode{0.75}{3.5}{p_1,p_2}{a,b}
      \labelednode{2.50}{3.5}{p_3}{a,c}
      \labelednode{4.25}{3.5}{p_4}{b,c}
      
      \put(0.75,2.5){\line(0,1){0.5}}
      \put(0.75,2.5){\line(3,1){1.5}}
      
      \put(2.5,2.5){\line(-3,1){1.5}}
      \put(2.5,2.5){\line(3,1){1.5}}
      
      \put(4.25,2.5){\line(0,1){0.5}}
      \put(4.25,2.5){\line(-3,1){1.5}}
      

      \labelednode{0.75}{2}{p_5,p_6}{a}
      \labelednode{2.50}{2}{}{b}
      \labelednode{4.25}{2}{p_7,p_8}{c}
      
      \put(2.5,1){\line(-3,1){1.5}}
      \put(2.5,1){\line(0,1){0.5}}
      \put(2.5,1){\line(3,1){1.5}}
      
      \labelednode{2.5}{0.5}{}{}
    \end{picture}}
    \caption{Example boolean structure, shown with edges idicating inclusion. The terms $p_1$--$p_8$ name the sets. Not all sets have names, and  some sets have multiple names, so that learning $\nateq$ is non-trivial.}
  \end{subfigure}
  \qquad\small
    \begin{subfigure}[t]{0.43\textwidth}
    \centering \vspace{0.4cm}
    \setlength{\tabcolsep}{12pt}
    \begin{tabular}[b]{c  c}
      \toprule
      Train & Test \\
      \midrule
      $p_1 \nateq p_2$              & $p_2 \natneg p_7$ \\
      $p_1 \natrev p_5$              & $p_2 \natrev p_5$ \\
      $p_4 \natrev p_8$              & \strikeout{$p_5 \nateq p_6$} \\
      $p_5 \natalt p_7$              & \strikeout{$p_7 \natfor p_4$} \\
      $p_7 \natneg p_1$           & $p_8 \natfor p_4$ \\

      \bottomrule
    \end{tabular}

    \caption{A few examples of atomic statements about the
      model depicted above.  Test statements that are not provable from the training data shown are
      crossed out.}
  \end{subfigure}  
  \caption{Small example structure and data for learning relation composition.}
  \label{lattice-figure}
\end{figure} 

\begin{table}[tp]
  \centering\small
  \begin{tabular}{ l r@{ \ }r r@{ \ }r }
    \toprule
    ~&\multicolumn{2}{c}{Train} & \multicolumn{2}{c}{Test}\\
    \midrule
    $\natind$ only &53.8 &(10.5)    &53.8 &(10.5) \\
    15d NN &				99.8&	(99.0) &94.0&(87.0) \\
    15d NTN 				& \textbf{100} & \textbf{(100)} & \textbf{99.6} & \textbf{(95.5)}\\
    \bottomrule
  \end{tabular}
  
  
  \caption{Performance on the semantic relation experiments. These results and all other results on artificial data are reported as mean accuracy scores over five runs followed by mean macroaveraged F1 scores in parentheses. The ``$\natind$ only'' entries reflect the frequency of the most frequent class.}
  \label{joinresultstable}
\end{table}

\paragraph{Experiments}
We begin by creating a world model
on which we will base the statements in the train and test sets.
This takes the form of a small Boolean structure in which terms denote
sets of entities from a small domain.  Fig.~\ref{lattice-figure}a
depicts a structure of this form with three entities ($a$, $b$, and $c$) and eight proposition terms ($p_1$--$p_8$). We then generate a 
relational statement for each pair of terms in the model, as shown in Fig.~\ref{lattice-figure}b. 
We divide these statements evenly into train and test sets, and delete the test set
 examples which cannot be proven from the train examples, for which there is not enough information for even an ideal system to choose a correct label.
In each experimental run, we create a model with 80 terms over a domain of 7 elements, yielding a training set of 3200 examples and a test set of 
2960 examples.

We trained models with both the NN and NTN comparison functions on these
data sets.\footnote{Since this task relies crucially on the learning of a pair of vectors, no simpler version of our model is a viable baseline.} %+%
In both cases, the models are implemented as
described in \S\ref{methods}, but since the items being compared
are single terms rather than full tree structures, the composition
layer is not used, and the two models are not recursive. We simply present
the models with the (randomly initialized) embedding vectors for each
of two terms, ensuring that the model has no information about the terms
being compared except for the relations between them that appear in training.


\paragraph{Results} 
The results (Table \ref{joinresultstable}) show that NTN is able to accurately encode the relations between the terms in the geometric relations between their vectors, 
and is able to then use that information to recover relations that 
are not overtly included in the training data. The NN also generalizes fairly well, 
but makes enough errors that it remains an open question whether 
it is capable of learning representations with these properties. 
It is not possible for us to rule out the possibility that different optimization techniques or
finer-grained hyperparameter tuning could lead an NN model to succeed.

As an example from our test data, both models correctly labeled $p_1 \natfor p_3$, potentially learning from the training examples $\{p_1 \natfor p_{51},~p_3 \natrev p_{51}\}$ or $\{p_1\natfor p_{65},~p_3 \natrev p_{65} \}$. On another example involving comparably frequent relations, the NTN correctly labeled $p_6 \natrev p_{24}$, likely on the basis of the training examples $\{p_6 \natcov p_{28},~p_{28} \natneg p_{24}\}$, while the NN incorrectly assigned it $\natind$.

% From train\test_1