stan-dev · syclik · Aug 2, 2013 · Jul 23, 2013 · Jul 23, 2013 · Jul 24, 2013
diff --git a/src/docs/stan-reference/distributions.tex b/src/docs/stan-reference/distributions.tex
@@ -316,21 +316,40 @@ \subsubsection{Stan Functions}
 
 \section{Categorical Distribution}
 
-\subsubsection{Probability Mass Function}
+\subsubsection{Probability Mass Functions}
 
-If $N \in \nats$ and $\theta \in \mbox{$N$-simplex}$, then for $y \in
+If $N \in \nats$, $N > 0$, and $\theta \in \mbox{$N$-simplex}$, then for $y \in
 \setlist{1,\ldots,N}$, 
+%
+\[
+\distro{Categorical}(y|\theta) = \theta_y.
+\]
+%
+In addition, Stan provides a log-odds scaled categorical distribution, 
+%
 \[
-\distro{Categorical}(y|\theta) = \theta_n.
+\distro{CategoricalLogit}(y|\beta)
+= \distro{Categorical}(y|\mbox{softmax}(\beta)).
 \]
+%
+See \refsection{softmax} for the definition of the softmax function.
 
 
 \subsubsection{Stan Functions}
 
+All of the categorical distributions are vectorized so that the
+outcome \farg{y} can be a single integer (type \code{int}) or an array
+of integers (type \code{int[]}).
+
 \begin{description}
-\fitem{real}{categorical\_log}{int \farg{y}, vector \farg{theta}}{The
-  log categorical probability mass function with outcome \farg{y} in 
-$1:N$ given $N$-simplex distribution parameter \farg{theta}}
+\fitem{real}{categorical\_log}{ints \farg{y}, vector \farg{theta}}{The
+  log categorical probability mass function with outcome(s) \farg{y} in 
+$1:N$ given $N$-simplex distribution parameter \farg{theta}.}
+%
+\fitem{real}{categorical\_logit\_log}{ints \farg{y}, vector
+  \farg{beta}}{The log categorical probability mass function with
+  outcome(s) \farg{y} in $1:N$ given log-odds of outcomes \farg{beta}.}
+%
 \end{description}
 %
 \begin{description}

diff --git a/src/docs/stan-reference/functions.tex b/src/docs/stan-reference/functions.tex
@@ -1328,6 +1328,17 @@ \subsection{Specialized Products}
 
 \section{Reductions}
 
+\subsection{Log Sum of Exponents}
+
+\begin{description}
+\fitem{real}{log\_sum\_exp}{vector \farg{x}}{
+The natural logarithm of the sum of the exponentials of the elements in \farg{x}}
+\fitem{real}{log\_sum\_exp}{row\_vector \farg{x}}{
+The natural logarithm of the sum of the exponentials of the elements in \farg{x}}
+\fitem{real}{log\_sum\_exp}{matrix \farg{x}}{
+The natural logarithm of the sum of the exponentials of the elements in \farg{x}}
+\end{description}
+
 \subsection{Minimum and Maximum}
 
 \begin{description}
@@ -1609,18 +1620,62 @@ \subsection{Transposition Postfix Operator}
 
 \section{Special Matrix Functions}\label{softmax.section}
 
-The softmax function maps $\reals^K$ to the $K$-simplex by
+The softmax function maps $y \in \reals^K$ to the $K$-simplex by
 \[
 \mbox{softmax}(y)
  = \frac{\exp(y)}
         {\sum_{k=1}^K \exp(y_k)},
 \]
 %
 where $\exp(y)$ is the componentwise exponentiation of $y$.
-
+%
+Softmax is usually calculated on the log scale, 
+\[
+\log \mbox{softmax}(y)
+ \ = \ y - \log \sum_{k=1}^K \exp(y_k)
+ \ = \ y - \mbox{log\_sum\_exp}(y).
+\]
+%
+The entries in the Jacobian of the softmax function are given by
+\[
+\begin{array}{l}
+\displaystyle
+\frac{\partial}{\partial y_m} \mbox{softmax}(y)[k]
+\\[8pt]
+\displaystyle
+\mbox{ } \ \ \ = \left\{ 
+\begin{array}{ll}
+\mbox{softmax}(y)[k] - \mbox{softmax}(y)[k] \times \mbox{softmax}(y)[m]
+& \mbox{ if } m = k, \mbox{ and}
+\\[6pt]
+\mbox{softmax}(y)[k] * \mbox{softmax}(y)[m]
+& \mbox{ if } m \neq k.
+\end{array}
+\right.
+\end{array}
+\]
+For the log softmax function, the entries are
+\[
+\frac{\partial}{\partial y_m} \mbox{softmax}(y)[k]
+= \left\{ 
+\begin{array}{ll}
+1 - \mbox{softmax}(y)[m]
+& \mbox{ if } m = k, \mbox{ and}
+\\[6pt]
+\mbox{softmax}(y)[m]
+& \mbox{ if } m \neq k.
+\end{array}
+\right.
+\]
+%
+Stan provides the following functions for softmax and its log.
+%
 \begin{description}
 \fitem{vector}{softmax}{vector \farg{x}}{
 The softmax of \farg{x}}
+%
+\fitem{vector}{log\_softmax}{vector \farg{x}}{
+The natural logarithm of the softmax of \farg{x}}
 \end{description}
 %
 

diff --git a/src/docs/stan-reference/stan-manuals.sty b/src/docs/stan-reference/stan-manuals.sty
@@ -110,12 +110,12 @@
 \newcommand{\refnote}[1]{Footnote~\ref{#1.footnote}}
 
 \newcommand{\fitem}[4]{\item[\begin{minipage}{\textwidth}{\tt #1 {\bfseries #2}(#3)}\end{minipage}]\mbox{ }
- \\[4pt] #4\index{{\tt\bfseries #2 }!{\tt (#3):\,#1}|hyperpage}}
+ \\[-10pt] #4\index{{\tt\bfseries #2 }!{\tt (#3):\,#1}|hyperpage}}
 % need special command for items requiring escapes in index
 \newcommand{\fitemindex}[5]{\item[{\tt #1 {\bfseries #2}(#3)}]\mbox{ }
-  \\[4pt] #4\index{{\tt\bfseries #5 }!{\tt (#3):\,#1}|hyperpage}}
+  \\[-10pt] #4\index{{\tt\bfseries #5 }!{\tt (#3):\,#1}|hyperpage}}
 \newcommand{\fitemindexsort}[6]{\item[{\tt #1 {\bfseries #2}(#3)}]\mbox{ }
-  \\[4pt] #4\index{{\tt\bfseries #6 }@{\tt\bfseries #5 }!{\tt (#3):\,#1}|hyperpage}}
+  \\[-10pt] #4\index{{\tt\bfseries #6 }@{\tt\bfseries #5 }!{\tt (#3):\,#1}|hyperpage}}
 \newcommand{\farg}[1]{{\tt\slshape #1}}
 
 \newcommand{\cmdflag}[3]{\item[\tt {-}-#1] \mbox{ } \\ #2 \\ \hspace*{24pt}(#3)}

diff --git a/src/stan/agrad/fvar_matrix.hpp b/src/stan/agrad/fvar_matrix.hpp
@@ -10,6 +10,8 @@
 #include <stan/agrad/fwd/matrix/dot_self.hpp>
 #include <stan/agrad/fwd/matrix/inverse.hpp>
 #include <stan/agrad/fwd/matrix/log_determinant.hpp>
+#include <stan/agrad/fwd/matrix/log_softmax.hpp>
+#include <stan/agrad/fwd/matrix/log_sum_exp.hpp>
 #include <stan/agrad/fwd/matrix/mdivide_left_tri_low.hpp>
 #include <stan/agrad/fwd/matrix/mdivide_left.hpp>
 #include <stan/agrad/fwd/matrix/mdivide_right_tri_low.hpp>
@@ -18,6 +20,7 @@
 #include <stan/agrad/fwd/matrix/multiply.hpp>
 #include <stan/agrad/fwd/matrix/rows_dot_product.hpp>
 #include <stan/agrad/fwd/matrix/rows_dot_self.hpp>
+#include <stan/agrad/fwd/matrix/softmax.hpp>
 #include <stan/agrad/fwd/matrix/sort.hpp>
 #include <stan/agrad/fwd/matrix/sum.hpp>
 #include <stan/agrad/fwd/matrix/tcrossprod.hpp>

diff --git a/src/stan/agrad/fwd/log_sum_exp.hpp b/src/stan/agrad/fwd/log_sum_exp.hpp
@@ -43,6 +43,25 @@ namespace stan{
                   stan::return_type<T1,T2>::type>(log_sum_exp(x1.val_, x2),
                           x1.d_ * exp(x1.val_) / (exp(x1.val_) + exp(x2)));
     }
+
+    template <typename T>
+    fvar<T>
+    log_sum_exp(const std::vector<fvar<T> >& v) {
+      using stan::math::log_sum_exp;
+      using std::exp;
+      std::vector<T> vals(v.size());
+      for (size_t i = 0; i < v.size(); ++i)
+        vals[i] = v[i].val_;
+      T deriv(0.0);
+      T denominator(0.0);
+      for (size_t i = 0; i < v.size(); ++i) {
+        T exp_vi = exp(vals[i]);
+        denominator += exp_vi;
+        deriv += v[i].d_ * exp_vi;
+      }
+      return fvar<T>(log_sum_exp(vals), deriv / denominator);
+    }
+
   }
 }
 #endif
diff --git a/src/stan/agrad/fwd/matrix/log_softmax.hpp b/src/stan/agrad/fwd/matrix/log_softmax.hpp
@@ -0,0 +1,59 @@
+#ifndef __STAN__AGRAD__FWD__MATRIX__LOG_SOFTMAX_HPP__
+#define __STAN__AGRAD__FWD__MATRIX__LOG_SOFTMAX_HPP__
+
+#include <stan/agrad/fwd/fvar.hpp>
+#include <stan/agrad/fwd/matrix/softmax.hpp>
+#include <stan/math/matrix/Eigen.hpp>
+#include <stan/math/matrix/log_softmax.hpp>
+#include <stan/math/matrix/softmax.hpp>
+
+namespace stan {
+  namespace agrad {
+
+    template <typename T>
+    inline 
+    Eigen::Matrix<fvar<T>,Eigen::Dynamic,1>
+    log_softmax(const Eigen::Matrix<fvar<T>,Eigen::Dynamic,1>& alpha) {
+      using stan::math::softmax;
+      using stan::math::log_softmax;
+      using Eigen::Matrix;
+      using Eigen::Dynamic;
+
+      Matrix<T,Dynamic,1> alpha_t(alpha.size());
+      for (int k = 0; k < alpha.size(); ++k)
+        alpha_t(k) = alpha(k).val_;
+
+      Matrix<T,Dynamic,1> softmax_alpha_t = softmax(alpha_t);
+      Matrix<T,Dynamic,1> log_softmax_alpha_t = log_softmax(alpha_t);
+
+      Matrix<fvar<T>,Dynamic,1> log_softmax_alpha(alpha.size());
+      for (int k = 0; k < alpha.size(); ++k) {
+        log_softmax_alpha(k).val_ = log_softmax_alpha_t(k);
+        log_softmax_alpha(k).d_ = 0;
+      }
+
+      // for each input position
+      for (int m = 0; m < alpha.size(); ++m) {
+        T negative_alpha_m_d_times_softmax_alpha_t_m 
+          = - alpha(m).d_ * softmax_alpha_t(m);
+        // for each output position
+        for (int k = 0; k < alpha.size(); ++k) {
+          // chain from input to output
+          if (m == k)
+            log_softmax_alpha(k).d_ 
+              += alpha(m).d_  
+              + negative_alpha_m_d_times_softmax_alpha_t_m;
+          else
+            log_softmax_alpha(k).d_ 
+              += negative_alpha_m_d_times_softmax_alpha_t_m;
+        }
+      }
+
+      return log_softmax_alpha;
+    }
+
+
+  }
+}
+
+#endif
diff --git a/src/stan/agrad/fwd/matrix/log_sum_exp.hpp b/src/stan/agrad/fwd/matrix/log_sum_exp.hpp
@@ -0,0 +1,35 @@
+#ifndef __STAN__AGRAD__FWD__MATRIX__LOG__SUM__EXP__HPP__
+#define __STAN__AGRAD__FWD__MATRIX__LOG__SUM__EXP__HPP__
+
+#include <vector>
+#include <stan/agrad/fwd/fvar.hpp>
+#include <stan/math/functions/log_sum_exp.hpp>
+#include <stan/math/matrix/Eigen.hpp>
+
+namespace stan{
+
+  namespace agrad{
+
+    // FIXME: cut-and-paste from fwd/log_sum_exp.hpp; should
+    // be able to generalize
+    template <typename T, int R, int C>
+    fvar<T>
+    log_sum_exp(const Eigen::Matrix<T,R,C>& v) {
+      using stan::math::log_sum_exp;
+      using std::exp;
+      std::vector<T> vals(v.size());
+      for (int i = 0; i < v.size(); ++i)
+        vals[i] = v[i].val_;
+      T deriv(0.0);
+      T denominator(0.0);
+      for (size_t i = 0; i < v.size(); ++i) {
+        T exp_vi = exp(vals[i]);
+        denominator += exp_vi;
+        deriv += v[i].d_ * exp_vi;
+      }
+      return fvar<T>(log_sum_exp(vals), deriv / denominator);
+    }
+
+  }
+}
+#endif
diff --git a/src/stan/agrad/fwd/matrix/softmax.hpp b/src/stan/agrad/fwd/matrix/softmax.hpp
@@ -0,0 +1,58 @@
+#ifndef __STAN__AGRAD__FWD__MATRIX__SOFTMAX_HPP__
+#define __STAN__AGRAD__FWD__MATRIX__SOFTMAX_HPP__
+
+#include <stan/agrad/fwd/fvar.hpp>
+#include <stan/math/matrix/Eigen.hpp>
+#include <stan/math/matrix/softmax.hpp>
+
+namespace stan {
+  namespace agrad {
+
+    template <typename T>
+    inline 
+    Eigen::Matrix<fvar<T>,Eigen::Dynamic,1>
+    softmax(const Eigen::Matrix<fvar<T>,Eigen::Dynamic,1>& alpha) {
+      using stan::math::softmax;
+      using Eigen::Matrix;
+      using Eigen::Dynamic;
+
+      Matrix<T,Dynamic,1> alpha_t(alpha.size());
+      for (int k = 0; k < alpha.size(); ++k)
+        alpha_t(k) = alpha(k).val_;
+
+      Matrix<T,Dynamic,1> softmax_alpha_t = softmax(alpha_t);
+
+      Matrix<fvar<T>,Dynamic,1> softmax_alpha(alpha.size());
+      for (int k = 0; k < alpha.size(); ++k) {
+        softmax_alpha(k).val_ = softmax_alpha_t(k);
+        softmax_alpha(k).d_ = 0;
+      }
+
+      // for each input position
+      for (int m = 0; m < alpha.size(); ++m) {
+        // for each output position
+        T negative_alpha_m_d_times_softmax_alpha_t_m
+          = - alpha(m).d_ * softmax_alpha_t(m);
+        for (int k = 0; k < alpha.size(); ++k) {
+          // chain from input to output
+          if (m == k) {
+            softmax_alpha(k).d_ 
+              += softmax_alpha_t(k) 
+              * (alpha(m).d_ 
+                 + negative_alpha_m_d_times_softmax_alpha_t_m);
+          } else {
+            softmax_alpha(k).d_ 
+              += negative_alpha_m_d_times_softmax_alpha_t_m
+              * softmax_alpha_t(k);
+          }
+        }
+      }
+
+      return softmax_alpha;
+    }
+
+
+  }
+}
+
+#endif
diff --git a/src/stan/agrad/matrix.hpp b/src/stan/agrad/matrix.hpp
@@ -26,5 +26,8 @@
 #include <stan/agrad/rev/matrix/trace_quad_form.hpp>
 #include <stan/agrad/rev/matrix/trace_gen_quad_form.hpp>
 #include <stan/agrad/rev/matrix/crossprod.hpp>
+#include <stan/agrad/rev/matrix/softmax.hpp>
+#include <stan/agrad/rev/matrix/log_softmax.hpp>
+#include <stan/agrad/rev/matrix/log_sum_exp.hpp>
 
 #endif
diff --git a/src/stan/agrad/rev/exp.hpp b/src/stan/agrad/rev/exp.hpp
@@ -1,7 +1,7 @@
 #ifndef __STAN__AGRAD__REV__EXP_HPP__
 #define __STAN__AGRAD__REV__EXP_HPP__
 
-#include <valarray>
+#include <cmath>
 #include <stan/agrad/rev/var.hpp>
 #include <stan/agrad/rev/op/v_vari.hpp>