Merge pull request #136 from stan-dev/docs/updates

Docs/updates
stan-dev · Oct 15, 2019 · e03d1b3 · e03d1b3
2 parents 2fe26ab + c2aa91b
commit e03d1b3
Show file tree

Hide file tree

Showing 21 changed files with 1,114 additions and 376 deletions.
diff --git a/cmdstanpy/model.py b/cmdstanpy/model.py
@@ -118,9 +118,7 @@ def exe_file(self) -> str:
         return self._exe_file
 
     def compile(
-        self,
-        opt_lvl: int = 3,
-        include_paths: List[str] = None,
+        self, opt_lvl: int = 3, include_paths: List[str] = None
     ) -> None:
         """
         Compile the given Stan program file.  Translates the Stan code to
@@ -204,10 +202,10 @@ def compile(
                     new_exec_name = (
                         os.path.basename(os.path.splitext(self._stan_file)[0])
                         + EXTENSION
-                        )
+                    )
                     self._exe_file = os.path.join(
                         original_target_dir, new_exec_name
-                        )
+                    )
                     shutil.copy(exe_file, self._exe_file)
                 else:
                     self._exe_file = exe_file
@@ -232,8 +230,9 @@ def optimize(
             either as a dictionary with entries matching the data variables,
             or as the path of a data file in JSON or Rdump format.
 
-        :param seed: The seed for random number generator Must be an integer
-            between 0 and 2^32 - 1. If unspecified, numpy.random.RandomState()
+        :param seed: The seed for random number generator. Must be an integer
+            between ``0`` and ``2^32 - 1``. If unspecified,
+            ``numpy.random.RandomState()``
             is used to generate a seed which will be used for all chains.
 
         :param inits:  Specifies how the sampler initializes parameter values.
@@ -242,7 +241,7 @@ def optimize(
             all parameters in the model.  The default initialization behavoir
             will initialize all parameter values on range [-2, 2] on the
             _unconstrained_ support.  If the expected parameter values are
-            too far from this range, this option may improve adaptation.
+            too far from this range, this option may improve estimation.
             The following value types are allowed:
 
             * Single number ``n > 0`` - initialization range is [-n, n].
@@ -340,10 +339,11 @@ def sample(
             If none then set automatically to `chains` but no more
             than `total_cpu_count - 2`
 
-        :param seed: The seed for random number generator or a list of per-chain
-            seeds. Must be an integer between 0 and 2^32 - 1. If unspecified,
-            numpy.random.RandomState() is used to generate a seed which will be
-            used for all chains. When the same seed is used across all chains,
+        :param seed: The seed for random number generator. Must be an integer
+            between ``0`` and ``2^32 - 1``. If unspecified,
+            ``numpy.random.RandomState()``
+            is used to generate a seed which will be used for all chains.
+            When the same seed is used across all chains,
             the chain-id is used to advance the RNG to avoid dependent samples.
 
         :param chain_ids: The offset for the random number generator, either
@@ -408,14 +408,15 @@ def sample(
             It improves the effective sample size, but may increase the time
             per iteration.
 
-        :param fixed_param: Call CmdStan with argument "algorithm=fixed_param"
-            which runs the sampler without updating the Markov Chain, so that
-            the values of all parameters and transformed parameters are constant
-            across all draws and only those values in the generated quantities
-            block which are produced by RNG functions may change.  This provides
+        :param fixed_param: When True, call CmdStan with argument
+            "algorithm=fixed_param" which runs the sampler without
+            updating the Markov Chain, thus the values of all parameters and
+            transformed parameters are constant across all draws and
+            only those values in the generated quantities block that are
+            produced by RNG functions may change.  This provides
             a way to use Stan programs to generate simulated data via the
             generated quantities block.  This option must be used when the
-            parameters block is empty.
+            parameters block is empty.  Default value is False.
 
         :param csv_basename: A path or file name which will be used as the
             basename for the sampler output files.  The csv output files
@@ -635,8 +636,9 @@ def run_generated_quantities(
             fitting the model to the data, either using CmdStanPy's `sample`
             method or via another Stan interface.
 
-        :param seed: The seed for random number generator Must be an integer
-            between 0 and 2^32 - 1. If unspecified, numpy.random.RandomState()
+        :param seed: The seed for random number generator. Must be an integer
+            between ``0`` and ``2^32 - 1``. If unspecified,
+            ``numpy.random.RandomState()``
             is used to generate a seed which will be used for all chains.
             *NOTE: Specifying the seed will guarantee the same result for
             multiple invocations of this method with the same inputs.  However
@@ -708,16 +710,15 @@ def variational(
             either as a dictionary with entries matching the data variables,
             or as the path of a data file in JSON or Rdump format.
 
-        :param seed: The seed for random number generator or a list of per-chain
-            seeds. Must be an integer between 0 and 2^32 - 1. If unspecified,
-            numpy.random.RandomState() is used to generate a seed which will be
-            used for all chains. When the same seed is used across all chains,
-            the chain-id is used to advance the RNG to avoid dependent samples.
+        :param seed: The seed for random number generator. Must be an integer
+            between ``0`` and ``2^32 - 1``. If unspecified,
+            ``numpy.random.RandomState()``
+            is used to generate a seed which will be used for all chains.
 
         :param inits:  Specifies how the sampler initializes parameter values.
-            Initializiation is uniform random on a range centered on 0 with
-            default range of 2. Specifying a single number ``n > 0`` changes
-            the initialization range to [-n, n].
+            Initializiation is uniform random on a range centered on ``0`` with
+            default range of ``2``. Specifying a single number ``n > 0`` changes
+            the initialization range to ``[-n, n]``.
 
         :param csv_basename:  A path or file name which will be used as the
             basename for the CmdStan output files.  The csv output files

diff --git a/cmdstanpy/stanfit.py b/cmdstanpy/stanfit.py
@@ -580,7 +580,7 @@ def generated_quantities(self) -> np.ndarray:
         so that the values for each parameter are stored contiguously
         in memory, likewise all draws from a chain are contiguous.
         """
-        if not (self.runset.method == Method.GENERATED_QUANTITIES):
+        if not (self.runset.method == Method.GENERATE_QUANTITIES):
             raise RuntimeError(
                 'Bad runset method {}.'.format(self.runset.method)
             )
@@ -631,7 +631,7 @@ def __init__(self, runset: RunSet) -> None:
         self.runset = runset
         self._column_names = ()
         self._variational_mean = {}
-        self._output_samples = None
+        self._variational_sample = None
 
     def __repr__(self) -> str:
         repr = 'StanVariational: model={}{}'.format(
@@ -648,7 +648,7 @@ def _set_variational_attrs(self, sample_csv_0: str) -> None:
         meta = scan_variational_csv(sample_csv_0)
         self._column_names = meta['column_names']
         self._variational_mean = meta['variational_mean']
-        self._output_samples = meta['output_samples']
+        self._variational_sample = meta['variational_sample']
 
     @property
     def columns(self) -> int:
@@ -689,11 +689,12 @@ def variational_params_dict(self) -> OrderedDict:
             self._set_variational_attrs(self.runset.csv_files[0])
         return OrderedDict(zip(self.column_names, self._variational_mean))
 
-    def output_samples(self) -> np.array:
+    @property
+    def variational_sample(self) -> np.array:
         """Returns the set of approximate posterior output draws."""
-        if self._output_samples is None:
+        if self._variational_sample is None:
             self._set_variational_attrs(self.runset.csv_files[0])
-        return self._output_samples
+        return self._variational_sample
 
     def save_csvfiles(self, dir: str = None, basename: str = None) -> None:
         """

diff --git a/cmdstanpy/utils.py b/cmdstanpy/utils.py
@@ -43,7 +43,7 @@ def get_latest_cmdstan(dot_dir: str) -> str:
         name.split('-')[1]
         for name in os.listdir(dot_dir)
         if os.path.isdir(os.path.join(dot_dir, name))
-        and name.startswith('cmdstan-')
+        and name.startswith('cmdstan-') and name[8].isdigit()
     ]
     versions.sort(key=lambda s: list(map(int, s.split('.'))))
     if len(versions) == 0:
@@ -506,7 +506,7 @@ def scan_variational_csv(path: str) -> Dict:
         xs = line.split(',')
         variational_mean = [float(x) for x in xs]
         dict['variational_mean'] = variational_mean
-        dict['output_samples'] = pd.read_csv(
+        dict['variational_sample'] = pd.read_csv(
             path, comment='#', skiprows=lineno, header=None
         )
     return dict

diff --git a/cmdstanpy_tutorial.ipynb b/cmdstanpy_tutorial.ipynb
@@ -96,7 +96,7 @@
    "source": [
     "import os\n",
     "import os.path\n",
-    "from cmdstanpy import Model, StanMCMC, cmdstan_path"
+    "from cmdstanpy import Model, cmdstan_path"
    ]
   },
   {

diff --git a/docs/advi.rst b/docs/advi.rst
@@ -2,39 +2,112 @@ Variational Inference
 =====================
 
 Variational inference is a scalable technique for approximate Bayesian inference.
-Unlike Stan's HMC-NUTS sampler, which produces a set of draws from the joint log
-probability density of the model conditioned on the data, Variational Inference
-produces a set of draws from a simpler, computationally tractable, probability density.
+In the Stan ecosystem, the terms "VI" and "VB" ("variational Bayes")
+are used synonymously.
+
 Stan implements an automatic variational inference algorithm,
 called Automatic Differentiation Variational Inference (ADVI)
 which searches over a family of simple densities to find the best
 approximate posterior density.
-
 ADVI produces an estimate of the parameter means together with a sample
 from the approximate posterior density.
 
 ADVI approximates the variational objective function, the evidence lower bound or ELBO,
 using stochastic gradient ascent.
-The algorithm ascends these gradients using an adaptive stepsize sequence,
-which has one parameter `eta`, which is adjusted during warmup.
-The number of draws used to approximate the ELBO is denoted by `elbo_samples`. 
+The algorithm ascends these gradients using an adaptive stepsize sequence
+that has one parameter ``eta`` which is adjusted during warmup.
+The number of draws used to approximate the ELBO is denoted by ``elbo_samples``. 
 ADVI heuristically determines a rolling window over which it computes
 the average and the median change of the ELBO.
-When this change falls below a threshold, denoted by `tol_rel_obj`,
+When this change falls below a threshold, denoted by ``tol_rel_obj``,
 the algorithm is considered to have converged.
 
-See:
 
- - Paper:  [Kucukelbir et al](http://arxiv.org/abs/1506.03431)
- - Stan manual:  https://mc-stan.org/docs/2_20/reference-manual/vi-algorithms-chapter.html
+ADVI configuration
+------------------
+
+- ``algorithm``: Algorithm to use. One of: "meanfield", "fullrank".
+
+- ``iter``: Maximum number of ADVI iterations.
+
+- ``grad_samples``: Number of MC draws for computing the gradient.
+
+- ``elbo_samples``: Number of MC draws for estimate of ELBO.
+
+- ``eta``: Stepsize scaling parameter.
+
+- ``adapt_iter``: Number of iterations for eta adaptation.
+
+- ``tol_rel_obj``: Relative tolerance parameter for convergence.
+
+- ``eval_elbo``: Number of interactions between ELBO evaluations.
+
+- ``output_samples``: Number of approximate posterior output draws to save.
+
+.. include:: common_config.rst
+
+All of these arguments are optional; when unspecified, the CmdStan defaults will be used.
+
+
+Example: variational inference for model ``bernoulli.stan``
+-----------------------------------------------------------
+
+In this example we use the CmdStan example model
+`bernoulli.stan <https://github.com/stan-dev/cmdstanpy/blob/master/test/data/bernoulli.stan>`__
+and data file
+`bernoulli.data.json <https://github.com/stan-dev/cmdstanpy/blob/master/test/data/bernoulli.data.json>`__.
+
+The :ref:`class_model` class method  ``variational`` returns a ``StanVariational`` object which provides properties
+to retrieve the estimate of the
+approximate posterior mean of all model parameters,
+and the returned set of draws from this approximate posterior (if any):
+
+- ``column_names``
+- ``variational_params_dict``
+- ``variational_params_np``
+- ``variational_params_pd``
+- ``variational_sample``
+
+- ``save_csvfiles()``
+
+In the following example, we instantiate a model and run variational inference using the default CmdStan settings:
+
+.. code:: ipython3
 
+    import os
+    from cmdstanpy.model import Model
+    from cmdstanpy.utils import cmdstan_path
+    
+    bernoulli_dir = os.path.join(cmdstan_path(), 'examples', 'bernoulli')
+    bernoulli_path = os.path.join(bernoulli_dir, 'bernoulli.stan')
+    bernoulli_data = os.path.join(bernoulli_dir, 'bernoulli.data.json')
+    # instantiate bernoulli model, compile Stan program
+    bernoulli_model = Model(stan_file=bernoulli_path)
+    bernoulli_model.compile()
+    # run CmdStan's variational inference method, returns object `StanVariational`
+    vi = bernoulli_model.variational(data=bernoulli_data)
+    print(vi.column_names)
+    print(vi.variational_params_dict)
+    vi.variational_sample.shape
 
+These estimates are only valid if the algorithm has converged to a good
+approximation. When the algorithm fails to do so, the ``variational``
+method will throw a ``RuntimeError``.
 
+.. code:: ipython3
 
+    fail_stan = os.path.join(datafiles_path, 'variational', 'eta_should_fail.stan')
+    fail_model = Model(stan_file=fail_stan)
+    model.compile()
+    vi = model.variational()
 
 
+References
+----------
 
 
+- Paper:  [Kucukelbir et al](http://arxiv.org/abs/1506.03431)
+- Stan manual:  https://mc-stan.org/docs/reference-manual/vi-algorithms-chapter.html
 
 
 

diff --git a/docs/api.rst b/docs/api.rst
@@ -6,31 +6,48 @@ API Reference
 Classes
 *******
 
+.. _class_model:
 
 Model
 =====
 
 .. autoclass:: cmdstanpy.Model
    :members:
 
+.. _class_runset:
+
+RunSet
+======
+
+.. autoclass:: cmdstanpy.stanfit.RunSet
+   :members:
+
+.. _class_stanmcmc:
+
 StanMCMC
-=======
+========
 
 .. autoclass:: cmdstanpy.StanMCMC
    :members:
 
+.. _class_stanmle:
+
 StanMLE
 =======
 
 .. autoclass:: cmdstanpy.StanMLE
    :members:
 
+.. _class_stanquantities:
+
 StanQuantities
 ==============
 
 .. autoclass:: cmdstanpy.StanQuantities
    :members:
-
+
+.. _class_stanvariational:
+
 StanVariational
 ===============
 

diff --git a/docs/common_config.rst b/docs/common_config.rst
@@ -0,0 +1,9 @@
+
+- ``data``: Values for all data variables in the model, specified either as a dictionary with entries matching the data variables, or as the path of a data file in JSON or Rdump format.
+
+- ``seed``: The seed for random number generator.
+
+- ``inits``:  Specifies how the sampler initializes parameter values.
+
+- ``csv_basename``:  A path or file name which will be used as the basename for the CmdStan output files.
+