diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index ae05803b..3c494343 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -55,7 +55,8 @@ jobs: make github cd .. - git commit -m "release/v${{ github.event.inputs.new_version }}: updating version numbers" -a + git add . + git commit -m "release/v${{ github.event.inputs.new_version }}: updating version numbers" git push -u origin release/v${{ github.event.inputs.new_version }} - name: Merge into develop diff --git a/.gitignore b/.gitignore index 7cfac24e..1b4d8a20 100644 --- a/.gitignore +++ b/.gitignore @@ -10,7 +10,6 @@ travis.yml .pytest_cache dist htmlcov -*.png docsrc/_build .idea .DS_Store diff --git a/docs/_images/examples_MCMC_Sampling_18_1.png b/docs/_images/examples_MCMC_Sampling_18_1.png new file mode 100644 index 00000000..7d8e7193 Binary files /dev/null and b/docs/_images/examples_MCMC_Sampling_18_1.png differ diff --git a/docs/_modules/cmdstanpy/model.html b/docs/_modules/cmdstanpy/model.html deleted file mode 100644 index 1351f26a..00000000 --- a/docs/_modules/cmdstanpy/model.html +++ /dev/null @@ -1,1518 +0,0 @@ - - - - - - - - cmdstanpy.model — CmdStanPy 0.9.77 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - -
- - - - -
- -
- - - - - - -
- -
- -

Source code for cmdstanpy.model

-"""CmdStanModel"""
-
-import logging
-import os
-import platform
-import re
-import shutil
-import subprocess
-from collections import OrderedDict
-from concurrent.futures import ThreadPoolExecutor
-from multiprocessing import cpu_count
-from pathlib import Path
-from typing import Any, Dict, List, Mapping, Optional, Union
-
-from cmdstanpy.cmdstan_args import (
-    CmdStanArgs,
-    GenerateQuantitiesArgs,
-    OptimizeArgs,
-    SamplerArgs,
-    VariationalArgs,
-)
-from cmdstanpy.compiler_opts import CompilerOptions
-from cmdstanpy.stanfit import (
-    CmdStanGQ,
-    CmdStanMCMC,
-    CmdStanMLE,
-    CmdStanVB,
-    RunSet,
-    from_csv,
-)
-from cmdstanpy.utils import (
-    EXTENSION,
-    MaybeDictToFilePath,
-    TemporaryCopiedFile,
-    cmdstan_path,
-    do_command,
-    get_logger,
-)
-
-
-
[docs]class CmdStanModel: - # overview, omitted from doc comment in order to improve Sphinx docs. - # A CmdStanModel object encapsulates the Stan program and provides - # methods for compilation and inference. - """ - The constructor method allows model instantiation given either the - Stan program source file or the compiled executable, or both. - By default, the constructor will compile the Stan program on instantiation - unless the argument ``compile=False`` is specified. - The set of constructor arguments are: - - :param model_name: Model name, used for output file names. - Optional, default is the base filename of the Stan program file. - - :param stan_file: Path to Stan program file. - - :param exe_file: Path to compiled executable file. Optional, unless - no Stan program file is specified. If both the program file and - the compiled executable file are specified, the base filenames - must match, (but different directory locations are allowed). - - :param compile: Whether or not to compile the model. Default is ``True``. - - :param stanc_options: Options for stanc compiler, specified as a Python - dictionary containing Stanc3 compiler option name, value pairs. - Optional. - - :param cpp_options: Options for C++ compiler, specified as a Python - dictionary containing C++ compiler option name, value pairs. - Optional. - """ - - def __init__( - self, - model_name: Optional[str] = None, - stan_file: Optional[str] = None, - exe_file: Optional[str] = None, - compile: bool = True, - stanc_options: Optional[Dict[str, Any]] = None, - cpp_options: Optional[Dict[str, Any]] = None, - logger: Optional[logging.Logger] = None, - ) -> None: - """ - Initialize object given constructor args. - - :param model_name: Model name, used for output file names. - :param stan_file: Path to Stan program file. - :param exe_file: Path to compiled executable file. - :param compile: Whether or not to compile the model. - :param stanc_options: Options for stanc compiler. - :param cpp_options: Options for C++ compiler. - """ - self._name = '' - self._stan_file = None - self._exe_file = None - self._compiler_options = CompilerOptions( - stanc_options=stanc_options, cpp_options=cpp_options - ) - if logger is not None: - get_logger().warning( - "Parameter 'logger' is deprecated." - " Control logging behavior via logging.getLogger('cmdstanpy')" - ) - - if model_name is not None: - if not model_name.strip(): - raise ValueError( - 'Invalid value for argument model name, found "{}"'.format( - model_name - ) - ) - self._name = model_name.strip() - - if stan_file is None: - if exe_file is None: - raise ValueError( - 'Missing model file arguments, you must specify ' - 'either Stan source or executable program file or both.' - ) - else: - self._stan_file = os.path.realpath(os.path.expanduser(stan_file)) - if not os.path.exists(self._stan_file): - raise ValueError('no such file {}'.format(self._stan_file)) - _, filename = os.path.split(stan_file) - if len(filename) < 6 or not filename.endswith('.stan'): - raise ValueError( - 'invalid stan filename {}'.format(self._stan_file) - ) - if not self._name: - self._name, _ = os.path.splitext(filename) - # if program has include directives, record path - with open(self._stan_file, 'r') as fd: - program = fd.read() - if '#include' in program: - path, _ = os.path.split(self._stan_file) - if self._compiler_options is None: - self._compiler_options = CompilerOptions( - stanc_options={'include_paths': [path]} - ) - elif self._compiler_options._stanc_options is None: - self._compiler_options._stanc_options = { - 'include_paths': [path] - } - else: - self._compiler_options.add_include_path(path) - - if exe_file is not None: - self._exe_file = os.path.realpath(os.path.expanduser(exe_file)) - if not os.path.exists(self._exe_file): - raise ValueError('no such file {}'.format(self._exe_file)) - _, exename = os.path.split(self._exe_file) - if not self._name: - self._name, _ = os.path.splitext(exename) - else: - if self._name != os.path.splitext(exename)[0]: - raise ValueError( - 'Name mismatch between Stan file and compiled' - ' executable, expecting basename: {}' - ' found: {}.'.format(self._name, exename) - ) - - self._compiler_options.validate() - - if platform.system() == 'Windows': - try: - do_command(['where.exe', 'tbb.dll']) - except RuntimeError: - # Add tbb to the $PATH on Windows - libtbb = os.environ.get('STAN_TBB') - if libtbb is None: - libtbb = os.path.join( - cmdstan_path(), 'stan', 'lib', 'stan_math', 'lib', 'tbb' - ) - get_logger().debug("Adding TBB (%s) to PATH", libtbb) - os.environ['PATH'] = ';'.join( - list( - OrderedDict.fromkeys( - [libtbb] + os.environ.get('PATH', '').split(';') - ) - ) - ) - else: - get_logger().debug("TBB already found in load path") - - if compile and self._exe_file is None: - self.compile() - if self._exe_file is None: - raise ValueError( - 'Unable to compile Stan model file: {}.'.format( - self._stan_file - ) - ) - - def __repr__(self) -> str: - repr = 'CmdStanModel: name={}'.format(self._name) - repr = '{}\n\t stan_file={}'.format(repr, self._stan_file) - repr = '{}\n\t exe_file={}'.format(repr, self._exe_file) - repr = '{}\n\t compiler_options={}'.format(repr, self._compiler_options) - return repr - - @property - def name(self) -> str: - """ - Model name used in output filename templates. Default is basename - of Stan program or exe file, unless specified in call to constructor - via argument ``model_name``. - """ - return self._name - - @property - def stan_file(self) -> Optional[str]: - """Full path to Stan program file.""" - return self._stan_file - - @property - def exe_file(self) -> Optional[str]: - """Full path to Stan exe file.""" - return self._exe_file - - @property - def stanc_options(self) -> Dict[str, Union[bool, int, str]]: - """Options to stanc compilers.""" - return self._compiler_options._stanc_options - - @property - def cpp_options(self) -> Dict[str, Union[bool, int]]: - """Options to C++ compilers.""" - return self._compiler_options._cpp_options - -
[docs] def code(self) -> Optional[str]: - """Return Stan program as a string.""" - if not self._stan_file: - raise RuntimeError('Please specify source file') - - code = None - try: - with open(self._stan_file, 'r') as fd: - code = fd.read() - except IOError: - get_logger().error( - 'Cannot read file Stan file: %s', self._stan_file - ) - return code
- -
[docs] def compile( - self, - force: bool = False, - stanc_options: Optional[Dict[str, Any]] = None, - cpp_options: Optional[Dict[str, Any]] = None, - override_options: bool = False, - ) -> None: - """ - Compile the given Stan program file. Translates the Stan code to - C++, then calls the C++ compiler. - - By default, this function compares the timestamps on the source and - executable files; if the executable is newer than the source file, it - will not recompile the file, unless argument ``force`` is ``True``. - - :param force: When ``True``, always compile, even if the executable file - is newer than the source file. Used for Stan models which have - ``#include`` directives in order to force recompilation when changes - are made to the included files. - - :param stanc_options: Options for stanc compiler. - :param cpp_options: Options for C++ compiler. - - :param override_options: When ``True``, override existing option. - When ``False``, add/replace existing options. Default is ``False``. - """ - if not self._stan_file: - raise RuntimeError('Please specify source file') - - compiler_options = None - if not (stanc_options is None and cpp_options is None): - compiler_options = CompilerOptions( - stanc_options=stanc_options, cpp_options=cpp_options - ) - compiler_options.validate() - if self._compiler_options is None: - self._compiler_options = compiler_options - elif override_options: - self._compiler_options = compiler_options - else: - self._compiler_options.add(compiler_options) - - # check if exe file exists in original location - exe_file, _ = os.path.splitext(os.path.abspath(self._stan_file)) - exe_file = Path(exe_file).as_posix() + EXTENSION - do_compile = True - if os.path.exists(exe_file): - src_time = os.path.getmtime(self._stan_file) - exe_time = os.path.getmtime(exe_file) - if exe_time > src_time and not force: - do_compile = False - get_logger().info('found newer exe file, not recompiling') - self._exe_file = exe_file - get_logger().info('compiled model file: %s', self._exe_file) - if do_compile: - compilation_failed = False - with TemporaryCopiedFile(self._stan_file) as (stan_file, is_copied): - exe_file, _ = os.path.splitext(os.path.abspath(stan_file)) - exe_file = Path(exe_file).as_posix() + EXTENSION - do_compile = True - if os.path.exists(exe_file): - src_time = os.path.getmtime(self._stan_file) - exe_time = os.path.getmtime(exe_file) - if exe_time > src_time and not force: - do_compile = False - get_logger().info( - 'found newer exe file, not recompiling' - ) - - if do_compile: - get_logger().info( - 'compiling stan program, exe file: %s', exe_file - ) - if self._compiler_options is not None: - self._compiler_options.validate() - get_logger().info( - 'compiler options: %s', self._compiler_options - ) - make = os.getenv( - 'MAKE', - 'make' - if platform.system() != 'Windows' - else 'mingw32-make', - ) - cmd = [make] - if self._compiler_options is not None: - cmd.extend(self._compiler_options.compose()) - cmd.append(Path(exe_file).as_posix()) - try: - msg = do_command(cmd, cmdstan_path()) - if msg is not None and 'Warning or error:' in msg: - msg = msg.split("Warning or error:", 1)[1].strip() - get_logger().warning( - "stanc3 has produced warnings:\n%s", msg - ) - - except RuntimeError as e: - get_logger().error( - 'file %s, exception %s', stan_file, str(e) - ) - if 'PCH file' in str(e): - get_logger().warning( - "%s, %s", - "CmdStan's precompiled header (PCH) files ", - "may need to be rebuilt.", - ) - get_logger().warning( - "%s %s", - "If your model failed to compile please run ", - "install_cmdstan(overwrite=True).", - ) - get_logger().warning( - "If the issue persists please open a bug report" - ) - - compilation_failed = True - - if not compilation_failed: - if is_copied: - original_target_dir = os.path.dirname( - os.path.abspath(self._stan_file) - ) - new_exec_name = ( - os.path.basename( - os.path.splitext(self._stan_file)[0] - ) - + EXTENSION - ) - self._exe_file = os.path.join( - original_target_dir, new_exec_name - ) - shutil.copy(exe_file, self._exe_file) - else: - self._exe_file = exe_file - get_logger().info('compiled model file: %s', self._exe_file) - else: - get_logger().error('model compilation failed')
- -
[docs] def optimize( - self, - data: Union[Mapping[str, Any], str, None] = None, - seed: Optional[int] = None, - inits: Union[Dict[str, float], float, str, None] = None, - output_dir: Optional[str] = None, - sig_figs: Optional[int] = None, - save_profile: bool = False, - algorithm: Optional[str] = None, - init_alpha: Optional[float] = None, - tol_obj: Optional[float] = None, - tol_rel_obj: Optional[float] = None, - tol_grad: Optional[float] = None, - tol_rel_grad: Optional[float] = None, - tol_param: Optional[float] = None, - history_size: Optional[int] = None, - iter: Optional[int] = None, - refresh: Optional[int] = None, - ) -> CmdStanMLE: - """ - Run the specified CmdStan optimize algorithm to produce a - penalized maximum likelihood estimate of the model parameters. - - This function validates the specified configuration, composes a call to - the CmdStan ``optimize`` method and spawns one subprocess to run the - optimizer and waits for it to run to completion. - Unspecified arguments are not included in the call to CmdStan, i.e., - those arguments will have CmdStan default values. - - The :class:`CmdStanMLE` object records the command, the return code, - and the paths to the optimize method output csv and console files. - The output files are written either to a specified output directory - or to a temporary directory which is deleted upon session exit. - - Output files are either written to a temporary directory or to the - specified output directory. Ouput filenames correspond to the template - '<model_name>-<YYYYMMDDHHMM>-<chain_id>' plus the file suffix which is - either '.csv' for the CmdStan output or '.txt' for - the console messages, e.g. 'bernoulli-201912081451-1.csv'. - Output files written to the temporary directory contain an additional - 8-character random string, e.g. 'bernoulli-201912081451-1-5nm6as7u.csv'. - - :param data: Values for all data variables in the model, specified - either as a dictionary with entries matching the data variables, - or as the path of a data file in JSON or Rdump format. - - :param seed: The seed for random number generator. Must be an integer - between 0 and 2^32 - 1. If unspecified, - :class:`numpy.random.RandomState` is used to generate a seed. - - :param inits: Specifies how the sampler initializes parameter values. - Initialization is either uniform random on a range centered on 0, - exactly 0, or a dictionary or file of initial values for some or - all parameters in the model. The default initialization behavior - will initialize all parameter values on range [-2, 2] on the - *unconstrained* support. If the expected parameter values are - too far from this range, this option may improve estimation. - The following value types are allowed: - - * Single number, n > 0 - initialization range is [-n, n]. - * 0 - all parameters are initialized to 0. - * dictionary - pairs parameter name : initial value. - * string - pathname to a JSON or Rdump data file. - - :param output_dir: Name of the directory to which CmdStan output - files are written. If unspecified, output files will be written - to a temporary directory which is deleted upon session exit. - - :param sig_figs: Numerical precision used for output CSV and text files. - Must be an integer between 1 and 18. If unspecified, the default - precision for the system file I/O is used; the usual value is 6. - Introduced in CmdStan-2.25. - - :param save_profile: Whether or not to profile auto-diff operations in - labelled blocks of code. If True, csv outputs are written to a file - '<model_name>-<YYYYMMDDHHMM>-profile-<chain_id>'. - Introduced in CmdStan-2.26. - - :param algorithm: Algorithm to use. One of: 'BFGS', 'LBFGS', 'Newton' - - :param init_alpha: Line search step size for first iteration - - :param tol_obj: Convergence tolerance on changes in objective - function value - - :param tol_rel_obj: Convergence tolerance on relative changes - in objective function value - - :param tol_grad: Convergence tolerance on the norm of the gradient - - :param tol_rel_grad: Convergence tolerance on the relative - norm of the gradient - - :param tol_param: Convergence tolerance on changes in parameter value - - :param history_size: Size of the history for LBFGS Hessian - approximation. The value should be less than the dimensionality - of the parameter space. 5-10 usually sufficient - - :param iter: Total number of iterations - - :param refresh: Specify the number of iterations cmdstan will take - between progress messages. Default value is 100. - - :return: CmdStanMLE object - """ - optimize_args = OptimizeArgs( - algorithm=algorithm, - init_alpha=init_alpha, - tol_obj=tol_obj, - tol_rel_obj=tol_rel_obj, - tol_grad=tol_grad, - tol_rel_grad=tol_rel_grad, - tol_param=tol_param, - history_size=history_size, - iter=iter, - ) - - with MaybeDictToFilePath(data, inits) as (_data, _inits): - args = CmdStanArgs( - self._name, - self._exe_file, - chain_ids=None, - data=_data, - seed=seed, - inits=_inits, - output_dir=output_dir, - sig_figs=sig_figs, - save_profile=save_profile, - method_args=optimize_args, - refresh=refresh, - ) - - dummy_chain_id = 0 - runset = RunSet(args=args, chains=1) - self._run_cmdstan(runset, dummy_chain_id) - - if not runset._check_retcodes(): - msg = 'Error during optimization:\n{}'.format(runset.get_err_msgs()) - msg = '{}Command and output files:\n{}'.format( - msg, runset.__repr__() - ) - raise RuntimeError(msg) - mle = CmdStanMLE(runset) - return mle
- - # pylint: disable=too-many-arguments -
[docs] def sample( - self, - data: Union[Mapping[str, Any], str, None] = None, - chains: Optional[int] = None, - parallel_chains: Optional[int] = None, - threads_per_chain: Optional[int] = None, - seed: Union[int, List[int], None] = None, - chain_ids: Union[int, List[int], None] = None, - inits: Union[Dict[str, float], float, str, List[str], None] = None, - iter_warmup: Optional[int] = None, - iter_sampling: Optional[int] = None, - save_warmup: bool = False, - thin: Optional[int] = None, - max_treedepth: Optional[int] = None, - metric: Union[str, List[str], None] = None, - step_size: Union[float, List[float], None] = None, - adapt_engaged: bool = True, - adapt_delta: Optional[float] = None, - adapt_init_phase: Optional[int] = None, - adapt_metric_window: Optional[int] = None, - adapt_step_size: Optional[int] = None, - fixed_param: bool = False, - output_dir: Optional[str] = None, - sig_figs: Optional[int] = None, - save_diagnostics: bool = False, - save_profile: bool = False, - show_progress: Union[bool, str] = False, - refresh: Optional[int] = None, - ) -> CmdStanMCMC: - """ - Run or more chains of the NUTS-HMC sampler to produce a set of draws - from the posterior distribution of a model conditioned on some data. - - This function validates the specified configuration, composes a call to - the CmdStan ``sample`` method and spawns one subprocess per chain to run - the sampler and waits for all chains to run to completion. - Unspecified arguments are not included in the call to CmdStan, i.e., - those arguments will have CmdStan default values. - - For each chain, the :class:`CmdStanMCMC` object records the command, - the return code, the sampler output file paths, and the corresponding - console outputs, if any. The output files are written either to a - specified output directory or to a temporary directory which is deleted - upon session exit. - - Output files are either written to a temporary directory or to the - specified output directory. Ouput filenames correspond to the template - '<model_name>-<YYYYMMDDHHMM>-<chain_id>' plus the file suffix which is - either '.csv' for the CmdStan output or '.txt' for - the console messages, e.g. 'bernoulli-201912081451-1.csv'. - Output files written to the temporary directory contain an additional - 8-character random string, e.g. 'bernoulli-201912081451-1-5nm6as7u.csv'. - - :param data: Values for all data variables in the model, specified - either as a dictionary with entries matching the data variables, - or as the path of a data file in JSON or Rdump format. - - :param chains: Number of sampler chains, must be a positive integer. - - :param parallel_chains: Number of processes to run in parallel. Must be - a positive integer. Defaults to :func:`multiprocessing.cpu_count`. - - :param threads_per_chain: The number of threads to use in parallelized - sections within an MCMC chain (e.g., when using the Stan functions - ``reduce_sum()`` or ``map_rect()``). This will only have an effect - if the model was compiled with threading support. The total number - of threads used will be ``parallel_chains * threads_per_chain``. - - :param seed: The seed for random number generator. Must be an integer - between 0 and 2^32 - 1. If unspecified, - :class:`numpy.random.RandomState` - is used to generate a seed which will be used for all chains. - When the same seed is used across all chains, - the chain-id is used to advance the RNG to avoid dependent samples. - - :param chain_ids: The offset for the random number generator, either - an integer or a list of unique per-chain offsets. If unspecified, - chain ids are numbered sequentially starting from 1. - - :param inits: Specifies how the sampler initializes parameter values. - Initialization is either uniform random on a range centered on 0, - exactly 0, or a dictionary or file of initial values for some or all - parameters in the model. The default initialization behavior will - initialize all parameter values on range [-2, 2] on the - *unconstrained* support. If the expected parameter values are - too far from this range, this option may improve adaptation. - The following value types are allowed: - - * Single number n > 0 - initialization range is [-n, n]. - * 0 - all parameters are initialized to 0. - * dictionary - pairs parameter name : initial value. - * string - pathname to a JSON or Rdump data file. - * list of strings - per-chain pathname to data file. - - :param iter_warmup: Number of warmup iterations for each chain. - - :param iter_sampling: Number of draws from the posterior for each - chain. - - :param save_warmup: When ``True``, sampler saves warmup draws as part of - the Stan csv output file. - - :param thin: Period between recorded iterations. Default is 1, i.e., - all iterations are recorded. - - :param max_treedepth: Maximum depth of trees evaluated by NUTS sampler - per iteration. - - :param metric: Specification of the mass matrix, either as a - vector consisting of the diagonal elements of the covariance - matrix ('diag' or 'diag_e') or the full covariance matrix - ('dense' or 'dense_e'). - - If the value of the metric argument is a string other than - 'diag', 'diag_e', 'dense', or 'dense_e', it must be - a valid filepath to a JSON or Rdump file which contains an entry - 'inv_metric' whose value is either the diagonal vector or - the full covariance matrix. - - If the value of the metric argument is a list of paths, its - length must match the number of chains and all paths must be - unique. - - :param step_size: Initial step size for HMC sampler. The value is - either a single number or a list of numbers which will be used - as the global or per-chain initial step size, respectively. - The length of the list of step sizes must match the number of - chains. - - :param adapt_engaged: When True, adapt step size and metric. - - :param adapt_delta: Adaptation target Metropolis acceptance rate. - The default value is 0.8. Increasing this value, which must be - strictly less than 1, causes adaptation to use smaller step sizes - which improves the effective sample size, but may increase the time - per iteration. - - :param adapt_init_phase: Iterations for initial phase of adaptation - during which step size is adjusted so that the chain converges - towards the typical set. - - :param adapt_metric_window: The second phase of adaptation tunes - the metric and step size in a series of intervals. This parameter - specifies the number of iterations used for the first tuning - interval; window size increases for each subsequent interval. - - :param adapt_step_size: Number of iterations given over to adjusting - the step size given the tuned metric during the final phase of - adaptation. - - :param fixed_param: When ``True``, call CmdStan with argument - ``algorithm=fixed_param`` which runs the sampler without - updating the Markov Chain, thus the values of all parameters and - transformed parameters are constant across all draws and - only those values in the generated quantities block that are - produced by RNG functions may change. This provides - a way to use Stan programs to generate simulated data via the - generated quantities block. This option must be used when the - parameters block is empty. Default value is ``False``. - - :param output_dir: Name of the directory to which CmdStan output - files are written. If unspecified, output files will be written - to a temporary directory which is deleted upon session exit. - - :param sig_figs: Numerical precision used for output CSV and text files. - Must be an integer between 1 and 18. If unspecified, the default - precision for the system file I/O is used; the usual value is 6. - Introduced in CmdStan-2.25. - - :param save_diagnostics: Whether or not to output the position and - momentum information for each parameter. If True, - csv outputs are written to an output file using filename - template '<model_name>-<YYYYMMDDHHMM>-diagnostic-<chain_id>', - e.g. 'bernoulli-201912081451-diagnostic-1.csv'. - - :param save_profile: Whether or not to profile auto-diff operations in - labelled blocks of code. If True, csv outputs are written to a file - '<model_name>-<YYYYMMDDHHMM>-profile-<chain_id>'. - Introduced in CmdStan-2.26. - - :param show_progress: Use tqdm progress bar to show sampling progress. - If show_progress=='notebook' use tqdm_notebook - (needs nodejs for jupyter). - - :param refresh: Specify the number of iterations cmdstan will take - between progress messages. Default value is 100. - - :return: CmdStanMCMC object - """ - if chains is None: - if fixed_param: - chains = 1 - else: - chains = 4 - if chains < 1: - raise ValueError( - 'Chains must be a positive integer value, found {}.'.format( - chains - ) - ) - if chain_ids is None: - chain_ids = [x + 1 for x in range(chains)] - else: - if isinstance(chain_ids, int): - if chain_ids < 1: - raise ValueError( - 'Chain_id must be a positive integer value,' - ' found {}.'.format(chain_ids) - ) - chain_ids = [chain_ids + i for i in range(chains)] - else: - if not len(chain_ids) == chains: - raise ValueError( - 'Chain_ids must correspond to number of chains' - ' specified {} chains, found {} chain_ids.'.format( - chains, len(chain_ids) - ) - ) - for chain_id in chain_ids: - if chain_id < 0: - raise ValueError( - 'Chain_id must be a non-negative integer value,' - ' found {}.'.format(chain_id) - ) - if parallel_chains is None: - parallel_chains = max(min(cpu_count(), chains), 1) - elif parallel_chains > chains: - get_logger().info( - 'Requesting %u parallel_chains for %u chains,' - ' running all chains in parallel.', - parallel_chains, - chains, - ) - parallel_chains = chains - elif parallel_chains < 1: - raise ValueError( - 'Argument parallel_chains must be a positive integer value, ' - 'found {}.'.format(parallel_chains) - ) - if threads_per_chain is None: - threads_per_chain = 1 - if threads_per_chain < 1: - raise ValueError( - 'Argument threads_per_chain must be a positive integer value, ' - 'found {}.'.format(threads_per_chain) - ) - get_logger().debug( - 'total threads: %u', parallel_chains * threads_per_chain - ) - os.environ['STAN_NUM_THREADS'] = str(threads_per_chain) - - if show_progress: - try: - import tqdm - - get_logger().propagate = False - except ImportError: - get_logger().warning( - ( - 'Package tqdm not installed, cannot show progress ' - 'information. Please install tqdm with ' - "'pip install tqdm'" - ) - ) - show_progress = False - - # TODO: issue 49: inits can be initialization function - - sampler_args = SamplerArgs( - iter_warmup=iter_warmup, - iter_sampling=iter_sampling, - save_warmup=save_warmup, - thin=thin, - max_treedepth=max_treedepth, - metric=metric, - step_size=step_size, - adapt_engaged=adapt_engaged, - adapt_delta=adapt_delta, - adapt_init_phase=adapt_init_phase, - adapt_metric_window=adapt_metric_window, - adapt_step_size=adapt_step_size, - fixed_param=fixed_param, - ) - with MaybeDictToFilePath(data, inits) as (_data, _inits): - args = CmdStanArgs( - self._name, - self._exe_file, - chain_ids=chain_ids, - data=_data, - seed=seed, - inits=_inits, - output_dir=output_dir, - sig_figs=sig_figs, - save_diagnostics=save_diagnostics, - save_profile=save_profile, - method_args=sampler_args, - refresh=refresh, - ) - runset = RunSet(args=args, chains=chains, chain_ids=chain_ids) - pbar = None - all_pbars = [] - - with ThreadPoolExecutor(max_workers=parallel_chains) as executor: - for i in range(chains): - if show_progress: - if ( - isinstance(show_progress, str) - and show_progress.lower() == 'notebook' - ): - try: - tqdm_pbar = tqdm.tqdm_notebook - except ImportError: - msg = ( - 'Cannot import tqdm.tqdm_notebook.\n' - 'Functionality is only supported on the ' - 'Jupyter Notebook and compatible platforms' - '.\nPlease follow the instructions in ' - 'https://github.com/tqdm/tqdm/issues/394#' - 'issuecomment-384743637 and remember to ' - 'stop & start your jupyter server.' - ) - get_logger().warning(msg) - tqdm_pbar = tqdm.tqdm - else: - tqdm_pbar = tqdm.tqdm - # enable dynamic_ncols for advanced users - # currently hidden feature - dynamic_ncols_raw = os.environ.get( - 'TQDM_DYNAMIC_NCOLS', 'False' - ) - if dynamic_ncols_raw.lower() in ['0', 'false']: - dynamic_ncols = False - else: - dynamic_ncols = True - pbar = tqdm_pbar( - desc='Chain {} - warmup'.format(i + 1), - position=i, - total=1, # Will set total from Stan's output - dynamic_ncols=dynamic_ncols, - ) - all_pbars.append(pbar) - executor.submit(self._run_cmdstan, runset, i, pbar) - - # Closing all progress bars - for pbar in all_pbars: - pbar.close() - if show_progress: - # re-enable logger for console - get_logger().propagate = True - - if not runset._check_retcodes(): - msg = 'Error during sampling:\n{}'.format(runset.get_err_msgs()) - msg = '{}Command and output files:\n{}'.format( - msg, runset.__repr__() - ) - raise RuntimeError(msg) - - mcmc = CmdStanMCMC(runset) - return mcmc
- -
[docs] def generate_quantities( - self, - data: Union[Mapping[str, Any], str, None] = None, - mcmc_sample: Union[CmdStanMCMC, List[str], None] = None, - seed: Optional[int] = None, - gq_output_dir: Optional[str] = None, - sig_figs: Optional[int] = None, - refresh: Optional[int] = None, - ) -> CmdStanGQ: - """ - Run CmdStan's generate_quantities method which runs the generated - quantities block of a model given an existing sample. - - This function takes a :class:`CmdStanMCMC` object and the dataset used - to generate that sample and calls to the CmdStan ``generate_quantities`` - method to generate additional quantities of interest. - - The :class:`CmdStanGQ` object records the command, the return code, - and the paths to the generate method output csv and console files. - The output files are written either to a specified output directory - or to a temporary directory which is deleted upon session exit. - - Output files are either written to a temporary directory or to the - specified output directory. Output filenames correspond to the template - '<model_name>-<YYYYMMDDHHMM>-<chain_id>' plus the file suffix which is - either '.csv' for the CmdStan output or '.txt' for - the console messages, e.g. 'bernoulli-201912081451-1.csv'. - Output files written to the temporary directory contain an additional - 8-character random string, e.g. 'bernoulli-201912081451-1-5nm6as7u.csv'. - - :param data: Values for all data variables in the model, specified - either as a dictionary with entries matching the data variables, - or as the path of a data file in JSON or Rdump format. - - :param mcmc_sample: Can be either a :class:`CmdStanMCMC` object returned - by the :meth:`sample` method or a list of stan-csv files generated - by fitting the model to the data using any Stan interface. - - :param seed: The seed for random number generator. Must be an integer - between 0 and 2^32 - 1. If unspecified, - :class:`numpy.random.RandomState` - is used to generate a seed which will be used for all chains. - *NOTE: Specifying the seed will guarantee the same result for - multiple invocations of this method with the same inputs. However - this will not reproduce results from the sample method given - the same inputs because the RNG will be in a different state.* - - :param gq_output_dir: Name of the directory in which the CmdStan output - files are saved. If unspecified, files will be written to a - temporary directory which is deleted upon session exit. - - :param sig_figs: Numerical precision used for output CSV and text files. - Must be an integer between 1 and 18. If unspecified, the default - precision for the system file I/O is used; the usual value is 6. - Introduced in CmdStan-2.25. - - :param refresh: Specify the number of iterations cmdstan will take - between progress messages. Default value is 100. - - :return: CmdStanGQ object - """ - if isinstance(mcmc_sample, CmdStanMCMC): - mcmc_fit = mcmc_sample - sample_csv_files = mcmc_sample.runset.csv_files - elif isinstance(mcmc_sample, list): - if len(mcmc_sample) < 1: - raise ValueError( - 'Expecting list of Stan CSV files, found empty list' - ) - try: - sample_csv_files = mcmc_sample - sample_fit = from_csv(sample_csv_files) - mcmc_fit = sample_fit # type: ignore - except ValueError as e: - raise ValueError( - 'Invalid sample from Stan CSV files, error:\n\t{}\n\t' - ' while processing files\n\t{}'.format( - repr(e), '\n\t'.join(mcmc_sample) - ) - ) from e - else: - raise ValueError( - 'MCMC sample must be either CmdStanMCMC object' - ' or list of paths to sample Stan CSV files.' - ) - chains = mcmc_fit.chains - chain_ids = mcmc_fit.chain_ids - if mcmc_fit.metadata.cmdstan_config['save_warmup']: - get_logger().warning( - 'Sample contains saved warmup draws which will be used ' - 'to generate additional quantities of interest.' - ) - generate_quantities_args = GenerateQuantitiesArgs( - csv_files=sample_csv_files - ) - generate_quantities_args.validate(chains) - with MaybeDictToFilePath(data, None) as (_data, _inits): - args = CmdStanArgs( - self._name, - self._exe_file, - chain_ids=chain_ids, - data=_data, - seed=seed, - output_dir=gq_output_dir, - sig_figs=sig_figs, - method_args=generate_quantities_args, - refresh=refresh, - ) - runset = RunSet(args=args, chains=chains, chain_ids=chain_ids) - - parallel_chains_avail = cpu_count() - parallel_chains = max(min(parallel_chains_avail - 2, chains), 1) - with ThreadPoolExecutor(max_workers=parallel_chains) as executor: - for i in range(chains): - executor.submit(self._run_cmdstan, runset, i) - - if not runset._check_retcodes(): - msg = 'Error during generate_quantities:\n{}'.format( - runset.get_err_msgs() - ) - msg = '{}Command and output files:\n{}'.format( - msg, runset.__repr__() - ) - raise RuntimeError(msg) - quantities = CmdStanGQ(runset=runset, mcmc_sample=mcmc_fit) - return quantities
- -
[docs] def variational( - self, - data: Union[Mapping[str, Any], str, None] = None, - seed: Optional[int] = None, - inits: Optional[float] = None, - output_dir: Optional[str] = None, - sig_figs: Optional[int] = None, - save_diagnostics: bool = False, - save_profile: bool = False, - algorithm: Optional[str] = None, - iter: Optional[int] = None, - grad_samples: Optional[int] = None, - elbo_samples: Optional[int] = None, - eta: Optional[float] = None, - adapt_engaged: bool = True, - adapt_iter: Optional[int] = None, - tol_rel_obj: Optional[float] = None, - eval_elbo: Optional[int] = None, - output_samples: Optional[int] = None, - require_converged: bool = True, - refresh: Optional[int] = None, - ) -> CmdStanVB: - """ - Run CmdStan's variational inference algorithm to approximate - the posterior distribution of the model conditioned on the data. - - This function validates the specified configuration, composes a call to - the CmdStan ``variational`` method and spawns one subprocess to run the - optimizer and waits for it to run to completion. - Unspecified arguments are not included in the call to CmdStan, i.e., - those arguments will have CmdStan default values. - - The :class:`CmdStanVB` object records the command, the return code, - and the paths to the variational method output csv and console files. - The output files are written either to a specified output directory - or to a temporary directory which is deleted upon session exit. - - Output files are either written to a temporary directory or to the - specified output directory. Output filenames correspond to the template - '<model_name>-<YYYYMMDDHHMM>-<chain_id>' plus the file suffix which is - either '.csv' for the CmdStan output or '.txt' for - the console messages, e.g. 'bernoulli-201912081451-1.csv'. - Output files written to the temporary directory contain an additional - 8-character random string, e.g. 'bernoulli-201912081451-1-5nm6as7u.csv'. - - :param data: Values for all data variables in the model, specified - either as a dictionary with entries matching the data variables, - or as the path of a data file in JSON or Rdump format. - - :param seed: The seed for random number generator. Must be an integer - between 0 and 2^32 - 1. If unspecified, - :class:`numpy.random.RandomState` - is used to generate a seed which will be used for all chains. - - :param inits: Specifies how the sampler initializes parameter values. - Initialization is uniform random on a range centered on 0 with - default range of 2. Specifying a single number n > 0 changes - the initialization range to [-n, n]. - - :param output_dir: Name of the directory to which CmdStan output - files are written. If unspecified, output files will be written - to a temporary directory which is deleted upon session exit. - - :param sig_figs: Numerical precision used for output CSV and text files. - Must be an integer between 1 and 18. If unspecified, the default - precision for the system file I/O is used; the usual value is 6. - Introduced in CmdStan-2.25. - - :param save_diagnostics: Whether or not to save diagnostics. If True, - csv outputs are written to an output file using filename - template '<model_name>-<YYYYMMDDHHMM>-diagnostic-<chain_id>', - e.g. 'bernoulli-201912081451-diagnostic-1.csv'. - - :param save_profile: Whether or not to profile auto-diff operations in - labelled blocks of code. If True, csv outputs are written to a file - '<model_name>-<YYYYMMDDHHMM>-profile-<chain_id>'. - Introduced in CmdStan-2.26. - - :param algorithm: Algorithm to use. One of: 'meanfield', 'fullrank'. - - :param iter: Maximum number of ADVI iterations. - - :param grad_samples: Number of MC draws for computing the gradient. - - :param elbo_samples: Number of MC draws for estimate of ELBO. - - :param eta: Step size scaling parameter. - - :param adapt_engaged: Whether eta adaptation is engaged. - - :param adapt_iter: Number of iterations for eta adaptation. - - :param tol_rel_obj: Relative tolerance parameter for convergence. - - :param eval_elbo: Number of iterations between ELBO evaluations. - - :param output_samples: Number of approximate posterior output draws - to save. - - :param require_converged: Whether or not to raise an error if stan - reports that "The algorithm may not have converged". - - :param refresh: Specify the number of iterations cmdstan will take - between progress messages. Default value is 100. - - :return: CmdStanVB object - """ - variational_args = VariationalArgs( - algorithm=algorithm, - iter=iter, - grad_samples=grad_samples, - elbo_samples=elbo_samples, - eta=eta, - adapt_engaged=adapt_engaged, - adapt_iter=adapt_iter, - tol_rel_obj=tol_rel_obj, - eval_elbo=eval_elbo, - output_samples=output_samples, - ) - - with MaybeDictToFilePath(data, inits) as (_data, _inits): - args = CmdStanArgs( - self._name, - self._exe_file, - chain_ids=None, - data=_data, - seed=seed, - inits=_inits, - output_dir=output_dir, - sig_figs=sig_figs, - save_diagnostics=save_diagnostics, - save_profile=save_profile, - method_args=variational_args, - refresh=refresh, - ) - - dummy_chain_id = 0 - runset = RunSet(args=args, chains=1) - self._run_cmdstan(runset, dummy_chain_id) - - # treat failure to converge as failure - transcript_file = runset.stdout_files[dummy_chain_id] - valid = True - pat = re.compile(r'The algorithm may not have converged.', re.M) - with open(transcript_file, 'r') as transcript: - contents = transcript.read() - errors = re.findall(pat, contents) - if len(errors) > 0: - valid = False - if not valid: - if require_converged: - raise RuntimeError( - 'The algorithm may not have converged.\n' - 'If you would like to inspect the output, ' - 're-call with require_converged=False' - ) - # else: - get_logger().warning( - '%s\n%s', - 'The algorithm may not have converged.', - 'Proceeding because require_converged is set to False', - ) - if not runset._check_retcodes(): - msg = 'Error during variational inference:\n{}'.format( - runset.get_err_msgs() - ) - msg = '{}Command and output files:\n{}'.format( - msg, runset.__repr__() - ) - raise RuntimeError(msg) - # pylint: disable=invalid-name - vb = CmdStanVB(runset) - return vb
- - def _run_cmdstan( - self, runset: RunSet, idx: int = 0, pbar: Any = None - ) -> None: - """ - Encapsulates call to CmdStan. - Spawn process, capture console output to file, record returncode. - """ - cmd = runset.cmds[idx] - get_logger().info('start chain %u', idx + 1) - get_logger().debug( - 'threads: %s', str(os.environ.get('STAN_NUM_THREADS')) - ) - get_logger().debug('sampling: %s', cmd) - try: - proc = subprocess.Popen( - cmd, - stdin=subprocess.DEVNULL, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - env=os.environ, - ) - if pbar: - stdout_pbar = self._read_progress(proc, pbar, idx) - stdout, stderr = proc.communicate() - if pbar: - stdout = stdout_pbar + stdout - - get_logger().info('finish chain %u', idx + 1) - runset._set_retcode(idx, proc.returncode) - if stdout: - with open(runset.stdout_files[idx], 'w+') as fd: - contents = stdout.decode('utf-8') # bugfix 425 - if 'running fixed_param sampler' in contents: - sampler_args = runset._args.method_args - assert isinstance( - sampler_args, SamplerArgs - ) # make the typechecker happy - sampler_args.fixed_param = True - fd.write(contents) - console_error = '' - if stderr: - console_error = stderr.decode('utf-8') - with open(runset.stderr_files[idx], 'w+') as fd: - fd.write(console_error) - - if proc.returncode != 0: - if proc.returncode < 0: - msg = 'Chain {} terminated by signal {}'.format( - idx + 1, proc.returncode - ) - else: - msg = 'Chain {} processing error'.format(idx + 1) - msg = '{}, non-zero return code {}'.format( - msg, proc.returncode - ) - if len(console_error) > 0: - msg = '{}\n error message:\n\t{}'.format(msg, console_error) - get_logger().error(msg) - - except OSError as e: - msg = 'Chain {} encounted error: {}\n'.format(idx + 1, str(e)) - raise RuntimeError(msg) from e - - # pylint: disable=no-self-use - def _read_progress( - self, - proc: subprocess.Popen, # [] - Popoen is only generic in 3.9 - pbar: Any, - idx: int, - ) -> bytes: - """ - Update tqdm progress bars according to CmdStan console progress msgs. - Poll process to get CmdStan console outputs, - check for output lines that start with 'Iteration: '. - NOTE: if CmdStan output messages change, this will break. - """ - pattern = ( - r'^Iteration\:\s*(\d+)\s*/\s*(\d+)\s*\[\s*\d+%\s*\]\s*\((\S*)\)$' - ) - pattern_compiled = re.compile(pattern, flags=re.IGNORECASE) - previous_count = 0 - stdout = b'' - changed_description = False # Changed from 'warmup' to 'sample' - pbar.set_description(desc=f'Chain {idx + 1} - warmup', refresh=True) - - try: - # iterate while process is sampling - while proc.poll() is None and proc.stdout is not None: - output = proc.stdout.readline() - stdout += output - output = output.decode('utf-8').strip() - if output.startswith('Iteration'): - match = re.search(pattern_compiled, output) - if match: - current_count = int(match.group(1)) - total_count = int(match.group(2)) - - if pbar.total != total_count: - pbar.reset(total=total_count) - - if ( - match.group(3).lower() == 'sampling' - and not changed_description - ): - pbar.set_description(f'Chain {idx + 1} - sample') - changed_description = True - - pbar.update(current_count - previous_count) - previous_count = current_count - - pbar.set_description(f'Chain {idx + 1} - done', refresh=True) - - if 'notebook' in type(pbar).__name__: - # In Jupyter make the bar green by closing it - pbar.close() - - except Exception as e: # pylint: disable=broad-except - get_logger().warning( - 'Chain %s: Failed to read the progress on the fly. Error: %s', - idx, - repr(e), - ) - - return stdout
-
- -
- - -
- - -
- -
- - -
-
- - - - - - \ No newline at end of file diff --git a/docs/_modules/cmdstanpy/stanfit.html b/docs/_modules/cmdstanpy/stanfit.html deleted file mode 100644 index f239b4ca..00000000 --- a/docs/_modules/cmdstanpy/stanfit.html +++ /dev/null @@ -1,2512 +0,0 @@ - - - - - - - - cmdstanpy.stanfit — CmdStanPy 0.9.77 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - -
- - - - -
- -
- - - - - - -
- -
- -

Source code for cmdstanpy.stanfit

-"""Container objects for results of CmdStan run(s)."""
-
-import copy
-import glob
-import logging
-import math
-import os
-import re
-import shutil
-from collections import Counter, OrderedDict
-from datetime import datetime
-from time import time
-from typing import (
-    Any,
-    Dict,
-    Hashable,
-    List,
-    MutableMapping,
-    Optional,
-    Tuple,
-    Union,
-)
-
-import numpy as np
-import pandas as pd
-
-try:
-    import xarray as xr
-
-    XARRAY_INSTALLED = True
-except ImportError:
-    XARRAY_INSTALLED = False
-
-from cmdstanpy import _CMDSTAN_SAMPLING, _CMDSTAN_THIN, _CMDSTAN_WARMUP, _TMPDIR
-from cmdstanpy.cmdstan_args import (
-    CmdStanArgs,
-    Method,
-    OptimizeArgs,
-    SamplerArgs,
-    VariationalArgs,
-)
-from cmdstanpy.utils import (
-    EXTENSION,
-    check_sampler_csv,
-    cmdstan_path,
-    cmdstan_version_at,
-    create_named_text_file,
-    do_command,
-    flatten_chains,
-    get_logger,
-    parse_method_vars,
-    parse_stan_vars,
-    scan_config,
-    scan_generated_quantities_csv,
-    scan_optimize_csv,
-    scan_variational_csv,
-)
-
-
-
[docs]class RunSet: - """ - Encapsulates the configuration and results of a call to any CmdStan - inference method. Records the method return code and locations of - all console, error, and output files. - """ - - def __init__( - self, - args: CmdStanArgs, - chains: int = 4, - chain_ids: Optional[List[int]] = None, - logger: Optional[logging.Logger] = None, - ) -> None: - """Initialize object.""" - self._args = args - self._chains = chains - if logger is not None: - get_logger().warning( - "Parameter 'logger' is deprecated." - " Control logging behavior via logging.getLogger('cmdstanpy)'" - ) - if chains < 1: - raise ValueError( - 'Chains must be positive integer value, ' - 'found {}'.format(chains) - ) - if chain_ids is None: - chain_ids = [x + 1 for x in range(chains)] - elif len(chain_ids) != chains: - raise ValueError( - 'Mismatch between number of chains and chain_ids, ' - 'found {} chains, but {} chain_ids'.format( - chains, len(chain_ids) - ) - ) - self._chain_ids = chain_ids - self._retcodes = [-1 for _ in range(chains)] - - # stdout, stderr are written to text files - # prefix: ``<model_name>-<YYYYMMDDHHMM>-<chain_id>`` - # suffixes: ``-stdout.txt``, ``-stderr.txt`` - now = datetime.now() - now_str = now.strftime('%Y%m%d%H%M') - file_basename = '-'.join([args.model_name, now_str]) - if args.output_dir is not None: - output_dir = args.output_dir - else: - output_dir = _TMPDIR - self._csv_files = ['' for _ in range(chains)] - self._diagnostic_files = ['' for _ in range(chains)] - self._profile_files = ['' for _ in range(chains)] - self._stdout_files = ['' for _ in range(chains)] - self._stderr_files = ['' for _ in range(chains)] - self._cmds = [] - for i in range(chains): - if args.output_dir is None: - csv_file = create_named_text_file( - dir=output_dir, - prefix='{}-{}-'.format(file_basename, str(chain_ids[i])), - suffix='.csv', - ) - else: - csv_file = os.path.join( - output_dir, - '{}-{}.{}'.format(file_basename, str(chain_ids[i]), 'csv'), - ) - self._csv_files[i] = csv_file - stdout_file = ''.join( - [os.path.splitext(csv_file)[0], '-stdout.txt'] - ) - self._stdout_files[i] = stdout_file - stderr_file = ''.join( - [os.path.splitext(csv_file)[0], '-stderr.txt'] - ) - self._stderr_files[i] = stderr_file - # optional output files: diagnostics, profiling - if args.save_diagnostics: - if args.output_dir is None: - diag_file = create_named_text_file( - dir=_TMPDIR, - prefix='{}-diagnostic-{}-'.format( - file_basename, str(chain_ids[i]) - ), - suffix='.csv', - ) - else: - diag_file = os.path.join( - output_dir, - '{}-diagnostic-{}.{}'.format( - file_basename, str(chain_ids[i]), 'csv' - ), - ) - self._diagnostic_files[i] = diag_file - if args.save_profile: - if args.output_dir is None: - profile_file = create_named_text_file( - dir=_TMPDIR, - prefix='{}-profile-{}-'.format( - file_basename, str(chain_ids[i]) - ), - suffix='.csv', - ) - else: - profile_file = os.path.join( - output_dir, - '{}-profile-{}.{}'.format( - file_basename, str(chain_ids[i]), 'csv' - ), - ) - self._profile_files[i] = profile_file - if args.save_diagnostics and args.save_profile: - self._cmds.append( - args.compose_command( - i, - self._csv_files[i], - diagnostic_file=self._diagnostic_files[i], - profile_file=self._profile_files[i], - ) - ) - elif args.save_diagnostics: - self._cmds.append( - args.compose_command( - i, - self._csv_files[i], - diagnostic_file=self._diagnostic_files[i], - ) - ) - elif args.save_profile: - self._cmds.append( - args.compose_command( - i, - self._csv_files[i], - profile_file=self._profile_files[i], - ) - ) - else: - self._cmds.append(args.compose_command(i, self._csv_files[i])) - - def __repr__(self) -> str: - repr = 'RunSet: chains={}'.format(self._chains) - repr = '{}\n cmd:\n\t{}'.format(repr, self._cmds[0]) - repr = '{}\n retcodes={}'.format(repr, self._retcodes) - if os.path.exists(self._csv_files[0]): - repr = '{}\n csv_files:\n\t{}'.format( - repr, '\n\t'.join(self._csv_files) - ) - if self._args.save_diagnostics and os.path.exists( - self._diagnostic_files[0] - ): - repr = '{}\n diagnostics_files:\n\t{}'.format( - repr, '\n\t'.join(self._diagnostic_files) - ) - if self._args.save_profile and os.path.exists(self._profile_files[0]): - repr = '{}\n profile_files:\n\t{}'.format( - repr, '\n\t'.join(self._profile_files) - ) - if os.path.exists(self._stdout_files[0]): - repr = '{}\n console_msgs:\n\t{}'.format( - repr, '\n\t'.join(self._stdout_files) - ) - if os.path.exists(self._stderr_files[0]): - repr = '{}\n error_msgs:\n\t{}'.format( - repr, '\n\t'.join(self._stderr_files) - ) - return repr - - @property - def model(self) -> str: - """Stan model name.""" - return self._args.model_name - - @property - def method(self) -> Method: - """CmdStan method used to generate this fit.""" - return self._args.method - - @property - def chains(self) -> int: - """Number of chains.""" - return self._chains - - @property - def chain_ids(self) -> List[int]: - """Chain ids.""" - return self._chain_ids - - @property - def cmds(self) -> List[List[str]]: - """List of call(s) to CmdStan, one call per-chain.""" - return self._cmds - - @property - def csv_files(self) -> List[str]: - """List of paths to CmdStan output files.""" - return self._csv_files - - @property - def stdout_files(self) -> List[str]: - """List of paths to CmdStan stdout transcripts.""" - return self._stdout_files - - @property - def stderr_files(self) -> List[str]: - """List of paths to CmdStan stderr transcripts.""" - return self._stderr_files - - def _check_retcodes(self) -> bool: - """Returns ``True`` when all chains have retcode 0.""" - for i in range(self._chains): - if self._retcodes[i] != 0: - return False - return True - - @property - def diagnostic_files(self) -> List[str]: - """List of paths to CmdStan hamiltonian diagnostic files.""" - return self._diagnostic_files - - @property - def profile_files(self) -> List[str]: - """List of paths to CmdStan profiler files.""" - return self._profile_files - - def _retcode(self, idx: int) -> int: - """Get retcode for chain[idx].""" - return self._retcodes[idx] - - def _set_retcode(self, idx: int, val: int) -> None: - """Set retcode for chain[idx] to val.""" - self._retcodes[idx] = val - -
[docs] def get_err_msgs(self) -> str: - """Checks console messages for each chain.""" - msgs = [] - for i in range(self._chains): - if ( - os.path.exists(self._stderr_files[i]) - and os.stat(self._stderr_files[i]).st_size > 0 - ): - with open(self._stderr_files[i], 'r') as fd: - msgs.append( - 'chain_id {}:\n{}\n'.format( - self._chain_ids[i], fd.read() - ) - ) - # pre 2.27, all msgs sent to stdout, including errors - if ( - not cmdstan_version_at(2, 27) - and os.path.exists(self._stdout_files[i]) - and os.stat(self._stdout_files[i]).st_size > 0 - ): - with open(self._stdout_files[i], 'r') as fd: - contents = fd.read() - # pattern matches initial "Exception" or "Error" msg - pat = re.compile(r'^E[rx].*$', re.M) - errors = re.findall(pat, contents) - if len(errors) > 0: - msgs.append( - 'chain_id {}:\n\t{}\n'.format( - self._chain_ids[i], '\n\t'.join(errors) - ) - ) - return '\n'.join(msgs)
- -
[docs] def save_csvfiles(self, dir: Optional[str] = None) -> None: - """ - Moves csvfiles to specified directory. - - :param dir: directory path - - See Also - -------- - cmdstanpy.from_csv - """ - if dir is None: - dir = os.path.realpath('.') - test_path = os.path.join(dir, str(time())) - try: - os.makedirs(dir, exist_ok=True) - with open(test_path, 'w'): - pass - os.remove(test_path) # cleanup - except (IOError, OSError, PermissionError) as exc: - raise Exception('Cannot save to path: {}'.format(dir)) from exc - - for i in range(self.chains): - if not os.path.exists(self._csv_files[i]): - raise ValueError( - 'Cannot access csv file {}'.format(self._csv_files[i]) - ) - - path, filename = os.path.split(self._csv_files[i]) - if path == _TMPDIR: # cleanup tmpstr in filename - root, ext = os.path.splitext(filename) - rlist = root.split('-') - root = '-'.join(rlist[:-1]) - filename = ''.join([root, ext]) - - to_path = os.path.join(dir, filename) - if os.path.exists(to_path): - raise ValueError( - 'File exists, not overwriting: {}'.format(to_path) - ) - try: - get_logger().debug( - 'saving tmpfile: "%s" as: "%s"', self._csv_files[i], to_path - ) - shutil.move(self._csv_files[i], to_path) - self._csv_files[i] = to_path - except (IOError, OSError, PermissionError) as e: - raise ValueError( - 'Cannot save to file: {}'.format(to_path) - ) from e
- - -
[docs]class InferenceMetadata: - """ - CmdStan configuration and contents of output file parsed out of - the Stan CSV file header comments and column headers. - Assumes valid CSV files. - """ - - def __init__(self, config: Dict[str, Any]) -> None: - """Initialize object from CSV headers""" - self._cmdstan_config = config - self._method_vars_cols = parse_method_vars(names=config['column_names']) - stan_vars_dims, stan_vars_cols = parse_stan_vars( - names=config['column_names'] - ) - self._stan_vars_dims = stan_vars_dims - self._stan_vars_cols = stan_vars_cols - - def __repr__(self) -> str: - return 'Metadata:\n{}\n'.format(self._cmdstan_config) - - @property - def cmdstan_config(self) -> Dict[str, Any]: - """ - Returns a dictionary containing a set of name, value pairs - parsed out of the Stan CSV file header. These include the - command configuration and the CSV file header row information. - Uses deepcopy for immutability. - """ - return copy.deepcopy(self._cmdstan_config) - - @property - def method_vars_cols(self) -> Dict[str, Tuple[int, ...]]: - """ - Returns a map from a Stan inference method variable to - a tuple of column indices in inference engine's output array. - Method variable names always end in `__`, e.g. `lp__`. - Uses deepcopy for immutability. - """ - return copy.deepcopy(self._method_vars_cols) - - @property - def stan_vars_cols(self) -> Dict[str, Tuple[int, ...]]: - """ - Returns a map from a Stan program variable name to a - tuple of the column indices in the vector or matrix of - estimates produced by a CmdStan inference method. - Uses deepcopy for immutability. - """ - return copy.deepcopy(self._stan_vars_cols) - - @property - def stan_vars_dims(self) -> Dict[str, Tuple[int, ...]]: - """ - Returns map from Stan program variable names to variable dimensions. - Scalar types are mapped to the empty tuple, e.g., - program variable ``int foo`` has dimension ``()`` and - program variable ``vector[10] bar`` has single dimension ``(10)``. - Uses deepcopy for immutability. - """ - return copy.deepcopy(self._stan_vars_dims)
- - -
[docs]class CmdStanMCMC: - """ - Container for outputs from CmdStan sampler run. - Provides methods to summarize and diagnose the model fit - and accessor methods to access the entire sample or - individual items. Created by :meth:`CmdStanModel.sample` - - The sample is lazily instantiated on first access of either - the resulting sample or the HMC tuning parameters, i.e., the - step size and metric. - """ - - # pylint: disable=too-many-public-methods - def __init__( - self, - runset: RunSet, - logger: Optional[logging.Logger] = None, - ) -> None: - """Initialize object.""" - if not runset.method == Method.SAMPLE: - raise ValueError( - 'Wrong runset method, expecting sample runset, ' - 'found method {}'.format(runset.method) - ) - self.runset = runset - if logger is not None: - get_logger().warning( - "Parameter 'logger' is deprecated." - " Control logging behavior via logging.getLogger('cmdstanpy')" - ) - # info from runset to be exposed - sampler_args = self.runset._args.method_args - assert isinstance( - sampler_args, SamplerArgs - ) # make the typechecker happy - iter_sampling = sampler_args.iter_sampling - if iter_sampling is None: - self._iter_sampling = _CMDSTAN_SAMPLING - else: - self._iter_sampling = iter_sampling - iter_warmup = sampler_args.iter_warmup - if iter_warmup is None: - self._iter_warmup = _CMDSTAN_WARMUP - else: - self._iter_warmup = iter_warmup - thin = sampler_args.thin - if thin is None: - self._thin: int = _CMDSTAN_THIN - else: - self._thin = thin - self._is_fixed_param = sampler_args.fixed_param - self._save_warmup = sampler_args.save_warmup - self._sig_figs = runset._args.sig_figs - # info from CSV values, instantiated lazily - self._metric = np.array(()) - self._step_size = np.array(()) - self._draws = np.array(()) - # info from CSV initial comments and header - config = self._validate_csv_files() - self._metadata: InferenceMetadata = InferenceMetadata(config) - - def __repr__(self) -> str: - repr = 'CmdStanMCMC: model={} chains={}{}'.format( - self.runset.model, - self.runset.chains, - self.runset._args.method_args.compose(0, cmd=[]), - ) - repr = '{}\n csv_files:\n\t{}\n output_files:\n\t{}'.format( - repr, - '\n\t'.join(self.runset.csv_files), - '\n\t'.join(self.runset.stdout_files), - ) - # TODO - hamiltonian, profiling files - return repr - - @property - def chains(self) -> int: - """Number of chains.""" - return self.runset.chains - - @property - def chain_ids(self) -> List[int]: - """Chain ids.""" - return self.runset.chain_ids - - @property - def num_draws_warmup(self) -> int: - """Number of warmup draws per chain, i.e., thinned warmup iterations.""" - return int(math.ceil((self._iter_warmup) / self._thin)) - - @property - def num_draws_sampling(self) -> int: - """ - Number of sampling (post-warmup) draws per chain, i.e., - thinned sampling iterations. - """ - return int(math.ceil((self._iter_sampling) / self._thin)) - - @property - def metadata(self) -> InferenceMetadata: - """ - Returns object which contains CmdStan configuration as well as - information about the names and structure of the inference method - and model output variables. - """ - return self._metadata - - @property - def sampler_vars_cols(self) -> Dict[str, Tuple[int, ...]]: - """ - Deprecated - use "metadata.method_vars_cols" instead - """ - get_logger().warning( - 'Property "sampler_vars_cols" has been deprecated, ' - 'use "metadata.method_vars_cols" instead.' - ) - return self.metadata.method_vars_cols - - @property - def stan_vars_cols(self) -> Dict[str, Tuple[int, ...]]: - """ - Deprecated - use "metadata.stan_vars_cols" instead - """ - get_logger().warning( - 'Property "stan_vars_cols" has been deprecated, ' - 'use "metadata.stan_vars_cols" instead.' - ) - return self.metadata.stan_vars_cols - - @property - def stan_vars_dims(self) -> Dict[str, Tuple[int, ...]]: - """ - Deprecated - use "metadata.stan_vars_dims" instead - """ - get_logger().warning( - 'Property "stan_vars_dims" has been deprecated, ' - 'use "metadata.stan_vars_dims" instead.' - ) - return self.metadata.stan_vars_dims - - @property - def column_names(self) -> Tuple[str, ...]: - """ - Names of all outputs from the sampler, comprising sampler parameters - and all components of all model parameters, transformed parameters, - and quantities of interest. Corresponds to Stan CSV file header row, - with names munged to array notation, e.g. `beta[1]` not `beta.1`. - """ - return self._metadata.cmdstan_config['column_names'] # type: ignore - - @property - def num_unconstrained_params(self) -> int: - """ - Count of _unconstrained_ model parameters. This is the metric size; - for metric `diag_e`, the length of the diagonal vector, for metric - `dense_e` this is the size of the full covariance matrix. - - If the parameter variables in a model are - constrained parameter types, the number of constrained and - unconstrained parameters may differ. The sampler reports the - constrained parameters and computes with the unconstrained parameters. - E.g. a model with 2 parameter variables, ``real alpha`` and - ``vector[3] beta`` has 4 constrained and 4 unconstrained parameters, - however a model with variables ``real alpha`` and ``simplex[3] beta`` - has 4 constrained and 3 unconstrained parameters. - """ - if self._is_fixed_param: - return 0 - return self._metadata.cmdstan_config[ # type: ignore - 'num_unconstrained_params' - ] - - @property - def metric_type(self) -> Optional[str]: - """ - Metric type used for adaptation, either 'diag_e' or 'dense_e'. - When sampler algorithm 'fixed_param' is specified, metric_type is None. - """ - if self._is_fixed_param: - return None - # cmdstan arg name - return self._metadata.cmdstan_config['metric'] # type: ignore - - @property - def metric(self) -> Optional[np.ndarray]: - """ - Metric used by sampler for each chain. - When sampler algorithm 'fixed_param' is specified, metric is None. - """ - if self._is_fixed_param: - return None - if self._metric.shape == (0,): - self._assemble_draws() - return self._metric - - @property - def step_size(self) -> Optional[np.ndarray]: - """ - Step size used by sampler for each chain. - When sampler algorithm 'fixed_param' is specified, step size is None. - """ - if self._is_fixed_param: - return None - if self._step_size.shape == (0,): - self._assemble_draws() - return self._step_size - - @property - def thin(self) -> int: - """ - Period between recorded iterations. (Default is 1). - """ - return self._thin - -
[docs] def draws( - self, *, inc_warmup: bool = False, concat_chains: bool = False - ) -> np.ndarray: - """ - Returns a numpy.ndarray over all draws from all chains which is - stored column major so that the values for a parameter are contiguous - in memory, likewise all draws from a chain are contiguous. - By default, returns a 3D array arranged (draws, chains, columns); - parameter ``concat_chains=True`` will return a 2D array where all - chains are flattened into a single column, preserving chain order, - so that given M chains of N draws, the first N draws are from chain 1, - up through the last N draws from chain M. - - :param inc_warmup: When ``True`` and the warmup draws are present in - the output, i.e., the sampler was run with ``save_warmup=True``, - then the warmup draws are included. Default value is ``False``. - - :param concat_chains: When ``True`` return a 2D array flattening all - all draws from all chains. Default value is ``False``. - - See Also - -------- - CmdStanMCMC.draws_pd - CmdStanMCMC.draws_xr - CmdStanGQ.draws - """ - if self._draws.size == 0: - self._assemble_draws() - - if inc_warmup and not self._save_warmup: - get_logger().warning( - "Sample doesn't contain draws from warmup iterations," - ' rerun sampler with "save_warmup=True".' - ) - - start_idx = 0 - if not inc_warmup and self._save_warmup: - start_idx = self.num_draws_warmup - - if concat_chains: - return flatten_chains(self._draws[start_idx:, :, :]) - return self._draws[start_idx:, :, :] # type: ignore
- - @property - def sample(self) -> np.ndarray: - """ - Deprecated - use method "draws()" instead. - """ - get_logger().warning( - 'Method "sample" has been deprecated, use method "draws" instead.' - ) - return self.draws() - - @property - def warmup(self) -> np.ndarray: - """ - Deprecated - use "draws(inc_warmup=True)" - """ - get_logger().warning( - 'Method "warmup" has been deprecated, instead use method' - ' "draws(inc_warmup=True)", returning draws from both' - ' warmup and sampling iterations.' - ) - return self.draws(inc_warmup=True) - - def _validate_csv_files(self) -> Dict[str, Any]: - """ - Checks that Stan CSV output files for all chains are consistent - and returns dict containing config and column names. - - Raises exception when inconsistencies detected. - """ - dzero = {} - for i in range(self.chains): - if i == 0: - dzero = check_sampler_csv( - path=self.runset.csv_files[i], - is_fixed_param=self._is_fixed_param, - iter_sampling=self._iter_sampling, - iter_warmup=self._iter_warmup, - save_warmup=self._save_warmup, - thin=self._thin, - ) - else: - drest = check_sampler_csv( - path=self.runset.csv_files[i], - is_fixed_param=self._is_fixed_param, - iter_sampling=self._iter_sampling, - iter_warmup=self._iter_warmup, - save_warmup=self._save_warmup, - thin=self._thin, - ) - # pylint: disable=consider-using-dict-items - for key in dzero: - if ( - key - not in [ - 'id', - 'diagnostic_file', - 'metric_file', - 'profile_file', - 'stepsize', - 'init', - 'seed', - 'start_datetime', - ] - and dzero[key] != drest[key] - ): - raise ValueError( - 'CmdStan config mismatch in Stan CSV file {}: ' - 'arg {} is {}, expected {}'.format( - self.runset.csv_files[i], - key, - dzero[key], - drest[key], - ) - ) - return dzero - - def _assemble_draws(self) -> None: - """ - Allocates and populates the step size, metric, and sample arrays - by parsing the validated stan_csv files. - """ - if self._draws.shape != (0,): - return - - num_draws = self.num_draws_sampling - sampling_iter_start = 0 - if self._save_warmup: - num_draws += self.num_draws_warmup - sampling_iter_start = self.num_draws_warmup - self._draws = np.empty( - (num_draws, self.chains, len(self.column_names)), - dtype=float, - order='F', - ) - if not self._is_fixed_param: - self._step_size = np.empty(self.chains, dtype=float) - if self.metric_type == 'diag_e': - self._metric = np.empty( - (self.chains, self.num_unconstrained_params), dtype=float - ) - else: - self._metric = np.empty( - ( - self.chains, - self.num_unconstrained_params, - self.num_unconstrained_params, - ), - dtype=float, - ) - for chain in range(self.chains): - with open(self.runset.csv_files[chain], 'r') as fd: - # skip initial comments, up to columns header - line = fd.readline().strip() - while len(line) > 0 and line.startswith('#'): - line = fd.readline().strip() - # at columns header - if not self._is_fixed_param: - if self._save_warmup: - for i in range(self.num_draws_warmup): - line = fd.readline().strip() - xs = line.split(',') - self._draws[i, chain, :] = [float(x) for x in xs] - # read to adaptation msg - line = fd.readline().strip() - if line != '# Adaptation terminated': - while line != '# Adaptation terminated': - line = fd.readline().strip() - line = fd.readline().strip() # step_size - _, step_size = line.split('=') - self._step_size[chain] = float(step_size.strip()) - line = fd.readline().strip() # metric header - # process metric - if self.metric_type == 'diag_e': - line = fd.readline().lstrip(' #\t').strip() - xs = line.split(',') - self._metric[chain, :] = [float(x) for x in xs] - else: - for i in range(self.num_unconstrained_params): - line = fd.readline().lstrip(' #\t').strip() - xs = line.split(',') - self._metric[chain, i, :] = [float(x) for x in xs] - # process draws - for i in range(sampling_iter_start, num_draws): - line = fd.readline().strip() - xs = line.split(',') - self._draws[i, chain, :] = [float(x) for x in xs] - assert self._draws is not None - -
[docs] def summary( - self, - percentiles: Optional[List[int]] = None, - sig_figs: Optional[int] = None, - ) -> pd.DataFrame: - """ - Run cmdstan/bin/stansummary over all output csv files, assemble - summary into DataFrame object; first row contains summary statistics - for total joint log probability `lp__`, remaining rows contain summary - statistics for all parameters, transformed parameters, and generated - quantities variables listed in the order in which they were declared - in the Stan program. - - :param percentiles: Ordered non-empty list of percentiles to report. - Must be integers from (1, 99), inclusive. - - :param sig_figs: Number of significant figures to report. - Must be an integer between 1 and 18. If unspecified, the default - precision for the system file I/O is used; the usual value is 6. - If precision above 6 is requested, sample must have been produced - by CmdStan version 2.25 or later and sampler output precision - must equal to or greater than the requested summary precision. - - :return: pandas.DataFrame - """ - percentiles_str = '--percentiles=5,50,95' - if percentiles is not None: - if len(percentiles) == 0: - raise ValueError( - 'Invalid percentiles argument, must be ordered' - ' non-empty list from (1, 99), inclusive.' - ) - cur_pct = 0 - for pct in percentiles: - if pct > 99 or not pct > cur_pct: - raise ValueError( - 'Invalid percentiles spec, must be ordered' - ' non-empty list from (1, 99), inclusive.' - ) - cur_pct = pct - percentiles_str = '='.join( - ['--percentiles', ','.join([str(x) for x in percentiles])] - ) - sig_figs_str = '--sig_figs=2' - if sig_figs is not None: - if not isinstance(sig_figs, int) or sig_figs < 1 or sig_figs > 18: - raise ValueError( - 'Keyword "sig_figs" must be an integer between 1 and 18,' - ' found {}'.format(sig_figs) - ) - csv_sig_figs = self._sig_figs or 6 - if sig_figs > csv_sig_figs: - get_logger().warning( - 'Requesting %d significant digits of output, but CSV files' - ' only have %d digits of precision.', - sig_figs, - csv_sig_figs, - ) - sig_figs_str = '--sig_figs=' + str(sig_figs) - cmd_path = os.path.join( - cmdstan_path(), 'bin', 'stansummary' + EXTENSION - ) - tmp_csv_file = 'stansummary-{}-'.format(self.runset._args.model_name) - tmp_csv_path = create_named_text_file( - dir=_TMPDIR, prefix=tmp_csv_file, suffix='.csv', name_only=True - ) - csv_str = '--csv_filename={}'.format(tmp_csv_path) - if not cmdstan_version_at(2, 24): - csv_str = '--csv_file={}'.format(tmp_csv_path) - cmd = [ - cmd_path, - percentiles_str, - sig_figs_str, - csv_str, - ] + self.runset.csv_files - do_command(cmd) - with open(tmp_csv_path, 'rb') as fd: - summary_data = pd.read_csv( - fd, - delimiter=',', - header=0, - index_col=0, - comment='#', - float_precision='high', - ) - mask = [x == 'lp__' or not x.endswith('__') for x in summary_data.index] - return summary_data[mask]
- -
[docs] def diagnose(self) -> Optional[str]: - """ - Run cmdstan/bin/diagnose over all output csv files. - Returns output of diagnose (stdout/stderr). - - The diagnose utility reads the outputs of all chains - and checks for the following potential problems: - - + Transitions that hit the maximum treedepth - + Divergent transitions - + Low E-BFMI values (sampler transitions HMC potential energy) - + Low effective sample sizes - + High R-hat values - """ - cmd_path = os.path.join(cmdstan_path(), 'bin', 'diagnose' + EXTENSION) - cmd = [cmd_path] + self.runset.csv_files - result = do_command(cmd=cmd) - if result: - get_logger().info(result) - return result
- -
[docs] def draws_pd( - self, - vars: Union[List[str], str, None] = None, - inc_warmup: bool = False, - *, - params: Union[List[str], str, None] = None, - ) -> pd.DataFrame: - """ - Returns the sample draws as a pandas DataFrame. - Flattens all chains into single column. Container variables - (array, vector, matrix) will span multiple columns, one column - per element. E.g. variable 'matrix[2,2] foo' spans 4 columns: - 'foo[1,1], ... foo[2,2]'. - - :param vars: optional list of variable names. - - :param inc_warmup: When ``True`` and the warmup draws are present in - the output, i.e., the sampler was run with ``save_warmup=True``, - then the warmup draws are included. Default value is ``False``. - - See Also - -------- - CmdStanMCMC.draws - CmdStanMCMC.draws_xr - CmdStanGQ.draws_pd - """ - if params is not None: - if vars is not None: - raise ValueError("Cannot use both vars and (deprecated) params") - get_logger().warning( - 'Keyword "params" is deprecated, use "vars" instead.' - ) - vars = params - if vars is not None: - if isinstance(vars, str): - vars_list = [vars] - else: - vars_list = vars - - if inc_warmup and not self._save_warmup: - get_logger().warning( - 'Draws from warmup iterations not available,' - ' must run sampler with "save_warmup=True".' - ) - - self._assemble_draws() - cols = [] - if vars is not None: - for var in set(vars_list): - if ( - var not in self.metadata.method_vars_cols - and var not in self.metadata.stan_vars_cols - ): - raise ValueError('Unknown variable: {}'.format(var)) - if var in self.metadata.method_vars_cols: - cols.append(var) - else: - for idx in self.metadata.stan_vars_cols[var]: - cols.append(self.column_names[idx]) - else: - cols = list(self.column_names) - - return pd.DataFrame( - data=flatten_chains(self.draws(inc_warmup=inc_warmup)), - columns=self.column_names, - )[cols]
- -
[docs] def draws_xr( - self, vars: Union[str, List[str], None] = None, inc_warmup: bool = False - ) -> "xr.Dataset": - """ - Returns the sampler draws as a xarray Dataset. - - :param vars: optional list of variable names. - - :param inc_warmup: When ``True`` and the warmup draws are present in - the output, i.e., the sampler was run with ``save_warmup=True``, - then the warmup draws are included. Default value is ``False``. - - See Also - -------- - CmdStanMCMC.draws - CmdStanMCMC.draws_pd - CmdStanGQ.draws_xr - """ - if not XARRAY_INSTALLED: - raise RuntimeError( - 'Package "xarray" is not installed, cannot produce draws array.' - ) - if inc_warmup and not self._save_warmup: - get_logger().warning( - "Draws from warmup iterations not available," - ' must run sampler with "save_warmup=True".' - ) - if vars is None: - vars_list = list(self.metadata.stan_vars_cols.keys()) - elif isinstance(vars, str): - vars_list = [vars] - else: - vars_list = vars - - self._assemble_draws() - - num_draws = self.num_draws_sampling - meta = self._metadata.cmdstan_config - attrs: MutableMapping[Hashable, Any] = { - "stan_version": f"{meta['stan_version_major']}." - f"{meta['stan_version_minor']}.{meta['stan_version_patch']}", - "model": meta["model"], - "num_unconstrained_params": self.num_unconstrained_params, - "num_draws_sampling": num_draws, - } - if inc_warmup and self._save_warmup: - num_draws += self.num_draws_warmup - attrs["num_draws_warmup"] = self.num_draws_warmup - - data: MutableMapping[Hashable, Any] = {} - coordinates: MutableMapping[Hashable, Any] = { - "chain": self.chain_ids, - "draw": np.arange(num_draws), - } - - for var in vars_list: - build_xarray_data( - data, - var, - self._metadata.stan_vars_dims[var], - self._metadata.stan_vars_cols[var], - 0, - self.draws(inc_warmup=inc_warmup), - ) - - return xr.Dataset(data, coords=coordinates, attrs=attrs).transpose( - 'chain', 'draw', ... - )
- -
[docs] def stan_variable( - self, - var: Optional[str] = None, - inc_warmup: bool = False, - *, - name: Optional[str] = None, - ) -> np.ndarray: - """ - Return a numpy.ndarray which contains the set of draws - for the named Stan program variable. Flattens the chains, - leaving the draws in chain order. The first array dimension, - corresponds to number of draws or post-warmup draws in the sample, - per argument ``inc_warmup``. The remaining dimensions correspond to - the shape of the Stan program variable. - - Underlyingly draws are in chain order, i.e., for a sample with - N chains of M draws each, the first M array elements are from chain 1, - the next M are from chain 2, and the last M elements are from chain N. - - * If the variable is a scalar variable, the return array has shape - ( draws X chains, 1). - * If the variable is a vector, the return array has shape - ( draws X chains, len(vector)) - * If the variable is a matrix, the return array has shape - ( draws X chains, size(dim 1) X size(dim 2) ) - * If the variable is an array with N dimensions, the return array - has shape ( draws X chains, size(dim 1) X ... X size(dim N)) - - For example, if the Stan program variable ``theta`` is a 3x3 matrix, - and the sample consists of 4 chains with 1000 post-warmup draws, - this function will return a numpy.ndarray with shape (4000,3,3). - - :param var: variable name - - :param inc_warmup: When ``True`` and the warmup draws are present in - the output, i.e., the sampler was run with ``save_warmup=True``, - then the warmup draws are included. Default value is ``False``. - - See Also - -------- - CmdStanMCMC.stan_variables - CmdStanMLE.stan_variable - CmdStanVB.stan_variable - CmdStanGQ.stan_variable - """ - if name is not None: - if var is not None: - raise ValueError( - 'Cannot use both "var" and (deprecated) "name"' - ) - get_logger().warning( - 'Keyword "name" is deprecated, use "var" instead.' - ) - var = name - if var is None: - raise ValueError('No variable name specified.') - if var not in self._metadata.stan_vars_dims: - raise ValueError('Unknown variable name: {}'.format(var)) - self._assemble_draws() - draw1 = 0 - if not inc_warmup and self._save_warmup: - draw1 = self.num_draws_warmup - num_draws = self.num_draws_sampling - if inc_warmup and self._save_warmup: - num_draws += self.num_draws_warmup - dims = [num_draws * self.chains] - col_idxs = self._metadata.stan_vars_cols[var] - if len(col_idxs) > 0: - dims.extend(self._metadata.stan_vars_dims[var]) - # pylint: disable=redundant-keyword-arg - return self._draws[draw1:, :, col_idxs].reshape( # type: ignore - dims, order='F' - )
- -
[docs] def stan_variables(self) -> Dict[str, np.ndarray]: - """ - Return a dictionary mapping Stan program variables names - to the corresponding numpy.ndarray containing the inferred values. - - See Also - -------- - CmdStanMCMC.stan_variable - CmdStanMLE.stan_variables - CmdStanVB.stan_variables - CmdStanGQ.stan_variables - """ - result = {} - for name in self._metadata.stan_vars_dims.keys(): - result[name] = self.stan_variable(name) - return result
- -
[docs] def method_variables(self) -> Dict[str, np.ndarray]: - """ - Returns a dictionary of all sampler variables, i.e., all - output column names ending in `__`. Assumes that all variables - are scalar variables where column name is variable name. - Maps each column name to a numpy.ndarray (draws x chains x 1) - containing per-draw diagnostic values. - """ - result = {} - self._assemble_draws() - for idxs in self.metadata.method_vars_cols.values(): - for idx in idxs: - result[self.column_names[idx]] = self._draws[:, :, idx] - return result
- -
[docs] def sampler_variables(self) -> Dict[str, np.ndarray]: - """ - Deprecated, use "method_variables" instead - """ - get_logger().warning( - 'Method "sampler_variables" has been deprecated, ' - 'use method "method_variables" instead.' - ) - return self.method_variables()
- -
[docs] def sampler_diagnostics(self) -> Dict[str, np.ndarray]: - """ - Deprecated, use "method_variables" instead - """ - get_logger().warning( - 'Method "sampler_diagnostics" has been deprecated, ' - 'use method "method_variables" instead.' - ) - return self.method_variables()
- -
[docs] def save_csvfiles(self, dir: Optional[str] = None) -> None: - """ - Move output csvfiles to specified directory. If files were - written to the temporary session directory, clean filename. - E.g., save 'bernoulli-201912081451-1-5nm6as7u.csv' as - 'bernoulli-201912081451-1.csv'. - - :param dir: directory path - - See Also - -------- - stanfit.RunSet.save_csvfiles - cmdstanpy.from_csv - """ - self.runset.save_csvfiles(dir)
- - -
[docs]class CmdStanMLE: - """ - Container for outputs from CmdStan optimization. - Created by :meth:`CmdStanModel.optimize`. - """ - - def __init__(self, runset: RunSet) -> None: - """Initialize object.""" - if not runset.method == Method.OPTIMIZE: - raise ValueError( - 'Wrong runset method, expecting optimize runset, ' - 'found method {}'.format(runset.method) - ) - self.runset = runset - self._set_mle_attrs(runset.csv_files[0]) - - def __repr__(self) -> str: - repr = 'CmdStanMLE: model={}{}'.format( - self.runset.model, self.runset._args.method_args.compose(0, cmd=[]) - ) - repr = '{}\n csv_file:\n\t{}\n output_file:\n\t{}'.format( - repr, - '\n\t'.join(self.runset.csv_files), - '\n\t'.join(self.runset.stdout_files), - ) - # TODO - profiling files - return repr - - def _set_mle_attrs(self, sample_csv_0: str) -> None: - meta = scan_optimize_csv(sample_csv_0) - self._metadata = InferenceMetadata(meta) - self._column_names: Tuple[str, ...] = meta['column_names'] - self._mle: List[float] = meta['mle'] - - @property - def column_names(self) -> Tuple[str, ...]: - """ - Names of estimated quantities, includes joint log probability, - and all parameters, transformed parameters, and generated quantities. - """ - return self._column_names - - @property - def metadata(self) -> InferenceMetadata: - """ - Returns object which contains CmdStan configuration as well as - information about the names and structure of the inference method - and model output variables. - """ - return self._metadata - - @property - def optimized_params_np(self) -> np.ndarray: - """Returns optimized params as numpy array.""" - return np.asarray(self._mle) - - @property - def optimized_params_pd(self) -> pd.DataFrame: - """Returns optimized params as pandas DataFrame.""" - return pd.DataFrame([self._mle], columns=self.column_names) - - @property - def optimized_params_dict(self) -> Dict[str, float]: - """Returns optimized params as Dict.""" - return OrderedDict(zip(self.column_names, self._mle)) - -
[docs] def stan_variable( - self, var: Optional[str] = None, *, name: Optional[str] = None - ) -> np.ndarray: - """ - Return a numpy.ndarray which contains the estimates for the - for the named Stan program variable where the dimensions of the - numpy.ndarray match the shape of the Stan program variable. - - :param var: variable name - - See Also - -------- - CmdStanMLE.stan_variables - CmdStanMCMC.stan_variable - CmdStanVB.stan_variable - CmdStanGQ.stan_variable - """ - if name is not None: - if var is not None: - raise ValueError( - 'Cannot use both "var" and (deprecated) "name".' - ) - get_logger().warning( - 'Keyword "name" is deprecated, use "var" instead.' - ) - var = name - if var is None: - raise ValueError('no variable name specified.') - if var not in self._metadata.stan_vars_dims: - raise ValueError('unknown variable name: {}'.format(var)) - col_idxs = list(self._metadata.stan_vars_cols[var]) - vals = list(self._mle) - xs = [vals[x] for x in col_idxs] - shape: Tuple[int, ...] = () - if len(col_idxs) > 0: - shape = self._metadata.stan_vars_dims[var] - return np.array(xs).reshape(shape)
- -
[docs] def stan_variables(self) -> Dict[str, np.ndarray]: - """ - Return a dictionary mapping Stan program variables names - to the corresponding numpy.ndarray containing the inferred values. - - See Also - -------- - CmdStanMLE.stan_variable - CmdStanMCMC.stan_variables - CmdStanVB.stan_variables - CmdStanGQ.stan_variables - """ - result = {} - for name in self._metadata.stan_vars_dims.keys(): - result[name] = self.stan_variable(name) - return result
- -
[docs] def save_csvfiles(self, dir: Optional[str] = None) -> None: - """ - Move output csvfiles to specified directory. If files were - written to the temporary session directory, clean filename. - E.g., save 'bernoulli-201912081451-1-5nm6as7u.csv' as - 'bernoulli-201912081451-1.csv'. - - :param dir: directory path - - See Also - -------- - stanfit.RunSet.save_csvfiles - cmdstanpy.from_csv - """ - self.runset.save_csvfiles(dir)
- - -
[docs]class CmdStanGQ: - """ - Container for outputs from CmdStan generate_quantities run. - Created by :meth:`CmdStanModel.generate_quantities`. - """ - - def __init__( - self, - runset: RunSet, - mcmc_sample: CmdStanMCMC, - ) -> None: - """Initialize object.""" - if not runset.method == Method.GENERATE_QUANTITIES: - raise ValueError( - 'Wrong runset method, expecting generate_quantities runset, ' - 'found method {}'.format(runset.method) - ) - self.runset = runset - self.mcmc_sample = mcmc_sample - self._draws = np.array(()) - config = self._validate_csv_files() - self._metadata = InferenceMetadata(config) - - def __repr__(self) -> str: - repr = 'CmdStanGQ: model={} chains={}{}'.format( - self.runset.model, - self.chains, - self.runset._args.method_args.compose(0, cmd=[]), - ) - repr = '{}\n csv_files:\n\t{}\n output_files:\n\t{}'.format( - repr, - '\n\t'.join(self.runset.csv_files), - '\n\t'.join(self.runset.stdout_files), - ) - return repr - - def _validate_csv_files(self) -> dict: - """ - Checks that Stan CSV output files for all chains are consistent - and returns dict containing config and column names. - - Raises exception when inconsistencies detected. - """ - dzero = {} - for i in range(self.chains): - if i == 0: - dzero = scan_generated_quantities_csv( - path=self.runset.csv_files[i], - ) - else: - drest = scan_generated_quantities_csv( - path=self.runset.csv_files[i], - ) - # pylint: disable=consider-using-dict-items - for key in dzero: - if ( - key - not in [ - 'id', - 'fitted_params', - 'diagnostic_file', - 'metric_file', - 'profile_file', - 'init', - 'seed', - 'start_datetime', - ] - and dzero[key] != drest[key] - ): - raise ValueError( - 'CmdStan config mismatch in Stan CSV file {}: ' - 'arg {} is {}, expected {}'.format( - self.runset.csv_files[i], - key, - dzero[key], - drest[key], - ) - ) - return dzero - - @property - def chains(self) -> int: - """Number of chains.""" - return self.runset.chains - - @property - def chain_ids(self) -> List[int]: - """Chain ids.""" - return self.runset.chain_ids - - @property - def column_names(self) -> Tuple[str, ...]: - """ - Names of generated quantities of interest. - """ - return self._metadata.cmdstan_config['column_names'] # type: ignore - - @property - def metadata(self) -> InferenceMetadata: - """ - Returns object which contains CmdStan configuration as well as - information about the names and structure of the inference method - and model output variables. - """ - return self._metadata - - @property - def generated_quantities(self) -> np.ndarray: - """ - Deprecated - use method ``draws`` instead. - """ - get_logger().warning( - 'Property "generated_quantities" has been deprecated, ' - 'use method "draws" instead.' - ) - if self._draws.size == 0: - self._assemble_generated_quantities() - return flatten_chains(self._draws) - - @property - def generated_quantities_pd(self) -> pd.DataFrame: - """ - Deprecated - use method ``draws_pd`` instead. - """ - get_logger().warning( - 'Property "generated_quantities_pd" has been deprecated, ' - 'use method "draws_pd" instead.' - ) - if self._draws.size == 0: - self._assemble_generated_quantities() - return pd.DataFrame( - data=flatten_chains(self._draws), - columns=self.column_names, - ) - - @property - def sample_plus_quantities(self) -> pd.DataFrame: - """ - Deprecated - use method "draws_pd(inc_sample=True)" instead. - """ - get_logger().warning( - 'Property "sample_plus_quantities" has been deprecated, ' - 'use method "draws_pd(inc_sample=True)" instead.' - ) - return self.draws_pd(inc_sample=True) - -
[docs] def draws( - self, - *, - inc_warmup: bool = False, - concat_chains: bool = False, - inc_sample: bool = False, - ) -> np.ndarray: - """ - Returns a numpy.ndarray over the generated quantities draws from - all chains which is stored column major so that the values - for a parameter are contiguous in memory, likewise all draws from - a chain are contiguous. By default, returns a 3D array arranged - (draws, chains, columns); parameter ``concat_chains=True`` will - return a 2D array where all chains are flattened into a single column, - preserving chain order, so that given M chains of N draws, - the first N draws are from chain 1, ..., and the the last N draws - are from chain M. - - :param inc_warmup: When ``True`` and the warmup draws are present in - the output, i.e., the sampler was run with ``save_warmup=True``, - then the warmup draws are included. Default value is ``False``. - - :param concat_chains: When ``True`` return a 2D array flattening all - all draws from all chains. Default value is ``False``. - - :param inc_sample: When ``True`` include all columns in the mcmc_sample - draws array as well, excepting columns for variables already present - in the generated quantities drawset. Default value is ``False``. - - See Also - -------- - CmdStanGQ.draws_pd - CmdStanGQ.draws_xr - CmdStanMCMC.draws - """ - if self._draws.size == 0: - self._assemble_generated_quantities() - if ( - inc_warmup - and not self.mcmc_sample.metadata.cmdstan_config['save_warmup'] - ): - get_logger().warning( - "Sample doesn't contain draws from warmup iterations," - ' rerun sampler with "save_warmup=True".' - ) - if inc_sample: - cols_1 = self.mcmc_sample.column_names - cols_2 = self.column_names - dups = [ - item - for item, count in Counter(cols_1 + cols_2).items() - if count > 1 - ] - drop_cols: List[int] = [] - for dup in dups: - drop_cols.extend(self.mcmc_sample.metadata.stan_vars_cols[dup]) - - start_idx = 0 - if ( - not inc_warmup - and self.mcmc_sample.metadata.cmdstan_config['save_warmup'] - ): - start_idx = self.mcmc_sample.num_draws_warmup - - if concat_chains and inc_sample: - return flatten_chains( - np.dstack( - ( - np.delete(self.mcmc_sample.draws(), drop_cols, axis=1), - self._draws, - ) - )[start_idx:, :, :] - ) - if concat_chains: - return flatten_chains(self._draws[start_idx:, :, :]) - if inc_sample: - return np.dstack( # type: ignore - ( - np.delete(self.mcmc_sample.draws(), drop_cols, axis=1), - self._draws, - ) - )[start_idx:, :, :] - return self._draws[start_idx:, :, :] # type: ignore
- -
[docs] def draws_pd( - self, - vars: Union[List[str], str, None] = None, - inc_warmup: bool = False, - inc_sample: bool = False, - ) -> pd.DataFrame: - """ - Returns the generated quantities draws as a pandas DataFrame. - Flattens all chains into single column. Container variables - (array, vector, matrix) will span multiple columns, one column - per element. E.g. variable 'matrix[2,2] foo' spans 4 columns: - 'foo[1,1], ... foo[2,2]'. - - :param vars: optional list of variable names. - - :param inc_warmup: When ``True`` and the warmup draws are present in - the output, i.e., the sampler was run with ``save_warmup=True``, - then the warmup draws are included. Default value is ``False``. - - See Also - -------- - CmdStanGQ.draws - CmdStanGQ.draws_xr - CmdStanMCMC.draws_pd - """ - if vars is not None: - if isinstance(vars, str): - vars_list = [vars] - else: - vars_list = vars - if ( - inc_warmup - and not self.mcmc_sample.metadata.cmdstan_config['save_warmup'] - ): - get_logger().warning( - 'Draws from warmup iterations not available,' - ' must run sampler with "save_warmup=True".' - ) - self._assemble_generated_quantities() - - gq_cols = [] - mcmc_vars = [] - if vars is not None: - for var in set(vars_list): - if var in self.metadata.stan_vars_cols: - for idx in self.metadata.stan_vars_cols[var]: - gq_cols.append(self.column_names[idx]) - elif ( - inc_sample - and var in self.mcmc_sample.metadata.stan_vars_cols - ): - mcmc_vars.append(var) - else: - raise ValueError('Unknown variable: {}'.format(var)) - else: - gq_cols = list(self.column_names) - - if inc_sample and mcmc_vars: - if gq_cols: - return pd.concat( - [ - self.mcmc_sample.draws_pd( - vars=mcmc_vars, inc_warmup=inc_warmup - ).reset_index(drop=True), - pd.DataFrame( - data=flatten_chains( - self.draws(inc_warmup=inc_warmup) - ), - columns=self.column_names, - )[gq_cols], - ], - axis='columns', - ) - else: - return self.mcmc_sample.draws_pd( - vars=mcmc_vars, inc_warmup=inc_warmup - ) - elif inc_sample and vars is None: - cols_1 = self.mcmc_sample.column_names - cols_2 = self.column_names - dups = [ - item - for item, count in Counter(cols_1 + cols_2).items() - if count > 1 - ] - return pd.concat( - [ - self.mcmc_sample.draws_pd(inc_warmup=inc_warmup) - .drop(columns=dups) - .reset_index(drop=True), - pd.DataFrame( - data=flatten_chains(self.draws(inc_warmup=inc_warmup)), - columns=self.column_names, - ), - ], - axis='columns', - ignore_index=True, - ) - elif gq_cols: - return pd.DataFrame( - data=flatten_chains(self.draws(inc_warmup=inc_warmup)), - columns=self.column_names, - )[gq_cols] - - return pd.DataFrame( - data=flatten_chains(self.draws(inc_warmup=inc_warmup)), - columns=self.column_names, - )
- -
[docs] def draws_xr( - self, - vars: Union[str, List[str], None] = None, - inc_warmup: bool = False, - inc_sample: bool = False, - ) -> "xr.Dataset": - """ - Returns the generated quantities draws as a xarray Dataset. - - :param vars: optional list of variable names. - - :param inc_warmup: When ``True`` and the warmup draws are present in - the MCMC sample, then the warmup draws are included. - Default value is ``False``. - - See Also - -------- - CmdStanGQ.draws - CmdStanGQ.draws_pd - CmdStanMCMC.draws_xr - """ - if not XARRAY_INSTALLED: - raise RuntimeError( - 'Package "xarray" is not installed, cannot produce draws array.' - ) - mcmc_vars_list = [] - dup_vars = [] - if vars is not None: - if isinstance(vars, str): - vars_list = [vars] - else: - vars_list = vars - for var in vars_list: - if var not in self.metadata.stan_vars_cols: - if inc_sample and var in self.mcmc_sample.stan_vars_cols: - mcmc_vars_list.append(var) - dup_vars.append(var) - else: - raise ValueError('Unknown variable: {}'.format(var)) - else: - vars_list = list(self.metadata.stan_vars_cols.keys()) - if inc_sample: - for var in self.mcmc_sample.metadata.stan_vars_cols.keys(): - if var not in vars_list and var not in mcmc_vars_list: - mcmc_vars_list.append(var) - for var in dup_vars: - vars_list.remove(var) - - self._assemble_generated_quantities() - - num_draws = self.mcmc_sample.num_draws_sampling - sample_config = self.mcmc_sample.metadata.cmdstan_config - attrs: MutableMapping[Hashable, Any] = { - "stan_version": f"{sample_config['stan_version_major']}." - f"{sample_config['stan_version_minor']}." - f"{sample_config['stan_version_patch']}", - "model": sample_config["model"], - "num_unconstrained_params": ( - self.mcmc_sample.num_unconstrained_params - ), - "num_draws_sampling": num_draws, - } - if inc_warmup and sample_config['save_warmup']: - num_draws += self.mcmc_sample.num_draws_warmup - attrs["num_draws_warmup"] = self.mcmc_sample.num_draws_warmup - - data: MutableMapping[Hashable, Any] = {} - coordinates: MutableMapping[Hashable, Any] = { - "chain": self.chain_ids, - "draw": np.arange(num_draws), - } - - for var in vars_list: - build_xarray_data( - data, - var, - self._metadata.stan_vars_dims[var], - self._metadata.stan_vars_cols[var], - 0, - self.draws(inc_warmup=inc_warmup), - ) - if inc_sample: - for var in mcmc_vars_list: - build_xarray_data( - data, - var, - self.mcmc_sample.metadata.stan_vars_dims[var], - self.mcmc_sample.metadata.stan_vars_cols[var], - 0, - self.mcmc_sample.draws(inc_warmup=inc_warmup), - ) - - return xr.Dataset(data, coords=coordinates, attrs=attrs).transpose( - 'chain', 'draw', ... - )
- -
[docs] def stan_variable( - self, - var: Optional[str] = None, - inc_warmup: bool = False, - *, - name: Optional[str] = None, - ) -> np.ndarray: - """ - Return a numpy.ndarray which contains the set of draws - for the named Stan program variable. Flattens the chains, - leaving the draws in chain order. The first array dimension, - corresponds to number of draws in the sample. - The remaining dimensions correspond to - the shape of the Stan program variable. - - Underlyingly draws are in chain order, i.e., for a sample with - N chains of M draws each, the first M array elements are from chain 1, - the next M are from chain 2, and the last M elements are from chain N. - - * If the variable is a scalar variable, the return array has shape - ( draws X chains, 1). - * If the variable is a vector, the return array has shape - ( draws X chains, len(vector)) - * If the variable is a matrix, the return array has shape - ( draws X chains, size(dim 1) X size(dim 2) ) - * If the variable is an array with N dimensions, the return array - has shape ( draws X chains, size(dim 1) X ... X size(dim N)) - - For example, if the Stan program variable ``theta`` is a 3x3 matrix, - and the sample consists of 4 chains with 1000 post-warmup draws, - this function will return a numpy.ndarray with shape (4000,3,3). - - :param var: variable name - - :param inc_warmup: When ``True`` and the warmup draws are present in - the MCMC sample, then the warmup draws are included. - Default value is ``False``. - - See Also - -------- - CmdStanGQ.stan_variables - CmdStanMCMC.stan_variable - CmdStanMLE.stan_variable - CmdStanVB.stan_variable - """ - if name is not None: - if var is not None: - raise ValueError( - 'Cannot use both "var" and (deprecated) "name"' - ) - get_logger().warning( - 'Keyword "name" is deprecated, use "var" instead.' - ) - var = name - if var is None: - raise ValueError('No variable name specified.') - model_var_names = self.mcmc_sample.metadata.stan_vars_cols.keys() - gq_var_names = self.metadata.stan_vars_cols.keys() - if not (var in model_var_names or var in gq_var_names): - raise ValueError('Unknown variable name: {}'.format(var)) - if var not in gq_var_names: - return self.mcmc_sample.stan_variable(var, inc_warmup=inc_warmup) - else: # is gq variable - self._assemble_generated_quantities() - col_idxs = self._metadata.stan_vars_cols[var] - if ( - not inc_warmup - and self.mcmc_sample.metadata.cmdstan_config['save_warmup'] - ): - draw1 = self.mcmc_sample.num_draws_warmup * self.chains - return flatten_chains(self._draws)[ # type: ignore - draw1:, col_idxs - ] - return flatten_chains(self._draws)[:, col_idxs] # type: ignore
- -
[docs] def stan_variables(self, inc_warmup: bool = False) -> Dict[str, np.ndarray]: - """ - Return a dictionary mapping Stan program variables names - to the corresponding numpy.ndarray containing the inferred values. - - :param inc_warmup: When ``True`` and the warmup draws are present in - the MCMC sample, then the warmup draws are included. - Default value is ``False`` - - See Also - -------- - CmdStanGQ.stan_variable - CmdStanMCMC.stan_variables - CmdStanMLE.stan_variables - CmdStanVB.stan_variables - """ - result = {} - sample_var_names = self.mcmc_sample.metadata.stan_vars_cols.keys() - gq_var_names = self.metadata.stan_vars_cols.keys() - for name in gq_var_names: - result[name] = self.stan_variable(name, inc_warmup) - for name in sample_var_names: - if name not in gq_var_names: - result[name] = self.stan_variable(name, inc_warmup) - return result
- - def _assemble_generated_quantities(self) -> None: - # use numpy genfromtext - warmup = self.mcmc_sample.metadata.cmdstan_config['save_warmup'] - num_draws = self.mcmc_sample.draws(inc_warmup=warmup).shape[0] - gq_sample = np.empty( - (num_draws, self.chains, len(self.column_names)), - dtype=float, - order='F', - ) - for chain in range(self.chains): - with open(self.runset.csv_files[chain], 'r') as fd: - lines = (line for line in fd if not line.startswith('#')) - gq_sample[:, chain, :] = np.loadtxt( - lines, dtype=np.ndarray, ndmin=2, skiprows=1, delimiter=',' - ) - self._draws = gq_sample - -
[docs] def save_csvfiles(self, dir: Optional[str] = None) -> None: - """ - Move output csvfiles to specified directory. If files were - written to the temporary session directory, clean filename. - E.g., save 'bernoulli-201912081451-1-5nm6as7u.csv' as - 'bernoulli-201912081451-1.csv'. - - :param dir: directory path - - See Also - -------- - stanfit.RunSet.save_csvfiles - cmdstanpy.from_csv - """ - self.runset.save_csvfiles(dir)
- - -
[docs]class CmdStanVB: - """ - Container for outputs from CmdStan variational run. - Created by :meth:`CmdStanModel.variational`. - """ - - def __init__(self, runset: RunSet) -> None: - """Initialize object.""" - if not runset.method == Method.VARIATIONAL: - raise ValueError( - 'Wrong runset method, expecting variational inference, ' - 'found method {}'.format(runset.method) - ) - self.runset = runset - self._set_variational_attrs(runset.csv_files[0]) - - def __repr__(self) -> str: - repr = 'CmdStanVB: model={}{}'.format( - self.runset.model, self.runset._args.method_args.compose(0, cmd=[]) - ) - repr = '{}\n csv_file:\n\t{}\n output_file:\n\t{}'.format( - repr, - '\n\t'.join(self.runset.csv_files), - '\n\t'.join(self.runset.stdout_files), - ) - # TODO - diagnostic, profiling files - return repr - - def _set_variational_attrs(self, sample_csv_0: str) -> None: - meta = scan_variational_csv(sample_csv_0) - self._metadata = InferenceMetadata(meta) - # these three assignments don't grant type information - self._column_names: Tuple[str, ...] = meta['column_names'] - self._variational_mean: np.ndarray = meta['variational_mean'] - self._variational_sample: np.ndarray = meta['variational_sample'] - - @property - def columns(self) -> int: - """ - Total number of information items returned by sampler. - Includes approximation information and names of model parameters - and computed quantities. - """ - return len(self._column_names) - - @property - def column_names(self) -> Tuple[str, ...]: - """ - Names of information items returned by sampler for each draw. - Includes approximation information and names of model parameters - and computed quantities. - """ - return self._column_names - - @property - def variational_params_np(self) -> np.ndarray: - """ - Returns inferred parameter means as numpy array. - """ - return self._variational_mean - - @property - def variational_params_pd(self) -> pd.DataFrame: - """ - Returns inferred parameter means as pandas DataFrame. - """ - return pd.DataFrame([self._variational_mean], columns=self.column_names) - - @property - def variational_params_dict(self) -> Dict[str, np.ndarray]: - """Returns inferred parameter means as Dict.""" - return OrderedDict(zip(self.column_names, self._variational_mean)) - - @property - def metadata(self) -> InferenceMetadata: - """ - Returns object which contains CmdStan configuration as well as - information about the names and structure of the inference method - and model output variables. - """ - return self._metadata - -
[docs] def stan_variable( - self, var: Optional[str] = None, *, name: Optional[str] = None - ) -> np.ndarray: - """ - Return a numpy.ndarray which contains the estimates for the - for the named Stan program variable where the dimensions of the - numpy.ndarray match the shape of the Stan program variable. - - :param var: variable name - - See Also - -------- - CmdStanVB.stan_variables - CmdStanMCMC.stan_variable - CmdStanMLE.stan_variable - CmdStanGQ.stan_variable - """ - if name is not None: - if var is not None: - raise ValueError( - 'Cannot use both "var" and (deprecated) "name"' - ) - get_logger().warning( - 'Keyword "name" is deprecated, use "var" instead.' - ) - var = name - if var is None: - raise ValueError('No variable name specified.') - if var not in self._metadata.stan_vars_dims: - raise ValueError('Unknown variable name: {}'.format(var)) - col_idxs = list(self._metadata.stan_vars_cols[var]) - vals = list(self._variational_mean) - xs = [vals[x] for x in col_idxs] - shape: Tuple[int, ...] = () - if len(col_idxs) > 0: - shape = self._metadata.stan_vars_dims[var] - return np.array(xs).reshape(shape)
- -
[docs] def stan_variables(self) -> Dict[str, np.ndarray]: - """ - Return a dictionary mapping Stan program variables names - to the corresponding numpy.ndarray containing the inferred values. - - See Also - -------- - CmdStanVB.stan_variable - CmdStanMCMC.stan_variables - CmdStanMLE.stan_variables - CmdStanGQ.stan_variables - """ - result = {} - for name in self._metadata.stan_vars_dims.keys(): - result[name] = self.stan_variable(name) - return result
- - @property - def variational_sample(self) -> np.ndarray: - """Returns the set of approximate posterior output draws.""" - return self._variational_sample - -
[docs] def save_csvfiles(self, dir: Optional[str] = None) -> None: - """ - Move output csvfiles to specified directory. If files were - written to the temporary session directory, clean filename. - E.g., save 'bernoulli-201912081451-1-5nm6as7u.csv' as - 'bernoulli-201912081451-1.csv'. - - :param dir: directory path - - See Also - -------- - stanfit.RunSet.save_csvfiles - cmdstanpy.from_csv - """ - self.runset.save_csvfiles(dir)
- - -
[docs]def from_csv( - path: Union[str, List[str], None] = None, method: Optional[str] = None -) -> Union[CmdStanMCMC, CmdStanMLE, CmdStanVB, None]: - """ - Instantiate a CmdStan object from a the Stan CSV files from a CmdStan run. - CSV files are specified from either a list of Stan CSV files or a single - filepath which can be either a directory name, a Stan CSV filename, or - a pathname pattern (i.e., a Python glob). The optional argument 'method' - checks that the CSV files were produced by that method. - Stan CSV files from CmdStan methods 'sample', 'optimize', and 'variational' - result in objects of class CmdStanMCMC, CmdStanMLE, and CmdStanVB, - respectively. - - :param path: directory path - :param method: method name (optional) - - :return: either a CmdStanMCMC, CmdStanMLE, or CmdStanVB object - """ - if path is None: - raise ValueError('Must specify path to Stan CSV files.') - if method is not None and method not in [ - 'sample', - 'optimize', - 'variational', - ]: - raise ValueError( - 'Bad method argument {}, must be one of: ' - '"sample", "optimize", "variational"'.format(method) - ) - - csvfiles = [] - if isinstance(path, list): - csvfiles = path - elif isinstance(path, str): - if '*' in path: - splits = os.path.split(path) - if splits[0] is not None: - if not (os.path.exists(splits[0]) and os.path.isdir(splits[0])): - raise ValueError( - 'Invalid path specification, {} ' - ' unknown directory: {}'.format(path, splits[0]) - ) - csvfiles = glob.glob(path) - elif os.path.exists(path) and os.path.isdir(path): - for file in os.listdir(path): - if file.endswith(".csv"): - csvfiles.append(os.path.join(path, file)) - elif os.path.exists(path): - csvfiles.append(path) - else: - raise ValueError('Invalid path specification: {}'.format(path)) - else: - raise ValueError('Invalid path specification: {}'.format(path)) - - if len(csvfiles) == 0: - raise ValueError('No CSV files found in directory {}'.format(path)) - for file in csvfiles: - if not (os.path.exists(file) and file.endswith('.csv')): - raise ValueError( - 'Bad CSV file path spec,' - ' includes non-csv file: {}'.format(file) - ) - - config_dict: Dict[str, Any] = {} - try: - with open(csvfiles[0], 'r') as fd: - scan_config(fd, config_dict, 0) - except (IOError, OSError, PermissionError) as e: - raise ValueError('Cannot read CSV file: {}'.format(csvfiles[0])) from e - if 'model' not in config_dict or 'method' not in config_dict: - raise ValueError("File {} is not a Stan CSV file.".format(csvfiles[0])) - if method is not None and method != config_dict['method']: - raise ValueError( - 'Expecting Stan CSV output files from method {}, ' - ' found outputs from method {}'.format( - method, config_dict['method'] - ) - ) - try: - if config_dict['method'] == 'sample': - chains = len(csvfiles) - sampler_args = SamplerArgs( - iter_sampling=config_dict['num_samples'], - iter_warmup=config_dict['num_warmup'], - thin=config_dict['thin'], - save_warmup=config_dict['save_warmup'], - ) - # bugfix 425, check for fixed_params output - try: - check_sampler_csv( - csvfiles[0], - iter_sampling=config_dict['num_samples'], - iter_warmup=config_dict['num_warmup'], - thin=config_dict['thin'], - save_warmup=config_dict['save_warmup'], - ) - except ValueError: - try: - check_sampler_csv( - csvfiles[0], - is_fixed_param=True, - iter_sampling=config_dict['num_samples'], - iter_warmup=config_dict['num_warmup'], - thin=config_dict['thin'], - save_warmup=config_dict['save_warmup'], - ) - sampler_args = SamplerArgs( - iter_sampling=config_dict['num_samples'], - iter_warmup=config_dict['num_warmup'], - thin=config_dict['thin'], - save_warmup=config_dict['save_warmup'], - fixed_param=True, - ) - except (ValueError) as e: - raise ValueError( - 'Invalid or corrupt Stan CSV output file, ' - ) from e - - cmdstan_args = CmdStanArgs( - model_name=config_dict['model'], - model_exe=config_dict['model'], - chain_ids=[x + 1 for x in range(chains)], - method_args=sampler_args, - ) - runset = RunSet(args=cmdstan_args, chains=chains) - runset._csv_files = csvfiles - for i in range(len(runset._retcodes)): - runset._set_retcode(i, 0) - fit = CmdStanMCMC(runset) - fit.draws() - return fit - elif config_dict['method'] == 'optimize': - if 'algorithm' not in config_dict: - raise ValueError( - "Cannot find optimization algorithm" - " in file {}.".format(csvfiles[0]) - ) - optimize_args = OptimizeArgs( - algorithm=config_dict['algorithm'], - ) - cmdstan_args = CmdStanArgs( - model_name=config_dict['model'], - model_exe=config_dict['model'], - chain_ids=None, - method_args=optimize_args, - ) - runset = RunSet(args=cmdstan_args) - runset._csv_files = csvfiles - for i in range(len(runset._retcodes)): - runset._set_retcode(i, 0) - return CmdStanMLE(runset) - elif config_dict['method'] == 'variational': - if 'algorithm' not in config_dict: - raise ValueError( - "Cannot find variational algorithm" - " in file {}.".format(csvfiles[0]) - ) - variational_args = VariationalArgs( - algorithm=config_dict['algorithm'], - iter=config_dict['iter'], - grad_samples=config_dict['grad_samples'], - elbo_samples=config_dict['elbo_samples'], - eta=config_dict['eta'], - tol_rel_obj=config_dict['tol_rel_obj'], - eval_elbo=config_dict['eval_elbo'], - output_samples=config_dict['output_samples'], - ) - cmdstan_args = CmdStanArgs( - model_name=config_dict['model'], - model_exe=config_dict['model'], - chain_ids=None, - method_args=variational_args, - ) - runset = RunSet(args=cmdstan_args) - runset._csv_files = csvfiles - for i in range(len(runset._retcodes)): - runset._set_retcode(i, 0) - return CmdStanVB(runset) - else: - get_logger().info( - 'Unable to process CSV output files from method %s.', - (config_dict['method']), - ) - return None - except (IOError, OSError, PermissionError) as e: - raise ValueError( - 'An error occured processing the CSV files:\n\t{}'.format(str(e)) - ) from e
- - -def build_xarray_data( - data: MutableMapping[Hashable, Tuple[Tuple[str, ...], np.ndarray]], - var_name: str, - dims: Tuple[int, ...], - col_idxs: Tuple[int, ...], - start_row: int, - drawset: np.ndarray, -) -> None: - """ - Adds Stan variable name, labels, and values to a dictionary - that will be used to construct an xarray DataSet. - """ - var_dims: Tuple[str, ...] = ('draw', 'chain') - if dims: - var_dims += tuple(f"{var_name}_dim_{i}" for i in range(len(dims))) - data[var_name] = (var_dims, drawset[start_row:, :, col_idxs]) - else: - data[var_name] = ( - var_dims, - np.squeeze(drawset[start_row:, :, col_idxs], axis=2), - ) -
- -
- - -
- - -
- -
- - -
-
- - - - - - \ No newline at end of file diff --git a/docs/_modules/cmdstanpy/utils.html b/docs/_modules/cmdstanpy/utils.html deleted file mode 100644 index fbef4123..00000000 --- a/docs/_modules/cmdstanpy/utils.html +++ /dev/null @@ -1,1464 +0,0 @@ - - - - - - - - cmdstanpy.utils — CmdStanPy 0.9.77 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - -
- - - - -
- -
- - - - - - -
- -
- -

Source code for cmdstanpy.utils

-"""
-Utility functions
-"""
-import contextlib
-import functools
-import logging
-import math
-import os
-import platform
-import re
-import shutil
-import subprocess
-import sys
-import tempfile
-from collections import OrderedDict
-from collections.abc import Collection
-from typing import (
-    Any,
-    Callable,
-    Dict,
-    Iterator,
-    List,
-    Mapping,
-    MutableMapping,
-    Optional,
-    TextIO,
-    Tuple,
-    Union,
-)
-
-import numpy as np
-import pandas as pd
-import ujson as json
-
-from cmdstanpy import (
-    _CMDSTAN_SAMPLING,
-    _CMDSTAN_THIN,
-    _CMDSTAN_WARMUP,
-    _DOT_CMDSTAN,
-    _DOT_CMDSTANPY,
-    _TMPDIR,
-)
-
-EXTENSION = '.exe' if platform.system() == 'Windows' else ''
-
-
-@functools.lru_cache(maxsize=None)
-def get_logger() -> logging.Logger:
-    """cmdstanpy logger"""
-    logger = logging.getLogger('cmdstanpy')
-    if len(logger.handlers) == 0:
-        logging.basicConfig(level=logging.INFO)
-    return logger
-
-
-def validate_dir(install_dir: str) -> None:
-    """Check that specified install directory exists, can write."""
-    if not os.path.exists(install_dir):
-        try:
-            os.makedirs(install_dir)
-        except (IOError, OSError, PermissionError) as e:
-            raise ValueError(
-                'Cannot create directory: {}'.format(install_dir)
-            ) from e
-    else:
-        if not os.path.isdir(install_dir):
-            raise ValueError(
-                'File exists, should be a directory: {}'.format(install_dir)
-            )
-        try:
-            with open('tmp_test_w', 'w'):
-                pass
-            os.remove('tmp_test_w')  # cleanup
-        except OSError as e:
-            raise ValueError(
-                'Cannot write files to directory {}'.format(install_dir)
-            ) from e
-
-
-def get_latest_cmdstan(cmdstan_dir: str) -> Optional[str]:
-    """
-    Given a valid directory path, find all installed CmdStan versions
-    and return highest (i.e., latest) version number.
-    Assumes directory populated via script `install_cmdstan`.
-    """
-    versions = [
-        ''.join(name.split('-')[1:])  # name may contain '-rc'
-        for name in os.listdir(cmdstan_dir)
-        if os.path.isdir(os.path.join(cmdstan_dir, name))
-        and name.startswith('cmdstan-')
-        and name[8].isdigit()
-    ]
-    # munge rc for sort, e.g. 2.25.0-rc1 -> 2.25.0.-99
-    for i in range(len(versions)):  # # pylint: disable=C0200
-        tmp = versions[i].split('rc')
-        if len(tmp) == 1:
-            versions[i] = '.'.join([tmp[0], '0'])
-        else:
-            rc_sortable = str(int(tmp[1]) - 100)
-            versions[i] = '.'.join([tmp[0], rc_sortable])
-
-    versions.sort(key=lambda s: list(map(int, s.split('.'))))
-    if len(versions) == 0:
-        return None
-    latest = 'cmdstan-{}'.format(versions[len(versions) - 1])
-
-    # unmunge
-    tmp = latest.split('.')
-    prefix = '.'.join(tmp[0:3])
-    if int(tmp[3]) == 0:
-        latest = prefix
-    else:
-        tmp[3] = 'rc' + str(int(tmp[3]) + 100)
-        latest = '-'.join([prefix, tmp[3]])
-    return latest
-
-
-def validate_cmdstan_path(path: str) -> None:
-    """
-    Validate that CmdStan directory exists and binaries have been built.
-    Throws exception if specified path is invalid.
-    """
-    if not os.path.isdir(path):
-        raise ValueError('no such CmdStan directory {}'.format(path))
-    if not os.path.exists(os.path.join(path, 'bin', 'stanc' + EXTENSION)):
-        raise ValueError(
-            'no CmdStan binaries found, '
-            'run command line script "install_cmdstan"'
-        )
-
-
-
[docs]def set_cmdstan_path(path: str) -> None: - """ - Validate, then set CmdStan directory path. - """ - validate_cmdstan_path(path) - os.environ['CMDSTAN'] = path
- - -
[docs]def set_make_env(make: str) -> None: - """ - set MAKE environmental variable. - """ - os.environ['MAKE'] = make
- - -
[docs]def cmdstan_path() -> str: - """ - Validate, then return CmdStan directory path. - """ - cmdstan = '' - if 'CMDSTAN' in os.environ: - cmdstan = os.environ['CMDSTAN'] - else: - cmdstan_dir = os.path.expanduser(os.path.join('~', _DOT_CMDSTAN)) - if not os.path.exists(cmdstan_dir): - cmdstan_dir = os.path.expanduser(os.path.join('~', _DOT_CMDSTANPY)) - if not os.path.exists(cmdstan_dir): - raise ValueError( - 'no CmdStan installation found, ' - 'run command line script "install_cmdstan"' - ) - get_logger().warning( - "Using ~/.cmdstanpy is deprecated and" - " will not be automatically detected in version 1.0!\n" - " Please rename to ~/.cmdstan" - ) - latest_cmdstan = get_latest_cmdstan(cmdstan_dir) - if latest_cmdstan is None: - raise ValueError( - 'no CmdStan installation found, ' - 'run command line script "install_cmdstan"' - ) - cmdstan = os.path.join(cmdstan_dir, latest_cmdstan) - os.environ['CMDSTAN'] = cmdstan - validate_cmdstan_path(cmdstan) - return cmdstan
- - -def cmdstan_version_at(maj: int, min: int) -> bool: - """ - Check that CmdStan version is at or above Maj.min version. - Parses version string out of CmdStan makefile in CmdStan path dir. - - :param maj: Major version number - :param min: Minor version number - - :return: True if version at or above, else False - """ - # pylint:disable=bare-except - try: - path = cmdstan_path() - makefile = os.path.join(path, 'makefile') - if not os.path.exists(makefile): - raise ValueError( - 'CmdStan installation {}: missing makefile'.format(path) - ) - version = None - with open(makefile, 'r') as fd: - contents = fd.read() - start_idx = contents.find('CMDSTAN_VERSION := ') + len( - 'CMDSTAN_VERSION := ' - ) - end_idx = contents.find('\n', start_idx) - version = contents[start_idx:end_idx] - if version is None: - raise ValueError( - 'Cannot parse version from makefile: {}'.format(makefile) - ) - splits = version.split('.') - if len(splits) < 2: - raise ValueError( - 'Cannot parse version from makefile: {}'.format(makefile) - ) - cur_maj = int(splits[0]) - cur_min = int(splits[1]) - - if cur_maj > maj or (cur_maj == maj and cur_min >= min): - return True - except: # noqa - pass - return False - - -def cxx_toolchain_path( - version: Optional[str] = None, install_dir: Optional[str] = None -) -> Tuple[str, ...]: - """ - Validate, then activate C++ toolchain directory path. - """ - if platform.system() != 'Windows': - raise RuntimeError( - 'Functionality is currently only supported on Windows' - ) - if version is not None and not isinstance(version, str): - raise TypeError('Format version number as a string') - logger = get_logger() - if 'CMDSTAN_TOOLCHAIN' in os.environ: - toolchain_root = os.environ['CMDSTAN_TOOLCHAIN'] - if os.path.exists(os.path.join(toolchain_root, 'mingw64')): - compiler_path = os.path.join( - toolchain_root, - 'mingw64' if (sys.maxsize > 2 ** 32) else 'mingw32', - 'bin', - ) - if os.path.exists(compiler_path): - tool_path = os.path.join(toolchain_root, 'usr', 'bin') - if not os.path.exists(tool_path): - tool_path = '' - compiler_path = '' - logger.warning( - 'Found invalid installion for RTools40 on %s', - toolchain_root, - ) - toolchain_root = '' - else: - compiler_path = '' - logger.warning( - 'Found invalid installion for RTools40 on %s', - toolchain_root, - ) - toolchain_root = '' - - elif os.path.exists(os.path.join(toolchain_root, 'mingw_64')): - compiler_path = os.path.join( - toolchain_root, - 'mingw_64' if (sys.maxsize > 2 ** 32) else 'mingw_32', - 'bin', - ) - if os.path.exists(compiler_path): - tool_path = os.path.join(toolchain_root, 'bin') - if not os.path.exists(tool_path): - tool_path = '' - compiler_path = '' - logger.warning( - 'Found invalid installion for RTools35 on %s', - toolchain_root, - ) - toolchain_root = '' - else: - compiler_path = '' - logger.warning( - 'Found invalid installion for RTools35 on %s', - toolchain_root, - ) - toolchain_root = '' - else: - rtools40_home = os.environ.get('RTOOLS40_HOME') - cmdstan_dir = os.path.expanduser(os.path.join('~', _DOT_CMDSTAN)) - cmdstan_dir_old = os.path.expanduser(os.path.join('~', _DOT_CMDSTANPY)) - for toolchain_root in ( - ([rtools40_home] if rtools40_home is not None else []) - + ( - [ - os.path.join(install_dir, 'RTools40'), - os.path.join(install_dir, 'RTools35'), - os.path.join(install_dir, 'RTools30'), - os.path.join(install_dir, 'RTools'), - ] - if install_dir is not None - else [] - ) - + [ - os.path.join(cmdstan_dir, 'RTools40'), - os.path.join(cmdstan_dir_old, 'RTools40'), - os.path.join(os.path.abspath("/"), "RTools40"), - os.path.join(cmdstan_dir, 'RTools35'), - os.path.join(cmdstan_dir_old, 'RTools35'), - os.path.join(os.path.abspath("/"), "RTools35"), - os.path.join(cmdstan_dir, 'RTools'), - os.path.join(cmdstan_dir_old, 'RTools'), - os.path.join(os.path.abspath("/"), "RTools"), - os.path.join(os.path.abspath("/"), "RBuildTools"), - ] - ): - compiler_path = '' - tool_path = '' - - if os.path.exists(toolchain_root): - if version not in ('35', '3.5', '3'): - compiler_path = os.path.join( - toolchain_root, - 'mingw64' if (sys.maxsize > 2 ** 32) else 'mingw32', - 'bin', - ) - if os.path.exists(compiler_path): - tool_path = os.path.join(toolchain_root, 'usr', 'bin') - if not os.path.exists(tool_path): - tool_path = '' - compiler_path = '' - logger.warning( - 'Found invalid installation for RTools40 on %s', - toolchain_root, - ) - toolchain_root = '' - else: - break - else: - compiler_path = '' - logger.warning( - 'Found invalid installation for RTools40 on %s', - toolchain_root, - ) - toolchain_root = '' - else: - compiler_path = os.path.join( - toolchain_root, - 'mingw_64' if (sys.maxsize > 2 ** 32) else 'mingw_32', - 'bin', - ) - if os.path.exists(compiler_path): - tool_path = os.path.join(toolchain_root, 'bin') - if not os.path.exists(tool_path): - tool_path = '' - compiler_path = '' - logger.warning( - 'Found invalid installation for RTools35 on %s', - toolchain_root, - ) - toolchain_root = '' - else: - break - else: - compiler_path = '' - logger.warning( - 'Found invalid installation for RTools35 on %s', - toolchain_root, - ) - toolchain_root = '' - else: - toolchain_root = '' - - if not toolchain_root: - raise ValueError( - 'no RTools toolchain installation found, ' - 'run command line script ' - '"python -m cmdstanpy.install_cxx_toolchain"' - ) - logger.info('Add C++ toolchain to $PATH: %s', toolchain_root) - os.environ['PATH'] = ';'.join( - list( - OrderedDict.fromkeys( - [compiler_path, tool_path] + os.getenv('PATH', '').split(';') - ) - ) - ) - return compiler_path, tool_path - - -
[docs]def write_stan_json(path: str, data: Mapping[str, Any]) -> None: - """ - Dump a mapping of strings to data to a JSON file. - - Values can be any numeric type, a boolean (converted to int), - or any collection compatible with :func:`numpy.asarray`, e.g a - :class:`pandas.Series`. - - Produces a file compatible with the - `Json Format for Cmdstan - <https://mc-stan.org/docs/2_27/cmdstan-guide/json.html>`__ - - :param path: File path for the created json. Will be overwritten if - already in existence. - - :param data: A mapping from strings to values. This can be a dictionary - or something more exotic like an :class:`xarray.Dataset`. This will be - copied before type conversion, not modified - """ - data_out = {} - for key, val in data.items(): - if val is not None: - if isinstance(val, (str, bytes)) or ( - type(val).__module__ != 'numpy' - and not isinstance(val, (Collection, bool, int, float)) - ): - raise TypeError( - f"Invalid type '{type(val)}' provided to " - + f"write_stan_json for key '{key}'" - ) - try: - if not np.all(np.isfinite(val)): - raise ValueError( - "Input to write_stan_json has nan or infinite " - + f"values for key '{key}'" - ) - except TypeError: - # handles cases like val == ['hello'] - # pylint: disable=raise-missing-from - raise ValueError( - "Invalid type provided to " - + f"write_stan_json for key '{key}' " - + f"as part of collection {type(val)}" - ) - - if type(val).__module__ == 'numpy': - data_out[key] = val.tolist() - elif isinstance(val, Collection): - data_out[key] = np.asarray(val).tolist() - elif isinstance(val, bool): - data_out[key] = int(val) - else: - data_out[key] = val - - with open(path, 'w') as fd: - json.dump(data_out, fd)
- - -def rload(fname: str) -> Optional[Dict[str, Union[int, float, np.ndarray]]]: - """Parse data and parameter variable values from an R dump format file. - This parser only supports the subset of R dump data as described - in the "Dump Data Format" section of the CmdStan manual, i.e., - scalar, vector, matrix, and array data types. - """ - data_dict = {} - with open(fname, 'r') as fd: - lines = fd.readlines() - # Variable data may span multiple lines, parse accordingly - idx = 0 - while idx < len(lines) and '<-' not in lines[idx]: - idx += 1 - if idx == len(lines): - return None - start_idx = idx - idx += 1 - while True: - while idx < len(lines) and '<-' not in lines[idx]: - idx += 1 - next_var = idx - var_data = ''.join(lines[start_idx:next_var]).replace('\n', '') - lhs, rhs = [item.strip() for item in var_data.split('<-')] - lhs = lhs.replace('"', '') # strip optional Jags double quotes - rhs = rhs.replace('L', '') # strip R long int qualifier - data_dict[lhs] = parse_rdump_value(rhs) - if idx == len(lines): - break - start_idx = next_var - idx += 1 - return data_dict - - -def parse_rdump_value(rhs: str) -> Union[int, float, np.ndarray]: - """Process right hand side of Rdump variable assignment statement. - Value is either scalar, vector, or multi-dim structure. - Use regex to capture structure values, dimensions. - """ - pat = re.compile( - r'structure\(\s*c\((?P<vals>[^)]*)\)' - r'(,\s*\.Dim\s*=\s*c\s*\((?P<dims>[^)]*)\s*\))?\)' - ) - val: Union[int, float, np.ndarray] - try: - if rhs.startswith('structure'): - parse = pat.match(rhs) - if parse is None or parse.group('vals') is None: - raise ValueError(rhs) - vals = [float(v) for v in parse.group('vals').split(',')] - val = np.array(vals, order='F') - if parse.group('dims') is not None: - dims = [int(v) for v in parse.group('dims').split(',')] - val = np.array(vals).reshape(dims, order='F') - elif rhs.startswith('c(') and rhs.endswith(')'): - val = np.array([float(item) for item in rhs[2:-1].split(',')]) - elif '.' in rhs or 'e' in rhs: - val = float(rhs) - else: - val = int(rhs) - except TypeError as e: - raise ValueError('bad value in Rdump file: {}'.format(rhs)) from e - return val - - -def check_sampler_csv( - path: str, - is_fixed_param: bool = False, - iter_sampling: Optional[int] = None, - iter_warmup: Optional[int] = None, - save_warmup: bool = False, - thin: Optional[int] = None, -) -> Dict[str, Any]: - """Capture essential config, shape from stan_csv file.""" - meta = scan_sampler_csv(path, is_fixed_param) - if thin is None: - thin = _CMDSTAN_THIN - elif thin > _CMDSTAN_THIN: - if 'thin' not in meta: - raise ValueError( - 'bad Stan CSV file {}, ' - 'config error, expected thin = {}'.format(path, thin) - ) - if meta['thin'] != thin: - raise ValueError( - 'bad Stan CSV file {}, ' - 'config error, expected thin = {}, found {}'.format( - path, thin, meta['thin'] - ) - ) - draws_sampling = iter_sampling - if draws_sampling is None: - draws_sampling = _CMDSTAN_SAMPLING - draws_warmup = iter_warmup - if draws_warmup is None: - draws_warmup = _CMDSTAN_WARMUP - draws_warmup = int(math.ceil(draws_warmup / thin)) - draws_sampling = int(math.ceil(draws_sampling / thin)) - if meta['draws_sampling'] != draws_sampling: - raise ValueError( - 'bad Stan CSV file {}, expected {} draws, found {}'.format( - path, draws_sampling, meta['draws_sampling'] - ) - ) - if save_warmup: - if not ('save_warmup' in meta and meta['save_warmup'] == 1): - raise ValueError( - 'bad Stan CSV file {}, ' - 'config error, expected save_warmup = 1'.format(path) - ) - if meta['draws_warmup'] != draws_warmup: - raise ValueError( - 'bad Stan CSV file {}, ' - 'expected {} warmup draws, found {}'.format( - path, draws_warmup, meta['draws_warmup'] - ) - ) - return meta - - -def scan_sampler_csv(path: str, is_fixed_param: bool = False) -> Dict[str, Any]: - """Process sampler stan_csv output file line by line.""" - dict: Dict[str, Any] = {} - lineno = 0 - with open(path, 'r') as fd: - lineno = scan_config(fd, dict, lineno) - lineno = scan_column_names(fd, dict, lineno) - if not is_fixed_param: - lineno = scan_warmup_iters(fd, dict, lineno) - lineno = scan_metric(fd, dict, lineno) - lineno = scan_sampling_iters(fd, dict, lineno) - return dict - - -def scan_optimize_csv(path: str) -> Dict[str, Any]: - """Process optimizer stan_csv output file line by line.""" - dict: Dict[str, Any] = {} - lineno = 0 - with open(path, 'r') as fd: - lineno = scan_config(fd, dict, lineno) - lineno = scan_column_names(fd, dict, lineno) - line = fd.readline().lstrip(' #\t').rstrip() - xs = line.split(',') - dict['mle'] = [float(x) for x in xs] - return dict - - -def scan_generated_quantities_csv(path: str) -> Dict[str, Any]: - """ - Process standalone generated quantities stan_csv output file line by line. - """ - dict: Dict[str, Any] = {} - lineno = 0 - with open(path, 'r') as fd: - lineno = scan_config(fd, dict, lineno) - lineno = scan_column_names(fd, dict, lineno) - return dict - - -def scan_variational_csv(path: str) -> Dict[str, Any]: - """Process advi stan_csv output file line by line.""" - dict: Dict[str, Any] = {} - lineno = 0 - with open(path, 'r') as fd: - lineno = scan_config(fd, dict, lineno) - lineno = scan_column_names(fd, dict, lineno) - line = fd.readline().lstrip(' #\t').rstrip() - lineno += 1 - if line.startswith('Stepsize adaptation complete.'): - line = fd.readline().lstrip(' #\t\n') - lineno += 1 - if not line.startswith('eta'): - raise ValueError( - 'line {}: expecting eta, found:\n\t "{}"'.format( - lineno, line - ) - ) - line = fd.readline().lstrip(' #\t\n') - lineno += 1 - xs = line.split(',') - variational_mean = [float(x) for x in xs] - dict['variational_mean'] = variational_mean - dict['variational_sample'] = pd.read_csv( - path, - comment='#', - skiprows=lineno, - header=None, - float_precision='high', - ) - return dict - - -def scan_config(fd: TextIO, config_dict: Dict[str, Any], lineno: int) -> int: - """ - Scan initial stan_csv file comments lines and - save non-default configuration information to config_dict. - """ - cur_pos = fd.tell() - line = fd.readline().strip() - while len(line) > 0 and line.startswith('#'): - lineno += 1 - if line.endswith('(Default)'): - line = line.replace('(Default)', '') - line = line.lstrip(' #\t') - key_val = line.split('=') - if len(key_val) == 2: - if key_val[0].strip() == 'file' and not key_val[1].endswith('csv'): - config_dict['data_file'] = key_val[1].strip() - elif key_val[0].strip() != 'file': - raw_val = key_val[1].strip() - val: Union[int, float, str] - try: - val = int(raw_val) - except ValueError: - try: - val = float(raw_val) - except ValueError: - val = raw_val - config_dict[key_val[0].strip()] = val - cur_pos = fd.tell() - line = fd.readline().strip() - fd.seek(cur_pos) - return lineno - - -def scan_warmup_iters( - fd: TextIO, config_dict: Dict[str, Any], lineno: int -) -> int: - """ - Check warmup iterations, if any. - """ - if 'save_warmup' not in config_dict: - return lineno - cur_pos = fd.tell() - line = fd.readline().strip() - draws_found = 0 - while len(line) > 0 and not line.startswith('#'): - lineno += 1 - draws_found += 1 - cur_pos = fd.tell() - line = fd.readline().strip() - fd.seek(cur_pos) - config_dict['draws_warmup'] = draws_found - return lineno - - -def scan_column_names( - fd: TextIO, config_dict: MutableMapping[str, Any], lineno: int -) -> int: - """ - Process columns header, add to config_dict as 'column_names' - """ - line = fd.readline().strip() - lineno += 1 - names = line.split(',') - config_dict['column_names_raw'] = tuple(names) - config_dict['column_names'] = tuple(munge_varnames(names)) - return lineno - - -def munge_varnames(names: List[str]) -> List[str]: - """ - Change formatting for indices of container var elements - from use of dot separator to array-like notation, e.g., - rewrite label ``y_forecast.2.4`` to ``y_forecast[2,4]``. - """ - if names is None: - raise ValueError('missing argument "names"') - return [ - re.sub(r',([\d,]+)$', r'[\1]', column.replace('.', ',')) - for column in names - ] - - -def parse_method_vars(names: Tuple[str, ...]) -> Dict[str, Tuple[int, ...]]: - """ - Parses out names ending in `__` from list of CSV file column names. - Return a dict mapping sampler variable name to Stan CSV file column, using - zero-based column indexing. - Currently, (Stan 2.X) all CmdStan inference method vars are scalar, - the map entries are tuples of int to allow for structured variables. - """ - if names is None: - raise ValueError('missing argument "names"') - # note: method vars are currently all scalar so not checking for structure - return {v: tuple([k]) for (k, v) in enumerate(names) if v.endswith('__')} - - -def parse_stan_vars( - names: Tuple[str, ...] -) -> Tuple[Dict[str, Tuple[int, ...]], Dict[str, Tuple[int, ...]]]: - """ - Parses out Stan variable names (i.e., names not ending in `__`) - from list of CSV file column names. - Returns a pair of dicts which map variable names to dimensions and - variable names to columns, respectively, using zero-based column indexing. - Note: assumes: (a) munged varnames and (b) container vars are non-ragged - and dense; no checks size, indices. - """ - if names is None: - raise ValueError('missing argument "names"') - dims_map: Dict[str, Tuple[int, ...]] = {} - cols_map: Dict[str, Tuple[int, ...]] = {} - idxs = [] - dims: Union[List[str], List[int]] - for (idx, name) in enumerate(names): - idxs.append(idx) - var, *dims = name.split('[') - if var.endswith('__'): - idxs = [] - elif len(dims) == 0: - dims_map[var] = () - cols_map[var] = tuple(idxs) - idxs = [] - else: - if idx < len(names) - 1 and names[idx + 1].split('[')[0] == var: - continue - dims = [int(x) for x in dims[0][:-1].split(',')] - dims_map[var] = tuple(dims) - cols_map[var] = tuple(idxs) - idxs = [] - return (dims_map, cols_map) - - -def scan_metric(fd: TextIO, config_dict: Dict[str, Any], lineno: int) -> int: - """ - Scan step size, metric from stan_csv file comment lines, - set config_dict entries 'metric' and 'num_unconstrained_params' - """ - if 'metric' not in config_dict: - config_dict['metric'] = 'diag_e' - metric = config_dict['metric'] - line = fd.readline().strip() - lineno += 1 - if not line == '# Adaptation terminated': - raise ValueError( - 'line {}: expecting metric, found:\n\t "{}"'.format(lineno, line) - ) - line = fd.readline().strip() - lineno += 1 - label, step_size = line.split('=') - if not label.startswith('# Step size'): - raise ValueError( - 'line {}: expecting step size, ' - 'found:\n\t "{}"'.format(lineno, line) - ) - try: - float(step_size.strip()) - except ValueError as e: - raise ValueError( - 'line {}: invalid step size: {}'.format(lineno, step_size) - ) from e - line = fd.readline().strip() - lineno += 1 - if not ( - ( - metric == 'diag_e' - and line == '# Diagonal elements of inverse mass matrix:' - ) - or ( - metric == 'dense_e' and line == '# Elements of inverse mass matrix:' - ) - ): - raise ValueError( - 'line {}: invalid or missing mass matrix ' - 'specification'.format(lineno) - ) - line = fd.readline().lstrip(' #\t') - lineno += 1 - num_unconstrained_params = len(line.split(',')) - config_dict['num_unconstrained_params'] = num_unconstrained_params - if metric == 'diag_e': - return lineno - else: - for _ in range(1, num_unconstrained_params): - line = fd.readline().lstrip(' #\t') - lineno += 1 - if len(line.split(',')) != num_unconstrained_params: - raise ValueError( - 'line {}: invalid or missing mass matrix ' - 'specification'.format(lineno) - ) - return lineno - - -def scan_sampling_iters( - fd: TextIO, config_dict: Dict[str, Any], lineno: int -) -> int: - """ - Parse sampling iteration, save number of iterations to config_dict. - """ - draws_found = 0 - num_cols = len(config_dict['column_names']) - cur_pos = fd.tell() - line = fd.readline().strip() - while len(line) > 0 and not line.startswith('#'): - lineno += 1 - draws_found += 1 - data = line.split(',') - if len(data) != num_cols: - raise ValueError( - 'line {}: bad draw, expecting {} items, found {}'.format( - lineno, num_cols, len(line.split(',')) - ) - ) - cur_pos = fd.tell() - line = fd.readline().strip() - config_dict['draws_sampling'] = draws_found - fd.seek(cur_pos) - return lineno - - -def read_metric(path: str) -> List[int]: - """ - Read metric file in JSON or Rdump format. - Return dimensions of entry "inv_metric". - """ - if path.endswith('.json'): - with open(path, 'r') as fd: - metric_dict = json.load(fd) - if 'inv_metric' in metric_dict: - dims_np = np.asarray(metric_dict['inv_metric']) - return list(dims_np.shape) - else: - raise ValueError( - 'metric file {}, bad or missing' - ' entry "inv_metric"'.format(path) - ) - else: - dims = list(read_rdump_metric(path)) - if dims is None: - raise ValueError( - 'metric file {}, bad or missing' - ' entry "inv_metric"'.format(path) - ) - return dims - - -def read_rdump_metric(path: str) -> List[int]: - """ - Find dimensions of variable named 'inv_metric' in Rdump data file. - """ - metric_dict = rload(path) - if metric_dict is None or not ( - 'inv_metric' in metric_dict - and isinstance(metric_dict['inv_metric'], np.ndarray) - ): - raise ValueError( - 'metric file {}, bad or missing entry "inv_metric"'.format(path) - ) - return list(metric_dict['inv_metric'].shape) - - -def do_command( - cmd: List[str], - cwd: Optional[str] = None, -) -> Optional[str]: - """ - Spawn process, print stdout/stderr to console. - Throws RuntimeError on non-zero returncode. - """ - get_logger().debug('cmd: %s', cmd) - try: - proc = subprocess.Popen( - cmd, - cwd=cwd, - stdin=subprocess.DEVNULL, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - env=os.environ, - ) - stdout, stderr = proc.communicate() - if proc.returncode != 0: # problem, throw RuntimeError with msg - try: - serror = os.strerror(proc.returncode) - except ValueError as e: - pass - if proc.returncode < 0: - msg = 'Command: {}\nterminated by signal'.format(cmd) - elif proc.returncode <= 125: - msg = 'Command: {}\nfailed'.format(cmd) - elif proc.returncode == 127: - msg = 'Command: {}\nfailed, program not found'.format(cmd) - else: - msg = 'Command: {}\nmost likely crashed'.format(cmd) - msg = '{}, returncode: {}'.format(msg, proc.returncode) - if serror: - msg = '{}, error: {}'.format(msg, serror) - if stderr: - msg = '{}, stderr: {} '.format( - msg, stderr.decode('utf-8').strip() - ) - raise RuntimeError(msg) - if stdout or stderr: # success, return stdout, stderr, if any - msg = '' - if stdout: - msg = '{}'.format(stdout.decode('utf-8').strip()) - if stderr: - msg = '{}\nWarning or error:\t{}'.format( - msg, stderr.decode('utf-8').strip() - ) - return msg - except OSError as e: - msg = 'Command: {}\nfailed with error {}\n'.format(cmd, str(e)) - raise RuntimeError(msg) from e - return None # success - - -def windows_short_path(path: str) -> str: - """ - Gets the short path name of a given long path. - http://stackoverflow.com/a/23598461/200291 - - On non-Windows platforms, returns the path - - If (base)path does not exist, function raises RuntimeError - """ - if platform.system() != 'Windows': - return path - - if os.path.isfile(path) or ( - not os.path.isdir(path) and os.path.splitext(path)[1] != '' - ): - base_path, file_name = os.path.split(path) - else: - base_path, file_name = path, '' - - if not os.path.exists(base_path): - raise RuntimeError( - 'Windows short path function needs a valid directory. ' - 'Base directory does not exist: "{}"'.format(base_path) - ) - - import ctypes - from ctypes import wintypes - - # pylint: disable=invalid-name - _GetShortPathNameW = ( - ctypes.windll.kernel32.GetShortPathNameW # type: ignore - ) - - _GetShortPathNameW.argtypes = [ - wintypes.LPCWSTR, - wintypes.LPWSTR, - wintypes.DWORD, - ] - _GetShortPathNameW.restype = wintypes.DWORD - - output_buf_size = 0 - while True: - output_buf = ctypes.create_unicode_buffer(output_buf_size) - needed = _GetShortPathNameW(base_path, output_buf, output_buf_size) - if output_buf_size >= needed: - short_base_path = output_buf.value - break - else: - output_buf_size = needed - - short_path = ( - os.path.join(short_base_path, file_name) - if file_name - else short_base_path - ) - return short_path - - -def create_named_text_file( - dir: str, prefix: str, suffix: str, name_only: bool = False -) -> str: - """ - Create a named unique file, return filename. - Flag 'name_only' will create then delete the tmp file; - this lets us create filename args for commands which - disallow overwriting existing files (e.g., 'stansummary'). - """ - fd = tempfile.NamedTemporaryFile( - mode='w+', prefix=prefix, suffix=suffix, dir=dir, delete=name_only - ) - path = fd.name - fd.close() - return path - - -
[docs]def install_cmdstan( - version: Optional[str] = None, - dir: Optional[str] = None, - overwrite: bool = False, - verbose: bool = False, - compiler: bool = False, -) -> bool: - """ - Download and install a CmdStan release from GitHub by running - script ``install_cmdstan`` as a subprocess. Downloads the release - tar.gz file to temporary storage. Retries GitHub requests in order - to allow for transient network outages. Builds CmdStan executables - and tests the compiler by building example model ``bernoulli.stan``. - - :param version: CmdStan version string, e.g. "2.24.1". - Defaults to latest CmdStan release. - - :param dir: Path to install directory. Defaults to hidden directory - ``$HOME/.cmdstan``. - If no directory is specified and the above directory does not - exist, directory ``$HOME/.cmdstan`` will be created and populated. - - :param overwrite: Boolean value; when ``True``, will overwrite and - rebuild an existing CmdStan installation. Default is ``False``. - - :param verbose: Boolean value; when ``True``, output from CmdStan build - processes will be streamed to the console. Default is ``False``. - - :param compiler: Boolean value; when ``True`` on WINDOWS ONLY, use the - C++ compiler from the ``install_cxx_toolchain`` command or install - one if none is found. - - :return: Boolean value; ``True`` for success. - """ - logger = get_logger() - python = sys.executable - here = os.path.dirname(os.path.abspath(__file__)) - path = os.path.join(here, 'install_cmdstan.py') - cmd = [python, '-u', path] - if version is not None: - cmd.extend(['--version', version]) - if dir is not None: - cmd.extend(['--dir', dir]) - if overwrite: - cmd.append('--overwrite') - if verbose: - cmd.append('--verbose') - if compiler: - cmd.append('--compiler') - proc = subprocess.Popen( - cmd, - stdin=subprocess.DEVNULL, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - env=os.environ, - ) - while proc.poll() is None and proc.stdout: - print(proc.stdout.readline().decode('utf-8').strip()) - - _, stderr = proc.communicate() - if proc.returncode: - logger.warning('CmdStan installation failed.') - if stderr: - logger.warning(stderr.decode('utf-8').strip()) - return False - if dir is not None: - if version is not None: - set_cmdstan_path(os.path.join(dir, 'cmdstan-' + version)) - else: - set_cmdstan_path( - os.path.join(dir, get_latest_cmdstan(dir)) # type: ignore - ) - return True
- - -def flatten_chains(draws_array: np.ndarray) -> np.ndarray: - """ - Flatten a 3D array of draws X chains X variable into 2D array - where all chains are concatenated into a single column. - - :param draws_array: 3D array of draws - """ - if len(draws_array.shape) != 3: - raise ValueError( - 'Expecting 3D array, found array with {} dims'.format( - len(draws_array.shape) - ) - ) - - num_rows = draws_array.shape[0] * draws_array.shape[1] - num_cols = draws_array.shape[2] - return draws_array.reshape((num_rows, num_cols), order='F') - - -@contextlib.contextmanager -def pushd(new_dir: str) -> Iterator[None]: - """Acts like pushd/popd.""" - previous_dir = os.getcwd() - os.chdir(new_dir) - yield - os.chdir(previous_dir) - - -def wrap_progress_hook() -> Optional[Callable[[int, int, int], None]]: - try: - from tqdm import tqdm - - pbar = tqdm( - unit='B', - unit_scale=True, - unit_divisor=1024, - ) - - def download_progress_hook( - count: int, block_size: int, total_size: int - ) -> None: - if pbar.total is None: - pbar.total = total_size - pbar.reset() - downloaded_size = count * block_size - pbar.update(downloaded_size - pbar.n) - if pbar.n >= total_size: - pbar.close() - - except (ImportError, ModuleNotFoundError): - print("tqdm was not downloaded, progressbar not shown") - return None - - return download_progress_hook - - -class MaybeDictToFilePath: - """Context manager for json files.""" - - def __init__( - self, - *objs: Union[str, Mapping[str, Any], List[Any], int, float, None], - ): - self._unlink = [False] * len(objs) - self._paths: List[Any] = [''] * len(objs) - i = 0 - for obj in objs: - if isinstance(obj, Mapping): - data_file = create_named_text_file( - dir=_TMPDIR, prefix='', suffix='.json' - ) - get_logger().debug('input tempfile: %s', data_file) - write_stan_json(data_file, obj) - self._paths[i] = data_file - self._unlink[i] = True - elif isinstance(obj, str): - if not os.path.exists(obj): - raise ValueError("File doesn't exist {}".format(obj)) - self._paths[i] = obj - elif isinstance(obj, list): - err_msgs = [] - missing_obj_items = [] - for j, obj_item in enumerate(obj): - if not isinstance(obj_item, str): - err_msgs.append( - ( - 'List element {} must be a filename string,' - ' found {}' - ).format(j, obj_item) - ) - elif not os.path.exists(obj_item): - missing_obj_items.append( - "File doesn't exist: {}".format(obj_item) - ) - if err_msgs: - raise ValueError('\n'.join(err_msgs)) - if missing_obj_items: - raise ValueError('\n'.join(missing_obj_items)) - self._paths[i] = obj - elif obj is None: - self._paths[i] = None - elif i == 1 and isinstance(obj, (int, float)): - self._paths[i] = obj - else: - raise ValueError('data must be string or dict') - i += 1 - - def __enter__(self) -> List[str]: - return self._paths - - def __exit__(self, exc_type, exc_val, exc_tb) -> None: # type: ignore - for can_unlink, path in zip(self._unlink, self._paths): - if can_unlink and path: - try: - os.remove(path) - except PermissionError: - pass - - -class TemporaryCopiedFile: - """Context manager for tmpfiles, handles spaces in filepath.""" - - def __init__(self, file_path: str): - self._tmpdir = None - if ' ' in os.path.abspath(file_path) and platform.system() == 'Windows': - base_path, file_name = os.path.split(os.path.abspath(file_path)) - os.makedirs(base_path, exist_ok=True) - try: - short_base_path = windows_short_path(base_path) - if os.path.exists(short_base_path): - file_path = os.path.join(short_base_path, file_name) - except RuntimeError: - pass - - if ' ' in os.path.abspath(file_path): - tmpdir = tempfile.mkdtemp() - if ' ' in tmpdir: - raise RuntimeError( - 'Unable to generate temporary path without spaces! \n' - + 'Please move your stan file to location without spaces.' - ) - - _, path = tempfile.mkstemp(suffix='.stan', dir=tmpdir) - - shutil.copy(file_path, path) - self._path = path - self._tmpdir = tmpdir - else: - self._path = file_path - - def __enter__(self) -> Tuple[str, bool]: - return self._path, self._tmpdir is not None - - def __exit__(self, exc_type, exc_val, exc_tb) -> None: # type: ignore - if self._tmpdir: - shutil.rmtree(self._tmpdir, ignore_errors=True) -
- -
- - -
- - -
- -
- - -
-
- - - - - - \ No newline at end of file diff --git a/docs/_sources/api.rst.txt b/docs/_sources/api.rst.txt deleted file mode 100644 index bf3f8d0d..00000000 --- a/docs/_sources/api.rst.txt +++ /dev/null @@ -1,101 +0,0 @@ -.. py:currentmodule:: cmdstanpy - -############# -API Reference -############# - -******* -Classes -******* - -CmdStanModel -============ - -A CmdStanModel object encapsulates the Stan program. It manages program compilation and provides the following inference methods: - -:meth:`~CmdStanModel.sample` - runs the HMC-NUTS sampler to produce a set of draws from the posterior distribution. - -:meth:`~CmdStanModel.optimize` - produce a penalized maximum likelihood estimate (point estimate) of the model parameters. - -:meth:`~CmdStanModel.variational` - run CmdStan’s variational inference algorithm to approximate the posterior distribution. - -:meth:`~CmdStanModel.generate_quantities` - runs CmdStan’s generate_quantities method to produce additional quantities of interest based on draws from an existing sample. - -.. autoclass:: cmdstanpy.CmdStanModel - :members: - - -CmdStanMCMC -=========== - -.. autoclass:: cmdstanpy.CmdStanMCMC - :members: - -CmdStanMLE -========== - -.. autoclass:: cmdstanpy.CmdStanMLE - :members: - -CmdStanGQ -========= - -.. autoclass:: cmdstanpy.CmdStanGQ - :members: - -CmdStanVB -========= - -.. autoclass:: cmdstanpy.CmdStanVB - :members: - - -InferenceMetadata -================= - -.. autoclass:: cmdstanpy.InferenceMetadata - :members: - -RunSet -====== - -.. autoclass:: cmdstanpy.stanfit.RunSet - :members: - -********* -Functions -********* - -cmdstan_path -============ - -.. autofunction:: cmdstanpy.cmdstan_path - -install_cmdstan -=============== - -.. autofunction:: cmdstanpy.install_cmdstan - -set_cmdstan_path -================ - -.. autofunction:: cmdstanpy.set_cmdstan_path - -set_make_env -============ - -.. autofunction:: cmdstanpy.set_make_env - -from_csv -======== - -.. autofunction:: cmdstanpy.from_csv - -write_stan_json -=============== - -.. autofunction:: cmdstanpy.write_stan_json diff --git a/docs/_sources/hello_world.rst.txt b/docs/_sources/hello_world.rst.txt deleted file mode 100644 index 6565dd03..00000000 --- a/docs/_sources/hello_world.rst.txt +++ /dev/null @@ -1,227 +0,0 @@ -.. py:currentmodule:: cmdstanpy - -"Hello, World" --------------- - -Fitting a Stan model using the NUTS-HMC sampler -*********************************************** - -In order to verify the installation and also to demonstrate -the CmdStanPy workflow, we use CmdStanPy to fit the -the example Stan model ``bernoulli.stan`` -to the dataset ``bernoulli.data.json``. -This model and data are included with the CmdStan distribution -in subdirectory ``examples/bernoulli``. -This example allows the user to verify that CmdStanPy, CmdStan, -the StanC compiler, and the C++ toolchain have all been properly installed. -For substantive example models and -guidance on coding statistical models in Stan, see -the `CmdStan User's Guide `_. - - -The Stan model -^^^^^^^^^^^^^^ - -The model ``bernoulli.stan`` is a simple model for binary data: -given a set of N observations of i.i.d. binary data -`y[1] ... y[N]`, it calculates the Bernoulli chance-of-success `theta`. - -.. code:: - - data { - int N; - int y[N]; - } - parameters { - real theta; - } - model { - theta ~ beta(1,1); // uniform prior on interval 0,1 - y ~ bernoulli(theta); - } - -The :class:`CmdStanModel` class manages the Stan program and its corresponding compiled executable. -It provides properties and functions to inspect the model code and filepaths. -CmdStanPy, manages the environment variable ``CMDSTAN`` which specifies the path to -the local CmdStan installation. -The function :func:`~cmdstan_path` returns the value of this environment variable. - -.. ipython:: python - - # import packages - import os - from cmdstanpy import cmdstan_path, CmdStanModel - - # specify Stan program file - bernoulli_stan = os.path.join(cmdstan_path(), 'examples', 'bernoulli', 'bernoulli.stan') - - # instantiate the model; compiles the Stan program as needed. - bernoulli_model = CmdStanModel(stan_file=bernoulli_stan) - - # inspect model object - print(bernoulli_model) - - -Data inputs -^^^^^^^^^^^ - -CmdStanPy accepts input data either as a Python dictionary which maps data variable names -to values, or as the corresponding JSON file. - -The bernoulli model requires two inputs: the number of observations `N`, and -an N-length vector `y` of binary outcomes. -The data file `bernoulli.data.json` contains the following inputs: - -.. code:: - - { - "N" : 10, - "y" : [0,1,0,0,0,0,0,0,0,1] - } - - - -Fitting the model -^^^^^^^^^^^^^^^^^ - -The :meth:`~CmdStanModel.sample` method is used to do Bayesian inference -over the model conditioned on data using using Hamiltonian Monte Carlo -(HMC) sampling. It runs Stan's HMC-NUTS sampler on the model and data and -returns a :class:`CmdStanMCMC` object. The data can be specified -either as a filepath or a Python dictionary; in this example, we use the -example datafile `bernoulli.data.json`: - -By default, the :meth:`~CmdStanModel.sample` method runs 4 sampler chains. -The ``output_dir`` argument is an optional argument which specifies -the path to the output directory used by CmdStan. -If this argument is omitted, the output files are written -to a temporary directory which is deleted when the current Python session is terminated. - - -.. ipython:: python - - # specify data file - bernoulli_data = os.path.join(cmdstan_path(), 'examples', 'bernoulli', 'bernoulli.data.json') - - # fit the model - bernoulli_fit = bernoulli_model.sample(data=bernoulli_data) - - # printing the object reports sampler commands, output files - print(bernoulli_fit) - - -Accessing the sample -^^^^^^^^^^^^^^^^^^^^ - -The :meth:`~CmdStanModel.sample` method outputs are a set of per-chain -`Stan CSV files `__. -The filenames follow the template '--' -plus the file suffix '.csv'. -The :class:`CmdStanMCMC` class provides methods to assemble the contents -of these files in memory as well as methods to manage the disk files. - -Underlyingly, the draws from all chains are stored as an -a numpy.ndarray with dimensions: draws, chains, columns. -CmdStanPy provides accessor methods which return the sample -either in terms of the CSV file columns or in terms of the -sampler and Stan program variables. -The :meth:`~CmdStanMCMC.draws` and :meth:`~CmdStanMCMC.draws_pd` methods return the sample contents -in columnar format. - -The :meth:`~CmdStanMCMC.stan_variable` method to returns a numpy.ndarray object -which contains the set of all draws in the sample for the named Stan program variable. -The draws from all chains are flattened into a single drawset. -The first ndarray dimension is the number of draws X number of chains. -The remaining ndarray dimensions correspond to the Stan program variable dimension. -The :meth:`~CmdStanMCMC.stan_variables` method returns a Python dict over all Stan model variables. - -.. ipython:: python - - bernoulli_fit.draws().shape - bernoulli_fit.draws(concat_chains=True).shape - - draws_theta = bernoulli_fit.stan_variable(name='theta') - draws_theta.shape - - -CmdStan utilities: `stansummary`, `diagnose` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -CmdStan is distributed with a posterior analysis utility -`stansummary `__ -that reads the outputs of all chains and computes summary statistics -for all sampler and model parameters and quantities of interest. -The :class:`CmdStanMCMC` method :meth:`~CmdStanMCMC.summary` runs this utility and returns -summaries of the total joint log-probability density **lp__** plus -all model parameters and quantities of interest in a pandas.DataFrame: - -.. ipython:: python - - bernoulli_fit.summary() - -CmdStan is distributed with a second posterior analysis utility -`diagnose `__ -which analyzes the per-draw sampler parameters across all chains -looking for potential problems which indicate that the sample -isn't a representative sample from the posterior. -The :meth:`~CmdStanMCMC.diagnose` method runs this utility and prints the output to the console. - -.. ipython:: python - - print(bernoulli_fit.diagnose()) - -Managing Stan CSV files -^^^^^^^^^^^^^^^^^^^^^^^ - -The :class:`CmdStanMCMC` object keeps track of all output files produced -by the sampler run. -The :meth:`~CmdStanMCMC.save_csvfiles` function moves the CSV files -to a specified directory. - -.. ipython:: python - :verbatim: - - bernoulli_fit.save_csvfiles(dir='some/path') - - -.. comment - Progress bar - ^^^^^^^^^^^^ - - User can enable progress bar for the sampling if ``tqdm`` package - has been installed. - - .. code-block:: python - - bernoulli_fit = bernoulli_model.sample(data=bernoulli_data, show_progress=True) - - On Jupyter Notebook environment user should use notebook version - by using ``show_progress='notebook'``. - - .. code-block:: python - - bernoulli_fit = bernoulli_model.sample(data=bernoulli_data, show_progress='notebook') - - To enable javascript progress bar on Jupyter Lab Notebook user needs to install - nodejs and ipywidgets. Following the instructions in - `tqdm issue #394 ` - For ``conda`` users installing nodejs can be done with ``conda``. - - .. code-block:: bash - - conda install nodejs - - After nodejs has been installed, user needs to install ipywidgets and enable it. - - .. code-block:: bash - - pip install ipywidgets - jupyter nbextension enable --py widgetsnbextension - - Jupyter Lab still needs widgets manager. - - .. code-block:: bash - - jupyter labextension install @jupyter-widgets/jupyterlab-manager - - diff --git a/docs/_sources/index.rst.txt b/docs/_sources/index.rst.txt deleted file mode 100644 index 87d549f5..00000000 --- a/docs/_sources/index.rst.txt +++ /dev/null @@ -1,34 +0,0 @@ -.. CmdStanPy documentation master file, created by - sphinx-quickstart on Wed Jun 6 13:32:52 2018. - You can adapt this file completely to your liking, but it should at least - contain the root `toctree` directive. - -=============================================== -:mod:`cmdstanpy` -- Python interface to CmdStan -=============================================== - -.. module:: cmdstanpy - :synopsis: A lightweight pure-Python interface to CmdStan which provides access to the Stan compiler and all inference algorithms. - -.. moduleauthor:: Stan Developement Team - -CmdStanPy is a lightweight interface to Stan for Python users which -provides the necessary objects and functions to do Bayesian inference -given a probability model and data. -It wraps the -`CmdStan `_ -command line interface in a small set of -Python classes which provide methods to do analysis and manage the resulting -set of model, data, and posterior estimates. - -.. toctree:: - :maxdepth: 4 - - overview - installation - hello_world - workflow - examples - api - -:ref:`genindex` diff --git a/docs/_sources/installation.rst.txt b/docs/_sources/installation.rst.txt deleted file mode 100644 index 49979912..00000000 --- a/docs/_sources/installation.rst.txt +++ /dev/null @@ -1,182 +0,0 @@ -Installation -============ - -CmdStanPy is a pure-Python3 package, but it relies on CmdStan for all -of its functionality. There are several ways to install CmdStan and CmdStanPy, -which depend on the kind of user you are. - - -Conda users (Recommended) -------------------------- - -If you use `conda `__, -installation of both can be done very simply. CmdStanPy -and CmdStan are both available via the -`conda-forge `__ repository. - -We recommend creating a new environment for CmdStan[Py]: - -.. code-block:: bash - - conda create -n cmdstan -c conda-forge cmdstanpy - -but installation is possible in an existing environment: - -.. code-block:: bash - - conda install -c conda-forge cmdstanpy - -These commands will install CmdStanPy, CmdStan, and the -required compilers for using CmdStan on your system inside -a conda environment. To use them, run ``conda activate cmdstan``, -or whichever name you used for your environment (following ``-n`` -above). - -Note that CmdStan is only available on conda for versions -2.27.0 and newer. If you require an older version, you must use -one of the following methods to install it. If you require a -version of CmdStan *newer* than 2.27.0, but not the latest, -you can install it in the standard conda way by specifying -``cmdstan==VERSION`` in the install command. - -Pip (non-Conda) users -------------------------- - -CmdStan can also be installed from PyPI via URL: https://pypi.org/project/cmdstanpy/ or from the -command line using ``pip``: - -.. code-block:: bash - - pip install --upgrade cmdstanpy - -The optional packages are - -* ``tqdm`` which allows for progress bar display during sampling -* ``xarray``, an n-dimension labeled dataset package which can be used for outputs - -To install CmdStanPy with all the optional packages: - -.. code-block:: bash - - pip install --upgrade cmdstanpy[all] - -To install the current develop branch from GitHub: - -.. code-block:: bash - - pip install -e git+https://github.com/stan-dev/cmdstanpy@/develop#egg=cmdstanpy - - -If you install CmdStanPy from GitHub, -**you must install CmdStan**. The recommended way for Pip users -to do so is via the ``install_cmdstan`` function -:ref:`described below` - -*Note for PyStan & RTools users:* PyStan and CmdStanPy should be installed in -separate environments if you are using the RTools toolchain (primarily Windows users). -If you already have PyStan installed, you should take care to install CmdStanPy in its own -virtual environment. - -Installing CmdStan ------------------- - -Prerequisites -^^^^^^^^^^^^^ - -CmdStanPy requires an installed C++ toolchain -consisting of a modern C++ compiler and the GNU-Make utility. - -+ Windows: CmdStanPy provides the function ``install_cxx_toolchain`` - -+ Linux: install g++ 4.9.3 or clang 6.0. (GNU-Make is the default ``make`` utility) - -+ maxOS: install XCode and Xcode command line tools via command: `xcode-select --install`. - -.. _install-cmdstan-fun: - -Function ``install_cmdstan`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -CmdStanPy provides the function :func:`~cmdstanpy.install_cmdstan` which -downloads CmdStan from GitHub and builds the CmdStan utilities. -It can be can be called from within Python or from the command line. - -The default install location is a hidden directory in the user ``$HOME`` directory -named ``.cmdstan``. This directory will be created by the install script. - -+ From Python - -.. code-block:: python - - import cmdstanpy - cmdstanpy.install_cmdstan() - -+ From the command line on Linux or MacOSX - -.. code-block:: bash - - install_cmdstan - ls -F ~/.cmdstan - -+ On Windows - -.. code-block:: bash - - python -m cmdstanpy.install_cmdstan - dir "%HOME%/.cmdstan" - -The named arguments: `-d ` and `-v ` -can be used to override these defaults: - -.. code-block:: bash - - install_cmdstan -d my_local_cmdstan -v 2.20.0 - ls -F my_local_cmdstan - -DIY Installation -^^^^^^^^^^^^^^^^ - -If you with to install CmdStan yourself, follow the instructions -in the `CmdStan User's Guide `__. - -Post Installation: Setting Environment Variables -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -The default for the CmdStan installation location -is a directory named ``.cmdstan`` in your ``$HOME`` directory. [1]_ -If you have installed CmdStan in a different directory, -then you can set the environment variable ``CMDSTAN`` to this -location and it will be picked up by CmdStanPy. `Note:` This is done -for you if you installed via ``conda``, as cmdstan will be installed -in the ``bin/`` subfolder of the environment directory. - -.. code-block:: bash - - export CMDSTAN='/path/to/cmdstan-2.27.0' - - -The CmdStanPy commands ``cmdstan_path`` and ``set_cmdstan_path`` -get and set this environment variable: - -.. code-block:: python - - from cmdstanpy import cmdstan_path, set_cmdstan_path - - oldpath = cmdstan_path() - set_cmdstan_path(os.path.join('path','to','cmdstan')) - newpath = cmdstan_path() - -To use custom ``make``-tool use ``set_make_env`` function. - -.. code-block:: python - - from cmdstanpy import set_make_env - set_make_env("mingw32-make.exe") # On Windows with mingw32-make - - -.. rubric:: Footnotes - -.. [1] In earlier versions, the hidden directory was named ``.cmdstanpy``; - use of this directory has been deprecated. - - diff --git a/docs/_sources/overview.rst.txt b/docs/_sources/overview.rst.txt deleted file mode 100644 index ce7de3f7..00000000 --- a/docs/_sources/overview.rst.txt +++ /dev/null @@ -1,24 +0,0 @@ -Overview -======== - -CmdStanPy is a lightweight interface to Stan for Python users which -provides the necessary objects and functions to do Bayesian inference -given a probability model and data. -It wraps the -`CmdStan `_ -command line interface in a small set of -Python classes which provide methods to do analysis and manage the resulting -set of model, data, and posterior estimates. - -CmdStanPy is a lightweight interface in that it is designed to use minimal -memory beyond what is used by CmdStan itself to do inference given -and model and data.It runs and records an analysis, but the user chooses -whether or not to instantiate the results in memory, -thus CmdStanPy has the potential to fit more complex models -to larger datasets than might be possible in PyStan or RStan. -It manages the set of CmdStan input and output files and provides -methods and options which allow the user to save these files -to a specific filepath. -By default, CmdStan output files are written to a temporary directory -in order to avoid filling up the user's filesystem. - diff --git a/docs/_modules/index.html b/docs/examples.html similarity index 53% rename from docs/_modules/index.html rename to docs/examples.html index e938afc3..1d4f8c27 100644 --- a/docs/_modules/index.html +++ b/docs/examples.html @@ -5,38 +5,40 @@ - Overview: module code — CmdStanPy 0.9.77 documentation + CmdStanPy Examples — CmdStanPy 0.9.77 documentation - - + + + href="_static/vendor/fontawesome/5.13.0/css/all.min.css"> + href="_static/vendor/fontawesome/5.13.0/webfonts/fa-solid-900.woff2"> + href="_static/vendor/fontawesome/5.13.0/webfonts/fa-brands-400.woff2"> - - - + + + - - - - - - - - + + + + + + + + - - + + + + @@ -70,37 +72,37 @@ - +