diff --git a/pymc3/backends/__init__.py b/pymc3/backends/__init__.py index d5cc8dba1d..2d91f228ea 100644 --- a/pymc3/backends/__init__.py +++ b/pymc3/backends/__init__.py @@ -7,8 +7,8 @@ 2. Text files (pymc3.backends.Text) 3. SQLite (pymc3.backends.SQLite) -The NumPy arrays and text files both hold the entire trace in memory, -whereas SQLite commits the trace to the database while sampling. +The NDArray backend holds the entire trace in memory, whereas the Text +and SQLite backends store the values while sampling. Selecting a backend ------------------- diff --git a/pymc3/backends/base.py b/pymc3/backends/base.py index c918e5df96..41b34d6d93 100644 --- a/pymc3/backends/base.py +++ b/pymc3/backends/base.py @@ -7,6 +7,10 @@ from ..model import modelcontext +class BackendError(Exception): + pass + + class BaseTrace(object): """Base trace object diff --git a/pymc3/backends/sqlite.py b/pymc3/backends/sqlite.py index bed3c602ce..348e7212a8 100644 --- a/pymc3/backends/sqlite.py +++ b/pymc3/backends/sqlite.py @@ -272,6 +272,8 @@ def close(self): self.connected = False +# TODO Consider merging `_create_colnames` and `_create_shape` with +# very similar functions in the csv backend. def _create_colnames(shape): """Return column names based on `shape`. diff --git a/pymc3/backends/text.py b/pymc3/backends/text.py index 1084b45dcf..8055abe152 100644 --- a/pymc3/backends/text.py +++ b/pymc3/backends/text.py @@ -1,42 +1,31 @@ """Text file trace backend -After sampling with NDArray backend, save results as text files. +Store sampling values as CSV files. -As this other backends, this can be used by passing the backend instance -to `sample`. +File format +----------- - >>> import pymc3 as pm - >>> db = pm.backends.Text('test') - >>> trace = pm.sample(..., trace=db) +Sampling values for each chain are saved in a separate file (under a +directory specified by the `name` argument). The rows correspond to +sampling iterations. The column names consist of variable names and +index labels. For example, the heading -Or sampling can be performed with the default NDArray backend and then -dumped to text files after. + x,y__0_0,y__0_1,y__1_0,y__1_1,y__2_0,y__2_1 - >>> from pymc3.backends import text - >>> trace = pm.sample(...) - >>> text.dump('test', trace) - -Database format ---------------- - -For each chain, a directory named `chain-N` is created. In this -directory, one file per variable is created containing the values of the -object. To deal with multidimensional variables, the array is reshaped -to one dimension before saving with `numpy.savetxt`. The shape and dtype -information is saved in a json file in the same directory and is used to -load the database back again using `numpy.loadtxt`. +represents two variables, x and y, where x is a scalar and y has a +shape of (3, 2). """ -import os -import glob -import json +from glob import glob import numpy as np +import os +import pandas as pd +import warnings from ..backends import base -from ..backends.ndarray import NDArray -class Text(NDArray): - """Text storage +class Text(base.BaseTrace): + """Text trace object Parameters ---------- @@ -53,102 +42,207 @@ def __init__(self, name, model=None, vars=None): os.mkdir(name) super(Text, self).__init__(name, model, vars) - def close(self): - super(Text, self).close() - _dump_trace(self.name, self) - - -def dump(name, trace, chains=None): - """Store NDArray trace as text database. - - Parameters - ---------- - name : str - Name of directory to store text files - trace : MultiTrace of NDArray traces - Result of MCMC run with default NDArray backend - chains : list - Chains to dump. If None, all chains are dumped. - """ - if not os.path.exists(name): - os.mkdir(name) - if chains is None: - chains = trace.chains - for chain in chains: - _dump_trace(name, trace._traces[chain]) - + self.flat_names = {v: _create_flat_names(v, shape) + for v, shape in self.var_shapes.items()} + + self.filename = None + self._fh = None + self.df = None + + ## Sampling methods + + def setup(self, draws, chain): + """Perform chain-specific setup. + + Parameters + ---------- + draws : int + Expected number of draws + chain : int + Chain number + """ + self.chain = chain + self.filename = os.path.join(self.name, 'chain-{}.csv'.format(chain)) + + cnames = [fv for v in self.varnames for fv in self.flat_names[v]] + + if os.path.exists(self.filename): + with open(self.filename) as fh: + prev_cnames = next(fh).strip().split(',') + if prev_cnames != cnames: + raise base.BackendError( + "Previous file '{}' has different variables names " + "than current model.".format(self.filename)) + self._fh = open(self.filename, 'a') + else: + self._fh = open(self.filename, 'w') + self._fh.write(','.join(cnames) + '\n') + + def record(self, point): + """Record results of a sampling iteration. + + Parameters + ---------- + point : dict + Values mapped to variable names + """ + vals = {} + for varname, value in zip(self.varnames, self.fn(point)): + vals[varname] = value.ravel() + columns = [str(val) for var in self.varnames for val in vals[var]] + self._fh.write(','.join(columns) + '\n') -def _dump_trace(name, trace): - """Dump a single-chain trace. + def close(self): + self._fh.close() + self._fh = None # Avoid serialization issue. + + ## Selection methods + + def _load_df(self): + if self.df is None: + self.df = pd.read_csv(self.filename) + + def __len__(self): + if self.filename is None: + return 0 + self._load_df() + return self.df.shape[0] + + def get_values(self, varname, burn=0, thin=1): + """Get values from trace. + + Parameters + ---------- + varname : str + burn : int + thin : int + + Returns + ------- + A NumPy array + """ + self._load_df() + var_df = self.df[self.flat_names[varname]] + shape = (self.df.shape[0],) + self.var_shapes[varname] + vals = var_df.values.ravel().reshape(shape) + return vals[burn::thin] + + def _slice(self, idx): + warnings.warn('Slice for Text backend has no effect.') + + def point(self, idx): + """Return dictionary of point values at `idx` for current chain + with variables names as keys. + """ + idx = int(idx) + self._load_df() + pt = {} + for varname in self.varnames: + vals = self.df[self.flat_names[varname]].iloc[idx] + pt[varname] = vals.reshape(self.var_shapes[varname]) + return pt + + +def _create_flat_names(varname, shape): + """Return flat variable names for `varname` of `shape`. + + Examples + -------- + >>> _create_flat_names('x', (5,)) + ['x__0', 'x__1', 'x__2', 'x__3', 'x__4'] + + >>> _create_flat_names('x', (2, 2)) + ['x__0_0', 'x__0_1', 'x__1_0', 'x__1_1'] """ - chain_name = 'chain-{}'.format(trace.chain) - chain_dir = os.path.join(name, chain_name) - os.mkdir(chain_dir) + if not shape: + return [varname] + labels = (np.ravel(xs).tolist() for xs in np.indices(shape)) + labels = (map(str, xs) for xs in labels) + return ['{}__{}'.format(varname, '_'.join(idxs)) for idxs in zip(*labels)] - info = {} - for varname in trace.varnames: - data = trace.get_values(varname) - if np.issubdtype(data.dtype, np.int): - fmt = '%i' - is_int = True - else: - fmt = '%g' - is_int = False - info[varname] = {'shape': data.shape, 'is_int': is_int} +def _create_shape(flat_names): + "Determine shape from `_create_flat_names` output." + try: + _, shape_str = flat_names[-1].rsplit('__', 1) + except ValueError: + return () + return tuple(int(i) + 1 for i in shape_str.split('_')) - var_file = os.path.join(chain_dir, varname + '.txt') - np.savetxt(var_file, data.reshape(-1, data.size), fmt=fmt) - ## Store shape and dtype information for reloading. - info_file = os.path.join(chain_dir, 'info.json') - with open(info_file, 'w') as sfh: - json.dump(info, sfh) - -def load(name, chains=None, model=None): - """Load text database. +def load(name, model=None): + """Load Text database. Parameters ---------- name : str - Path to root directory for text database - chains : list - Chains to load. If None, all chains are loaded. + Name of directory with files (one per chain) model : Model If None, the model is taken from the `with` context. Returns ------- - ndarray.Trace instance + A MultiTrace instance """ - chain_dirs = _get_chain_dirs(name) - if chains is None: - chains = list(chain_dirs.keys()) + files = glob(os.path.join(name, 'chain-*.csv')) traces = [] - for chain in chains: - chain_dir = chain_dirs[chain] - info_file = os.path.join(chain_dir, 'info.json') - with open(info_file, 'r') as sfh: - info = json.load(sfh) - samples = {} - for varname, info in info.items(): - var_file = os.path.join(chain_dir, varname + '.txt') - dtype = int if info['is_int'] else float - flat_data = np.loadtxt(var_file, dtype=dtype) - samples[varname] = flat_data.reshape(info['shape']) - trace = NDArray(model=model) - trace.samples = samples + for f in files: + chain = int(os.path.splitext(f)[0].rsplit('-', 1)[1]) + trace = Text(name, model=model) trace.chain = chain + trace.filename = f traces.append(trace) return base.MultiTrace(traces) -def _get_chain_dirs(name): - """Return mapping of chain number to directory.""" - return {_chain_dir_to_chain(chain_dir): chain_dir - for chain_dir in glob.glob(os.path.join(name, 'chain-*'))} +def dump(name, trace, chains=None): + """Store values from NDArray trace as CSV files. + + Parameters + ---------- + name : str + Name of directory to store CSV files in + trace : MultiTrace of NDArray traces + Result of MCMC run with default NDArray backend + chains : list + Chains to dump. If None, all chains are dumped. + """ + if not os.path.exists(name): + os.mkdir(name) + if chains is None: + chains = trace.chains + + var_shapes = trace._traces[chains[0]].var_shapes + flat_names = {v: _create_flat_names(v, shape) + for v, shape in var_shapes.items()} + + for chain in chains: + filename = os.path.join(name, 'chain-{}.csv'.format(chain)) + df = _trace_to_df(trace._traces[chain], flat_names) + df.to_csv(filename, index=False) + +def _trace_to_df(trace, flat_names=None): + """Convert single-chain trace to Pandas DataFrame. -def _chain_dir_to_chain(chain_dir): - return int(os.path.basename(chain_dir).split('-')[1]) + Parameters + ---------- + trace : NDarray trace + flat_names : dict or None + A dictionary that maps each variable name in `trace` to a list + of flat variable names (e.g., ['x__0', 'x__1', ...]) + """ + if flat_names is None: + flat_names = {v: _create_flat_names(v, shape) + for v, shape in trace.var_shapes.items()} + + var_dfs = [] + for varname, shape in trace.var_shapes.items(): + vals = trace[varname] + if len(shape) == 1: + flat_vals = vals + else: + flat_vals = vals.reshape(len(trace), np.prod(shape)) + var_dfs.append(pd.DataFrame(flat_vals, columns=flat_names[varname])) + return pd.concat(var_dfs, axis=1) diff --git a/pymc3/tests/test_text_backend.py b/pymc3/tests/test_text_backend.py index 263f9bec37..51181cafec 100644 --- a/pymc3/tests/test_text_backend.py +++ b/pymc3/tests/test_text_backend.py @@ -3,6 +3,42 @@ from pymc3.backends import ndarray, text +class TestText0dSampling(bf.SamplingTestCase): + backend = text.Text + name = 'text-db' + shape = () + + +class TestText1dSampling(bf.SamplingTestCase): + backend = text.Text + name = 'text-db' + shape = 2 + + +class TestText2dSampling(bf.SamplingTestCase): + backend = text.Text + name = 'text-db' + shape = (2, 3) + + +class TestText0dSelection(bf.SelectionNoSliceTestCase): + backend = text.Text + name = 'text-db' + shape = () + + +class TestText1dSelection(bf.SelectionNoSliceTestCase): + backend = text.Text + name = 'text-db' + shape = 2 + + +class TestText2dSelection(bf.SelectionNoSliceTestCase): + backend = text.Text + name = 'text-db' + shape = (2, 3) + + class TestTextDumpLoad(bf.DumpLoadTestCase): backend = text.Text load_func = staticmethod(text.load) @@ -24,6 +60,62 @@ def setUpClass(cls): cls.mtrace1 = text.load(cls.name1) -def test__chain_dir_to_chain(): - assert text._chain_dir_to_chain('/path/to/chain-0') == 0 - assert text._chain_dir_to_chain('chain-0') == 0 +class TestTraceToDf(bf.ModelBackendSampledTestCase): + backend = ndarray.NDArray + name = 'text-db' + shape = (2, 3) + + def test_trace_to_df(self): + mtrace = self.mtrace + df = text._trace_to_df(mtrace._traces[0]) + self.assertEqual(len(mtrace), df.shape[0]) + + checked = False + for varname in self.test_point.keys(): + vararr = mtrace.get_values(varname, chains=0) + ## With `shape` above, only one variable has to have that + ## `shape`. + if vararr.shape[1:] != self.shape: + continue + npt.assert_equal(vararr[:, 0, 0], df[varname + '__0_0'].values) + npt.assert_equal(vararr[:, 1, 0], df[varname + '__1_0'].values) + npt.assert_equal(vararr[:, 1, 2], df[varname + '__1_2'].values) + checked = True + self.assertTrue(checked) + +class TestNDArrayTextEquality(bf.BackendEqualityTestCase): + backend0 = ndarray.NDArray + name0 = None + backend1 = text.Text + name1 = 'text-db' + shape = (2, 3) + + +def test_create_flat_names_0d(): + shape = () + result = text._create_flat_names('x', shape) + expected = ['x'] + assert result == expected + assert text._create_shape(result) == shape + + +def test_create_flat_names_1d(): + shape = 2, + result = text._create_flat_names('x', shape) + expected = ['x__0', 'x__1'] + assert result == expected + assert text._create_shape(result) == shape + + +def test_create_flat_names_2d(): + shape = 2, 3 + result = text._create_flat_names('x', shape) + expected = ['x__0_0', 'x__0_1', 'x__0_2', + 'x__1_0', 'x__1_1', 'x__1_2'] + assert result == expected + assert text._create_shape(result) == shape + + +def test_create_flat_names_3d(): + shape = 2, 3, 4 + assert text._create_shape(text._create_flat_names('x', shape)) == shape diff --git a/readme.md b/readme.md index 638bfa4dda..7885546975 100644 --- a/readme.md +++ b/readme.md @@ -46,11 +46,13 @@ Another option is to clone the repository and install PyMC using `python setup.p ## Dependencies -PyMC is tested on Python 2.7 and 3.3 and depends on Theano, NumPy, SciPy, and Matplotlib (see setup.py for version information). +PyMC is tested on Python 2.7 and 3.3 and depends on Theano, NumPy, +SciPy, Pandas, and Matplotlib (see setup.py for version information). ### Optional -The GLM submodule relies on Pandas and Patsy. +In addtion to the above dependencies, the GLM submodule relies on +Patsy. [`scikits.sparse`](https://github.com/njsmith/scikits-sparse) enables sparse scaling matrices which are useful for large problems. Installation on Ubuntu is easy: diff --git a/setup.py b/setup.py index bf20d02416..57eb7dc4b9 100755 --- a/setup.py +++ b/setup.py @@ -27,7 +27,7 @@ 'Operating System :: OS Independent'] install_reqs = ['numpy>=1.7.1', 'scipy>=0.12.0', 'matplotlib>=1.2.1', - 'Theano<=0.7.1dev'] + 'Theano<=0.7.1dev', 'pandas>=0.15.0'] test_reqs = ['nose'] if sys.version_info[0] == 2: # py3 has mock in stdlib