From 8da5beaaae9d741f29595bc49335f69018a26b87 Mon Sep 17 00:00:00 2001 From: Kyle Meyer Date: Sun, 31 May 2015 23:39:00 -0400 Subject: [PATCH 1/3] Add CSV backend This has a couple of advantages over the Text backend. 1. The values are stored as in a table per chain, not a file per variable. This is easier to inspect and work with directly if desired (e.g., with pd.read_csv). 2. Values are stored during sampling, not kept in memory. --- pymc3/backends/__init__.py | 11 +- pymc3/backends/base.py | 4 + pymc3/backends/csv.py | 248 ++++++++++++++++++++++++++++++++ pymc3/backends/sqlite.py | 2 + pymc3/tests/test_csv_backend.py | 121 ++++++++++++++++ 5 files changed, 382 insertions(+), 4 deletions(-) create mode 100644 pymc3/backends/csv.py create mode 100644 pymc3/tests/test_csv_backend.py diff --git a/pymc3/backends/__init__.py b/pymc3/backends/__init__.py index d5cc8dba1d..f0a6d1f2f0 100644 --- a/pymc3/backends/__init__.py +++ b/pymc3/backends/__init__.py @@ -4,11 +4,11 @@ ------------------ 1. NumPy array (pymc3.backends.NDArray) -2. Text files (pymc3.backends.Text) +2. Plain text formats (pymc3.backends.Text and pymc3.backends.CSV) 3. SQLite (pymc3.backends.SQLite) -The NumPy arrays and text files both hold the entire trace in memory, -whereas SQLite commits the trace to the database while sampling. +The NDArray and Text backend both hold the entire trace in memory, +whereas CSV and SQLite backends store the values while sampling. Selecting a backend ------------------- @@ -107,13 +107,16 @@ If the traces are stored on disk, then a `load` function should also be defined that returns a MultiTrace object. -For specific examples, see pymc3.backends.{ndarray,text,sqlite}.py. +For specific examples, see pymc3.backends.{ndarray,text,csv,sqlite}.py. """ from ..backends.ndarray import NDArray from ..backends.text import Text +from ..backends.csv import CSV from ..backends.sqlite import SQLite _shortcuts = {'text': {'backend': Text, 'name': 'mcmc'}, + 'csv': {'backend': CSV, + 'name': 'mcmc'}, 'sqlite': {'backend': SQLite, 'name': 'mcmc.sqlite'}} diff --git a/pymc3/backends/base.py b/pymc3/backends/base.py index c918e5df96..41b34d6d93 100644 --- a/pymc3/backends/base.py +++ b/pymc3/backends/base.py @@ -7,6 +7,10 @@ from ..model import modelcontext +class BackendError(Exception): + pass + + class BaseTrace(object): """Base trace object diff --git a/pymc3/backends/csv.py b/pymc3/backends/csv.py new file mode 100644 index 0000000000..95d9495a56 --- /dev/null +++ b/pymc3/backends/csv.py @@ -0,0 +1,248 @@ +"""CSV file trace backend + +Store and retrieve sampling values from CSV file. + +CSV file format +--------------- + +Sampling values for each chain are saved in a separate file (under a +directory specified by the `name` argument). The rows correspond to +sampling iterations. The column names consist of variable names and +index labels. For example, the heading + + x,y__0_0,y__0_1,y__1_0,y__1_1,y__2_0,y__2_1 + +represents two variables, x and y, where x is a scalar and y has a +shape of (3, 2). +""" +from glob import glob +import numpy as np +import os +import pandas as pd +import warnings + +from ..backends import base + + +class CSV(base.BaseTrace): + """CSV trace object + + Parameters + ---------- + name : str + Name of directory to store CSV files in + model : Model + If None, the model is taken from the `with` context. + vars : list of variables + Sampling values will be stored for these variables. If None, + `model.unobserved_RVs` is used. + """ + def __init__(self, name, model=None, vars=None): + if not os.path.exists(name): + os.mkdir(name) + super(CSV, self).__init__(name, model, vars) + + self.flat_names = {v: _create_flat_names(v, shape) + for v, shape in self.var_shapes.items()} + + self.filename = None + self._fh = None + self.df = None + + ## Sampling methods + + def setup(self, draws, chain): + """Perform chain-specific setup. + + Parameters + ---------- + draws : int + Expected number of draws + chain : int + Chain number + """ + self.chain = chain + self.filename = os.path.join(self.name, 'chain-{}.csv'.format(chain)) + + cnames = [fv for v in self.varnames for fv in self.flat_names[v]] + + if os.path.exists(self.filename): + with open(self.filename) as fh: + prev_cnames = next(fh).strip().split(',') + if prev_cnames != cnames: + raise base.BackendError( + "Previous file '{}' has different variables names " + "than current model.".format(self.filename)) + self._fh = open(self.filename, 'a') + else: + self._fh = open(self.filename, 'w') + self._fh.write(','.join(cnames) + '\n') + + def record(self, point): + """Record results of a sampling iteration. + + Parameters + ---------- + point : dict + Values mapped to variable names + """ + vals = {} + for varname, value in zip(self.varnames, self.fn(point)): + vals[varname] = value.ravel() + columns = [str(val) for var in self.varnames for val in vals[var]] + self._fh.write(','.join(columns) + '\n') + + def close(self): + self._fh.close() + self._fh = None # Avoid serialization issue. + + ## Selection methods + + def _load_df(self): + if self.df is None: + self.df = pd.read_csv(self.filename) + + def __len__(self): + if self.filename is None: + return 0 + self._load_df() + return self.df.shape[0] + + def get_values(self, varname, burn=0, thin=1): + """Get values from trace. + + Parameters + ---------- + varname : str + burn : int + thin : int + + Returns + ------- + A NumPy array + """ + self._load_df() + var_df = self.df[self.flat_names[varname]] + shape = (self.df.shape[0],) + self.var_shapes[varname] + vals = var_df.values.ravel().reshape(shape) + return vals[burn::thin] + + def _slice(self, idx): + warnings.warn('Slice for CSV backend has no effect.') + + def point(self, idx): + """Return dictionary of point values at `idx` for current chain + with variables names as keys. + """ + idx = int(idx) + self._load_df() + pt = {} + for varname in self.varnames: + vals = self.df[self.flat_names[varname]].iloc[idx] + pt[varname] = vals.reshape(self.var_shapes[varname]) + return pt + + +def _create_flat_names(varname, shape): + """Return flat variable names for `varname` of `shape`. + + Examples + -------- + >>> _create_flat_names('x', (5,)) + ['x__0', 'x__1', 'x__2', 'x__3', 'x__4'] + + >>> _create_flat_names('x', (2, 2)) + ['x__0_0', 'x__0_1', 'x__1_0', 'x__1_1'] + """ + if not shape: + return [varname] + labels = (np.ravel(xs).tolist() for xs in np.indices(shape)) + labels = (map(str, xs) for xs in labels) + return ['{}__{}'.format(varname, '_'.join(idxs)) for idxs in zip(*labels)] + + +def _create_shape(flat_names): + "Determine shape from `_create_flat_names` output." + try: + _, shape_str = flat_names[-1].rsplit('__', 1) + except ValueError: + return () + return tuple(int(i) + 1 for i in shape_str.split('_')) + + +def load(name, model=None): + """Load CSV database. + + Parameters + ---------- + name : str + Name of directory that stores CSV files. + model : Model + If None, the model is taken from the `with` context. + + Returns + ------- + A MultiTrace instance + """ + files = glob(os.path.join(name, 'chain-*.csv')) + + traces = [] + for f in files: + chain = int(os.path.splitext(f)[0].rsplit('-', 1)[1]) + trace = CSV(name, model=model) + trace.chain = chain + trace.filename = f + traces.append(trace) + return base.MultiTrace(traces) + + +def dump(name, trace, chains=None): + """Store values from NDArray trace as CSV files. + + Parameters + ---------- + name : str + Name of directory to store CSV files in + trace : MultiTrace of NDArray traces + Result of MCMC run with default NDArray backend + chains : list + Chains to dump. If None, all chains are dumped. + """ + if not os.path.exists(name): + os.mkdir(name) + if chains is None: + chains = trace.chains + + var_shapes = trace._traces[chains[0]].var_shapes + flat_names = {v: _create_flat_names(v, shape) + for v, shape in var_shapes.items()} + + for chain in chains: + filename = os.path.join(name, 'chain-{}.csv'.format(chain)) + df = _trace_to_df(trace._traces[chain], flat_names) + df.to_csv(filename, index=False) + + +def _trace_to_df(trace, flat_names=None): + """Convert single-chain trace to Pandas DataFrame. + + Parameters + ---------- + trace : NDarray trace + flat_names : dict or None + A dictionary that maps each variable name in `trace` to a list + of flat variable names (e.g., ['x__0', 'x__1', ...]) + """ + if flat_names is None: + flat_names = {v: _create_flat_names(v, shape) + for v, shape in trace.var_shapes.items()} + + var_dfs = [] + for varname, shape in trace.var_shapes.items(): + vals = trace[varname] + if len(shape) == 1: + flat_vals = vals + else: + flat_vals = vals.reshape(len(trace), np.prod(shape)) + var_dfs.append(pd.DataFrame(flat_vals, columns=flat_names[varname])) + return pd.concat(var_dfs, axis=1) diff --git a/pymc3/backends/sqlite.py b/pymc3/backends/sqlite.py index bed3c602ce..348e7212a8 100644 --- a/pymc3/backends/sqlite.py +++ b/pymc3/backends/sqlite.py @@ -272,6 +272,8 @@ def close(self): self.connected = False +# TODO Consider merging `_create_colnames` and `_create_shape` with +# very similar functions in the csv backend. def _create_colnames(shape): """Return column names based on `shape`. diff --git a/pymc3/tests/test_csv_backend.py b/pymc3/tests/test_csv_backend.py new file mode 100644 index 0000000000..4cbce6a6a7 --- /dev/null +++ b/pymc3/tests/test_csv_backend.py @@ -0,0 +1,121 @@ +import numpy.testing as npt +from pymc3.tests import backend_fixtures as bf +from pymc3.backends import ndarray, csv + + +class TestCSV0dSampling(bf.SamplingTestCase): + backend = csv.CSV + name = 'csv-db' + shape = () + + +class TestCSV1dSampling(bf.SamplingTestCase): + backend = csv.CSV + name = 'csv-db' + shape = 2 + + +class TestCSV2dSampling(bf.SamplingTestCase): + backend = csv.CSV + name = 'csv-db' + shape = (2, 3) + + +class TestCSV0dSelection(bf.SelectionNoSliceTestCase): + backend = csv.CSV + name = 'csv-db' + shape = () + + +class TestCSV1dSelection(bf.SelectionNoSliceTestCase): + backend = csv.CSV + name = 'csv-db' + shape = 2 + + +class TestCSV2dSelection(bf.SelectionNoSliceTestCase): + backend = csv.CSV + name = 'csv-db' + shape = (2, 3) + + +class TestCSVDumpLoad(bf.DumpLoadTestCase): + backend = csv.CSV + load_func = staticmethod(csv.load) + name = 'csv-db' + shape = (2, 3) + + +class TestCSVDumpFunction(bf.BackendEqualityTestCase): + backend0 = backend1 = ndarray.NDArray + name0 = None + name1 = 'csv-db' + shape = (2, 3) + + @classmethod + def setUpClass(cls): + super(TestCSVDumpFunction, cls).setUpClass() + csv.dump(cls.name1, cls.mtrace1) + with cls.model: + cls.mtrace1 = csv.load(cls.name1) + + +class TestTraceToDf(bf.ModelBackendSampledTestCase): + backend = ndarray.NDArray + name = 'csv-db' + shape = (2, 3) + + def test_trace_to_df(self): + mtrace = self.mtrace + df = csv._trace_to_df(mtrace._traces[0]) + self.assertEqual(len(mtrace), df.shape[0]) + + checked = False + for varname in self.test_point.keys(): + vararr = mtrace.get_values(varname, chains=0) + ## With `shape` above, only one variable has to have that + ## `shape`. + if vararr.shape[1:] != self.shape: + continue + npt.assert_equal(vararr[:, 0, 0], df[varname + '__0_0'].values) + npt.assert_equal(vararr[:, 1, 0], df[varname + '__1_0'].values) + npt.assert_equal(vararr[:, 1, 2], df[varname + '__1_2'].values) + checked = True + self.assertTrue(checked) + +class TestNDArrayCSVEquality(bf.BackendEqualityTestCase): + backend0 = ndarray.NDArray + name0 = None + backend1 = csv.CSV + name1 = 'csv-db' + shape = (2, 3) + + +def test_create_flat_names_0d(): + shape = () + result = csv._create_flat_names('x', shape) + expected = ['x'] + assert result == expected + assert csv._create_shape(result) == shape + + +def test_create_flat_names_1d(): + shape = 2, + result = csv._create_flat_names('x', shape) + expected = ['x__0', 'x__1'] + assert result == expected + assert csv._create_shape(result) == shape + + +def test_create_flat_names_2d(): + shape = 2, 3 + result = csv._create_flat_names('x', shape) + expected = ['x__0_0', 'x__0_1', 'x__0_2', + 'x__1_0', 'x__1_1', 'x__1_2'] + assert result == expected + assert csv._create_shape(result) == shape + + +def test_create_flat_names_3d(): + shape = 2, 3, 4 + assert csv._create_shape(csv._create_flat_names('x', shape)) == shape From 5b9093f78fa79a06b0db1fbdbf34c276da000af2 Mon Sep 17 00:00:00 2001 From: Kyle Meyer Date: Fri, 5 Jun 2015 21:00:03 -0400 Subject: [PATCH 2/3] Add pandas as hard dependency --- readme.md | 6 ++++-- setup.py | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/readme.md b/readme.md index 638bfa4dda..7885546975 100644 --- a/readme.md +++ b/readme.md @@ -46,11 +46,13 @@ Another option is to clone the repository and install PyMC using `python setup.p ## Dependencies -PyMC is tested on Python 2.7 and 3.3 and depends on Theano, NumPy, SciPy, and Matplotlib (see setup.py for version information). +PyMC is tested on Python 2.7 and 3.3 and depends on Theano, NumPy, +SciPy, Pandas, and Matplotlib (see setup.py for version information). ### Optional -The GLM submodule relies on Pandas and Patsy. +In addtion to the above dependencies, the GLM submodule relies on +Patsy. [`scikits.sparse`](https://github.com/njsmith/scikits-sparse) enables sparse scaling matrices which are useful for large problems. Installation on Ubuntu is easy: diff --git a/setup.py b/setup.py index bf20d02416..57eb7dc4b9 100755 --- a/setup.py +++ b/setup.py @@ -27,7 +27,7 @@ 'Operating System :: OS Independent'] install_reqs = ['numpy>=1.7.1', 'scipy>=0.12.0', 'matplotlib>=1.2.1', - 'Theano<=0.7.1dev'] + 'Theano<=0.7.1dev', 'pandas>=0.15.0'] test_reqs = ['nose'] if sys.version_info[0] == 2: # py3 has mock in stdlib From 9c084dd15253859c7886929a5d0bfc3f53f893b5 Mon Sep 17 00:00:00 2001 From: Kyle Meyer Date: Fri, 5 Jun 2015 20:53:04 -0400 Subject: [PATCH 3/3] Replace Text backend with CSV The main reason to keep the Text backend around was that pandas was an optional dependency. Now that this is no longer the case, the original Text backend doesn't offer any advantages over the CSV backend. (Even if someone prefers not to write to files while sampling, they can sample with the NDArray backend and then use the CSV dump function.) Rename CSV to Text. This name is more appropriate because the values being stored as plain text is the important feature, not which delimiter is used. --- pymc3/backends/__init__.py | 11 +- pymc3/backends/csv.py | 248 -------------------------- pymc3/backends/text.py | 296 ++++++++++++++++++++----------- pymc3/tests/test_csv_backend.py | 121 ------------- pymc3/tests/test_text_backend.py | 98 +++++++++- 5 files changed, 294 insertions(+), 480 deletions(-) delete mode 100644 pymc3/backends/csv.py delete mode 100644 pymc3/tests/test_csv_backend.py diff --git a/pymc3/backends/__init__.py b/pymc3/backends/__init__.py index f0a6d1f2f0..2d91f228ea 100644 --- a/pymc3/backends/__init__.py +++ b/pymc3/backends/__init__.py @@ -4,11 +4,11 @@ ------------------ 1. NumPy array (pymc3.backends.NDArray) -2. Plain text formats (pymc3.backends.Text and pymc3.backends.CSV) +2. Text files (pymc3.backends.Text) 3. SQLite (pymc3.backends.SQLite) -The NDArray and Text backend both hold the entire trace in memory, -whereas CSV and SQLite backends store the values while sampling. +The NDArray backend holds the entire trace in memory, whereas the Text +and SQLite backends store the values while sampling. Selecting a backend ------------------- @@ -107,16 +107,13 @@ If the traces are stored on disk, then a `load` function should also be defined that returns a MultiTrace object. -For specific examples, see pymc3.backends.{ndarray,text,csv,sqlite}.py. +For specific examples, see pymc3.backends.{ndarray,text,sqlite}.py. """ from ..backends.ndarray import NDArray from ..backends.text import Text -from ..backends.csv import CSV from ..backends.sqlite import SQLite _shortcuts = {'text': {'backend': Text, 'name': 'mcmc'}, - 'csv': {'backend': CSV, - 'name': 'mcmc'}, 'sqlite': {'backend': SQLite, 'name': 'mcmc.sqlite'}} diff --git a/pymc3/backends/csv.py b/pymc3/backends/csv.py deleted file mode 100644 index 95d9495a56..0000000000 --- a/pymc3/backends/csv.py +++ /dev/null @@ -1,248 +0,0 @@ -"""CSV file trace backend - -Store and retrieve sampling values from CSV file. - -CSV file format ---------------- - -Sampling values for each chain are saved in a separate file (under a -directory specified by the `name` argument). The rows correspond to -sampling iterations. The column names consist of variable names and -index labels. For example, the heading - - x,y__0_0,y__0_1,y__1_0,y__1_1,y__2_0,y__2_1 - -represents two variables, x and y, where x is a scalar and y has a -shape of (3, 2). -""" -from glob import glob -import numpy as np -import os -import pandas as pd -import warnings - -from ..backends import base - - -class CSV(base.BaseTrace): - """CSV trace object - - Parameters - ---------- - name : str - Name of directory to store CSV files in - model : Model - If None, the model is taken from the `with` context. - vars : list of variables - Sampling values will be stored for these variables. If None, - `model.unobserved_RVs` is used. - """ - def __init__(self, name, model=None, vars=None): - if not os.path.exists(name): - os.mkdir(name) - super(CSV, self).__init__(name, model, vars) - - self.flat_names = {v: _create_flat_names(v, shape) - for v, shape in self.var_shapes.items()} - - self.filename = None - self._fh = None - self.df = None - - ## Sampling methods - - def setup(self, draws, chain): - """Perform chain-specific setup. - - Parameters - ---------- - draws : int - Expected number of draws - chain : int - Chain number - """ - self.chain = chain - self.filename = os.path.join(self.name, 'chain-{}.csv'.format(chain)) - - cnames = [fv for v in self.varnames for fv in self.flat_names[v]] - - if os.path.exists(self.filename): - with open(self.filename) as fh: - prev_cnames = next(fh).strip().split(',') - if prev_cnames != cnames: - raise base.BackendError( - "Previous file '{}' has different variables names " - "than current model.".format(self.filename)) - self._fh = open(self.filename, 'a') - else: - self._fh = open(self.filename, 'w') - self._fh.write(','.join(cnames) + '\n') - - def record(self, point): - """Record results of a sampling iteration. - - Parameters - ---------- - point : dict - Values mapped to variable names - """ - vals = {} - for varname, value in zip(self.varnames, self.fn(point)): - vals[varname] = value.ravel() - columns = [str(val) for var in self.varnames for val in vals[var]] - self._fh.write(','.join(columns) + '\n') - - def close(self): - self._fh.close() - self._fh = None # Avoid serialization issue. - - ## Selection methods - - def _load_df(self): - if self.df is None: - self.df = pd.read_csv(self.filename) - - def __len__(self): - if self.filename is None: - return 0 - self._load_df() - return self.df.shape[0] - - def get_values(self, varname, burn=0, thin=1): - """Get values from trace. - - Parameters - ---------- - varname : str - burn : int - thin : int - - Returns - ------- - A NumPy array - """ - self._load_df() - var_df = self.df[self.flat_names[varname]] - shape = (self.df.shape[0],) + self.var_shapes[varname] - vals = var_df.values.ravel().reshape(shape) - return vals[burn::thin] - - def _slice(self, idx): - warnings.warn('Slice for CSV backend has no effect.') - - def point(self, idx): - """Return dictionary of point values at `idx` for current chain - with variables names as keys. - """ - idx = int(idx) - self._load_df() - pt = {} - for varname in self.varnames: - vals = self.df[self.flat_names[varname]].iloc[idx] - pt[varname] = vals.reshape(self.var_shapes[varname]) - return pt - - -def _create_flat_names(varname, shape): - """Return flat variable names for `varname` of `shape`. - - Examples - -------- - >>> _create_flat_names('x', (5,)) - ['x__0', 'x__1', 'x__2', 'x__3', 'x__4'] - - >>> _create_flat_names('x', (2, 2)) - ['x__0_0', 'x__0_1', 'x__1_0', 'x__1_1'] - """ - if not shape: - return [varname] - labels = (np.ravel(xs).tolist() for xs in np.indices(shape)) - labels = (map(str, xs) for xs in labels) - return ['{}__{}'.format(varname, '_'.join(idxs)) for idxs in zip(*labels)] - - -def _create_shape(flat_names): - "Determine shape from `_create_flat_names` output." - try: - _, shape_str = flat_names[-1].rsplit('__', 1) - except ValueError: - return () - return tuple(int(i) + 1 for i in shape_str.split('_')) - - -def load(name, model=None): - """Load CSV database. - - Parameters - ---------- - name : str - Name of directory that stores CSV files. - model : Model - If None, the model is taken from the `with` context. - - Returns - ------- - A MultiTrace instance - """ - files = glob(os.path.join(name, 'chain-*.csv')) - - traces = [] - for f in files: - chain = int(os.path.splitext(f)[0].rsplit('-', 1)[1]) - trace = CSV(name, model=model) - trace.chain = chain - trace.filename = f - traces.append(trace) - return base.MultiTrace(traces) - - -def dump(name, trace, chains=None): - """Store values from NDArray trace as CSV files. - - Parameters - ---------- - name : str - Name of directory to store CSV files in - trace : MultiTrace of NDArray traces - Result of MCMC run with default NDArray backend - chains : list - Chains to dump. If None, all chains are dumped. - """ - if not os.path.exists(name): - os.mkdir(name) - if chains is None: - chains = trace.chains - - var_shapes = trace._traces[chains[0]].var_shapes - flat_names = {v: _create_flat_names(v, shape) - for v, shape in var_shapes.items()} - - for chain in chains: - filename = os.path.join(name, 'chain-{}.csv'.format(chain)) - df = _trace_to_df(trace._traces[chain], flat_names) - df.to_csv(filename, index=False) - - -def _trace_to_df(trace, flat_names=None): - """Convert single-chain trace to Pandas DataFrame. - - Parameters - ---------- - trace : NDarray trace - flat_names : dict or None - A dictionary that maps each variable name in `trace` to a list - of flat variable names (e.g., ['x__0', 'x__1', ...]) - """ - if flat_names is None: - flat_names = {v: _create_flat_names(v, shape) - for v, shape in trace.var_shapes.items()} - - var_dfs = [] - for varname, shape in trace.var_shapes.items(): - vals = trace[varname] - if len(shape) == 1: - flat_vals = vals - else: - flat_vals = vals.reshape(len(trace), np.prod(shape)) - var_dfs.append(pd.DataFrame(flat_vals, columns=flat_names[varname])) - return pd.concat(var_dfs, axis=1) diff --git a/pymc3/backends/text.py b/pymc3/backends/text.py index 1084b45dcf..8055abe152 100644 --- a/pymc3/backends/text.py +++ b/pymc3/backends/text.py @@ -1,42 +1,31 @@ """Text file trace backend -After sampling with NDArray backend, save results as text files. +Store sampling values as CSV files. -As this other backends, this can be used by passing the backend instance -to `sample`. +File format +----------- - >>> import pymc3 as pm - >>> db = pm.backends.Text('test') - >>> trace = pm.sample(..., trace=db) +Sampling values for each chain are saved in a separate file (under a +directory specified by the `name` argument). The rows correspond to +sampling iterations. The column names consist of variable names and +index labels. For example, the heading -Or sampling can be performed with the default NDArray backend and then -dumped to text files after. + x,y__0_0,y__0_1,y__1_0,y__1_1,y__2_0,y__2_1 - >>> from pymc3.backends import text - >>> trace = pm.sample(...) - >>> text.dump('test', trace) - -Database format ---------------- - -For each chain, a directory named `chain-N` is created. In this -directory, one file per variable is created containing the values of the -object. To deal with multidimensional variables, the array is reshaped -to one dimension before saving with `numpy.savetxt`. The shape and dtype -information is saved in a json file in the same directory and is used to -load the database back again using `numpy.loadtxt`. +represents two variables, x and y, where x is a scalar and y has a +shape of (3, 2). """ -import os -import glob -import json +from glob import glob import numpy as np +import os +import pandas as pd +import warnings from ..backends import base -from ..backends.ndarray import NDArray -class Text(NDArray): - """Text storage +class Text(base.BaseTrace): + """Text trace object Parameters ---------- @@ -53,102 +42,207 @@ def __init__(self, name, model=None, vars=None): os.mkdir(name) super(Text, self).__init__(name, model, vars) - def close(self): - super(Text, self).close() - _dump_trace(self.name, self) - - -def dump(name, trace, chains=None): - """Store NDArray trace as text database. - - Parameters - ---------- - name : str - Name of directory to store text files - trace : MultiTrace of NDArray traces - Result of MCMC run with default NDArray backend - chains : list - Chains to dump. If None, all chains are dumped. - """ - if not os.path.exists(name): - os.mkdir(name) - if chains is None: - chains = trace.chains - for chain in chains: - _dump_trace(name, trace._traces[chain]) - + self.flat_names = {v: _create_flat_names(v, shape) + for v, shape in self.var_shapes.items()} + + self.filename = None + self._fh = None + self.df = None + + ## Sampling methods + + def setup(self, draws, chain): + """Perform chain-specific setup. + + Parameters + ---------- + draws : int + Expected number of draws + chain : int + Chain number + """ + self.chain = chain + self.filename = os.path.join(self.name, 'chain-{}.csv'.format(chain)) + + cnames = [fv for v in self.varnames for fv in self.flat_names[v]] + + if os.path.exists(self.filename): + with open(self.filename) as fh: + prev_cnames = next(fh).strip().split(',') + if prev_cnames != cnames: + raise base.BackendError( + "Previous file '{}' has different variables names " + "than current model.".format(self.filename)) + self._fh = open(self.filename, 'a') + else: + self._fh = open(self.filename, 'w') + self._fh.write(','.join(cnames) + '\n') + + def record(self, point): + """Record results of a sampling iteration. + + Parameters + ---------- + point : dict + Values mapped to variable names + """ + vals = {} + for varname, value in zip(self.varnames, self.fn(point)): + vals[varname] = value.ravel() + columns = [str(val) for var in self.varnames for val in vals[var]] + self._fh.write(','.join(columns) + '\n') -def _dump_trace(name, trace): - """Dump a single-chain trace. + def close(self): + self._fh.close() + self._fh = None # Avoid serialization issue. + + ## Selection methods + + def _load_df(self): + if self.df is None: + self.df = pd.read_csv(self.filename) + + def __len__(self): + if self.filename is None: + return 0 + self._load_df() + return self.df.shape[0] + + def get_values(self, varname, burn=0, thin=1): + """Get values from trace. + + Parameters + ---------- + varname : str + burn : int + thin : int + + Returns + ------- + A NumPy array + """ + self._load_df() + var_df = self.df[self.flat_names[varname]] + shape = (self.df.shape[0],) + self.var_shapes[varname] + vals = var_df.values.ravel().reshape(shape) + return vals[burn::thin] + + def _slice(self, idx): + warnings.warn('Slice for Text backend has no effect.') + + def point(self, idx): + """Return dictionary of point values at `idx` for current chain + with variables names as keys. + """ + idx = int(idx) + self._load_df() + pt = {} + for varname in self.varnames: + vals = self.df[self.flat_names[varname]].iloc[idx] + pt[varname] = vals.reshape(self.var_shapes[varname]) + return pt + + +def _create_flat_names(varname, shape): + """Return flat variable names for `varname` of `shape`. + + Examples + -------- + >>> _create_flat_names('x', (5,)) + ['x__0', 'x__1', 'x__2', 'x__3', 'x__4'] + + >>> _create_flat_names('x', (2, 2)) + ['x__0_0', 'x__0_1', 'x__1_0', 'x__1_1'] """ - chain_name = 'chain-{}'.format(trace.chain) - chain_dir = os.path.join(name, chain_name) - os.mkdir(chain_dir) + if not shape: + return [varname] + labels = (np.ravel(xs).tolist() for xs in np.indices(shape)) + labels = (map(str, xs) for xs in labels) + return ['{}__{}'.format(varname, '_'.join(idxs)) for idxs in zip(*labels)] - info = {} - for varname in trace.varnames: - data = trace.get_values(varname) - if np.issubdtype(data.dtype, np.int): - fmt = '%i' - is_int = True - else: - fmt = '%g' - is_int = False - info[varname] = {'shape': data.shape, 'is_int': is_int} +def _create_shape(flat_names): + "Determine shape from `_create_flat_names` output." + try: + _, shape_str = flat_names[-1].rsplit('__', 1) + except ValueError: + return () + return tuple(int(i) + 1 for i in shape_str.split('_')) - var_file = os.path.join(chain_dir, varname + '.txt') - np.savetxt(var_file, data.reshape(-1, data.size), fmt=fmt) - ## Store shape and dtype information for reloading. - info_file = os.path.join(chain_dir, 'info.json') - with open(info_file, 'w') as sfh: - json.dump(info, sfh) - -def load(name, chains=None, model=None): - """Load text database. +def load(name, model=None): + """Load Text database. Parameters ---------- name : str - Path to root directory for text database - chains : list - Chains to load. If None, all chains are loaded. + Name of directory with files (one per chain) model : Model If None, the model is taken from the `with` context. Returns ------- - ndarray.Trace instance + A MultiTrace instance """ - chain_dirs = _get_chain_dirs(name) - if chains is None: - chains = list(chain_dirs.keys()) + files = glob(os.path.join(name, 'chain-*.csv')) traces = [] - for chain in chains: - chain_dir = chain_dirs[chain] - info_file = os.path.join(chain_dir, 'info.json') - with open(info_file, 'r') as sfh: - info = json.load(sfh) - samples = {} - for varname, info in info.items(): - var_file = os.path.join(chain_dir, varname + '.txt') - dtype = int if info['is_int'] else float - flat_data = np.loadtxt(var_file, dtype=dtype) - samples[varname] = flat_data.reshape(info['shape']) - trace = NDArray(model=model) - trace.samples = samples + for f in files: + chain = int(os.path.splitext(f)[0].rsplit('-', 1)[1]) + trace = Text(name, model=model) trace.chain = chain + trace.filename = f traces.append(trace) return base.MultiTrace(traces) -def _get_chain_dirs(name): - """Return mapping of chain number to directory.""" - return {_chain_dir_to_chain(chain_dir): chain_dir - for chain_dir in glob.glob(os.path.join(name, 'chain-*'))} +def dump(name, trace, chains=None): + """Store values from NDArray trace as CSV files. + + Parameters + ---------- + name : str + Name of directory to store CSV files in + trace : MultiTrace of NDArray traces + Result of MCMC run with default NDArray backend + chains : list + Chains to dump. If None, all chains are dumped. + """ + if not os.path.exists(name): + os.mkdir(name) + if chains is None: + chains = trace.chains + + var_shapes = trace._traces[chains[0]].var_shapes + flat_names = {v: _create_flat_names(v, shape) + for v, shape in var_shapes.items()} + + for chain in chains: + filename = os.path.join(name, 'chain-{}.csv'.format(chain)) + df = _trace_to_df(trace._traces[chain], flat_names) + df.to_csv(filename, index=False) + +def _trace_to_df(trace, flat_names=None): + """Convert single-chain trace to Pandas DataFrame. -def _chain_dir_to_chain(chain_dir): - return int(os.path.basename(chain_dir).split('-')[1]) + Parameters + ---------- + trace : NDarray trace + flat_names : dict or None + A dictionary that maps each variable name in `trace` to a list + of flat variable names (e.g., ['x__0', 'x__1', ...]) + """ + if flat_names is None: + flat_names = {v: _create_flat_names(v, shape) + for v, shape in trace.var_shapes.items()} + + var_dfs = [] + for varname, shape in trace.var_shapes.items(): + vals = trace[varname] + if len(shape) == 1: + flat_vals = vals + else: + flat_vals = vals.reshape(len(trace), np.prod(shape)) + var_dfs.append(pd.DataFrame(flat_vals, columns=flat_names[varname])) + return pd.concat(var_dfs, axis=1) diff --git a/pymc3/tests/test_csv_backend.py b/pymc3/tests/test_csv_backend.py deleted file mode 100644 index 4cbce6a6a7..0000000000 --- a/pymc3/tests/test_csv_backend.py +++ /dev/null @@ -1,121 +0,0 @@ -import numpy.testing as npt -from pymc3.tests import backend_fixtures as bf -from pymc3.backends import ndarray, csv - - -class TestCSV0dSampling(bf.SamplingTestCase): - backend = csv.CSV - name = 'csv-db' - shape = () - - -class TestCSV1dSampling(bf.SamplingTestCase): - backend = csv.CSV - name = 'csv-db' - shape = 2 - - -class TestCSV2dSampling(bf.SamplingTestCase): - backend = csv.CSV - name = 'csv-db' - shape = (2, 3) - - -class TestCSV0dSelection(bf.SelectionNoSliceTestCase): - backend = csv.CSV - name = 'csv-db' - shape = () - - -class TestCSV1dSelection(bf.SelectionNoSliceTestCase): - backend = csv.CSV - name = 'csv-db' - shape = 2 - - -class TestCSV2dSelection(bf.SelectionNoSliceTestCase): - backend = csv.CSV - name = 'csv-db' - shape = (2, 3) - - -class TestCSVDumpLoad(bf.DumpLoadTestCase): - backend = csv.CSV - load_func = staticmethod(csv.load) - name = 'csv-db' - shape = (2, 3) - - -class TestCSVDumpFunction(bf.BackendEqualityTestCase): - backend0 = backend1 = ndarray.NDArray - name0 = None - name1 = 'csv-db' - shape = (2, 3) - - @classmethod - def setUpClass(cls): - super(TestCSVDumpFunction, cls).setUpClass() - csv.dump(cls.name1, cls.mtrace1) - with cls.model: - cls.mtrace1 = csv.load(cls.name1) - - -class TestTraceToDf(bf.ModelBackendSampledTestCase): - backend = ndarray.NDArray - name = 'csv-db' - shape = (2, 3) - - def test_trace_to_df(self): - mtrace = self.mtrace - df = csv._trace_to_df(mtrace._traces[0]) - self.assertEqual(len(mtrace), df.shape[0]) - - checked = False - for varname in self.test_point.keys(): - vararr = mtrace.get_values(varname, chains=0) - ## With `shape` above, only one variable has to have that - ## `shape`. - if vararr.shape[1:] != self.shape: - continue - npt.assert_equal(vararr[:, 0, 0], df[varname + '__0_0'].values) - npt.assert_equal(vararr[:, 1, 0], df[varname + '__1_0'].values) - npt.assert_equal(vararr[:, 1, 2], df[varname + '__1_2'].values) - checked = True - self.assertTrue(checked) - -class TestNDArrayCSVEquality(bf.BackendEqualityTestCase): - backend0 = ndarray.NDArray - name0 = None - backend1 = csv.CSV - name1 = 'csv-db' - shape = (2, 3) - - -def test_create_flat_names_0d(): - shape = () - result = csv._create_flat_names('x', shape) - expected = ['x'] - assert result == expected - assert csv._create_shape(result) == shape - - -def test_create_flat_names_1d(): - shape = 2, - result = csv._create_flat_names('x', shape) - expected = ['x__0', 'x__1'] - assert result == expected - assert csv._create_shape(result) == shape - - -def test_create_flat_names_2d(): - shape = 2, 3 - result = csv._create_flat_names('x', shape) - expected = ['x__0_0', 'x__0_1', 'x__0_2', - 'x__1_0', 'x__1_1', 'x__1_2'] - assert result == expected - assert csv._create_shape(result) == shape - - -def test_create_flat_names_3d(): - shape = 2, 3, 4 - assert csv._create_shape(csv._create_flat_names('x', shape)) == shape diff --git a/pymc3/tests/test_text_backend.py b/pymc3/tests/test_text_backend.py index 263f9bec37..51181cafec 100644 --- a/pymc3/tests/test_text_backend.py +++ b/pymc3/tests/test_text_backend.py @@ -3,6 +3,42 @@ from pymc3.backends import ndarray, text +class TestText0dSampling(bf.SamplingTestCase): + backend = text.Text + name = 'text-db' + shape = () + + +class TestText1dSampling(bf.SamplingTestCase): + backend = text.Text + name = 'text-db' + shape = 2 + + +class TestText2dSampling(bf.SamplingTestCase): + backend = text.Text + name = 'text-db' + shape = (2, 3) + + +class TestText0dSelection(bf.SelectionNoSliceTestCase): + backend = text.Text + name = 'text-db' + shape = () + + +class TestText1dSelection(bf.SelectionNoSliceTestCase): + backend = text.Text + name = 'text-db' + shape = 2 + + +class TestText2dSelection(bf.SelectionNoSliceTestCase): + backend = text.Text + name = 'text-db' + shape = (2, 3) + + class TestTextDumpLoad(bf.DumpLoadTestCase): backend = text.Text load_func = staticmethod(text.load) @@ -24,6 +60,62 @@ def setUpClass(cls): cls.mtrace1 = text.load(cls.name1) -def test__chain_dir_to_chain(): - assert text._chain_dir_to_chain('/path/to/chain-0') == 0 - assert text._chain_dir_to_chain('chain-0') == 0 +class TestTraceToDf(bf.ModelBackendSampledTestCase): + backend = ndarray.NDArray + name = 'text-db' + shape = (2, 3) + + def test_trace_to_df(self): + mtrace = self.mtrace + df = text._trace_to_df(mtrace._traces[0]) + self.assertEqual(len(mtrace), df.shape[0]) + + checked = False + for varname in self.test_point.keys(): + vararr = mtrace.get_values(varname, chains=0) + ## With `shape` above, only one variable has to have that + ## `shape`. + if vararr.shape[1:] != self.shape: + continue + npt.assert_equal(vararr[:, 0, 0], df[varname + '__0_0'].values) + npt.assert_equal(vararr[:, 1, 0], df[varname + '__1_0'].values) + npt.assert_equal(vararr[:, 1, 2], df[varname + '__1_2'].values) + checked = True + self.assertTrue(checked) + +class TestNDArrayTextEquality(bf.BackendEqualityTestCase): + backend0 = ndarray.NDArray + name0 = None + backend1 = text.Text + name1 = 'text-db' + shape = (2, 3) + + +def test_create_flat_names_0d(): + shape = () + result = text._create_flat_names('x', shape) + expected = ['x'] + assert result == expected + assert text._create_shape(result) == shape + + +def test_create_flat_names_1d(): + shape = 2, + result = text._create_flat_names('x', shape) + expected = ['x__0', 'x__1'] + assert result == expected + assert text._create_shape(result) == shape + + +def test_create_flat_names_2d(): + shape = 2, 3 + result = text._create_flat_names('x', shape) + expected = ['x__0_0', 'x__0_1', 'x__0_2', + 'x__1_0', 'x__1_1', 'x__1_2'] + assert result == expected + assert text._create_shape(result) == shape + + +def test_create_flat_names_3d(): + shape = 2, 3, 4 + assert text._create_shape(text._create_flat_names('x', shape)) == shape