Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions pymc3/backends/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@
2. Text files (pymc3.backends.Text)
3. SQLite (pymc3.backends.SQLite)

The NumPy arrays and text files both hold the entire trace in memory,
whereas SQLite commits the trace to the database while sampling.
The NDArray backend holds the entire trace in memory, whereas the Text
and SQLite backends store the values while sampling.

Selecting a backend
-------------------
Expand Down
4 changes: 4 additions & 0 deletions pymc3/backends/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@
from ..model import modelcontext


class BackendError(Exception):
pass


class BaseTrace(object):
"""Base trace object

Expand Down
2 changes: 2 additions & 0 deletions pymc3/backends/sqlite.py
Original file line number Diff line number Diff line change
Expand Up @@ -272,6 +272,8 @@ def close(self):
self.connected = False


# TODO Consider merging `_create_colnames` and `_create_shape` with
# very similar functions in the csv backend.
def _create_colnames(shape):
"""Return column names based on `shape`.

Expand Down
296 changes: 195 additions & 101 deletions pymc3/backends/text.py
Original file line number Diff line number Diff line change
@@ -1,42 +1,31 @@
"""Text file trace backend

After sampling with NDArray backend, save results as text files.
Store sampling values as CSV files.

As this other backends, this can be used by passing the backend instance
to `sample`.
File format
-----------

>>> import pymc3 as pm
>>> db = pm.backends.Text('test')
>>> trace = pm.sample(..., trace=db)
Sampling values for each chain are saved in a separate file (under a
directory specified by the `name` argument). The rows correspond to
sampling iterations. The column names consist of variable names and
index labels. For example, the heading

Or sampling can be performed with the default NDArray backend and then
dumped to text files after.
x,y__0_0,y__0_1,y__1_0,y__1_1,y__2_0,y__2_1

>>> from pymc3.backends import text
>>> trace = pm.sample(...)
>>> text.dump('test', trace)

Database format
---------------

For each chain, a directory named `chain-N` is created. In this
directory, one file per variable is created containing the values of the
object. To deal with multidimensional variables, the array is reshaped
to one dimension before saving with `numpy.savetxt`. The shape and dtype
information is saved in a json file in the same directory and is used to
load the database back again using `numpy.loadtxt`.
represents two variables, x and y, where x is a scalar and y has a
shape of (3, 2).
"""
import os
import glob
import json
from glob import glob
import numpy as np
import os
import pandas as pd
import warnings

from ..backends import base
from ..backends.ndarray import NDArray


class Text(NDArray):
"""Text storage
class Text(base.BaseTrace):
"""Text trace object

Parameters
----------
Expand All @@ -53,102 +42,207 @@ def __init__(self, name, model=None, vars=None):
os.mkdir(name)
super(Text, self).__init__(name, model, vars)

def close(self):
super(Text, self).close()
_dump_trace(self.name, self)


def dump(name, trace, chains=None):
"""Store NDArray trace as text database.

Parameters
----------
name : str
Name of directory to store text files
trace : MultiTrace of NDArray traces
Result of MCMC run with default NDArray backend
chains : list
Chains to dump. If None, all chains are dumped.
"""
if not os.path.exists(name):
os.mkdir(name)
if chains is None:
chains = trace.chains
for chain in chains:
_dump_trace(name, trace._traces[chain])

self.flat_names = {v: _create_flat_names(v, shape)
for v, shape in self.var_shapes.items()}

self.filename = None
self._fh = None
self.df = None

## Sampling methods

def setup(self, draws, chain):
"""Perform chain-specific setup.

Parameters
----------
draws : int
Expected number of draws
chain : int
Chain number
"""
self.chain = chain
self.filename = os.path.join(self.name, 'chain-{}.csv'.format(chain))

cnames = [fv for v in self.varnames for fv in self.flat_names[v]]

if os.path.exists(self.filename):
with open(self.filename) as fh:
prev_cnames = next(fh).strip().split(',')
if prev_cnames != cnames:
raise base.BackendError(
"Previous file '{}' has different variables names "
"than current model.".format(self.filename))
self._fh = open(self.filename, 'a')
else:
self._fh = open(self.filename, 'w')
self._fh.write(','.join(cnames) + '\n')

def record(self, point):
"""Record results of a sampling iteration.

Parameters
----------
point : dict
Values mapped to variable names
"""
vals = {}
for varname, value in zip(self.varnames, self.fn(point)):
vals[varname] = value.ravel()
columns = [str(val) for var in self.varnames for val in vals[var]]
self._fh.write(','.join(columns) + '\n')

def _dump_trace(name, trace):
"""Dump a single-chain trace.
def close(self):
self._fh.close()
self._fh = None # Avoid serialization issue.

## Selection methods

def _load_df(self):
if self.df is None:
self.df = pd.read_csv(self.filename)

def __len__(self):
if self.filename is None:
return 0
self._load_df()
return self.df.shape[0]

def get_values(self, varname, burn=0, thin=1):
"""Get values from trace.

Parameters
----------
varname : str
burn : int
thin : int

Returns
-------
A NumPy array
"""
self._load_df()
var_df = self.df[self.flat_names[varname]]
shape = (self.df.shape[0],) + self.var_shapes[varname]
vals = var_df.values.ravel().reshape(shape)
return vals[burn::thin]

def _slice(self, idx):
warnings.warn('Slice for Text backend has no effect.')

def point(self, idx):
"""Return dictionary of point values at `idx` for current chain
with variables names as keys.
"""
idx = int(idx)
self._load_df()
pt = {}
for varname in self.varnames:
vals = self.df[self.flat_names[varname]].iloc[idx]
pt[varname] = vals.reshape(self.var_shapes[varname])
return pt


def _create_flat_names(varname, shape):
"""Return flat variable names for `varname` of `shape`.

Examples
--------
>>> _create_flat_names('x', (5,))
['x__0', 'x__1', 'x__2', 'x__3', 'x__4']

>>> _create_flat_names('x', (2, 2))
['x__0_0', 'x__0_1', 'x__1_0', 'x__1_1']
"""
chain_name = 'chain-{}'.format(trace.chain)
chain_dir = os.path.join(name, chain_name)
os.mkdir(chain_dir)
if not shape:
return [varname]
labels = (np.ravel(xs).tolist() for xs in np.indices(shape))
labels = (map(str, xs) for xs in labels)
return ['{}__{}'.format(varname, '_'.join(idxs)) for idxs in zip(*labels)]

info = {}
for varname in trace.varnames:
data = trace.get_values(varname)

if np.issubdtype(data.dtype, np.int):
fmt = '%i'
is_int = True
else:
fmt = '%g'
is_int = False
info[varname] = {'shape': data.shape, 'is_int': is_int}
def _create_shape(flat_names):
"Determine shape from `_create_flat_names` output."
try:
_, shape_str = flat_names[-1].rsplit('__', 1)
except ValueError:
return ()
return tuple(int(i) + 1 for i in shape_str.split('_'))

var_file = os.path.join(chain_dir, varname + '.txt')
np.savetxt(var_file, data.reshape(-1, data.size), fmt=fmt)
## Store shape and dtype information for reloading.
info_file = os.path.join(chain_dir, 'info.json')
with open(info_file, 'w') as sfh:
json.dump(info, sfh)


def load(name, chains=None, model=None):
"""Load text database.
def load(name, model=None):
"""Load Text database.

Parameters
----------
name : str
Path to root directory for text database
chains : list
Chains to load. If None, all chains are loaded.
Name of directory with files (one per chain)
model : Model
If None, the model is taken from the `with` context.

Returns
-------
ndarray.Trace instance
A MultiTrace instance
"""
chain_dirs = _get_chain_dirs(name)
if chains is None:
chains = list(chain_dirs.keys())
files = glob(os.path.join(name, 'chain-*.csv'))

traces = []
for chain in chains:
chain_dir = chain_dirs[chain]
info_file = os.path.join(chain_dir, 'info.json')
with open(info_file, 'r') as sfh:
info = json.load(sfh)
samples = {}
for varname, info in info.items():
var_file = os.path.join(chain_dir, varname + '.txt')
dtype = int if info['is_int'] else float
flat_data = np.loadtxt(var_file, dtype=dtype)
samples[varname] = flat_data.reshape(info['shape'])
trace = NDArray(model=model)
trace.samples = samples
for f in files:
chain = int(os.path.splitext(f)[0].rsplit('-', 1)[1])
trace = Text(name, model=model)
trace.chain = chain
trace.filename = f
traces.append(trace)
return base.MultiTrace(traces)


def _get_chain_dirs(name):
"""Return mapping of chain number to directory."""
return {_chain_dir_to_chain(chain_dir): chain_dir
for chain_dir in glob.glob(os.path.join(name, 'chain-*'))}
def dump(name, trace, chains=None):
"""Store values from NDArray trace as CSV files.

Parameters
----------
name : str
Name of directory to store CSV files in
trace : MultiTrace of NDArray traces
Result of MCMC run with default NDArray backend
chains : list
Chains to dump. If None, all chains are dumped.
"""
if not os.path.exists(name):
os.mkdir(name)
if chains is None:
chains = trace.chains

var_shapes = trace._traces[chains[0]].var_shapes
flat_names = {v: _create_flat_names(v, shape)
for v, shape in var_shapes.items()}

for chain in chains:
filename = os.path.join(name, 'chain-{}.csv'.format(chain))
df = _trace_to_df(trace._traces[chain], flat_names)
df.to_csv(filename, index=False)


def _trace_to_df(trace, flat_names=None):
"""Convert single-chain trace to Pandas DataFrame.

def _chain_dir_to_chain(chain_dir):
return int(os.path.basename(chain_dir).split('-')[1])
Parameters
----------
trace : NDarray trace
flat_names : dict or None
A dictionary that maps each variable name in `trace` to a list
of flat variable names (e.g., ['x__0', 'x__1', ...])
"""
if flat_names is None:
flat_names = {v: _create_flat_names(v, shape)
for v, shape in trace.var_shapes.items()}

var_dfs = []
for varname, shape in trace.var_shapes.items():
vals = trace[varname]
if len(shape) == 1:
flat_vals = vals
else:
flat_vals = vals.reshape(len(trace), np.prod(shape))
var_dfs.append(pd.DataFrame(flat_vals, columns=flat_names[varname]))
return pd.concat(var_dfs, axis=1)
6 changes: 5 additions & 1 deletion pymc3/distributions/distribution.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import numpy as np
from ..model import Model

__all__ = ['DensityDist', 'Distribution', 'Continuous', 'Discrete']
__all__ = ['DensityDist', 'Distribution', 'Continuous', 'Discrete', 'NoDistribution', 'TensorType']


class Distribution(object):
Expand Down Expand Up @@ -69,6 +69,10 @@ def getattr_value(self, val):
def TensorType(dtype, shape):
return t.TensorType(str(dtype), np.atleast_1d(shape) == 1)

class NoDistribution(Distribution):
def logp(self, x):
return 0

class Discrete(Distribution):
"""Base class for discrete distributions"""
def __init__(self, shape=(), dtype='int64', defaults=['mode'], *args, **kwargs):
Expand Down
Loading