Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
ae610ae
simple draft of missing values
jsalvatier May 24, 2015
5187b05
enable c code for gammaln/psi on non floats
jsalvatier May 25, 2015
839ac29
missing vals use NoDistribution
jsalvatier May 25, 2015
20abbe5
test missing values
jsalvatier May 25, 2015
e5833ad
fix for python 3.4
jsalvatier May 25, 2015
1ac4d0d
handle missing data from pandas
jsalvatier May 25, 2015
9e10b75
Added disaster_model_missing (not working)
May 27, 2015
7e128dc
Added lasso model with missing covariates
May 28, 2015
9fd8fe7
split observedrv into observedrv and multiobserved
jsalvatier Jun 2, 2015
e6ab0ce
make example work
jsalvatier Jun 2, 2015
9d9be7e
fix non numpy data
jsalvatier Jun 4, 2015
1d3a9e4
fix multiobservedrv
jsalvatier Jun 4, 2015
7d27478
fix disasters missing
jsalvatier Jun 4, 2015
43b9263
bound bernoulli
jsalvatier Jun 4, 2015
251b72b
handle gradient when no vars are passed
jsalvatier Jun 4, 2015
11956ff
give missing variables the .model parameter
jsalvatier Jun 4, 2015
c31fb56
use the right datatype for the missing values
jsalvatier Jun 4, 2015
a73d53c
make lasso missing work
jsalvatier Jun 5, 2015
2a02b3a
add fake data for missing lasso example
jsalvatier Jun 5, 2015
ed6b5aa
need n='short' for auto running
jsalvatier Jun 5, 2015
b1bdda5
increase samples to get summary to work
jsalvatier Jun 5, 2015
fe825af
Removed test_scores.csv
Jun 5, 2015
2e113df
Revert "increase samples to get summary to work"
Jun 5, 2015
9f20c6a
Revert "Revert "increase samples to get summary to work""
Jun 5, 2015
674bc52
Revert "Removed test_scores.csv"
Jun 5, 2015
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion pymc3/distributions/discrete.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,9 @@ def logp(self, value):
p = self.p
return bound(
switch(value, log(p), log(1 - p)),
0 <= p, p <= 1)
0 <= p, p <= 1,
eq(value,0) | eq(value,1)
)


class Poisson(Discrete):
Expand Down
6 changes: 5 additions & 1 deletion pymc3/distributions/distribution.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import numpy as np
from ..model import Model

__all__ = ['DensityDist', 'Distribution', 'Continuous', 'Discrete']
__all__ = ['DensityDist', 'Distribution', 'Continuous', 'Discrete', 'NoDistribution', 'TensorType']


class Distribution(object):
Expand Down Expand Up @@ -64,6 +64,10 @@ def get_test_val(self, val, defaults):
def TensorType(dtype, shape):
return t.TensorType(str(dtype), np.atleast_1d(shape) == 1)

class NoDistribution(Distribution):
def logp(self, x):
return 0

class Discrete(Distribution):
"""Base class for discrete distributions"""
def __init__(self, shape=(), dtype='int64', *args, **kwargs):
Expand Down
13 changes: 5 additions & 8 deletions pymc3/distributions/special.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,7 @@ def grad(self, inp, grads):
def c_code(self, node, name, inp, out, sub):
x, = inp
z, = out
if node.inputs[0].type in [scalar.float32, scalar.float64]:
return """%(z)s =
lgamma(%(x)s);""" % locals()
raise NotImplementedError('only floatingpoint is implemented')
return """%(z)s = lgamma(%(x)s);""" % locals()

def __eq__(self, other):
return type(self) == type(other)
Expand Down Expand Up @@ -119,10 +116,10 @@ def c_support_code(self):
def c_code(self, node, name, inp, out, sub):
x, = inp
z, = out
if node.inputs[0].type in [scalar.float32, scalar.float64]:
return """%(z)s =
_psi(%(x)s);""" % locals()
raise NotImplementedError('only floatingpoint is implemented')
if node.inputs[0].type in scalar.complex_types:
raise NotImplementedError('type not supported', node.inputs[0].type)

return """%(z)s = _psi(%(x)s);""" % locals()

def __eq__(self, other):
return type(self) == type(other)
Expand Down
2 changes: 1 addition & 1 deletion pymc3/examples/arbitrary_stochastic.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
def logp(failure, value):
return sum(failure * log(lam) - lam * value)

x = DensityDist('x', logp, observed=(failure, value))
x = DensityDist('x', logp, observed={'failure':failure, 'value':value})


def run (n=3000):
Expand Down
17 changes: 17 additions & 0 deletions pymc3/examples/data/test_scores.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
score,male,sib,synd_or_disab,age_test,mother_hs,ident_by_3
100,0,1,0,5,1,0
35,0,2,0,9,0,1
24,1,NaN,1,7,1,0
75,1,0,0,2,1,1
24,1,NaN,1,7,1,0
81,1,0,0,2,1,1
24,1,3,1,7,1,0
64,1,0,NaN,11,1,0
18,1,1,1,9,1,0
24,1,1,0,8,0,0
24,1,1,1,7,1,0
76,0,2,0,2,NaN,0
36,0,1,0,2,1,1
16,1,1,0,2,0,0
95,0,0,0,2,NaN,1
65,1,2,0,2,0,1
199 changes: 199 additions & 0 deletions pymc3/examples/disaster_model.ipynb

Large diffs are not rendered by default.

68 changes: 68 additions & 0 deletions pymc3/examples/disaster_model_missing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
"""
A model for coal mining disasters data with a changepoint

switchpoint ~ U(0, 110)
early_mean ~ Exp(1.)
late_mean ~ Exp(1.)
disasters[t] ~ Po(early_mean if t <= switchpoint, late_mean otherwise)

"""

from pymc3 import *

import theano.tensor as t
from numpy import arange, array, ones, concatenate
from numpy.random import randint
from numpy.ma import masked_values

__all__ = ['disasters_data', 'switchpoint', 'early_mean', 'late_mean', 'rate',
'disasters']

# Time series of recorded coal mining disasters in the UK from 1851 to 1962
disasters_data = array([4, 5, 4, 0, 1, 4, 3, 4, 0, 6, 3, 3, 4, 0, 2, 6,
3, 3, 5, 4, 5, 3, 1, -1, 4, 1, 5, 5, 3, 4, 2, 5,
2, 2, 3, 4, 2, 1, 3, 2, 2, 1, 1, 1, 1, 3, 0, 0,
1, 0, 1, 1, 0, 0, 3, 1, 0, 3, 2, 2, 0, 1, 1, 1,
0, 1, 0, 1, -1, 0, 0, 2, 1, 0, 0, 0, 1, 1, 0, 2,
3, 3, 1, 1, 2, 1, 1, 1, 1, 2, 4, 2, 0, 0, 1, 4,
0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1])
years = len(disasters_data)

masked_values = masked_values(disasters_data, value=-1)

with Model() as model:

# Prior for distribution of switchpoint location
switchpoint = DiscreteUniform('switchpoint', lower=0, upper=years)
# Priors for pre- and post-switch mean number of disasters
early_mean = Exponential('early_mean', lam=1.)
late_mean = Exponential('late_mean', lam=1.)

# Allocate appropriate Poisson rates to years before and after current
# switchpoint location
idx = arange(years)
rate = switch(switchpoint >= idx, early_mean, late_mean)

# Data likelihood
disasters = Poisson('disasters', rate, observed=masked_values)


def run(n=1000):
if n == "short":
n = 500
with model:

# Initial values for stochastic nodes
start = {'early_mean': 2., 'late_mean': 3.}

# Use slice sampler for means
step1 = Slice([early_mean, late_mean])
# Use Metropolis for switchpoint, since it accomodates discrete variables
step2 = Metropolis([switchpoint, disasters.missing_values ])

tr = sample(n, tune=500, start=start, step=[step1, step2])

summary(tr, vars=['disasters_missing'])

if __name__ == '__main__':
run()
53 changes: 53 additions & 0 deletions pymc3/examples/lasso_missing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
from pymc3 import *
import numpy as np
import pandas as pd
from numpy.ma import masked_values

# Import data, filling missing values with sentinels (-999)
test_scores = pd.read_csv(get_data_file('pymc3.examples', 'data/test_scores.csv')).fillna(-999)

# Extract variables: test score, gender, number of siblings, previous disability, age,
# mother with HS education or better, hearing loss identified by 3 months of age
(score, male, siblings, disability,
age, mother_hs, early_ident) = test_scores[['score', 'male', 'sib',
'synd_or_disab', 'age_test',
'mother_hs', 'ident_by_3']].astype(float).values.T

with Model() as model:

# Impute missing values
sib_mean = Exponential('sib_mean', 1)
siblings_imp = Poisson('siblings_imp', sib_mean, observed=masked_values(siblings, value=-999))

p_disab = Beta('p_disab', 1, 1)
disability_imp = Bernoulli('disability_imp', p_disab, observed=masked_values(disability, value=-999))

p_mother = Beta('p_mother', 1, 1)
mother_imp = Bernoulli('mother_imp', p_mother, observed=masked_values(mother_hs, value=-999))

s = HalfCauchy('s', 5, testval=5)
beta = Laplace('beta', 0, 100, shape=7, testval=.1)

expected_score = (beta[0] + beta[1]*male + beta[2]*siblings_imp + beta[3]*disability_imp +
beta[4]*age + beta[5]*mother_imp + beta[6]*early_ident)

observed_score = Normal('observed_score', expected_score, s ** -2, observed=score)


with model:
start = find_MAP()
step1 = NUTS([beta, s, p_disab, p_mother, sib_mean], scaling=start)

step2 = Metropolis([mother_imp.missing_values,
disability_imp.missing_values,
siblings_imp.missing_values])

def run(n=5000):
if n == 'short':
n = 100
with model:
trace = sample(n, [step1, step2], start)


if __name__ == '__main__':
run()
104 changes: 88 additions & 16 deletions pymc3/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ def __init__(self):
self.observed_RVs = []
self.deterministics = []
self.potentials = []
self.missing_values = []
self.model = self

@property
Expand All @@ -101,7 +102,7 @@ def logpt(self):

@property
def vars(self):
"""List of unobserved random variables the model is defined in terms of (which excludes deterministics)."""
"""List of unobserved random variables used as inputs to the model (which excludes deterministics)."""
return self.free_RVs

@property
Expand All @@ -112,7 +113,7 @@ def basic_RVs(self):
@property
def unobserved_RVs(self):
"""List of all random variable, including deterministic ones."""
return self.free_RVs + self.deterministics
return self.vars + self.deterministics


@property
Expand Down Expand Up @@ -142,9 +143,20 @@ def Var(self, name, dist, data=None):
if data is None:
var = FreeRV(name=name, distribution=dist, model=self)
self.free_RVs.append(var)
else:

elif isinstance(data, dict):
var = MultiObservedRV(name=name, data=data, distribution=dist, model=self)
self.observed_RVs.append(var)
if var.missing_values:
self.free_RVs += var.missing_values
self.missing_values += var.missing_values
else:
var = ObservedRV(name=name, data=data, distribution=dist, model=self)
self.observed_RVs.append(var)
if var.missing_values:
self.free_RVs.append(var.missing_values)
self.missing_values.append(var.missing_values)

self.add_random_variable(var)
return var

Expand Down Expand Up @@ -322,8 +334,76 @@ def __init__(self, type=None, owner=None, index=None, name=None, distribution=No
self.logp_elemwiset = distribution.logp(self)
self.model = model

class ObservedRV(Factor):
"""Observed random variable that a model is specified in terms of."""
def pandas_to_array(data):
if hasattr(data, 'values'): #pandas
if data.isnull().any().any(): #missing values
return np.ma.MaskedArray(data.values, data.isnull().values)
else:
return data.values
elif hasattr(data, 'mask'):
return data
else:
return np.asarray(data)


def as_tensor(data, name,model, dtype):
data = pandas_to_array(data).astype(dtype)

if hasattr(data, 'mask'):
from .distributions import NoDistribution
fakedist = NoDistribution.dist(shape=data.mask.sum(), dtype=dtype, testval=data.mean().astype(dtype))
missing_values = FreeRV(name=name + '_missing', distribution=fakedist, model=model)

constant = t.as_tensor_variable(data.filled())

dataTensor = theano.tensor.set_subtensor(constant[data.mask.nonzero()], missing_values)
dataTensor.missing_values = missing_values
return dataTensor
else:
data = t.as_tensor_variable(data, name=name)
data.missing_values = None
return data

class ObservedRV(Factor, TensorVariable):
"""Observed random variable that a model is specified in terms of.
Potentially partially observed.
"""
def __init__(self, type=None, owner=None, index=None, name=None, data=None, distribution=None, model=None):
"""
Parameters
----------

type : theano type (optional)
owner : theano owner (optional)

name : str
distribution : Distribution
model : Model
"""
from .distributions import TensorType
if type is None:
data = pandas_to_array(data)
type = TensorType(distribution.dtype, data.shape)

super(TensorVariable, self).__init__(type, None, None, name)

if distribution is not None:
data = as_tensor(data, name,model,distribution.dtype)
self.missing_values = data.missing_values

self.logp_elemwiset = distribution.logp(data)
self.model = model
self.distribution = distribution

#make this RV a view on the combined missing/nonmissing array
theano.gof.Apply(theano.compile.view_op, inputs=[data], outputs=[self])

self.tag.test_value = theano.compile.view_op(data).tag.test_value

class MultiObservedRV(Factor):
"""Observed random variable that a model is specified in terms of.
Potentially partially observed.
"""
def __init__(self, name, data, distribution, model):
"""
Parameters
Expand All @@ -337,17 +417,11 @@ def __init__(self, name, data, distribution, model):
model : Model
"""
self.name = name
data = getattr(data, 'values', data) #handle pandas
args = as_iterargs(data)

if len(args) > 1:
params = getargspec(distribution.logp).args
args = [t.constant(d, name=name + "_" + param)
for d,param in zip(args,params) ]
else:
args = [t.constant(args[0], name=name)]
self.data = { name : as_tensor(data, name, model, distribution.dtype) for name, data in data.items()}

self.logp_elemwiset = distribution.logp(*args)
self.missing_values = [ data.missing_values for data in self.data.values() if data.missing_values is not None]
self.logp_elemwiset = distribution.logp(**self.data)
self.model = model
self.distribution = distribution

Expand Down Expand Up @@ -385,8 +459,6 @@ def Potential(name, var, model=None):
def as_iterargs(data):
if isinstance(data, tuple):
return data
if hasattr(data, 'columns'): # data frames
return [np.asarray(data[c]) for c in data.columns]
else:
return [data]

Expand Down
27 changes: 27 additions & 0 deletions pymc3/tests/test_missing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
from pymc3 import Model, Normal, Metropolis, MvNormal
from numpy import ma
import numpy
import pandas as pd

def test_missing():
data = ma.masked_values([1,2,-1,4,-1], value=-1)
with Model() as model:
x = Normal('x', 1, 1)
y = Normal('y', x, 1, observed=data)

y_missing, = model.missing_values
assert y_missing.tag.test_value.shape == (2,)

model.logp(model.test_point)


def test_missing_pandas():
data = pd.DataFrame([1,2,numpy.nan,4,numpy.nan])
with Model() as model:
x = Normal('x', 1, 1)
y = Normal('y', x, 1, observed=data)

y_missing, = model.missing_values
assert y_missing.tag.test_value.shape == (2,)

model.logp(model.test_point)
Loading