In [1]:
%matplotlib inline
from IPython.core.pylabtools import figsize
import numpy as np
from matplotlib import pyplot as plt
figsize(18, 6)

import pandas as pd
import scipy.stats as stats
import scipy.special as special

import pymc3 as pm
import theano
import theano.tensor as tt

Below, we simulate a process that generated missing data.

It is a totally contrived example:

- We have houses
- Each houses has a songbird
- Each songbird has a certain number of notes
- When a house has a cat the number of notes is lesser (below beta is negative)

In [2]:
N_houses = 100
alpha = 5 # avg nb of notes in houses
beta = -3 # slope for cat influence
k = 0.3 # probability of having a cat
r = 0.2 # probability that we don't know if a house has a cat

With that we can simulate the data

In [3]:
cat = stats.bernoulli.rvs(k,size=N_houses)
cat = cat.astype('float') ## to be able to replace array elements afterwards (having integers would cause an issue)
notes = stats.poisson.rvs(alpha + cat * beta ,size = N_houses)
notes = notes.astype('float')
R_C = stats.bernoulli.rvs(r,size=N_houses) ## house with known and unknown cats
cat_obs = cat
cat_obs[R_C == 1] = -9 ## arbitrary impossible replacement to signal a misssing value

cat = cat.astype('int')
notes = notes.astype('int')

From there we want to go the other way around and pretend we know `notes` while we want to know which are the houses having a `cat`.

If we call Ni the number of notes and Ci the presence of a cat in the house, we have:

Pr(Ni) = (proba of a cat) * (proba of Ni while having a cat) + (proba of no cat) * (proba of Ni while having not a cat)

Pr(Ni) = Pr(Ci = 1) * Pr(Ni | Ci = 1) + Pr(Ci = 0) * Pr(Ni | Ci = 0)




In [4]:
def log_sum_exp( x ):
    xmax = x.max()
    xsum = sum( np.exp( x - xmax ) )
    print(np.exp( x - xmax ))
    return xmax + np.log(xsum)

In [5]:
with pm.Model() as cat:
    
    k = pm.Beta("k",2,2,testval=0.4)
    cat_cond_RC0 = pm.Bernoulli("cat|RC==0",k)
    
    a = pm.Normal("a",0,1)
    b = pm.Normal("b",0,0.5)
    
    # cat NA
    
    yes_cat = tt.log(k) + pm.Poisson.dist(pm.math.exp(a+b)).logp(notes[cat_obs == -9])
    no_cat = tt.log(1-k) + pm.Poisson.dist(pm.math.exp(a)).logp(notes[cat_obs == -9])
        
    
    notes_cond_RC1 = pm.Potential("notes|RC==1",
        pm.math.exp(pm.math.logsumexp(
            tt.stack(yes_cat,no_cat),
            axis=0
        ))
    )
                              
    
    #cat known present / absent
    lambda_ = pm.math.exp(a + b * cat_obs[cat_obs != -9])
    
    notes_cond_RC0 = pm.Poisson("notes|RC==0",lambda_,observed=notes[cat_obs != -9])
    
    trace_cat = pm.sample(tune=1000,chains=4)

Multiprocess sampling (4 chains in 2 jobs)
CompoundStep
>NUTS: [b, a, k]
>BinaryGibbsMetropolis: [cat|RC==0]
Sampling 4 chains, 0 divergences: 100%|██████████| 6000/6000 [00:08<00:00, 735.55draws/s]


In "notes|RC==1" We are computing Pr(Ni). A probability! 

Let's verify that probas are returned

In [6]:
cat_1 = tt.log(k).tag.test_value + pm.Poisson.dist(pm.math.exp(a+b)).logp(notes[cat_obs == -9]).tag.test_value
cat_2 = tt.log(1-k).tag.test_value + pm.Poisson.dist(pm.math.exp(a)).logp(notes[cat_obs == -9]).tag.test_value

In [7]:
ps = np.exp(special.logsumexp([cat_1,cat_2],axis=0)).round(3)*100

In [8]:
val_proba = np.array((notes[cat_obs == -9],ps))

for i in np.arange(len(ps)):
    print("N{0} value is {1} and proba is: {2}".format(i,val_proba[0,i],val_proba[1,i]))

N0 value is 1.0 and proba is: 36.8
N1 value is 4.0 and proba is: 1.5
N2 value is 1.0 and proba is: 36.8
N3 value is 8.0 and proba is: 0.0
N4 value is 1.0 and proba is: 36.8
N5 value is 9.0 and proba is: 0.0
N6 value is 7.0 and proba is: 0.0
N7 value is 1.0 and proba is: 36.8
N8 value is 1.0 and proba is: 36.8
N9 value is 3.0 and proba is: 6.1
N10 value is 1.0 and proba is: 36.8
N11 value is 6.0 and proba is: 0.1
N12 value is 0.0 and proba is: 36.8
N13 value is 4.0 and proba is: 1.5
N14 value is 7.0 and proba is: 0.0
N15 value is 8.0 and proba is: 0.0
N16 value is 7.0 and proba is: 0.0
N17 value is 1.0 and proba is: 36.8
N18 value is 4.0 and proba is: 1.5


In [9]:
pm.summary(trace_cat)

Unnamed: 0,mean,sd,hpd_3%,hpd_97%,mcse_mean,mcse_sd,ess_mean,ess_sd,ess_bulk,ess_tail,r_hat
cat|RC==0,0.554,0.497,0.0,1.0,0.011,0.008,2068.0,2068.0,2068.0,2000.0,1.0
a,1.528,0.062,1.404,1.638,0.001,0.001,1795.0,1795.0,1805.0,1509.0,1.0
b,-0.729,0.137,-0.979,-0.468,0.003,0.002,1903.0,1880.0,1912.0,1122.0,1.0
k,0.546,0.225,0.148,0.929,0.005,0.004,1707.0,1707.0,1654.0,1444.0,1.0


Yet it seems notes_cond_RC1 has no impact on the inference. For instance k @ 0.5 well is our pain old Beta(2,2)

In [10]:
with cat:
    post = pm.sample_posterior_predictive(trace_cat,var_names=["notes|RC==1"])

100%|██████████| 2000/2000 [00:03<00:00, 523.40it/s]


In [11]:
post["notes|RC==1"][:,0,:]

array([[0.2263516 , 0.11120208, 0.2263516 , ..., 0.01862827, 0.2263516 ,
        0.11120208],
       [0.09845782, 0.16717425, 0.09845782, ..., 0.06637997, 0.09845782,
        0.16717425],
       [0.11478204, 0.15900626, 0.11478204, ..., 0.06044713, 0.11478204,
        0.15900626],
       ...,
       [0.14415637, 0.14448465, 0.14415637, ..., 0.05064736, 0.14415637,
        0.14448465],
       [0.17704449, 0.13186853, 0.17704449, ..., 0.03382716, 0.17704449,
        0.13186853],
       [0.14977875, 0.15337085, 0.14977875, ..., 0.03681212, 0.14977875,
        0.15337085]])