In [1]:
import arviz as az
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pymc3 as pm
import seaborn as sns

In [2]:
df_cars = pd.read_csv('../../data/cars.csv', index_col=0)
df_cars.head()

Unnamed: 0,speed,dist
1,4,2
2,4,10
3,7,4
4,7,22
5,8,16


In [3]:
df_cars.shape

(50, 2)

In [4]:
df_cars.describe()

Unnamed: 0,speed,dist
count,50.0,50.0
mean,15.4,42.98
std,5.287644,25.769377
min,4.0,2.0
25%,12.0,26.0
50%,15.0,36.0
75%,19.0,56.0
max,25.0,120.0


### Code 7.19

In [5]:
with pm.Model() as m:
    a = pm.Normal('a', mu=0, sigma=100)
    b = pm.Normal('b', mu=0, sigma=10)
    
    mu = pm.Deterministic('mu', a + b * df_cars['speed'])
    sigma = pm.Exponential('sigma', lam=1)
    
    dist = pm.Normal('dist', mu=mu, sigma=sigma, observed=df_cars['dist'])
    
    trace_m = pm.sample(250, chains=4, tune=1000)

Only 250 samples in chain.
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [sigma, b, a]


Sampling 4 chains for 1_000 tune and 250 draw iterations (4_000 + 1_000 draws total) took 6 seconds.
The acceptance probability does not match the target. It is 0.8936239090807796, but should be close to 0.8. Try to increase the number of tuning steps.


In [6]:
df_m = pm.trace_to_dataframe(trace_m)
df_m.head()

Unnamed: 0,a,b,mu__0,mu__1,mu__2,mu__3,mu__4,mu__5,mu__6,mu__7,...,mu__41,mu__42,mu__43,mu__44,mu__45,mu__46,mu__47,mu__48,mu__49,sigma
0,-10.487176,3.508866,3.548289,3.548289,14.074888,14.074888,17.583754,21.092621,24.601487,24.601487,...,59.69015,59.69015,66.707883,70.216749,73.725616,73.725616,73.725616,73.725616,77.234482,14.341345
1,-22.320044,4.253327,-5.306735,-5.306735,7.453246,7.453246,11.706574,15.959901,20.213228,20.213228,...,62.7465,62.7465,71.253154,75.506481,79.759808,79.759808,79.759808,79.759808,84.013136,12.439642
2,-21.943634,4.22674,-5.036674,-5.036674,7.643546,7.643546,11.870286,16.097026,20.323766,20.323766,...,62.591166,62.591166,71.044646,75.271386,79.498126,79.498126,79.498126,79.498126,83.724865,15.051139
3,-7.370282,3.300896,5.833304,5.833304,15.735993,15.735993,19.036889,22.337785,25.638681,25.638681,...,58.647644,58.647644,65.249437,68.550333,71.85123,71.85123,71.85123,71.85123,75.152126,12.751333
4,-7.3345,3.298598,5.85989,5.85989,15.755683,15.755683,19.054281,22.352878,25.651476,25.651476,...,58.637451,58.637451,65.234646,68.533243,71.831841,71.831841,71.831841,71.831841,75.130438,13.264365


In [7]:
df_mu = df_m.filter(regex='mu__*')

In [8]:
df_mu = df_mu.T
df_mu.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,990,991,992,993,994,995,996,997,998,999
mu__0,3.548289,-5.306735,-5.036674,5.833304,5.85989,-0.927314,4.444826,4.162056,8.722967,5.249862,...,0.850852,-10.128599,-14.004751,-0.628703,-2.748037,0.609783,1.679113,-8.380728,-8.317011,-1.633552
mu__1,3.548289,-5.306735,-5.036674,5.833304,5.85989,-0.927314,4.444826,4.162056,8.722967,5.249862,...,0.850852,-10.128599,-14.004751,-0.628703,-2.748037,0.609783,1.679113,-8.380728,-8.317011,-1.633552
mu__2,14.074888,7.453246,7.643546,15.735993,15.755683,10.633575,14.107995,13.930974,18.727074,15.026776,...,11.870785,3.864796,-0.013334,10.472871,8.900293,11.795583,12.497536,4.575706,5.502067,10.876285
mu__3,14.074888,7.453246,7.643546,15.735993,15.755683,10.633575,14.107995,13.930974,18.727074,15.026776,...,11.870785,3.864796,-0.013334,10.472871,8.900293,11.795583,12.497536,4.575706,5.502067,10.876285
mu__4,17.583754,11.706574,11.870286,19.036889,19.054281,14.487205,17.329052,17.18728,22.061776,18.285747,...,15.544095,8.529262,4.650472,14.173396,12.783069,15.524183,16.103677,8.894518,10.108426,15.046231


In [9]:
df_mu.columns

RangeIndex(start=0, stop=1000, step=1)

In [10]:
sigma = df_m['sigma'].T
sigma.head()

0    14.341345
1    12.439642
2    15.051139
3    12.751333
4    13.264365
Name: sigma, dtype: float64

### Code 7.20

Recall that the lppd of the $i$th observation is defined as:
$$
    \text{lppd}_i = \log \frac{1}{S} \sum_{s} p(y_i \mid \theta_s) = \log \sum_{s} p(y_i \mid \theta_s) - \log S,
$$
where $S$ is the number of samples of the parameter values. 

For computational error minimization, this is actually computed by first taking the log-probability density and then summing 
$$
    \log \sum_{s} \exp \log p(y_i \mid \theta_s) - \log S
$$

In [11]:
from scipy import stats


N_SAMPLES = 1000
N_OBS = df_mu.shape[0]

logprob = np.zeros((N_OBS, N_SAMPLES))
for s in range(N_SAMPLES):
    logprob[:,s] = stats.norm.logpdf(df_cars['dist'], loc=df_mu[s], scale=sigma[s]) 

In [12]:
logprob[0:5, 0:5]

array([[-3.5879128 , -3.61233154, -3.73967862, -3.50976049, -3.54635925],
       [-3.68327571, -4.19686781, -4.1294316 , -3.51796221, -3.55272991],
       [-3.82884284, -3.47835773, -3.65969304, -3.8881183 , -3.8967485 ],
       [-3.73477179, -4.12355964, -4.08530237, -3.58523436, -3.61482656],
       [-3.58818284, -3.49938786, -3.66803409, -3.49293503, -3.53052994]])

### Code 7.21

In [13]:
from scipy.special import logsumexp

lppd = np.zeros(N_OBS)

for i in range(N_OBS):
    lppd[i] = logsumexp(logprob[i]) - np.log(N_SAMPLES)

### Code 7.22

The WAIC penalty term for observation $y_i$ is defined as: 
$$
    \text{var}_{\theta} \log p(y_i \mid \theta),
$$
where the variance is taken over the log-probability values corresponding to the samples of the parameters $\theta$.

In [14]:
# the WAIC penalty
pWAIC = np.zeros(N_OBS)

for i in range(N_OBS):
    pWAIC[i] = np.var(logprob[i])

### Code 7.23

The WAIC itself is defined as:

$$
    -2 \cdot \left ( \sum_i \text{lppd}_i - \sum_i \text{var}_{\theta} \log p(y_i \mid \theta) \right ).
$$

In [15]:
-2 * (sum(lppd) - sum(pWAIC))

421.39844173678534

### Code 7.24

In [16]:
waic_vec = -2 * (lppd - pWAIC)
(N_OBS * np.var(waic_vec)) ** 0.5

16.309706118235184