In [1]:
import arviz as az
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pymc3 as pm
import seaborn as sns

In [2]:
df_cars = pd.read_csv('../../data/cars.csv', index_col=0)
df_cars.head()

Unnamed: 0,speed,dist
1,4,2
2,4,10
3,7,4
4,7,22
5,8,16


In [3]:
df_cars.shape

(50, 2)

In [4]:
df_cars.describe()

Unnamed: 0,speed,dist
count,50.0,50.0
mean,15.4,42.98
std,5.287644,25.769377
min,4.0,2.0
25%,12.0,26.0
50%,15.0,36.0
75%,19.0,56.0
max,25.0,120.0


### Code 7.19

In [5]:
with pm.Model() as m:
    a = pm.Normal('a', mu=0, sigma=100)
    b = pm.Normal('b', mu=0, sigma=10)
    
    mu = pm.Deterministic('mu', a + b * df_cars['speed'])
    sigma = pm.Exponential('sigma', lam=1)
    
    dist = pm.Normal('dist', mu=mu, sigma=sigma, observed=df_cars['dist'])
    
    trace_m = pm.sample(250, chains=4, tune=1000)

Only 250 samples in chain.
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [sigma, b, a]


Sampling 4 chains for 1_000 tune and 250 draw iterations (4_000 + 1_000 draws total) took 4 seconds.


In [6]:
df_m = pm.trace_to_dataframe(trace_m)
df_m.head()

Unnamed: 0,a,b,mu__0,mu__1,mu__2,mu__3,mu__4,mu__5,mu__6,mu__7,...,mu__41,mu__42,mu__43,mu__44,mu__45,mu__46,mu__47,mu__48,mu__49,sigma
0,-14.020351,3.833359,1.313083,1.313083,12.813159,12.813159,16.646517,20.479876,24.313235,24.313235,...,62.646821,62.646821,70.313538,74.146897,77.980255,77.980255,77.980255,77.980255,81.813614,12.29849
1,-25.803944,4.322635,-8.513405,-8.513405,4.454499,4.454499,8.777133,13.099768,17.422403,17.422403,...,60.64875,60.64875,69.294019,73.616654,77.939288,77.939288,77.939288,77.939288,82.261923,16.448034
2,-24.251112,4.25393,-7.235391,-7.235391,5.5264,5.5264,9.78033,14.03426,18.28819,18.28819,...,60.827492,60.827492,69.335353,73.589283,77.843213,77.843213,77.843213,77.843213,82.097143,15.086659
3,-14.071437,3.632792,0.459733,0.459733,11.35811,11.35811,14.990902,18.623695,22.256487,22.256487,...,58.584411,58.584411,65.849996,69.482788,73.11558,73.11558,73.11558,73.11558,76.748373,13.015358
4,-11.576985,3.628859,2.938452,2.938452,13.825029,13.825029,17.453888,21.082747,24.711606,24.711606,...,61.000197,61.000197,68.257916,71.886775,75.515634,75.515634,75.515634,75.515634,79.144493,12.79977


In [7]:
df_mu = df_m.filter(regex='mu__*')

In [8]:
df_mu = df_mu.T
df_mu.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,990,991,992,993,994,995,996,997,998,999
mu__0,1.313083,-8.513405,-7.235391,0.459733,2.938452,2.938452,4.983341,1.605433,-3.981233,-3.981233,...,-5.811632,-3.383203,-0.257234,-5.263791,-1.175943,2.335699,5.26407,1.741471,1.548722,2.872788
mu__1,1.313083,-8.513405,-7.235391,0.459733,2.938452,2.938452,4.983341,1.605433,-3.981233,-3.981233,...,-5.811632,-3.383203,-0.257234,-5.263791,-1.175943,2.335699,5.26407,1.741471,1.548722,2.872788
mu__2,12.813159,4.454499,5.5264,11.35811,13.825029,13.825029,15.189856,12.380578,8.36491,8.36491,...,7.033663,8.930605,10.904434,6.231058,11.230402,13.91923,15.846519,11.379295,12.228656,14.015621
mu__3,12.813159,4.454499,5.5264,11.35811,13.825029,13.825029,15.189856,12.380578,8.36491,8.36491,...,7.033663,8.930605,10.904434,6.231058,11.230402,13.91923,15.846519,11.379295,12.228656,14.015621
mu__4,16.646517,8.777133,9.78033,14.990902,17.453888,17.453888,18.592027,15.972292,12.480291,12.480291,...,11.315428,13.035207,14.624989,10.062675,15.36585,17.780406,19.374002,14.591903,15.788634,17.729899


In [9]:
df_mu.columns

RangeIndex(start=0, stop=1000, step=1)

In [10]:
sigma = df_m['sigma'].T
sigma.head()

0    12.298490
1    16.448034
2    15.086659
3    13.015358
4    12.799770
Name: sigma, dtype: float64

### Code 7.20

In [11]:
from scipy import stats


N_SAMPLES = 1000
N_OBS = df_mu.shape[0]

logprob = np.zeros((N_OBS, N_SAMPLES))
for s in range(N_SAMPLES):
    logprob[:,s] = stats.norm.logpdf(df_cars['dist'], loc=df_mu[s], scale=sigma[s]) 

In [12]:
logprob[0:5, 0:5]

array([[-3.42997487, -3.92342589, -3.82011696, -3.49207104, -3.4710535 ],
       [-3.67787322, -4.35259784, -4.28531791, -3.75371365, -3.62054852],
       [-3.68517637, -3.71952625, -3.63786763, -3.64487356, -3.76296616],
       [-3.70741158, -4.28809381, -4.22890673, -3.81933698, -3.6723225 ],
       [-3.42979679, -3.81556316, -3.71772975, -3.48807414, -3.47481675]])

### Code 7.21

In [13]:
from scipy.special import logsumexp

lppd = np.zeros(N_OBS)

for i in range(N_OBS):
    lppd[i] = logsumexp(logprob[i]) - np.log(N_SAMPLES)

### Code 7.22

In [14]:
# the WAIC penalty
pWAIC = np.zeros(N_OBS)

for i in range(N_OBS):
    pWAIC[i] = np.var(logprob[i])

### Code 7.23

In [15]:
-2 * (sum(lppd) - sum(pWAIC))

421.51015283841366