In [1]:
import arviz as az
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import pymc3 as pm
import seaborn as sns


az.style.use('arviz-darkgrid')

In [2]:
HEIGHT_DATA = 'howell1.csv'
PRIMATE_DATA = 'milk.csv'
DATA_FOLDER = '../../data'

In [3]:
df = pd.read_csv(f'{DATA_FOLDER}/{HEIGHT_DATA}')
df.head()

Unnamed: 0,height,weight,age,male
0,151.765,47.825606,63.0,1
1,139.7,36.485807,63.0,0
2,136.525,31.864838,65.0,0
3,156.845,53.041914,41.0,1
4,145.415,41.276872,51.0,0


In [4]:
df['is_male'] = np.where(df['male'] == 1, 1, 0)
df['is_female'] = np.where(df['male'] == 1, 0, 1)
df.head()

Unnamed: 0,height,weight,age,male,is_male,is_female
0,151.765,47.825606,63.0,1,1,0
1,139.7,36.485807,63.0,0,0,1
2,136.525,31.864838,65.0,0,0,1
3,156.845,53.041914,41.0,1,1,0
4,145.415,41.276872,51.0,0,0,1


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 544 entries, 0 to 543
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   height     544 non-null    float64
 1   weight     544 non-null    float64
 2   age        544 non-null    float64
 3   male       544 non-null    int64  
 4   is_male    544 non-null    int64  
 5   is_female  544 non-null    int64  
dtypes: float64(3), int64(3)
memory usage: 25.6 KB


## Model With Indicator Variables

$$
\begin{align*}
    H_i      & \sim N(\mu_i, \sigma) \\
    \mu_i    & = \beta_f \cdot f_i + \beta_m \cdot m_i \\
    \beta_f  & \sim N(178, 20) \\
    \beta_m  & \sim N(178, 20) \\
    \sigma  & \sim \text{Uniform}(0, 50)
\end{align*}
$$

### Code 5.47

In [6]:
with pm.Model() as m_5_8:
    beta_f = pm.Normal('beta_f', mu=178, sigma=20)
    beta_m = pm.Normal('beta_m', mu=178, sigma=20)
    sigma = pm.Uniform('sigma', lower=0, upper=50)
    
    mu = pm.Deterministic('mu', beta_f * df['is_female'] + beta_m * df['is_male'])
    H = pm.Normal('height', mu=mu, sigma=sigma, observed=df['height'])
    
    trace_5_8 = pm.sample(5000, tune=2000)

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [sigma, beta_m, beta_f]


Sampling 4 chains for 2_000 tune and 5_000 draw iterations (8_000 + 20_000 draws total) took 11 seconds.


In [7]:
az.summary(trace_5_8, var_names=['beta_f', 'beta_m', 'sigma'], hdi_prob=0.89)



Unnamed: 0,mean,sd,hdi_5.5%,hdi_94.5%,mcse_mean,mcse_sd,ess_mean,ess_sd,ess_bulk,ess_tail,r_hat
beta_f,134.936,1.625,132.308,137.452,0.01,0.007,25863.0,25845.0,25854.0,14761.0,1.0
beta_m,142.582,1.717,139.87,145.309,0.011,0.008,25329.0,25292.0,25388.0,15090.0,1.0
sigma,27.419,0.828,26.068,28.705,0.005,0.004,25152.0,25000.0,25315.0,13980.0,1.0


In [8]:
df_tr_5_8 = pm.trace_to_dataframe(trace_5_8)
df_tr_5_8.filter(['beta_f', 'beta_m']).head()

Unnamed: 0,beta_f,beta_m
0,133.751724,141.217282
1,133.751724,141.217282
2,137.112186,143.167468
3,131.734718,141.235762
4,135.870758,143.926389


In [9]:
df_tr_5_8['diff'] = df_tr_5_8['beta_f'] - df_tr_5_8['beta_m']
df_tr_5_8[['beta_f', 'beta_m', 'diff']].describe(percentiles=[0.055, 0.945])

Unnamed: 0,beta_f,beta_m,diff
count,20000.0,20000.0,20000.0
mean,134.936356,142.581785,-7.645429
std,1.625362,1.71674,2.348622
min,128.653617,135.366323,-16.062242
5.5%,132.356593,139.861831,-11.404771
50%,134.928149,142.575925,-7.655799
94.5%,137.505852,145.30513,-3.89933
max,140.659911,148.942594,3.643682


## Categorical Variables in Primate Milk Example

In [10]:
df_m = pd.read_csv(f'{DATA_FOLDER}/{PRIMATE_DATA}')
df_m.head()

Unnamed: 0,clade,species,kcal.per.g,perc.fat,perc.protein,perc.lactose,mass,neocortex.perc
0,Strepsirrhine,Eulemur fulvus,0.49,16.6,15.42,67.98,1.95,55.16
1,Strepsirrhine,E macaco,0.51,19.27,16.91,63.82,2.09,
2,Strepsirrhine,E mongoz,0.46,14.11,16.85,69.04,2.51,
3,Strepsirrhine,E rubriventer,0.48,14.91,13.18,71.91,1.62,
4,Strepsirrhine,Lemur catta,0.6,27.28,19.5,53.22,2.19,


In [11]:
df_m['clade'].unique()

array(['Strepsirrhine', 'New World Monkey', 'Old World Monkey', 'Ape'],
      dtype=object)

In [12]:
pd.get_dummies(df_m['clade']).head()

Unnamed: 0,Ape,New World Monkey,Old World Monkey,Strepsirrhine
0,0,0,0,1
1,0,0,0,1
2,0,0,0,1
3,0,0,0,1
4,0,0,0,1


In [13]:
df_m[['is_ape', 'is_nwm', 'is_owm', 'is_str']] = pd.get_dummies(df_m['clade'])
df_m.head()

Unnamed: 0,clade,species,kcal.per.g,perc.fat,perc.protein,perc.lactose,mass,neocortex.perc,is_ape,is_nwm,is_owm,is_str
0,Strepsirrhine,Eulemur fulvus,0.49,16.6,15.42,67.98,1.95,55.16,0,0,0,1
1,Strepsirrhine,E macaco,0.51,19.27,16.91,63.82,2.09,,0,0,0,1
2,Strepsirrhine,E mongoz,0.46,14.11,16.85,69.04,2.51,,0,0,0,1
3,Strepsirrhine,E rubriventer,0.48,14.91,13.18,71.91,1.62,,0,0,0,1
4,Strepsirrhine,Lemur catta,0.6,27.28,19.5,53.22,2.19,,0,0,0,1


$$
\begin{align*}
    K      & \sim N(\mu_i, \sigma) \\
    \mu_i    & = \beta_a \cdot C_a + \beta_n \cdot C_n + \beta_o \cdot C_o + \beta_s \cdot C_s \\
    \beta_x  & \sim N(0, 0.5) \text{ for } x \in \{ a, n, o, s \} \\
    \sigma  & \sim \text{Exponential}(1)
\end{align*}
$$

In [14]:
from scipy.stats import zscore


df_m['kcal.per.g_stan'] = zscore(df_m['kcal.per.g'])

df_m['log_mass_stan'] = zscore(df_m['mass'].apply(np.log))

df_m['neocortex.perc_stan'] = zscore(df_m['neocortex.perc']) 

In [15]:
par_names = ['beta_a', 'beta_n', 'beta_o', 'beta_s']

pars_to_cols = {'beta_a': 'is_ape', 
                  'beta_n': 'is_nwm', 
                  'beta_o': 'is_owm', 
                  'beta_s': 'is_str'}

pars_to_pymc3 = {}

with pm.Model() as m_5_9:
    sigma = pm.Exponential('sigma', lam=1)
    
    for p_name in par_names:
        p_obj = pm.Normal(p_name, mu=0, sigma=0.5)
        pars_to_pymc3[p_name] = p_obj
    
    mu_obj = pars_to_pymc3[par_names[0]] * df_m[pars_to_cols[par_names[0]]]
    
    for p_name in par_names[1:]:
        mu_obj += pars_to_pymc3[p_name] * df_m[pars_to_cols[p_name]]
        
    mu = pm.Deterministic('mu', mu_obj)
    K = pm.Normal('kcal', mu=mu, sigma=sigma, observed=df_m['kcal.per.g_stan'])
    
    trace_5_9 = pm.sample(5000, tune=2000)

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [beta_s, beta_o, beta_n, beta_a, sigma]


Sampling 4 chains for 2_000 tune and 5_000 draw iterations (8_000 + 20_000 draws total) took 16 seconds.


In [16]:
az.summary(trace_5_9, var_names=par_names, hdi_prob=0.89)



Unnamed: 0,mean,sd,hdi_5.5%,hdi_94.5%,mcse_mean,mcse_sd,ess_mean,ess_sd,ess_bulk,ess_tail,r_hat
beta_a,-0.468,0.24,-0.84,-0.084,0.001,0.001,27436.0,24454.0,27651.0,14779.0,1.0
beta_n,0.353,0.242,-0.025,0.744,0.002,0.001,25957.0,21402.0,26211.0,13797.0,1.0
beta_o,0.644,0.285,0.179,1.086,0.002,0.001,25558.0,24386.0,25822.0,15370.0,1.0
beta_s,-0.553,0.299,-1.045,-0.092,0.002,0.001,24996.0,23216.0,25145.0,15587.0,1.0
