In [1]:
import arviz as az
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import pymc3 as pm
import seaborn as sns


az.style.use('arviz-darkgrid')

In [2]:
HEIGHT_DATA = 'howell1.csv'
DATA_FOLDER = '../../data'

In [3]:
df = pd.read_csv(f'{DATA_FOLDER}/{HEIGHT_DATA}')
df.head()

Unnamed: 0,height,weight,age,male
0,151.765,47.825606,63.0,1
1,139.7,36.485807,63.0,0
2,136.525,31.864838,65.0,0
3,156.845,53.041914,41.0,1
4,145.415,41.276872,51.0,0


In [4]:
df['is_male'] = np.where(df['male'] == 1, 1, 0)
df['is_female'] = np.where(df['male'] == 1, 0, 1)
df.head()

Unnamed: 0,height,weight,age,male,is_male,is_female
0,151.765,47.825606,63.0,1,1,0
1,139.7,36.485807,63.0,0,0,1
2,136.525,31.864838,65.0,0,0,1
3,156.845,53.041914,41.0,1,1,0
4,145.415,41.276872,51.0,0,0,1


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 544 entries, 0 to 543
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   height     544 non-null    float64
 1   weight     544 non-null    float64
 2   age        544 non-null    float64
 3   male       544 non-null    int64  
 4   is_male    544 non-null    int64  
 5   is_female  544 non-null    int64  
dtypes: float64(3), int64(3)
memory usage: 25.6 KB


## Model With Indicator Variables

$$
\begin{align*}
    H_i      & \sim N(\mu_i, \sigma) \\
    \mu_i    & = \beta_f \cdot f_i + \beta_m \cdot m_i \\
    \beta_f  & \sim N(178, 20) \\
    \beta_m  & \sim N(178, 20) \\
    \sigma  & \sim \text{Uniform}(0, 50)
\end{align*}
$$

### Code 5.47

In [6]:
with pm.Model() as m_5_8:
    beta_f = pm.Normal('beta_f', mu=178, sigma=20)
    beta_m = pm.Normal('beta_m', mu=178, sigma=20)
    sigma = pm.Uniform('sigma', lower=0, upper=50)
    
    mu = pm.Deterministic('mu', beta_f * df['is_female'] + beta_m * df['is_male'])
    H = pm.Normal('height', mu=mu, sigma=sigma, observed=df['height'])
    
    trace_5_8 = pm.sample(5000, tune=2000)

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [sigma, beta_m, beta_f]


Sampling 4 chains for 2_000 tune and 5_000 draw iterations (8_000 + 20_000 draws total) took 9 seconds.


In [7]:
az.summary(trace_5_8, var_names=['beta_f', 'beta_m', 'sigma'], hdi_prob=0.89)



Unnamed: 0,mean,sd,hdi_5.5%,hdi_94.5%,mcse_mean,mcse_sd,ess_mean,ess_sd,ess_bulk,ess_tail,r_hat
beta_f,134.93,1.62,132.307,137.448,0.01,0.007,26498.0,26484.0,26488.0,13724.0,1.0
beta_m,142.569,1.717,139.879,145.349,0.011,0.008,26087.0,26087.0,26086.0,14583.0,1.0
sigma,27.421,0.837,26.114,28.782,0.005,0.004,26741.0,26535.0,26967.0,13661.0,1.0


In [8]:
df_tr_5_8 = pm.trace_to_dataframe(trace_5_8)
df_tr_5_8.filter(['beta_f', 'beta_m']).head()

Unnamed: 0,beta_f,beta_m
0,140.189733,139.254911
1,132.561391,146.413293
2,134.692619,141.942948
3,134.692619,141.942948
4,136.691656,142.265394


In [9]:
df_tr_5_8['diff'] = df_tr_5_8['beta_f'] - df_tr_5_8['beta_m']
df_tr_5_8[['beta_f', 'beta_m', 'diff']].describe(percentiles=[0.055, 0.945])

Unnamed: 0,beta_f,beta_m,diff
count,20000.0,20000.0,20000.0
mean,134.93017,142.568826,-7.638656
std,1.61981,1.717243,2.391734
min,128.554117,136.032007,-17.368563
5.5%,132.360799,139.840565,-11.479579
50%,134.919715,142.560139,-7.637808
94.5%,137.515286,145.322431,-3.833229
max,141.527395,149.681463,1.534426
