In [1]:
import arviz as az
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import pymc3 as pm
import seaborn as sns


az.style.use('arviz-darkgrid')

In [2]:
WAFFLE_DATA = 'waffle_divorce.csv'
DATA_FOLDER = '../../data'

In [3]:
df_waf = pd.read_csv(f'{DATA_FOLDER}/{WAFFLE_DATA}')
df_waf.head()

Unnamed: 0.1,Unnamed: 0,Location,Loc,Population,MedianAgeMarriage,Marriage,Marriage.SE,Divorce,Divorce.SE,WaffleHouses,South,Slaves1860,Population1860,PropSlaves1860
0,1,Alabama,AL,4.78,25.3,20.2,1.27,12.7,0.79,128,1,435080,964201,0.45
1,2,Alaska,AK,0.71,25.2,26.0,2.93,12.5,2.05,0,0,0,0,0.0
2,3,Arizona,AZ,6.33,25.8,20.3,0.98,10.8,0.74,18,0,0,0,0.0
3,4,Arkansas,AR,2.92,24.3,26.4,1.7,13.5,1.22,41,1,111115,435450,0.26
4,5,California,CA,37.25,26.8,19.1,0.39,8.0,0.24,0,0,0,379994,0.0


In [4]:
df_waf['age_stan'] = (df_waf['MedianAgeMarriage'] - df_waf['MedianAgeMarriage'].mean()) / df_waf['MedianAgeMarriage'].std()
df_waf['divorce_stan'] = (df_waf['Divorce'] - df_waf['Divorce'].mean()) / df_waf['Divorce'].std()
df_waf['marriage_stan'] = (df_waf['Marriage'] - df_waf['Marriage'].mean()) /df_waf['Marriage'].std()

# Predictor Residual Plots

## Marriage rate against median age

$$
\begin{align*}
    M_i     & \sim N(\mu_i, \sigma) \\
    \mu_i   & \sim \alpha + \beta_A \cdot A_i\\
    \alpha  & \sim N(0, 0.2) \\
    \beta_A & \sim N(0, 0.5) \\
    \sigma  & \sim \text{Exponential}(1)
\end{align*}
$$

### Code 5.13

In [None]:
with pm.Model() as m_5_4:
    alpha = pm.Normal('alpha', mu=0.0, sigma=0.2)
    beta_A = pm.Normal('beta_A', mu=0.0, sigma=0.5)
    sigma = pm.Exponential('sigma', lam=1.0)
    
    mu = pm.Deterministic('mu', alpha + beta_A * df_waf['age_stan'])
    M = pm.Normal('marriage', mu=mu, sigma=sigma, observed=df_waf['marriage_stan'])
    
    trace_5_4 = pm.sample(5000, tune=2000)

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [sigma, beta_A, alpha]


Sampling 4 chains for 2_000 tune and 5_000 draw iterations (8_000 + 20_000 draws total) took 11 seconds.


In [None]:
df_tr_5_4 = pm.trace_to_dataframe(trace_5_4)
df_tr_5_4.head()

In [None]:
df_mu_5_4 = df_tr_5_4.filter(regex='(mu.*)')
df_mu_5_4.head()

In [None]:
mu_means_5_4 = df_mu_5_4.apply(np.mean, axis=0)

mu_means_5_4[:5]

In [None]:
residuals_marriage = df_waf['marriage_stan'].values - mu_means_5_4.values
residuals_marriage[:5]

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2)

ax1.scatter(df_waf['age_stan'], df_waf['marriage_stan'], marker='.')
ax1.plot(df_waf['age_stan'], mu_means_5_4, 'k-', alpha=0.5)
ax1.vlines(df_waf['age_stan'], mu_means_5_4.values, mu_means_5_4.values + residuals_marriage, colors='grey')

ax1.set_xlabel('Age Standarized')
ax1.set_ylabel('Marriage Rate Standardized')
ax1.set_aspect('equal')

ax2.scatter(residuals_marriage, df_waf['divorce_stan'], marker='.')
ax2.axvline(x=0.0, ls='--', color='k', alpha=0.3)

ax2.set_xlabel('Marriage Rate Residuals')
ax2.set_ylabel('Divorce Rate Standardized')
ax2.set_aspect('equal')


m, c = np.polyfit(residuals_marriage, df_waf['divorce_stan'], deg=1)
fitted_line = [m * x + c for x in residuals_marriage]

ax2.plot(residuals_marriage, fitted_line, 'k-', alpha=0.5)

plt.show()

The plot of the divorce rate against the marriage rate residuals can be thought of as a plot of the divorce rate against the marriage rate after controlling the median age at marriage. As the plot shows, there is very little variation of the divorce rate with the residuals suggesting that the divorce rate is not causally related to the marriage rate (as in having a a directed arrow from $M$ to $D$). 

## Marriage rate against median age
$$
\begin{align*}
    A_i     & \sim N(\mu_i, \sigma) \\
    \mu_i   & \sim \alpha + \beta_M \cdot M_i\\
    \alpha  & \sim N(0, 0.2) \\
    \beta_M & \sim N(0, 0.5) \\
    \sigma  & \sim \text{Exponential}(1)
\end{align*}
$$

In [None]:
with pm.Model() as m_5_5:
    alpha = pm.Normal('alpha', mu=0.0, sigma=0.2)
    beta_M = pm.Normal('beta_M', mu=0.0, sigma=0.5)
    sigma = pm.Exponential('sigma', lam=1.0)
    
    mu = pm.Deterministic('mu', alpha + beta_M * df_waf['marriage_stan'])
    A = pm.Normal('age', mu=mu, sigma=sigma, observed=df_waf['age_stan'])
    
    trace_5_5 = pm.sample(5000, tune=2000)

In [None]:
df_tr_5_5 = pm.trace_to_dataframe(trace_5_5)
df_tr_5_5.head()

In [None]:
df_mu_5_5 = df_tr_5_5.filter(regex='(mu.*)')

mu_means_5_5 = df_mu_5_5.apply(np.mean, axis=0)
residuals_age = df_waf['age_stan'].values - mu_means_5_5.values

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2)

ax1.scatter(df_waf['marriage_stan'], df_waf['age_stan'], marker='.')
ax1.plot(df_waf['marriage_stan'], mu_means_5_5, 'k-', alpha=0.5)
ax1.vlines(df_waf['marriage_stan'], mu_means_5_5.values, mu_means_5_5.values + residuals_age, colors='grey')

ax1.set_ylabel('Age Standarized')
ax1.set_xlabel('Marriage Rate Standardized')
ax1.set_aspect('equal')

ax2.scatter(residuals_age, df_waf['divorce_stan'], marker='.')
ax2.axvline(x=0.0, ls='--', color='k', alpha=0.3)

ax2.set_xlabel('Median Age Residuals')
ax2.set_ylabel('Divorce Rate Standardized')
ax2.set_aspect('equal')


m, c = np.polyfit(residuals_age, df_waf['divorce_stan'], deg=1)
fitted_line = [m * x + c for x in residuals_age]

ax2.plot(residuals_age, fitted_line, 'k-', alpha=0.5)

plt.show()

## Posterior Prediction Plots Model 5.3

$$
\begin{align*}
    D_i     & \sim N(\mu_i, \sigma) \\
    \mu_i   & \sim \alpha + \beta_A \cdot A_i + \beta_M \cdot M_i\\
    \alpha  & \sim N(0, 0.2) \\
    \beta_A & \sim N(0, 0.5) \\
    \beta_M & \sim N(0, 0.5) \\
    \sigma  & \sim \text{Exponential}(1)
\end{align*}
$$

### Code 5.15 and 5.16

In [None]:
with pm.Model() as m_5_3:
    alpha = pm.Normal('alpha', mu=0.0, sigma=0.2)
    beta_A = pm.Normal('beta_A', mu=0.0, sigma=0.5)
    beta_M = pm.Normal('beta_M', mu=0.0, sigma=0.5)
    sigma = pm.Exponential('sigma', lam=1.0)
    
    mu = pm.Deterministic('mu', alpha + beta_A * df_waf['age_stan'] + beta_M * df_waf['marriage_stan'])
    D = pm.Normal('divorce', mu=mu, sigma=sigma, observed=df_waf['divorce_stan'])
    
    trace_5_3 = pm.sample(5000, tune=2000)

In [None]:
df_tr_5_3 = pm.trace_to_dataframe(trace_5_3)

df_mu_5_3 = df_tr_5_3.filter(regex='(mu.*)')

mu_means_5_3 = df_mu_5_3.apply(np.mean, axis=0)

summary_all = az.summary(trace_5_3, hdi_prob=0.89)
summary_all.head()

In [None]:
summary_all.index

In [None]:
summary_all.loc['mu[0]', :]

In [None]:
summary_mu = summary_all.loc['mu[0]':].copy()
lower = summary_mu['hdi_5.5%']
upper = summary_mu['hdi_94.5%']

In [None]:
fig, ax = plt.subplots(1, 1)

ax.scatter(df_waf['divorce_stan'], mu_means_5_3.values, marker='.')
ax.vlines(df_waf['divorce_stan'], lower.values, upper.values, colors='grey')

ax.plot(df_waf['divorce_stan'], df_waf['divorce_stan'], color='black', alpha=0.3)

ax.set_xlabel('Divorce Rate Standardized')
ax.set_ylabel('Mean Predicted Rate')

plt.show()

## Simulating Spurious Associations

In [None]:
from scipy.stats import norm


N = 1000
x_real = norm.rvs(loc=0, scale=1, size=N)
x_spur = norm.rvs(loc=x_real, scale=1)
y = norm.rvs(loc=x_real, scale=1)

### The Model

$$
\begin{align*}
    y       & \sim N(\mu_i, \sigma) \\
    \mu_i   & \sim \alpha + \beta_r \cdot x_r + \beta_s \cdot x_s\\
    \alpha  & \sim N(0, 0.2) \\
    \beta_r & \sim N(0, 0.5) \\
    \beta_s & \sim N(0, 0.5) \\
    \sigma  & \sim \text{Exponential}(1)
\end{align*}
$$


where $x_r$ stands for the real causal predictor and $x_s$ is the spurious predictor.

In [None]:
with pm.Model() as m_spur:
    alpha = pm.Normal('alpha', mu=0.0, sigma=0.2)
    beta_r = pm.Normal('beta_r', mu=0.0, sigma=0.5)
    beta_s = pm.Normal('beta_s', mu=0.0, sigma=0.5)
    sigma = pm.Exponential('sigma', lam=1.0)
    
    mu = pm.Deterministic('mu', alpha + beta_r * x_real + beta_s * x_spur)
    Y = pm.Normal('outcome', mu=mu, sigma=sigma, observed=y)
    
    trace_spur = pm.sample(5000, tune=1000)

In [None]:
summary = az.summary(trace_spur, var_names=['alpha', 'beta_r', 'beta_s'], hdi_prob=0.89)
summary

As expected, the posterior mean of $\beta_s$ is close to $0$, whereas that of $\beta_r$ is $1$.