# A simpl(er) Introduction to Hierarchical Linear Regressions 
### Naive Bayesians, 2020

By using synthetically generated data we will compare pooled, unpooled (individual) models and Bayesian hierarchical linear regressions 

In [54]:
A = [1, 2, 3]

A.pop(0)
A.insert(3, "lala")
A

[2, 3, 'lala']

<IPython.core.display.Javascript object>

In [1]:
%load_ext nb_black

<IPython.core.display.Javascript object>

In [2]:
import numpy as np
import pandas as pd
import pymc3 as pm
from sklearn.linear_model import LinearRegression
from typing import Tuple
from matplotlib import pyplot as plt
import plotly.graph_objects as go
import plotly.figure_factory as ff

<IPython.core.display.Javascript object>

In [3]:
plt.rcParams["figure.figsize"] = (8, 4)
plt.rcParams["font.size"] = 12

<IPython.core.display.Javascript object>

In [4]:
def create_dataset() -> pd.DataFrame:
    """
    Assume there are 2 sites with different regression coefficients 
    
    """

    n = 20
    beta_opt_1 = 2.0
    beta_opt_2 = -5.0

    std_noise_x = 0.5
    std_noise_y = 1

    x_1 = std_noise_x * np.random.randn(n)
    y_1 = beta_opt_1 * x_1 + std_noise_y * np.random.randn(n)

    x_2 = std_noise_x * np.random.randn(n)
    y_2 = beta_opt_2 * x_2 + std_noise_y * np.random.randn(n)

    df_1 = pd.DataFrame({"y": y_1, "x": x_1}).assign(
        **{"site": "site_A", "site_idx": 0}
    )
    df_2 = pd.DataFrame({"y": y_2, "x": x_2}).assign(
        **{"site": "site_B", "site_idx": 1}
    )

    df = pd.concat([df_1, df_2], axis="rows")

    return df


data = create_dataset()
data.sample(5)

Unnamed: 0,y,x,site,site_idx
1,-0.263065,0.066934,site_A,0
13,4.204973,-0.827595,site_B,1
14,-0.263547,0.339652,site_A,0
13,1.778967,0.47997,site_A,0
8,-0.390537,-0.088923,site_A,0


<IPython.core.display.Javascript object>

In [5]:
n_sites = len(data["site"].unique())
site_indices = data["site_idx"].values

print(f"Number of sites: {n_sites}\n")
print("Indicator to match each data point to a site:\n", site_indices)

Number of sites: 2

Indicator to match each data point to a site:
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1]


<IPython.core.display.Javascript object>

In [6]:
# Pooled model: Fitting a regression for both models
pooled_lin_mod = LinearRegression(fit_intercept=False)
pooled_lin_mod.fit(data[["x"]], data[["y"]])
print("Pooled model for both sites", pooled_lin_mod.coef_[0])


# Fitting a regular regression model for each site separately
for site_name, site_data in data.groupby("site"):
    lin_mod = LinearRegression(fit_intercept=False)
    lin_mod.fit(site_data[["x"]], site_data[["y"]])
    print(f"Individual model for: {site_name}", lin_mod.coef_[0])

Pooled model for both sites [-0.50500524]
Individual model for: site_A [2.79127743]
Individual model for: site_B [-5.09981824]


<IPython.core.display.Javascript object>

### Pooled Model

In [7]:
with pm.Model() as pooled_model:

    # Define priors
    sigma_pooled = pm.HalfCauchy("sigma_pooled", beta=5)
    beta_pooled = pm.Normal("beta_pooled", mu=0, sd=20)

    # Define likelihood
    likelihood = pm.Normal(
        "y_hat", mu=beta_pooled * data["x"], sd=sigma_pooled, observed=data["y"]
    )

    # Inference!
    trace_pooled = pm.sample(draws=3000)

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (2 chains in 2 jobs)
NUTS: [beta_pooled, sigma_pooled]


Sampling 2 chains for 1_000 tune and 3_000 draw iterations (2_000 + 6_000 draws total) took 56 seconds.


<IPython.core.display.Javascript object>

In [25]:
posterior_beta_pooled = trace_pooled.get_values("beta_pooled", burn=1000, chains=[0])
fig = ff.create_distplot(
    posterior_beta_pooled.reshape(1, -1),
    ["Pooled Estimate"],
    show_hist=False,
    show_rug=False,
)
fig.show()

<IPython.core.display.Javascript object>

In [35]:
all_posteriors = posterior_beta_pooled.copy().reshape(1, -1)

<IPython.core.display.Javascript object>

### Individual (Unpooled) Model


Instead of specifying 2 separate models for each site, we can build a "single" model but ensure that the data for each site is fitted independently. This is done using the array called `site_indices` which specifies data points to the sites

In [10]:
with pm.Model() as individual_model:

    # Define priors now with mu
    sigma_individual = pm.HalfCauchy("sigma_individual", beta=5, shape=n_sites)
    beta_individual = pm.Normal("beta_individual", mu=0, sd=20, shape=n_sites)

    # Define likelihood
    likelihood = pm.Normal(
        "y_individual",
        mu=beta_individual[site_indices] * data["x"],
        sd=sigma_individual[site_indices],
        observed=data["y"],
    )

    # Inference!
    trace_individual = pm.sample(draws=3000)

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (2 chains in 2 jobs)
NUTS: [beta_individual, sigma_individual]


Sampling 2 chains for 1_000 tune and 3_000 draw iterations (2_000 + 6_000 draws total) took 54 seconds.


<IPython.core.display.Javascript object>

In [36]:
posterior_beta_individual = trace_individual.get_values(
    "beta_individual", burn=1000, chains=[0]
)

all_posteriors = np.concatenate([all_posteriors, posterior_beta_individual.T])


fig = ff.create_distplot(
    all_posteriors,
    ["Pooled", "Individual Estimates_A", "Individual Estimates_B"],
    show_hist=False,
    show_rug=False,
)
fig.show()

<IPython.core.display.Javascript object>

In [12]:
posterior_beta_pooled

array([-0.97948039, -1.69681752, -1.89923144, ..., -1.97870578,
        1.04260188,  0.75237113])

<IPython.core.display.Javascript object>

### Hierarchical Model


In [34]:
with pm.Model() as hierarchical_model:

    sigma_hierarchical = pm.HalfCauchy("sigma_hierarchical", beta=5, shape=n_sites)

    # The step that makes it hierarchical
    # We only assume beta is linked to the global
    mu_global = pm.Normal("mu_global", mu=0, sd=1)
    #     sd_global = pm.HalfCauchy("sd_global", beta=0.1)
    beta_hierarchical = pm.Normal(
        "beta_hierarchical", mu=mu_global, sd=0.5, shape=n_sites
    )

    # Define likelihood
    likelihood = pm.Normal(
        "y_hierarchical",
        mu=beta_hierarchical[site_indices] * data["x"],
        sigma=sigma_hierarchical[site_indices],
        observed=data["y"],
    )

    # Inference!
    trace_hierarchical = pm.sample(draws=3000)

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (2 chains in 2 jobs)
NUTS: [beta_hierarchical, mu_global, sigma_hierarchical]


Sampling 2 chains for 1_000 tune and 3_000 draw iterations (2_000 + 6_000 draws total) took 43 seconds.


<IPython.core.display.Javascript object>

In [37]:
posterior_beta_hierarchical = trace_hierarchical.get_values(
    "beta_hierarchical", burn=1000, chains=[1]
)

all_posteriors = np.concatenate([all_posteriors, posterior_beta_hierarchical.T])


fig = ff.create_distplot(
    all_posteriors,
    ["Pooled", "Individual Estimates_A", "Individual Estimates_B"]
    + ["Hierarchical Estimates_A", "Hierarchical Estimates_B"],
    show_hist=False,
    show_rug=False,
)
fig.show()

<IPython.core.display.Javascript object>

In [15]:
# fig.add_trace(
#       go.Scatter(
#                x=df_posterior.index,
#                y=df_posterior[col],
#                name="lala")

# layout = go.Layout(
#     title="",
#     font=dict(family="Arial", size=14),
#     paper_bgcolor="rgba(0,0,0,0)",
#     plot_bgcolor="rgba(0,0,0,0)",
# )


# fig = go.Figure(layout=layout)
# fig.update_layout(title="Title")
# fig.update_xaxes(linecolor="black", title="Beta", showgrid=True, gridcolor="LightGray",)
# fig.update_yaxes(ticksuffix="", linecolor="black", title="Density",)
# fig.show()

data_radon = pd.read_csv(pm.get_data("radon.csv"))
# data_radon["log_radon"] = data_radon["log_radon"].astype(theano.config.floatX)
# county_names = data_radon.county.unique()
# county_idx = data_radon.county_code.values
data_radon

Unnamed: 0.1,Unnamed: 0,idnum,state,state2,stfips,zip,region,typebldg,floor,room,...,pcterr,adjwt,dupflag,zipflag,cntyfips,county,fips,Uppm,county_code,log_radon
0,0,5081.0,MN,MN,27.0,55735,5.0,1.0,1.0,3.0,...,9.7,1146.499190,1.0,0.0,1.0,AITKIN,27001.0,0.502054,0,0.832909
1,1,5082.0,MN,MN,27.0,55748,5.0,1.0,0.0,4.0,...,14.5,471.366223,0.0,0.0,1.0,AITKIN,27001.0,0.502054,0,0.832909
2,2,5083.0,MN,MN,27.0,55748,5.0,1.0,0.0,4.0,...,9.6,433.316718,0.0,0.0,1.0,AITKIN,27001.0,0.502054,0,1.098612
3,3,5084.0,MN,MN,27.0,56469,5.0,1.0,0.0,4.0,...,24.3,461.623670,0.0,0.0,1.0,AITKIN,27001.0,0.502054,0,0.095310
4,4,5085.0,MN,MN,27.0,55011,3.0,1.0,0.0,4.0,...,13.8,433.316718,0.0,0.0,3.0,ANOKA,27003.0,0.428565,1,1.163151
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
914,914,5995.0,MN,MN,27.0,55363,5.0,1.0,0.0,4.0,...,4.5,1146.499190,0.0,0.0,171.0,WRIGHT,27171.0,0.913909,83,1.871802
915,915,5996.0,MN,MN,27.0,55376,5.0,1.0,0.0,7.0,...,8.3,1105.956867,0.0,0.0,171.0,WRIGHT,27171.0,0.913909,83,1.526056
916,916,5997.0,MN,MN,27.0,55376,5.0,1.0,0.0,4.0,...,5.2,1214.922779,0.0,0.0,171.0,WRIGHT,27171.0,0.913909,83,1.629241
917,917,5998.0,MN,MN,27.0,56297,5.0,1.0,0.0,4.0,...,9.6,1177.377355,0.0,0.0,173.0,YELLOW MEDICINE,27173.0,1.426590,84,1.335001


<IPython.core.display.Javascript object>

In [16]:
len(data_radon.county_code.unique())

85

<IPython.core.display.Javascript object>

In [17]:
with pm.Model() as unpooled_model:

    # Independent parameters for each county
    b = pm.Normal("b", 0, sigma=100, shape=2)

<IPython.core.display.Javascript object>

In [18]:
b[[0, 0, 0]]

AdvancedSubtensor1.0

<IPython.core.display.Javascript object>

Hi all, 

I’ve attempted to create the simplest possible example illustrate hierarchical models for linear regression (even simpler than Gellman et. al.’s [Radon Gas case study](https://docs.pymc.io/notebooks/GLM-hierarchical.html)). 

**Problem Setup**
The data has 2 univariate Regressions (scalar regression coefficient + no intercept) 

$$
\begin{align}
x_{1} {} & \sim \text{Normal}(\mu = 0, \sigma^2 = 0.25)  \\
x_{2} {} & \sim \text{Normal}(\mu = 0, \sigma^2 = 0.25)  \\
\eta {} & \sim \text{Normal}(\mu = 0, \sigma^2 = 1)  \\
\beta_1 {} & =  2, \hspace{2mm} \beta_2 = -5 \\
y_1 {} & = \beta_1x_1 + \eta \\
y_2 {} & = \beta_2x_1 + \eta 
\end{align}
$$

Then I concatenated $y_1, y_2$ and $x_1, x_2$ into a single vectors e.g. $\mathbf{y} = [y_1^{(0)} \dots y_2^{N_1}, y_2^{(0)} \dots y_2^{N_2}]$ and labelled added an inidcator variable. 



Then, I performed inference using 3 different schemes (similar to the Radon example) 
* Pooled
* Unpooled (Individual regressions for $y_1$, and $y_2$)
* Hierarachical Regression

Pooled
```python
with pm.Model() as pooled_model:

    # Define priors
    sigma_pooled = pm.HalfCauchy("sigma_pooled", beta=5)
    beta_pooled = pm.Normal("beta_pooled", mu=0, sd=20)

    # Define likelihood
    likelihood = pm.Normal(
        "y_hat", mu=beta_pooled * data["x"], sd=sigma_pooled, observed=data["y"]
    )

    # Inference!
    trace_pooled = pm.sample(draws=3000)
    
    # Getting one chain
    posterior_beta_pooled = trace_pooled.get_values("beta_pooled", burn=1000, chains=[0])

```
Unpooled

```python
with pm.Model() as individual_model:

    # Define priors now with mu
    sigma_individual = pm.HalfCauchy("sigma_individual", beta=5, shape=n_sites)
    beta_individual = pm.Normal("beta_individual", mu=0, sd=20, shape=n_sites)

    # Define likelihood
    likelihood = pm.Normal(
        "y_individual",
        mu=beta_individual[site_indices] * data["x"],
        sd=sigma_individual[site_indices],
        observed=data["y"],
    )

    # Inference!
    trace_individual = pm.sample(draws=3000)
    
    posterior_beta_individual = trace_individual.get_values(
        "beta_individual", burn=1000, chains=[0]
    )
```
Hierarachical
```python
with pm.Model() as hierarchical_model:

    sigma_hierarchical = pm.HalfCauchy("sigma_hierarchical", beta=5, shape=n_sites)

    # The step that makes it hierarchical
    # We only assume beta is linked to the global
    mu_global = pm.Normal("mu_global", mu=0, sd=0.1)
    sd_global = pm.HalfCauchy("sd_global", beta=0.1)
    beta_hierarchical = pm.Normal(
        "beta_hierarchical", mu=mu_global, sd=sd_global, shape=n_sites
    )

    # Define likelihood
    likelihood = pm.Normal(
        "y_hierarchical",
        mu=beta_hierarchical[site_indices] * data["x"],
        sigma=sigma_hierarchical[site_indices],
        observed=data["y"],
    )

    # Inference!
    trace_hierarchical = pm.sample(draws=3000)
    
    posterior_beta_hierarchical = trace_hierarchical.get_values(
        "beta_hierarchical", burn=1000, chains=[1]
    )
```

The goal of the excercies was to show that the hierarchical estimates sits somewhere in between the pooled and individual estimates. And this difference is effected by a few things 

1) The confidence in the hyper-prior (paramet
2) The amount of data available for the different "sites" (i.e. confidence in the data)


But I find the hierarchical model to always yield posteriors closer to the individual estimates. 