In [1]:
import pandas as pd
import numpy as np
import altair as alt

import pymc as pm
import arviz as az

data = pd.read_csv('../data/discrimination.csv', index_col=0)

data.head()

Unnamed: 0,gender,age,ethnicity,party,conservative,discrimination
0,male,36,estonian,Keskerakond,1,0
1,female,50,estonian,SDE,0,0
2,male,67,estonian,Isamaa,1,0
3,male,58,estonian,Reform,0,0
4,male,62,estonian,Reform,1,0


In [2]:
data.groupby('discrimination').conservative.value_counts(normalize=True)

discrimination  conservative
0               1               0.518341
                0               0.481659
1               1               0.774799
                0               0.225201
Name: proportion, dtype: float64

In [3]:
alt.Chart(data.replace({0: 'No', 1: 'Yes'}), width=300).mark_bar().encode(
    x='conservative:N',
    xOffset='discrimination:N',
    y='count()',
    color='discrimination:N'
)

In [4]:
### Let's code the model

coords = {"predictors": ["conservative"]}

with pm.Model(coords=coords) as model:
    alpha = pm.Normal("alpha", mu=0, sigma=10)
    beta_conservative = pm.Normal("beta_conservative", mu=0, sigma=10)
    
    p = pm.math.sigmoid(alpha + beta_conservative * data["conservative"])
    likelihood = pm.Bernoulli("discrimination", p=p, observed=data["discrimination"])
    
    trace = pm.sample()

print("NModel summary:")
pm.summary(trace)

Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (2 chains in 2 jobs)
NUTS: [alpha, beta_conservative]


Output()

Sampling 2 chains for 1_000 tune and 1_000 draw iterations (2_000 + 2_000 draws total) took 20 seconds.
We recommend running at least 4 chains for robust computation of convergence diagnostics


NModel summary:


Unnamed: 0,mean,sd,hdi_3%,hdi_97%,mcse_mean,mcse_sd,ess_bulk,ess_tail,r_hat
alpha,-1.278,0.086,-1.424,-1.097,0.004,0.003,481.0,646.0,1.0
beta_conservative,1.161,0.104,0.949,1.336,0.005,0.003,462.0,596.0,1.0
