# INLA: Create models and assess predictions
## Models per quartile

In [837]:
#library(rstanarm)
library(INLA)
library(brinla)
library(data.table)
library(ggplot2)
options(repr.plot.width=3, repr.plot.height=3)

In [838]:
# read data
df = fread('../data/le_cov_sel.csv')
nrow(df)

In [839]:
print(names(df))

 [1] "le"                   "z_relative_mob"       "z_gini"              
 [4] "county"               "gender"               "income_q"            
 [7] "county_name"          "stateabbrv"           "statename"           
[10] "log_population"       "log_income"           "z_segregation_income"
[13] "log_unemployment"     "z_uninsured"          "z_medicare_expenses" 
[16] "log_crime_rate"       "log_pct_black"        "log_pct_hispanic"    
[19] "z_obesity"            "z_smoking"            "z_exercise"          


In [840]:
df[, state := .GRP, by = statename]
df[, cty := .GRP, by = county]
df[, income_qr := .GRP, by = income_q]

In [841]:
table(df[, .(income_qr, income_q)]) # ok, right!

         income_q
income_qr   Q1   Q2   Q3   Q4
        1 3000    0    0    0
        2    0 3000    0    0
        3    0    0 3000    0
        4    0    0    0 3000

# INLA Models (using PC prior)

In [842]:
# create auxiliary variables
df[, state_mob := state]
df[, state_gini := state]
# df[, cty := county]
# df[, cty_mob := county]
# df[, cty_gini := county]
# df[, q_mob := income_qr]
# df[, q_gini := income_qr]
# df[, q_exercise := income_qr]

In [843]:
female = df[gender=='F']
male = df[gender=='M']

# Baseline model

### Male

In [844]:
# define PC prior
# same prior per quartile
lmod <- lm(le ~ z_relative_mob  + z_gini + log_population + log_income, male)

# pc prior
sdres <- sd(residuals(lmod))
pcprior <- list(prec = list(prior="pc.prec", param = c(3*sdres,0.01)))

In [845]:
# run models per income quartile
for (i in 1:4) {
    formula = le ~ z_relative_mob  + z_gini + log_population + log_income + 
       f(state, model = "iid", hyper = pcprior)
    model = inla(formula, family = "gaussian", data = male[income_qr==i],
#           control.predictor=list(compute = TRUE),
          control.compute = list(config = TRUE, dic = TRUE,
                                 waic = TRUE, cpo = TRUE), 
#           control.inla = list(strategy ="gaussian"),
          verbose = TRUE)
    
    model_name = paste0('m1_', i)
    assign(model_name, model)  
    }

In [846]:
m1_1$summary.fixed

Unnamed: 0,mean,sd,0.025quant,0.5quant,0.975quant,mode,kld
(Intercept),36.422132,0.1250425,36.1765209,36.4217652,36.6693715,36.4210382,0.0
z_relative_mob,-0.4865725,0.04037959,-0.565848,-0.4865785,-0.4073375,-0.4865871,8.511308e-15
z_gini,0.3189739,0.03733459,0.2456577,0.3189744,0.3922186,0.3189788,2.737979e-14
log_population,-0.2542587,0.03939759,-0.3315967,-0.2542682,-0.1769404,-0.2542838,2.235223e-14
log_income,1.1578335,0.18952917,0.7856597,1.1578311,1.5296743,1.1578424,2.318031e-14


In [847]:
bri.hyperpar.summary(m1_1)

Unnamed: 0,mean,sd,q0.025,q0.5,q0.975,mode
SD for the Gaussian observations,1.081663,0.01997705,1.04317,1.0813494,1.1216,1.0806705
SD for state,0.815244,0.09513868,0.6473607,0.8082699,1.020416,0.7934126


In [848]:
# simulate values
source('functions/simulation_no_random_effects.R')

In [849]:
# create data for prediction
# all values in their means except for constrast: income mobility
nrep = 2 # 4 quartiles for 2 contrast values
relative_mob_pred_data = data.table(
    z_relative_mob = c(0.0, 1.0),
    z_gini = rep(0, nrep),
    log_population = rep(0, nrep), 
    log_income = rep(0, nrep))

In [851]:
# simulate values per quartile
sim_male_m1 = data.table()

for (i in 1:4) {
    model_name = paste0('m1_', i)
    s = simulate_pred_no_re(model=get(model_name), 
                                          data=relative_mob_pred_data, 
                                          contrast='z_relative_mob', 
                                          nsim = 2000)
    d = s[, .(q = i, fd = diff(pred)), by = sim][, .(q, fd)]
    sim_male_m1 = rbind(sim_male_m1, d)
    }

In [852]:
fwrite(sim_male_m1, file = '../data/sim_male_m1.csv')

### Female

In [853]:
# define PC prior
# same prior per quartile
lmod <- lm(le ~ z_relative_mob  + z_gini + log_population + log_income, female)

sdres <- sd(residuals(lmod))
pcprior <- list(prec = list(prior="pc.prec", param = c(3*sdres,0.01)))

In [854]:
# run models per income quartile
for (i in 1:4) {
    formula = le ~ z_relative_mob  + z_gini + log_population + log_income + 
       f(state, model = "iid", hyper = pcprior)
    model = inla(formula, family = "gaussian", data = female[income_qr==i],
#           control.predictor=list(compute = TRUE),
          control.compute = list(config = TRUE, dic = TRUE,
                                 waic = TRUE, cpo = TRUE), 
#           control.inla = list(strategy ="gaussian"),
          verbose = TRUE)
    
    model_name = paste0('f1_', i)
    assign(model_name, model)   
    }

In [855]:
f1_1$summary.fixed

Unnamed: 0,mean,sd,0.025quant,0.5quant,0.975quant,mode,kld
(Intercept),41.9454232,0.10680204,41.7348957,41.9453584,42.1559971,41.9452361,3.1933139999999996e-19
z_relative_mob,-0.3462273,0.03967026,-0.4241215,-0.3462297,-0.2683918,-0.3462314,1.984141e-14
z_gini,0.2673735,0.03730377,0.1941225,0.2673725,0.3405617,0.2673737,2.243867e-14
log_population,-0.2453682,0.03932484,-0.3225767,-0.245373,-0.1682047,-0.2453794,2.243501e-14
log_income,0.6489384,0.18938874,0.2770556,0.6489308,1.0205169,0.6489318,2.205397e-14


In [856]:
bri.hyperpar.summary(f1_1)

Unnamed: 0,mean,sd,q0.025,q0.5,q0.975,mode
SD for the Gaussian observations,1.0845787,0.02001332,1.045974,1.0842817,1.1245466,1.0836591
SD for state,0.6833374,0.08319916,0.5361704,0.6774347,0.8624172,0.6656124


In [857]:
# simulate values per quartile
sim_female_f1 = data.table()

for (i in 1:4) {
    model_name = paste0('f1_', i)
    s = simulate_pred_no_re(model=get(model_name), 
                                          data=relative_mob_pred_data, 
                                          contrast='z_relative_mob', 
                                          nsim = 2000)
    d = s[, .(q = i, fd = diff(pred)), by = sim][, .(q, fd)]
    sim_female_f1 = rbind(sim_female_f1, d)
    }

In [858]:
fwrite(sim_female_f1, file = '../data/sim_female_f1.csv')

# Adjusting for contextual variables

### Male

In [859]:
# -Demographic: % Black, Hispanic (since Chetty uses this to race adjust)
# -Social: Crime rate, segregation
# -Economic: Gini, possibly unemployment
# -Health Access: uninsured, medicare expenses

In [860]:
# define PC prior
lmod <- lm(le ~ z_relative_mob  + z_gini + log_population + log_income + 
           log_crime_rate + z_segregation_income +  log_pct_black + log_pct_hispanic + 
           log_unemployment +  z_uninsured + z_medicare_expenses, male)


sdres <- sd(residuals(lmod))
pcprior <- list(prec = list(prior="pc.prec", param = c(3*sdres,0.01)))

In [861]:
# models per quartile
for (i in 1:4) {
    formula = le ~ z_relative_mob  + z_gini + log_population + log_income + 
        log_crime_rate + z_segregation_income +  log_pct_black + log_pct_hispanic + 
        log_unemployment +  z_uninsured + z_medicare_expenses +
        f(state, model = "iid", hyper = pcprior)
    model = inla(formula, family = "gaussian", data = male[income_qr==i],
#           control.predictor=list(compute = TRUE),
          control.compute = list(config = TRUE, dic = TRUE,
                                 waic = TRUE, cpo = TRUE), 
#           control.inla = list(strategy ="gaussian"),
          verbose = TRUE)
    
    model_name = paste0('m2_', i)
    assign(model_name, model)
    
    }

In [862]:
m2_1$summary.fixed

Unnamed: 0,mean,sd,0.025quant,0.5quant,0.975quant,mode,kld
(Intercept),36.4081921,0.12772455,36.158657553,36.40731864,36.662404222,36.40558614,6.968851e-12
z_relative_mob,-0.39106851,0.04700954,-0.483389234,-0.391066,-0.298847603,-0.39105696,1.88395e-14
z_gini,0.22616832,0.04252952,0.142669129,0.22616268,0.309621075,0.22615496,1.822228e-14
log_population,-0.07253597,0.0557673,-0.181987264,-0.07255615,0.036925218,-0.072592,1.185305e-14
log_income,1.66689946,0.22158534,1.23176714,1.66690066,2.101620575,1.66692189,2.035029e-14
log_crime_rate,-0.21794905,0.06325348,-0.342137247,-0.21795708,-0.093831923,-0.21796784,2.254575e-14
z_segregation_income,-0.02413533,0.05258644,-0.127425531,-0.02412653,0.079010155,-0.02410438,1.574158e-14
log_pct_black,-0.0672101,0.03084784,-0.127765877,-0.06721701,-0.00667235,-0.0672283,1.845762e-14
log_pct_hispanic,-0.08308339,0.04337508,-0.168292717,-0.08307208,0.001984155,-0.08304563,2.143742e-14
log_unemployment,0.28562925,0.14912812,-0.007288275,0.2856545,0.578134788,0.2857179,1.872071e-14


In [863]:
bri.hyperpar.summary(m2_1)

Unnamed: 0,mean,sd,q0.025,q0.5,q0.975,mode
SD for the Gaussian observations,1.0409372,0.01930077,1.003781,1.0406196,1.079555,1.0399182
SD for state,0.8279269,0.10227067,0.648109,0.8202341,1.049101,0.8040011


In [864]:
# create data for predictions
nrep =  2 # 2 contrast values
relative_mob_pred_data = data.table(
    z_relative_mob       = c(0.0, 1.0),
    z_gini               = rep(0, nrep),
    log_population       = rep(0, nrep), 
    log_income           = rep(0, nrep),
    log_crime_rate       = rep(0, nrep),
#     log_poverty          = rep(0, nrep),
#     log_mig_inflow       = rep(0, nrep),
#     log_mig_outflow      = rep(0, nrep),
#     log_foreign          = rep(0, nrep),
    log_pct_black        = rep(0, nrep),
    log_pct_hispanic     = rep(0, nrep),
#     log_house_value      = rep(0, nrep),
#     log_local_gov_exp    = rep(0, nrep),
    log_unemployment     = rep(0, nrep),
    z_segregation_income = rep(0, nrep),
#     z_religion           = rep(0, nrep),
#     z_labor_force        = rep(0, nrep),
#     z_college            = rep(0, nrep),
#     z_middle_class       = rep(0, nrep),
    z_uninsured          = rep(0, nrep), 
    z_medicare_expenses  = rep(0, nrep))

In [865]:
# simulate by quartile
sim_male_m2 = data.table()

for (i in 1:4) {
    model_name = paste0('m2_', i)
    s = simulate_pred_no_re(model=get(model_name), 
                                          data=relative_mob_pred_data, 
                                          contrast='z_relative_mob', 
                                          nsim = 2000)
    d = s[, .(q = i, fd = diff(pred)), by = sim][, .(q, fd)]
    sim_male_m2 = rbind(sim_male_m2, d)
    }

In [866]:
fwrite(sim_male_m2, file = '../data/sim_male_m2.csv')

### Female

In [867]:
lmod <- lm(le ~ z_relative_mob  + z_gini + log_population + log_income + 
       log_crime_rate + z_segregation_income +  log_pct_black + log_pct_hispanic + 
       log_unemployment +  z_uninsured + z_medicare_expenses, female)

# pc prior
sdres <- sd(residuals(lmod))
pcprior <- list(prec = list(prior="pc.prec", param = c(3*sdres,0.01)))

In [None]:
for (i in 1:4) {
    formula = le ~ z_relative_mob  + z_gini + log_population + log_income + 
        log_crime_rate + z_segregation_income +  log_pct_black + log_pct_hispanic + 
        log_unemployment +  z_uninsured + z_medicare_expenses +
        f(state, model = "iid", hyper = pcprior)
    model = inla(formula, family = "gaussian", data = female[income_qr==i],
#           control.predictor=list(compute = TRUE),
          control.compute = list(config = TRUE, dic = TRUE,
                                 waic = TRUE, cpo = TRUE), 
#           control.inla = list(strategy ="gaussian"),
          verbose = TRUE)
    
    model_name = paste0('f2_', i)
    assign(model_name, model)
    
    }

In [None]:
f2_1$summary.fixed

In [None]:
bri.hyperpar.summary(f2_1)

In [None]:
# simulate per quartile
sim_female_f2 = data.table()

for (i in 1:4) {
    model_name = paste0('f2_', i)
    s = simulate_pred_no_re(model=get(model_name), 
                                          data=relative_mob_pred_data, 
                                          contrast='z_relative_mob', 
                                          nsim = 2000)
    d = s[, .(q = i, fd = diff(pred)), by = sim][, .(q, fd)]
    sim_female_f2 = rbind(sim_female_f2, d)
    }

In [None]:
fwrite(sim_female_f2, file = '../data/sim_female_f2.csv')

# Table by income quartile

In [873]:
library(texreg)
source('functions/extract_inla.R')

In [874]:
for (i in 1:4) {
    cmodels <- rep(c('Women', 'Men'), 2)
    models <- list(get(paste0('f1_', i)),
                   get(paste0('m1_', i)),
                   get(paste0('f2_', i)),
                   get(paste0('m2_', i)))

    cnames <- list("(Intercept)" = 'Constant',
                   'z_relative_mob' = 'Standardized Income mobility (Rank-Rank Slope)',
                   'z_gini' = 'Standardized Gini',
                   "sd for state" = "SD states",
                   "sd for the Gaussian observations" = "SD observations")

    # screenreg(models)
    t = texreg(models,
                include.dic = TRUE, include.waic = TRUE,
                ci.test = FALSE,
                float.pos = "htp",
                caption = "Life Expectancy (40) Models",
                booktabs = TRUE,
                use.packages = FALSE,
                dcolumn = TRUE,
                caption.above = TRUE,
                scalebox = 0.65,
                label = "inla_models",
                # sideways = TRUE,
                digits = 2,
                custom.model.names = cmodels,
                custom.coef.map = cnames,
                groups = list("Random Effects" = c(4:5)),
                custom.note = "Note: Selected coefficients 
                (mean of marginal posterior distribution). 95\\% credibility intervals.")
    
    assign(paste0('tab_', i), t)
    
}

In [875]:
heading = '\\renewcommand{\\arraystretch}{1.2}\n
\\begin{table}[htp]\n
\\begin{threeparttable}\n
\\caption{Life Expectancy (40) Models\\tnote{1}}\\label{inla_models}\n
\\centering\n
\\setlength{\\tabcolsep}{1pt}\n
\\scriptsize\n
\\begin{tabular}{l D{.}{.}{5.11} D{.}{.}{5.11} D{.}{.}{5.11} D{.}{.}{5.11} }\n
\\toprule\n
& \\multicolumn{2}{c}{Baseline\\tnote{2}} & \\multicolumn{2}{c}{Social Indicators\\tnote{3}} \\\\\n
& \\multicolumn{1}{c}{Women} & \\multicolumn{1}{c}{Men} & \\multicolumn{1}{c}{Women} & \\multicolumn{1}{c}{Men} \\\\\n
\\midrule\n'

In [876]:
heading =  gsub("\n\n", "\n", heading)

In [877]:
bottom = '\\addlinespace[5pt]\n
\\bottomrule\n
\\end{tabular}
\n\\begin{tablenotes}[flushleft]\n
\\scriptsize\n
\\item [1] Four separated models (one per income quartile). Selected coefficients, mean of marginal posterior distribution and 95\\% credibility intervals in brackets.\n
\\item [2] Baseline model adjusts for log population and log income.\n
\\item [3] Social indicators model adjusts for log population, log income, log crime rate, log \\% black, log \\% hispanic, log unemployment, z-score income segregation, z-score \\% uninsured, and z-score medicare expenses.\n\\end{tablenotes}\n\\end{threeparttable}\n
\\end{table}\n'

In [878]:
bottom =  gsub("\n\n", "\n", bottom)

In [879]:
sep = NA
for (i in 1:4) {
  sep[i] = paste0("\\addlinespace[10pt]\n\\multicolumn{5}{l}{\\textbf{Income Quartile ", i, "}} \\\\\n\\addlinespace[10pt]\n")
}

In [880]:
tabs = list(tab_1, tab_2, tab_3, tab_4)

In [881]:
out = list()
for (i in 1:4) {
    out[[i]] = gsub('(.+)(Constant.+)(Random.+)', '\\2', tabs[[i]])
}

In [882]:
# export table
cat(heading, 
    sep[[1]], out[[1]], 
    sep[[2]], out[[2]], 
    sep[[3]], out[[3]], 
    sep[[4]], out[[4]], 
    bottom, 
    file = 'tables/inla_models_quartile.tex')

# Prediction per county

In [883]:
library(sdazar)
library(ggridges)
library(forcats)

In [884]:
# order data by le
setorder(male, le)
setorder(female, le)

In [886]:
# county state labes
df[, cn := paste0(county_name, ", ", stateabbrv)]
cn = df[, .(cn, state, county)]
cn = cn[!duplicated(cn)]
head(cn)

cn,state,county
"Autauga, AL",1,1001
"Baldwin, AL",1,1003
"Barbour, AL",1,1005
"Blount, AL",1,1009
"Calhoun, AL",1,1015
"Chambers, AL",1,1017


In [889]:
# selected variables
vars = c('le', 'county', 'state', 'z_relative_mob', 'z_gini', 'log_population', 'log_income' ,  
        'log_crime_rate', 'z_segregation_income', 'log_pct_black', 'log_pct_hispanic', 
        'log_unemployment', 'z_uninsured', 'z_medicare_expenses')

### Male

In [890]:
min_mob = min(male$z_relative_mob)

In [891]:
bottom_male = head(male[income_qr==1, vars, with=FALSE], 10)
bottom_male_c = copy(bottom_male)
bottom_male_c[, z_relative_mob := min_mob]
tot_bottom_male = rbind(bottom_male, bottom_male_c) 
tot_bottom_male[, ranking := .GRP, by = .(county, state)]
tot_bottom_male[, order := seq_len(.N), by = .(county, state)]
anyDuplicated(tot_bottom_male[,.(state, county, z_relative_mob)])

In [892]:
# simulate values
source('functions/simulation_random_intercept.R')

In [893]:
sim_male_bottom = simulate_predictions(m2_1, nsim=2000, tot_bottom, contrast='z_relative_mob', 
                  random_intercept='state')

sim_male_bottom = merge(sim_male_bottom, cn, by = c('county', 'state'))
setorder(sim_male_bottom, ranking)
head(sim_male_bottom)

county,state,pred,sim,z_relative_mob,le,ranking,order,cn
48025,41,34.368,1,0.03526187,31.51167,1,1,"Bee, TX"
48025,41,36.04396,1,-3.53846431,31.51167,1,2,"Bee, TX"
48025,41,34.37797,2,0.03526187,31.51167,1,1,"Bee, TX"
48025,41,35.11921,2,-3.53846431,31.51167,1,2,"Bee, TX"
48025,41,34.7391,3,0.03526187,31.51167,1,1,"Bee, TX"
48025,41,35.54174,3,-3.53846431,31.51167,1,2,"Bee, TX"


In [894]:
sim_male_bottom[, fcn := fct_rev(factor(cn))]

In [895]:
sim_male_bottom[fcn=='Bee, TX', le][1]

In [896]:
sim_male_bottom[fcn=='Shelby, IN', le][1]

In [905]:
# plot
savepdf('plots/counties_male')
plot = ggplot(sim_male_bottom[ranking<9], aes(y =fcn)) + 
  geom_density_ridges(aes(x = pred, fill = paste(order)), 
           alpha = .5, color = "white", from = 33, to = 38, scale = 1) +
  labs(x = "E(40)",
       y = "") +  theme_ridges()+ 
  scale_y_discrete(expand = c(0.01, 0)) +
  scale_x_continuous(expand = c(0.01, 0))  +
  scale_fill_cyclical(values = c("#ff0000", "#0000ff", "#ff8080", "#8080ff"), 
                     guide= 'legend', name='', 
                     labels = c('Observed Rank-Rank Slope','Lowest Rank-Rank Slope')) + 
  theme(legend.position = "top", 
        axis.title=element_text(size=10), 
        axis.text.x = element_text(size=10), 
        axis.text.y = element_text(size=10))
print(plot)
dev.off()

Picking joint bandwidth of 0.0427


### Female

In [898]:
min_mob = min(female$z_relative_mob)

In [900]:
bottom_female = head(female[income_qr==1, vars, with=FALSE], 10)
bottom_female_c = copy(bottom_female)
bottom_female_c[, z_relative_mob := min_mob]
tot_bottom_female = rbind(bottom_female, bottom_female_c) 
tot_bottom_female[, ranking := .GRP, by = .(county, state)]
tot_bottom_female[, order := seq_len(.N), by = .(county, state)]
anyDuplicated(tot_bottom_female[,.(state, county, z_relative_mob)])

In [901]:
sim_female_bottom = simulate_predictions(f2_1, nsim=2000, tot_bottom, contrast='z_relative_mob', 
                  random_intercept='state')

sim_female_bottom = merge(sim_female_bottom, cn, by = c('county', 'state'))
setorder(sim_female_bottom, ranking)
head(sim_female_bottom)

county,state,pred,sim,z_relative_mob,le,ranking,order,cn
48025,41,39.76154,1,0.03526187,31.51167,1,1,"Bee, TX"
48025,41,41.18461,1,-3.53846431,31.51167,1,2,"Bee, TX"
48025,41,39.9169,2,0.03526187,31.51167,1,1,"Bee, TX"
48025,41,41.21468,2,-3.53846431,31.51167,1,2,"Bee, TX"
48025,41,40.26671,3,0.03526187,31.51167,1,1,"Bee, TX"
48025,41,41.86972,3,-3.53846431,31.51167,1,2,"Bee, TX"


In [902]:
sim_female_bottom[, fcn := fct_rev(factor(cn))]

In [906]:
# plot
savepdf('plots/counties_female')
plot = ggplot(sim_female_bottom[ranking<9], aes(y = fcn)) + 
  geom_density_ridges(aes(x = pred, fill = paste(order)), 
           alpha = .5, color = "white", from = 39, to = 44, scale = 1) +
  labs(x = "E(40)",
       y = "") +  theme_ridges()+ 
  scale_y_discrete(expand = c(0.01, 0)) +
  scale_x_continuous(expand = c(0.01, 0))  +
  scale_fill_cyclical(values = c("#ff0000", "#0000ff", "#ff8080", "#8080ff"), 
                     guide= 'legend', name='', 
                     labels = c('Observed Rank-Rank Slope','Lowest Rank-Rank Slope')) + 
  theme(legend.position = "top", 
        axis.title=element_text(size=10), 
        axis.text.x = element_text(size=10), 
        axis.text.y = element_text(size=10))
print(plot)
dev.off()

Picking joint bandwidth of 0.0423
