# INLA: Create models and assess predictions

In [2]:
#library(rstanarm)
library(INLA)
library(brinla)
library(data.table)
library(ggplot2)
options(repr.plot.width=3, repr.plot.height=3)

Loading required package: sp
Loading required package: Matrix
This is INLA_17.05.31 built 2017-05-31 13:53:28 UTC.
See www.r-inla.org/contact-us for how to get help.


In [3]:
# read data
df = fread('../data/le_cov_tr.csv')
nrow(df)

In [4]:
print(names(df))

 [1] "county"               "gender"               "income_q"            
 [4] "le"                   "county_name"          "population"          
 [7] "statename"            "stateabbrv"           "density"             
[10] "gini"                 "relative_mob"         "absolute_mob"        
[13] "segregation_income"   "segregation_race"     "income"              
[16] "poverty"              "middle_class"         "mig_inflow"          
[19] "mig_outflow"          "foreign"              "religion"            
[22] "crime_rate"           "uninsured"            "labor_force"         
[25] "unemployment"         "pct_black"            "pct_hispanic"        
[28] "obesity"              "smoking"              "exercise"            
[31] "house_value"          "college"              "medicare_expenses"   
[34] "local_gov_exp"        "male"                 "q2"                  
[37] "q3"                   "q4"                   "log_population"      
[40] "log_crime_rate"       "log_pover

In [5]:
df[, state := .GRP, by = statename]
df[, income_qr := .GRP, by = income_q]

In [6]:
table(df[, .(income_qr, income_q)]) # ok, right!

         income_q
income_qr   Q1   Q2   Q3   Q4
        1 3100    0    0    0
        2    0 3102    0    0
        3    0    0 3098    0
        4    0    0    0 3100

# INLA Models (using PC prior)

In [14]:
#names(dat)

In [7]:
# create auxiliary variables
df[, state_mob := state]
df[, state_gini := state]
df[, cty := county]
df[, cty_mob := county]
df[, cty_gini := county]
df[, q_mob := income_qr]
df[, q_gini := income_qr]

In [8]:
female = df[gender=='F']
male = df[gender=='M']

# Baseline model

### Male

In [17]:
lmod <- lm(le ~ z_relative_mob  + z_gini + log_population + log_income, male)

# pc prior
sdres <- sd(residuals(lmod))
pcprior <- list(prec = list(prior="pc.prec", param = c(3*sdres,0.01)))

In [None]:
formula = le ~ z_relative_mob  + z_gini + log_population + log_income + 
       f(state, model = "iid", hyper = pcprior) + 
       f(cty, model = "iid", hyper = pcprior) + 
       f(income_qr, model = "iid", hyper = pcprior) + 
       f(q_mob, z_relative_mob , model = "iid", hyper = pcprior) + 
       f(q_gini, z_gini , model = "iid", hyper = pcprior)

m1 = inla(formula, family = "gaussian", data = male,
#           control.predictor=list(compute = TRUE),
          control.compute = list(config = TRUE, dic = TRUE,
                                 waic = TRUE, cpo = TRUE), 
#           control.inla = list(strategy ="gaussian"),
          verbose = TRUE)

In [19]:
m1$summary.fixed


Call:
c("inla(formula = formula, family = \"gaussian\", data = male, verbose = TRUE, ",  "    control.compute = list(config = TRUE, dic = TRUE, waic = TRUE, ",  "        cpo = TRUE))")

Time used:
 Pre-processing    Running inla Post-processing           Total 
         2.8433         47.4151          2.2996         52.5580 

Fixed effects:
                  mean     sd 0.025quant 0.5quant 0.975quant    mode kld
(Intercept)    81.2476 2.0756    77.0603  81.2476    85.4367 81.2478   0
z_relative_mob -0.3054 0.1346    -0.5660  -0.3054    -0.0433 -0.3052   0
z_gini          0.2244 0.1874    -0.1420   0.2243     0.5913  0.2245   0
log_population -0.2283 0.0280    -0.2832  -0.2283    -0.1734 -0.2283   0
log_income      1.2320 0.1338     0.9692   1.2320     1.4946  1.2321   0

Random effects:
Name	  Model
 state   IID model 
cty   IID model 
income_qr   IID model 
q_mob   IID model 
q_gini   IID model 

Model hyperparameters:
                                           mean      sd 0.025quan

In [20]:
bri.hyperpar.summary(m1)

Unnamed: 0,mean,sd,q0.025,q0.5,q0.975,mode
SD for the Gaussian observations,1.1317247,0.01211908,1.10658445,1.1323237,1.15398,1.1347625
SD for state,0.6942886,0.08535555,0.55672067,0.6830516,0.8895159,0.6548753
SD for cty,0.5370075,0.03005964,0.47129057,0.5403433,0.5859431,0.5551366
SD for income_qr,3.8519119,1.32871292,1.90993057,3.6248208,7.0737405,3.2183385
SD for q_mob,0.2205882,0.13244463,0.07811555,0.1834379,0.576016,0.1349803
SD for q_gini,0.2977208,0.17325169,0.10889687,0.2496264,0.7618955,0.1857306


### Female

In [37]:
lmod <- lm(le ~ z_relative_mob  + z_gini + log_population + log_income, female)

# pc prior
sdres <- sd(residuals(lmod))
pcprior <- list(prec = list(prior="pc.prec", param = c(3*sdres,0.01)))

In [38]:
formula = le ~ z_relative_mob  + z_gini + log_population + log_income + 
       f(state, model = "iid", hyper = pcprior) + 
       f(cty, model = "iid", hyper = pcprior) + 
       f(income_qr, model = "iid", hyper = pcprior) + 
       f(q_mob, z_relative_mob , model = "iid", hyper = pcprior) + 
       f(q_gini, z_gini , model = "iid", hyper = pcprior)


f1 = inla(formula, family = "gaussian", data = female,
#           control.predictor=list(compute = TRUE),
          control.compute = list(config = TRUE, dic = TRUE,
                                 waic = TRUE, cpo = TRUE), 
#           control.inla = list(strategy ="gaussian"),
          verbose = TRUE)

In [23]:
f1$summary.fixed

ERROR: Error in eval(expr, envir, enclos): object 'f1' not found


In [27]:
bri.hyperpar.summary(f1)

Unnamed: 0,mean,sd,q0.025,q0.5,q0.975,mode
SD for the Gaussian observations,1.15963278,0.01191264,1.13689746,1.15934386,1.183652,1.1585153
SD for state,0.5469012,0.07004732,0.43149425,0.53858846,0.7051523,0.51870145
SD for cty,0.52466497,0.02203057,0.48181655,0.52455596,0.5682688,0.52498093
SD for income_qr,2.54320251,0.94900487,1.26164343,2.34395349,4.9316836,2.00805761
SD for q_mob,0.08795318,0.05720506,0.02551899,0.07236143,0.2409428,0.05153076
SD for q_gini,0.27765261,0.15396563,0.10137775,0.23704051,0.6855605,0.18042773


## First differences (simulation, baseline)

In [78]:
setorder(male, -le)
top_male = male[, .(county, state, le)][1:11]

In [79]:
setorder(male, le)
bottom_male = male[, .(county, state, le, income_q)][1:11]

In [9]:
# function to simulate predicted values

simulate_predictions = function(model, data, nsim = 1000, contrast='z_relative_mob', 
                               random = 'q_mob') {
    
    # simulate posterior distribution
    simData <- inla.posterior.sample(n=nsim, result=model)
    
    # define matrix model
    f = formula(paste0('~ ', paste0(names(data), collapse = ' + ')))
    X = model.matrix(f, dat = data)
    N = nrow(data)
    Ysim = matrix(nrow = N, ncol = nsim) 
    
    # loop
    for (i in 1:nsim) {
        rnames <- rownames(simData[[i]]$latent)
        rndQinc <- simData[[i]]$latent[grep('income_qr', rnames)]
        rndIncCoef <- simData[[i]]$latent[grep(random, rnames)]
        coefNames <- rownames(model$summary.fixed)
        Betas = simData[[i]]$latent[-grep("^Pred|^state|^cty|^obs|^income_qr|q_mob|q_gini", rnames)]
        names(Betas) = coefNames
        mt = matrix(rep(Betas, N), ncol = length(coefNames), nrow = N, byrow = TRUE)
        indc = grep(contrast, coefNames)
        indi = grep('Intercept', coefNames)
        mt[, indc] = mt[,indc] + rep(rndIncCoef, 2)
        # mt[,indi] = mt[,indi] + rep(rndQinc, 2)
        for (j in 1:N) {
            Ysim[j, i] <- X[j, ] %*% mt[j,]
        }
    }
    
    t = data.table(Ysim)
    t[, q := rep(1:4, times = 2)]
    t[, (contrast) := rep(c(0.0, 1.0), times = 1 , each = 4)]
    return(melt(t, id.vars = c('q', contrast)))
}

# first difference function
first_difference = function(simulated_data, value_variable, constrast_variable, 
                            simulation_index, group_variable) {
    output = data.table()
    gr = simulated_data[, unique(get(group_variable))]
    for (g in gr) { 
        diff = simulated_data[get(group_variable)==g, 
                     .(q=g, diff = diff(get(value_variable))), by=.(sim=get(simulation_index))]
        output = rbind(output, diff )
    }
    return(output)
}

# btw grouups

first_difference_between_groups = function(data, contrast = 'z_gini', group = 'q', model = 'm1') {
    c = gtools::combinations(n = 4, r = 2, v = c(1:4), repeats.allowed = FALSE)
    t = list()
    for (i in 1:nrow(c)) {
        a = data[get(group) == c[i,1] & get(contrast) == 1, value] - data[get(group) == c[i,1] & get(contrast) == 0, value]
        b = data[get(group) == c[i,2] & get(contrast) == 1, value] - data[get(group) == c[i,2] & get(contrast) == 0, value]
        varname = paste0(c[i,1], '-', c[i,2]) 
        t[[i]] = data.table(type =contrast,  contrast = varname, model = model, values = (a - b))
    }
    return(rbindlist(t))
}

### Income mobility 

In [29]:
nrep = 4 * 2 # 4 quartiles for 2 contrast values
relative_mob_pred_data = data.table(
    z_relative_mob = rep(c(0.0, 1.0), times = 1 , each = 4),
    z_gini = rep(0, nrep),
    log_population = rep(0, nrep), 
    log_income = rep(0, nrep))

In [31]:
sim_mob_male = simulate_predictions(model=m1, data=relative_mob_pred_data, 
                                           nsim=2000, 
                                           contrast='z_relative_mob', 
                                           random='q_mob')

In [49]:
sim_mob_male_m1 = first_difference(sim_mob_male, 'value', 'z_relative_mob', 'variable', 'q')
fwrite(sim_mob_male_m1, file='../data/sim_mob_male_m1.csv')

In [41]:
sim_mob_female = simulate_predictions(model=f1, data=relative_mob_pred_data, 
                                           nsim=2000, 
                                           contrast='z_relative_mob', 
                                           random='q_mob')

In [53]:
sim_mob_female_m1 = first_difference(sim_mob_female, 'value', 'z_relative_mob', 'variable', 'q')
fwrite(sim_mob_female_m1, file='../data/sim_mob_female_m1.csv')

### Inequality


In [55]:
nrep = 4 * 2 # 4 quartiles for 2 contrast values
gini_pred_data = data.table(
    z_relative_mob = rep(0, nrep),
    z_gini = rep(c(0.0, 1.0), times = 1 , each = 4),
    log_population = rep(0, nrep),     
    log_income = rep(0, nrep)
)

In [None]:
sim_gini_male = simulate_predictions(model=m1, data=gini_pred_data, 
                                           nsim=2000, 
                                           contrast='z_gini', 
                                           random='q_gini')

In [57]:
sim_gini_male_m1 = first_difference(sim_gini_male, 'value', 'z_gini', 'variable', 'q')
fwrite(sim_gini_male_m1, file='../data/sim_gini_male_m1.csv')

In [None]:
sim_gini_female = simulate_predictions(model=f1, data=gini_pred_data, 
                                           nsim=2000, 
                                           contrast='z_gini', 
                                           random='q_gini')

In [59]:
sim_gini_female_m1 = first_difference(sim_gini_female, 'value', 'z_gini', 'variable', 'q')
fwrite(sim_gini_female_m1, file='../data/sim_gini_female_m1.csv')

# Adjusting for contextual variables

### Male

In [10]:
lmod <- lm(le ~ z_relative_mob  + z_gini + log_population + log_income + 
           log_crime_rate + log_poverty + log_mig_inflow + log_mig_outflow + 
           log_foreign + log_pct_black + log_pct_hispanic + log_house_value + 
           log_local_gov_exp + log_unemployment + z_segregation_income + z_religion + 
           z_labor_force + z_college + z_middle_class, male)

# pc prior
sdres <- sd(residuals(lmod))
pcprior <- list(prec = list(prior="pc.prec", param = c(3*sdres,0.01)))

In [11]:
formula = le ~ z_relative_mob  + z_gini + log_population + log_income +
       log_crime_rate + log_poverty + log_mig_inflow + log_mig_outflow + 
       log_foreign + log_pct_black + log_pct_hispanic + log_house_value + 
       log_local_gov_exp + log_unemployment + z_segregation_income + z_religion + 
       z_labor_force + z_college + z_middle_class + 
       f(state, model = "iid", hyper = pcprior) + 
       f(cty, model = "iid", hyper = pcprior) + 
       f(income_qr, model = "iid", hyper = pcprior) + 
       f(q_mob, z_relative_mob , model = "iid", hyper = pcprior) + 
       f(q_gini, z_gini , model = "iid", hyper = pcprior)


m2 = inla(formula, family = "gaussian", data = male,
          control.predictor=list(compute = TRUE),
          control.compute = list(config = TRUE, dic = TRUE,
                                 waic = TRUE, cpo = TRUE),
          verbose = TRUE)

In [27]:
bri.hyperpar.summary(m2)

Unnamed: 0,mean,sd,q0.025,q0.5,q0.975,mode
SD for the Gaussian observations,1.1486979,0.0104465,1.12655089,1.1494305,1.1672743,1.1523909
SD for state,0.5060942,0.06764672,0.39735464,0.4971061,0.661206,0.4752792
SD for cty,0.4160783,0.03679549,0.36103255,0.4099513,0.5024892,0.3909014
SD for income_qr,3.8934809,1.3980736,1.94939541,3.6183372,7.3686851,3.1442084
SD for q_mob,0.2005374,0.10677314,0.07448484,0.1733685,0.4812912,0.134194
SD for q_gini,0.2954147,0.17246655,0.11101168,0.2465936,0.7588431,0.1822891


In [26]:
m2$summary.fixed

Unnamed: 0,mean,sd,0.025quant,0.5quant,0.975quant,mode,kld
(Intercept),81.143784271,1.92308108,77.288145831,81.143695036,84.99841074,81.143784521,3.238176e-10
z_relative_mob,0.001129495,0.11030376,-0.22073487,0.00112547,0.22305296,0.001154498,3.92297e-06
z_gini,0.122126913,0.15387469,-0.185590148,0.122110226,0.429880277,0.122163537,6.429428e-05
log_population,-0.138195533,0.03658584,-0.210038351,-0.138198574,-0.066403552,-0.138201594,9.007183e-14
log_income,-0.116869698,0.24852252,-0.604822402,-0.11691021,0.370850261,-0.116968795,8.10157e-14
log_crime_rate,0.0015932,0.03121518,-0.059683515,0.001587991,0.06284215,0.001579938,2.406434e-15
log_poverty,0.16400054,0.1285574,-0.088395716,0.163973439,0.416310031,0.16393067,8.92185e-14
log_mig_inflow,0.016120583,0.09113387,-0.162845027,0.016116587,0.194940173,0.016116419,8.166385e-14
log_mig_outflow,-0.329586464,0.11210406,-0.549734881,-0.329590713,-0.109621577,-0.329589774,8.199246e-14
log_foreign,0.067402169,0.05167286,-0.034055609,0.067394498,0.16880721,0.06738366,8.543402e-14


#### Variables

```
[1] "county"               "gender"               "income_q"            
 [4] "le"                   "county_name"          "population"          
 [7] "statename"            "stateabbrv"           "density"             
[10] "gini"                 "relative_mob"         "absolute_mob"        
[13] "segregation_income"   "segregation_race"     "income"              
[16] "poverty"              "middle_class"         "mig_inflow"          
[19] "mig_outflow"          "foreign"              "religion"            
[22] "crime_rate"           "uninsured"            "labor_force"         
[25] "unemployment"         "pct_black"            "pct_hispanic"        
[28] "obesity"              "smoking"              "exercise"            
[31] "house_value"          "college"              "medicare_expenses"   
[34] "local_gov_exp"        "male"                 "q2"                  
[37] "q3"                   "q4"                   "log_population"      
[40] "log_crime_rate"       "log_poverty"          "log_mig_inflow"      
[43] "log_mig_outflow"      "log_foreign"          "log_pct_black"       
[46] "log_pct_hispanic"     "log_house_value"      "log_local_gov_exp"   
[49] "log_unemployment"     "log_income"           "z_gini"              
[52] "z_relative_mob"       "z_absolute_mob"       "z_middle_class"      
[55] "z_segregation_income" "z_religion"           "z_labor_force"       
[58] "z_uninsured"          "z_medicare_expenses"  "z_college"           
[61] "z_obesity"            "z_smoking"            "z_exercise" ```

In [29]:
nrep = 4 * 2 # 4 quartiles for 2 contrast values
relative_mob_pred = data.table(
    z_relative_mob       = rep(c(0.0, 1.0), times = 1 , each = 4),
    z_gini               = rep(0, nrep),
    log_population       = rep(0, nrep), 
    log_income           = rep(0, nrep),
    log_crime_rate       = rep(0, nrep),
    log_poverty          = rep(0, nrep),
    log_mig_inflow       = rep(0, nrep),
    log_mig_outflow      = rep(0, nrep),
    log_foreign          = rep(0, nrep),
    log_pct_black        = rep(0, nrep),
    log_pct_hispanic     = rep(0, nrep),
    log_house_value      = rep(0, nrep),
    log_local_gov_exp    = rep(0, nrep),
    log_unemployment     = rep(0, nrep),
    z_segregation_income = rep(0, nrep),
    z_religion           = rep(0, nrep),
    z_labor_force        = rep(0, nrep),
    z_college            = rep(0, nrep),
    z_middle_class       = rep(0, nrep))

gini_pred = data.table(
    z_relative_mob        = rep(0, nrep),
    z_gini               = rep(c(0.0, 1.0), times = 1 , each = 4),
    log_population       = rep(0, nrep), 
    log_income           = rep(0, nrep),
    log_crime_rate       = rep(0, nrep),
    log_poverty          = rep(0, nrep),
    log_mig_inflow       = rep(0, nrep),
    log_mig_outflow      = rep(0, nrep),
    log_foreign          = rep(0, nrep),
    log_pct_black        = rep(0, nrep),
    log_pct_hispanic     = rep(0, nrep),
    log_house_value      = rep(0, nrep),
    log_local_gov_exp    = rep(0, nrep),
    log_unemployment     = rep(0, nrep),
    z_segregation_income = rep(0, nrep),
    z_religion           = rep(0, nrep),
    z_labor_force        = rep(0, nrep),
    z_college            = rep(0, nrep),
    z_middle_class       = rep(0, nrep))

In [30]:
# simulate
sim_mob_male = simulate_predictions(model=m2, data=relative_mob_pred, 
                                           nsim=2000, 
                                           contrast='z_relative_mob', 
                                           random='q_mob')

sim_mob_male_m2 = first_difference(sim_mob_male, 'value', 'z_relative_mob', 'variable', 'q')
fwrite(sim_mob_male_m2, file='../data/sim_mob_male_m2.csv')

In [34]:
sim_gini_male = simulate_predictions(model=m2, data=gini_pred, 
                                           nsim=2000, 
                                           contrast='z_gini', 
                                           random='q_gini')

sim_gini_male_m2 = first_difference(sim_gini_male, 'value', 'z_gini', 'variable', 'q')
fwrite(sim_gini_male_m2, file='../data/sim_gini_male_m2.csv')

### Female

In [18]:
lmod <- lm(le ~ z_relative_mob  + z_gini + log_population + log_income + 
           log_crime_rate + log_poverty + log_mig_inflow + log_mig_outflow + 
           log_foreign + log_pct_black + log_pct_hispanic + log_house_value + 
           log_local_gov_exp + log_unemployment + z_segregation_income + z_religion + 
           z_labor_force + z_college + z_middle_class , female)

# pc prior
sdres <- sd(residuals(lmod))
pcprior <- list(prec = list(prior="pc.prec", param = c(3*sdres,0.01)))

In [19]:
formula = le ~ z_relative_mob  + z_gini + log_population + log_income +
       log_crime_rate + log_poverty + log_mig_inflow + log_mig_outflow + 
       log_foreign + log_pct_black + log_pct_hispanic + log_house_value + 
       log_local_gov_exp + log_unemployment + z_segregation_income + z_religion + 
       z_labor_force + z_college + z_middle_class + 
       f(state, model = "iid", hyper = pcprior) + 
       f(cty, model = "iid", hyper = pcprior) + 
       f(income_qr, model = "iid", hyper = pcprior) + 
       f(q_mob, z_relative_mob , model = "iid", hyper = pcprior) + 
       f(q_gini, z_gini , model = "iid", hyper = pcprior)


f2 = inla(formula, family = "gaussian", dat = female,
          control.predictor=list(compute = TRUE),
          control.compute = list(config = TRUE, dic = TRUE,
                                 waic = TRUE, cpo = TRUE),
          verbose = TRUE)

In [20]:
f2$summary.fixed

Unnamed: 0,mean,sd,0.025quant,0.5quant,0.975quant,mode,kld
(Intercept),84.88548688,1.36040151,82.1501273,84.88543764,87.62104211,84.88559136,1.125225e-09
z_relative_mob,0.02366167,0.07014106,-0.1200676,0.02365424,0.16757586,0.02365964,4.04491e-11
z_gini,0.13972977,0.15729051,-0.18042233,0.13971617,0.4602494,0.13973556,1.228536e-07
log_population,-0.11871743,0.03805566,-0.19343562,-0.11872131,-0.044047,-0.11872567,1.826676e-14
log_income,-0.36578137,0.25882401,-0.87403603,-0.3657802,0.14199389,-0.36575663,1.905895e-14
log_crime_rate,0.08614786,0.03184942,0.02361332,0.08614736,0.14862782,0.08614904,1.710122e-15
log_poverty,-0.16525883,0.13358486,-0.42757345,-0.16526053,0.09682111,-0.16525297,1.84701e-14
log_mig_inflow,-0.03342343,0.0947092,-0.21939552,-0.03342597,0.15238999,-0.033423,1.778028e-14
log_mig_outflow,-0.16960561,0.11672308,-0.39879098,-0.16961335,0.0594101,-0.16961863,1.897156e-14
log_foreign,-0.01487367,0.05359919,-0.12014283,-0.01486783,0.09026569,-0.01485125,1.758651e-14


In [21]:
bri.hyperpar.summary(f2)

Unnamed: 0,mean,sd,q0.025,q0.5,q0.975,mode
SD for the Gaussian observations,1.15838217,0.012718,1.13546826,1.15754227,1.1851819,1.15466139
SD for state,0.44179551,0.05816655,0.33541641,0.43926855,0.5632949,0.43560952
SD for cty,0.43880666,0.02351718,0.39252374,0.43897415,0.4846981,0.44062415
SD for income_qr,2.51904917,0.93382775,1.19160663,2.34743031,4.8142394,2.04738595
SD for q_mob,0.08668094,0.04969179,0.02367787,0.07578752,0.2132839,0.05557902
SD for q_gini,0.24919136,0.14174106,0.08774491,0.211705,0.6252977,0.15981182


In [32]:
# simulate
sim_mob_female = simulate_predictions(model=f2, data=relative_mob_pred, 
                                           nsim=2000, 
                                           contrast='z_relative_mob', 
                                           random='q_mob')

sim_mob_female_m2 = first_difference(sim_mob_female, 'value', 'z_relative_mob', 'variable', 'q')
fwrite(sim_mob_female_m2, file='../data/sim_mob_female_m2.csv')

In [33]:
sim_gini_female = simulate_predictions(model=f2, data=gini_pred, 
                                           nsim=2000, 
                                           contrast='z_gini', 
                                           random='q_gini')

sim_gini_female_m2 = first_difference(sim_gini_female, 'value', 'z_gini', 'variable', 'q')
fwrite(sim_gini_female_m2, file='../data/sim_gini_female_m2.csv')