# Create models and assess predictions

In [1]:
#library(rstanarm)
library(INLA)
library(data.table)

Loading required package: sp
Loading required package: Matrix
This is INLA_17.05.31 built 2017-05-31 13:53:28 UTC.
See www.r-inla.org/contact-us for how to get help.


In [10]:
# read data
df = fread('../data/le_cov.csv')
df[, V1 := NULL]
nrow(df)

In [13]:
names(df)

In [384]:
# function to log center or standardize
log_center = function(data, vars, type = 'log') {
    for (name in vars) {
        if (type == 'log') {
            variable = log(ifelse(data[, get(name)]==0.0 | data[, get(name)] < 0.0, 0.01, 
                                  data[, get(name)]))
            
            data[, paste0(type, '_', name) := variable - mean(variable, na.rm=TRUE)]
            }
        else if (type == 'z') { 
            data[, paste0(type, '_', name) := (get(name) - mean(get(name), na.rm=TRUE))
                 / sd(get(name), na.rm=TRUE)]
        }
    }
    return(data)
 }

In [386]:
vars = c('pct_black', 'ct_foreign', 'house_value', 'college', 'population', 
        'income')
log_center(cov, vars, type='log')

In [387]:
vars = c('gini', 'relative_mob', 'absolute_mob')
log_center(cov, vars, type='z')

In [388]:
setkey(cov, county)

setnames(le, c('cty', 'gnd', 'hh_inc_q', 'le_raceadj'),
         c('county', 'gender', 'income_q', 'le'))


le = copy(le[, c('county', 'gender', 'income_q', 'le')])

setkey(le, county)

dat = cov[le]

In [389]:
dim(dat)

In [390]:
dat[, state := .GRP, by = statename]

In [391]:
dat[, income_q := as.numeric(income_q)]

# Models

In [392]:
m1 = lm(le ~  z_relative_mob, data = dat)
summary(m1)


Call:
lm(formula = le ~ z_relative_mob, data = dat)

Residuals:
     Min       1Q   Median       3Q      Max 
-16.3457  -2.0617   0.4507   2.6243  11.3661 

Coefficients:
               Estimate Std. Error t value Pr(>|t|)    
(Intercept)    83.02370    0.03204 2591.00   <2e-16 ***
z_relative_mob -0.49660    0.03924  -12.66   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 3.572 on 12446 degrees of freedom
  (24 observations deleted due to missingness)
Multiple R-squared:  0.01271,	Adjusted R-squared:  0.01263 
F-statistic: 160.2 on 1 and 12446 DF,  p-value: < 2.2e-16


# INLA (default priors)

In [393]:
#names(dat)

In [394]:
# create auxiliary variables
dat[, state_mob := state]
dat[, state_gini := state]
dat[, cty := county]
dat[, cty_mob := county]
dat[, cty_gini := county]
dat[, q_mob := income_q]
dat[, q_gini := income_q]

In [395]:
female = dat[gender=='F']
male= dat[gender=='M']

In [397]:
formula = le ~ z_relative_mob  + z_gini + log_population + log_income + 
       f(state, model = "iid") + 
       f(cty, model = "iid") + 
       f(income_q, model = "iid") + 
       f(q_mob, z_relative_mob , model = "iid") + 
       f(q_gini, z_gini , model = "iid") 


m1 = inla(formula, family = "gaussian", dat = male,
          control.predictor=list(compute = TRUE),
          control.compute = list(config = TRUE, dic = TRUE,
                                 waic = TRUE, cpo = TRUE),
          verbose = TRUE)

In [398]:
formula = le ~ z_relative_mob  + z_gini + log_population + log_income + 
       f(state, model = "iid") + 
       f(cty, model = "iid") + 
       f(income_q, model = "iid") + 
       f(q_mob, z_relative_mob , model = "iid") + 
       f(q_gini, z_gini , model = "iid") 


f1 = inla(formula, family = "gaussian", dat = female,
          control.predictor=list(compute = TRUE),
          control.compute = list(config = TRUE, dic = TRUE,
                                 waic = TRUE, cpo = TRUE),
          verbose = TRUE)

In [399]:
summary(f1)


Call:
c("inla(formula = formula, family = \"gaussian\", data = female, verbose = TRUE, ",  "    control.compute = list(config = TRUE, dic = TRUE, waic = TRUE, ",  "        cpo = TRUE), control.predictor = list(compute = TRUE))" )

Time used:
 Pre-processing    Running inla Post-processing           Total 
         2.0067         47.1928          1.8047         51.0041 

Fixed effects:
                  mean     sd 0.025quant 0.5quant 0.975quant    mode kld
(Intercept)    85.0879 1.1169    82.8574  85.0879    87.3169 85.0879   0
z_relative_mob -0.2736 0.0382    -0.3486  -0.2736    -0.1987 -0.2736   0
z_gini          0.1466 0.0697     0.0087   0.1466     0.2842  0.1466   0
log_population -0.2156 0.0274    -0.2693  -0.2156    -0.1619 -0.2156   0
log_income      1.0977 0.1306     0.8411   1.0976     1.3540  1.0976   0

Random effects:
Name	  Model
 state   IID model 
cty   IID model 
income_q   IID model 
q_mob   IID model 
q_gini   IID model 

Model hyperparameters:
                     

# Simulate values

In [400]:
# function to simulate predicted values

simulate_predictions = function(model, data, nsim = 1000, contrast='z_relative_mob', 
                               random = 'q_mob') {
    
    # simulate posterior distribution
    simData <- inla.posterior.sample(n=nsim, result=model)
    
    # define matrix model
    f = formula(paste0('~ ', paste0(names(data), collapse = ' + ')))
    X = model.matrix(f, dat = data)
    N = nrow(data)
    Ysim = matrix(nrow = N, ncol = nsim) 
    
    # loop
    for (i in 1:nsim) {
        rnames <- rownames(simData[[i]]$latent)
        rndQinc <- simData[[i]]$latent[grep('income_q', rnames)]
        rndIncCoef <- simData[[i]]$latent[grep(random, rnames)]
        coefNames <- rownames(model$summary.fixed)
        Betas = simData[[i]]$latent[-grep("^Pred|^state|^cty|^obs|^income_q|q_mob|q_gini", rnames)]
        names(Betas) = coefNames
        mt = matrix(rep(Betas, N), ncol = length(coefNames), nrow = N, byrow = TRUE)
        indc = grep(contrast, coefNames)
        indi = grep('Intercept', coefNames)
        mt[, indc] = mt[,indc] + rep(rndIncCoef, 2)
        # mt[,indi] = mt[,indi] + rep(rndQinc, 2)
        for (j in 1:N) {
            Ysim[j, i] <- X[j, ] %*% mt[j,]
        }
    }
    
    t = data.table(Ysim)
    t[, q := rep(1:4, times = 2)]
    t[, (contrast) := rep(c(0.0, 1.0), times = 1 , each = 4)]
    return(melt(t, id.vars = c('q', contrast)))
}

# compute first difference

first_difference = function(data, contrast = 'z_gini', group = 'q', model = 'm1') {
    c = gtools::combinations(n = 4, r = 2, v = c(1:4), repeats.allowed = FALSE)
    t = list()
    for (i in 1:nrow(c)) {
        a = data[get(group) == c[i,1] & get(contrast) == 1, value] - data[get(group) == c[i,1] & get(contrast) == 0, value]
        b = data[get(group) == c[i,2] & get(contrast) == 1, value] - data[get(group) == c[i,2] & get(contrast) == 0, value]
        varname = paste0(c[i,1], '-', c[i,2]) 
        t[[i]] = data.table(type =contrast,  contrast = varname, model = model, values = (a - b))
    }
    return(rbindlist(t))
}

# Male

In [401]:
formula = le ~ z_relative_mob  + z_gini + log_population + log_income + 
       f(state, model = "iid") + 
       f(cty, model = "iid") + 
       f(income_q, model = "iid") + 
       f(q_mob, z_relative_mob , model = "iid") + 
       f(q_gini, z_gini , model = "iid") 


m1 = inla(formula, family = "gaussian", dat = male,
          control.predictor=list(compute = TRUE),
          control.compute = list(config = TRUE, dic = TRUE,
                                 waic = TRUE, cpo = TRUE),
          verbose = TRUE)

In [402]:
nrep = 4 * 2 # 4 quartiles for 2 contrast values
relative_mob_pred_data_m1 = data.table(
    z_relative_mob = rep(c(0.0, 1.0), times = 1 , each = 4),
    z_gini = rep(0, nrep),
    log_population = rep(0, nrep), 
    log_income = rep(0, nrep))

# simulate
sim_relative_mob_m1 = simulate_predictions(model=m1, data=relative_mob_pred_data_m1, 
                                           nsim=1000, 
                                           contrast='z_relative_mob', 
                                           random='q_mob')

In [403]:
nrep = 4 * 2 # 4 quartiles for 2 contrast values
gini_pred_data_m1 = data.table(
    z_relative_mob = rep(0, nrep),
    z_gini = rep(c(0.0, 1.0), times = 1 , each = 4),
    log_population = rep(0, nrep),     
    log_income = rep(0, nrep)
)

sim_gini_m1 = simulate_predictions(m1, gini_pred_data_m1, 
                                   1000, contrast = 'z_gini', 
                                   random = 'q_gini')

In [404]:
gini_diff_m1 = first_difference(sim_gini_m1, contrast='z_gini', model='m1')
relative_mob_diff_m1 = first_difference(sim_relative_mob_m1, contrast='z_relative_mob', model='m1')

In [405]:
formula = le ~ z_absolute_mob  + z_gini + log_population + log_income + 
       f(state, model = "iid") + 
       f(cty, model = "iid") + 
       f(income_q, model = "iid") + 
       f(q_mob, z_absolute_mob , model = "iid") + 
       f(q_gini, z_gini , model = "iid") 


m2 = inla(formula, family = "gaussian", dat = male,
          control.predictor=list(compute = TRUE),
          control.compute = list(config = TRUE, dic = TRUE,
                                 waic = TRUE, cpo = TRUE),
          verbose = TRUE)

In [406]:
summary(m2)


Call:
c("inla(formula = formula, family = \"gaussian\", data = male, verbose = TRUE, ",  "    control.compute = list(config = TRUE, dic = TRUE, waic = TRUE, ",  "        cpo = TRUE), control.predictor = list(compute = TRUE))" )

Time used:
 Pre-processing    Running inla Post-processing           Total 
         2.0870         44.5215          1.8280         48.4365 

Fixed effects:
                  mean     sd 0.025quant 0.5quant 0.975quant    mode kld
(Intercept)    81.4119 1.7991    77.8490  81.4118    84.9688 81.4117   0
z_absolute_mob  0.3990 0.0697     0.2619   0.3990     0.5361  0.3990   0
z_gini          0.2927 0.0875     0.1195   0.2926     0.4655  0.2926   0
log_population -0.1572 0.0295    -0.2151  -0.1572    -0.0993 -0.1572   0
log_income      1.2481 0.1356     0.9818   1.2481     1.5141  1.2481   0

Random effects:
Name	  Model
 state   IID model 
cty   IID model 
income_q   IID model 
q_mob   IID model 
q_gini   IID model 

Model hyperparameters:
                       

In [407]:
nrep = 4 * 2 # 4 quartiles for 2 contrast values
absolute_mob_pred_data_m2 = data.table(
    z_absolute_mob = rep(c(0.0, 1.0), times = 1 , each = 4),
    z_gini = rep(0, nrep),
    log_population = rep(0, nrep), 
    log_income = rep(0, nrep))

In [424]:
# simulate
sim_absolute_mob_m2 = simulate_predictions(model=m2, data=absolute_mob_pred_data_m2, 
                                           nsim=1000, 
                                           contrast='z_absolute_mob', 
                                           random='q_mob')

In [410]:
nrep = 4 * 2 # 4 quartiles for 2 contrast values
gini_pred_data_m2 = data.table(
    z_absolute_mob = rep(0, nrep),
    z_gini = rep(c(0.0, 1.0), times = 1 , each = 4),
    log_population = rep(0, nrep),     
    log_income = rep(0, nrep)
)

sim_gini_m2 = simulate_predictions(m2, gini_pred_data_m2, 
                                   1000, contrast = 'z_gini', 
                                   random = 'q_gini')

In [425]:
gini_diff_m2 = first_difference(sim_gini_m2, contrast='z_gini', model='m2')
absolute_mob_diff_m2 = first_difference(sim_absolute_mob_m2, contrast='z_absolute_mob', model='m2')

In [412]:
head(gini_diff_m2)

type,contrast,model,values
z_gini,1-2,m2,0.2578644
z_gini,1-2,m2,0.3111568
z_gini,1-2,m2,0.2796426
z_gini,1-2,m2,0.2891941
z_gini,1-2,m2,0.3051559
z_gini,1-2,m2,0.3330343


In [413]:
head(absolute_mob_diff_m2)

type,contrast,model,values
z_absolute_mob,1-2,m2,0.14038211
z_absolute_mob,1-2,m2,0.09571482
z_absolute_mob,1-2,m2,0.08377865
z_absolute_mob,1-2,m2,0.09543425
z_absolute_mob,1-2,m2,0.03019762
z_absolute_mob,1-2,m2,0.17061946


# Female

In [414]:
formula = le ~ z_relative_mob  + z_gini + log_population + log_income + 
       f(state, model = "iid") + 
       f(cty, model = "iid") + 
       f(income_q, model = "iid") + 
       f(q_mob, z_relative_mob , model = "iid") + 
       f(q_gini, z_gini , model = "iid") 


f1 = inla(formula, family = "gaussian", dat = female,
          control.predictor=list(compute = TRUE),
          control.compute = list(config = TRUE, dic = TRUE,
                                 waic = TRUE, cpo = TRUE),
          verbose = TRUE)

In [415]:
summary(f1)


Call:
c("inla(formula = formula, family = \"gaussian\", data = female, verbose = TRUE, ",  "    control.compute = list(config = TRUE, dic = TRUE, waic = TRUE, ",  "        cpo = TRUE), control.predictor = list(compute = TRUE))" )

Time used:
 Pre-processing    Running inla Post-processing           Total 
         2.1600         52.5059          2.6897         57.3556 

Fixed effects:
                  mean     sd 0.025quant 0.5quant 0.975quant    mode kld
(Intercept)    85.0879 1.1169    82.8574  85.0879    87.3169 85.0879   0
z_relative_mob -0.2736 0.0382    -0.3486  -0.2736    -0.1987 -0.2736   0
z_gini          0.1466 0.0697     0.0087   0.1466     0.2842  0.1466   0
log_population -0.2156 0.0274    -0.2693  -0.2156    -0.1619 -0.2156   0
log_income      1.0977 0.1306     0.8411   1.0976     1.3540  1.0976   0

Random effects:
Name	  Model
 state   IID model 
cty   IID model 
income_q   IID model 
q_mob   IID model 
q_gini   IID model 

Model hyperparameters:
                     

In [416]:
nrep = 4 * 2 # 4 quartiles for 2 contrast values
relative_mob_pred_data_f1 = data.table(
    z_relative_mob = rep(c(0.0, 1.0), times = 1 , each = 4),
    z_gini = rep(0, nrep),
    log_population = rep(0, nrep), 
    log_income = rep(0, nrep)
)

# simulate
sim_relative_mob_f1 = simulate_predictions(model=f1, data=relative_mob_pred_data_f1, 
                                           nsim=1000, 
                                           contrast = 'z_relative_mob', 
                                           random = 'q_mob')

nrep = 4 * 2 # 4 quartiles for 2 contrast values
gini_pred_data_f1 = data.table(
    z_relative_mob = rep(0, nrep),
    z_gini = rep(c(0.0, 1.0), times = 1 , each = 4),
    log_population = rep(0, nrep), 
    log_income = rep(0, nrep)
)

sim_gini_f1 = simulate_predictions(model=f1, 
                                   data=gini_pred_data_f1, 
                                   nsim=1000, 
                                   contrast='z_gini', 
                                   random='q_gini')

In [417]:
gini_diff_f1 = first_difference(sim_gini_f1, contrast='z_gini', model='f1')
relative_mob_diff_f1 = first_difference(sim_relative_mob_f1, contrast='z_relative_mob', model='f1')

In [418]:
formula = le ~ z_absolute_mob  + z_gini + log_population + log_income + 
       f(state, model = "iid") + 
       f(cty, model = "iid") + 
       f(income_q, model = "iid") + 
       f(q_mob, z_absolute_mob , model = "iid") + 
       f(q_gini, z_gini , model = "iid") 


f2 = inla(formula, family = "gaussian", dat = female,
          control.predictor=list(compute = TRUE),
          control.compute = list(config = TRUE, dic = TRUE,
                                 waic = TRUE, cpo = TRUE),
          verbose = TRUE)

In [419]:
summary(f2)


Call:
c("inla(formula = formula, family = \"gaussian\", data = female, verbose = TRUE, ",  "    control.compute = list(config = TRUE, dic = TRUE, waic = TRUE, ",  "        cpo = TRUE), control.predictor = list(compute = TRUE))" )

Time used:
 Pre-processing    Running inla Post-processing           Total 
         1.9378         63.6707          1.8629         67.4714 

Fixed effects:
                  mean     sd 0.025quant 0.5quant 0.975quant    mode kld
(Intercept)    85.1031 1.1405    82.8085  85.1030    87.3979 85.1031   0
z_absolute_mob  0.3538 0.0423     0.2707   0.3538     0.4368  0.3538   0
z_gini          0.2097 0.0745     0.0609   0.2097     0.3585  0.2097   0
log_population -0.1493 0.0286    -0.2054  -0.1493    -0.0932 -0.1493   0
log_income      1.0531 0.1317     0.7946   1.0531     1.3115  1.0531   0

Random effects:
Name	  Model
 state   IID model 
cty   IID model 
income_q   IID model 
q_mob   IID model 
q_gini   IID model 

Model hyperparameters:
                     

In [420]:
nrep = 4 * 2 # 4 quartiles for 2 contrast values
absolute_mob_pred_data_f2 = data.table(
    z_absolute_mob = rep(c(0.0, 1.0), times = 1 , each = 4),
    z_gini = rep(0, nrep),
    log_population = rep(0, nrep), 
    log_income = rep(0, nrep)
)

# simulate
sim_absolute_mob_f2 = simulate_predictions(model=f2, data=absolute_mob_pred_data_f2, 
                                           nsim=1000, 
                                           contrast = 'z_absolute_mob', 
                                           random = 'q_mob')

nrep = 4 * 2 # 4 quartiles for 2 contrast values
gini_pred_data_f2 = data.table(
    z_absolute_mob = rep(0, nrep),
    z_gini = rep(c(0.0, 1.0), times = 1 , each = 4),
    log_population = rep(0, nrep), 
    log_income = rep(0, nrep)
)

sim_gini_f2 = simulate_predictions(model=f2, 
                                   data=gini_pred_data_f2, 
                                   nsim=1000, 
                                   contrast='z_gini', 
                                   random='q_gini')

In [421]:
gini_diff_f2 = first_difference(sim_gini_f2, contrast='z_gini', model='f2')
absolute_mob_diff_f2 = first_difference(sim_absolute_mob_f2, contrast='z_absolute_mob', model='f2')

# Join comparisons

In [426]:
final = rbind(gini_diff_m1, relative_mob_diff_m1, 
              gini_diff_f1, relative_mob_diff_f1, 
              gini_diff_m2, absolute_mob_diff_m2,
              gini_diff_f2, absolute_mob_diff_f2
             )
fwrite(final, file='models.csv')

In [427]:
nrow(final)