# Smoking prevelance verification

In [23]:
library(data.table)
library(xtable)
source("../src/utils.R")

In [40]:
# read data files
m = fread("../output/data/mortality-verification-smoking.csv")
p = fread("../output/data/parameters-verification-smoking.csv")
e = fread("../output/data/environment-verification-smoking.csv")

In [41]:
# select parameters of interest
parameters = c("smoking_rank_slope_exp_coeff")
setorderv(p, parameters)

# redefine iteration and replicate indexes
dim(p)
p[, niteration := .GRP, by = parameters]
p[, nreplicate := 1:.N, by = niteration]
np = p[, c("iteration", "replicate", "niteration", "nreplicate", parameters), with = FALSE]

In [42]:
names(m)

In [43]:
# environment data

# create columns from arrays
vars = paste0("income", 1:5)
e = extractColumns(e, "prop_income_group", vars)
vars = paste0("le", 1:5)
e = extractColumns(e, "le_income_group",  vars)
vars = paste0("smoking",  1:5)
e = extractColumns(e, "smoking_income_group",  vars)

# merge files
e = merge(e, np, by = c("iteration", "replicate"))
setorderv(e, parameters)

In [50]:

# create tables with distributions
nhis = readRDS("../output/data/smoking_dist_nhis2019.rds")
setnames(nhis, "incomeType", "income_group")
s = melt(e, id.vars = c("niteration", "nreplicate"), measure.vars = patterns("^smoking[0-9]"), 
    value.name = "smoking_prop", variable.name = "income_group")


In [51]:
mean(e[niteration == 1, population])
mean(e[niteration == 2, population])

cf = s[niteration == 1, .(smoking_prop_cf = mean(smoking_prop)), income_group]
cf[, income_group := as.numeric(gsub("[a-z]+", "", income_group))]
cf = rbind(cf, data.table(income_group = 9, smoking_prop_cf  = mean(e[niteration == 1, smokers])))

trt = s[niteration == 2, .(smoking_prop_trt = mean(smoking_prop)), income_group]
trt[, income_group := as.numeric(gsub("[a-z]+", "", income_group))]
trt = rbind(trt, data.table(income_group = 9, smoking_prop_trt  = mean(e[niteration == 2, smokers])))


In [52]:
tab = Reduce(function(...) merge(..., all = TRUE, by = "income_group"), list(nhis, cf, trt))
setnames(tab, names(tab), 
    c("Income quintile" , "NHIS 2019", "MIA non-treatment", "MIA treatment"))

# latex table 
print(xtable(tab,
    caption = "Proportion smoking by income quintile"),
    table.placement = "htp",
    caption.placement = "top",
    include.rownames  = FALSE
)

% latex table generated in R 4.1.0 by xtable 1.8-4 package
% Fri Jan 14 23:05:21 2022
\begin{table}[htp]
\centering
\caption{Proportion smoking by income quintile} 
\begin{tabular}{rrrr}
  \hline
Income quintile & NHIS 2019 & MIA non-treatment & MIA treatment \\ 
  \hline
1.00 & 0.29 & 0.20 & 0.27 \\ 
  2.00 & 0.22 & 0.15 & 0.20 \\ 
  3.00 & 0.16 & 0.09 & 0.13 \\ 
  4.00 & 0.11 & 0.06 & 0.09 \\ 
  5.00 & 0.05 & 0.03 & 0.04 \\ 
  9.00 & 0.17 & 0.10 & 0.14 \\ 
   \hline
\end{tabular}
\end{table}


In [62]:
# le by smoking 
l = m[, .(le = mean(age)), .(smoker, income_group)]
setorder(l, income_group, smoker)

m[, mean(age), income_group]
m[, mean(age), smoker]


diff(m[, .(le = mean(age)), .(smoker)]$le)

# exploring some values
a = mean(e[niteration == 1, le5])
b = mean(e[niteration == 2, le5])
b - a

a = mean(e[niteration == 1, le1])
b = mean(e[niteration == 2, le1])
b- a

income_group,V1
<int>,<dbl>
4,78.04452
5,79.95197
3,76.50718
2,74.67351
1,72.96279


smoker,V1
<lgl>,<dbl>
False,78.1756
True,67.25669


In [208]:
t = ag[id == sid, .(id, age, prob_income_group, prob_transition, time_exposure, final_transition_probs)]
t[, prob_income_group := gsub("\\[|\\]", "", prob_income_group)]
t[, prob_transition := gsub("\\[|\\]", "", prob_transition)]
t[, paste0("i", 1:5) := tstrsplit(prob_income_group, ",", fixed = TRUE)]
t[, paste0("t", 1:5) := tstrsplit(prob_transition, ",", fixed = TRUE)]
vars = c(paste0("i", 1:5), paste0("t", 1:5))
t[, (vars) := lapply(.SD, as.numeric), .SDcols = vars]

In [209]:
check = as.numeric(tstrsplit(gsub("\\[|\\]", "", tail(t, 1)[, .(final_transition_probs)]), ",", fixed = TRUE))
sum(check)
check

In [211]:
# t[, check1 := apply(.SD, 1, mean), .SDcols = c("i2", "t2")]
for (i in 1:5) {
    w = 0.0
    t[, check1 := (get(paste0("i", i)) * w + get(paste0("t", i)) * (1-w))]
    print(
        paste0("Difference group ", i, ": ", 
            t[time_exposure > 0, weighted.mean(check1, time_exposure)] - check[i]
        )
    )
}

[1] "Difference group 1: 0"
[1] "Difference group 2: 0"
[1] "Difference group 3: 0"
[1] "Difference group 4: 0"
[1] "Difference group 5: 0"


# Income measures

In [212]:
inc = fread("../models/MobHealthRecycling/output/testing/income-1-1.csv")
inc[, kid_income := gsub("\\[|\\]", "", kid_income)]
inc[, parent_income := gsub("\\[|\\]", "", parent_income)]
sinc = inc[, .(kid = as.numeric(unlist(strsplit(kid_income, ','))), 
    parent = as.numeric(unlist(strsplit(parent_income, ',')))), 
    by = .(model_time, county)]
sinc = merge(sinc, 
    inc[, .(model_time, county, rank_slope, rank_absolute, rank_correlation, cohort_size )], 
    by = c("county", "model_time"))

In [213]:
head(sinc)

county,model_time,kid,parent,rank_slope,rank_absolute,rank_correlation,cohort_size
<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
19,169,180000,66300,0.3047226,0.4260768,0.3047053,154
19,169,4030,33000,0.3047226,0.4260768,0.3047053,154
19,169,200600,100400,0.3047226,0.4260768,0.3047053,154
19,169,102000,38900,0.3047226,0.4260768,0.3047053,154
19,169,47700,269700,0.3047226,0.4260768,0.3047053,154
19,169,167000,120000,0.3047226,0.4260768,0.3047053,154


In [215]:
sinc[, kid_rank := perc.rank(kid), .(model_time)]
sinc[, parent_rank := perc.rank(parent), .(model_time)]

reg = function(kid_income, parent_income, relative = TRUE) {
    m = lm(kid_income ~ parent_income)
    c = coef(m)
    if (relative) { return(c[2])}
    else {
        return (c[1]  + 0.25 * c[2])
    }
}

sinc[, spearman := cor(kid, parent, method = "spearman"), model_time]
sinc[, cor_rank := cor(kid_rank, parent_rank), model_time]
sinc[, im := reg(kid_rank, parent_rank, TRUE), model_time]
sinc[, am := reg(kid_rank, parent_rank, FALSE), model_time]
sinc[, order := 1:.N, model_time]
sinc = sinc[order == 1]


In [216]:
summary(sinc$rank_slope - sinc$im)
summary(sinc$rank_absolute - sinc$am)
summary(sinc$rank_correlation - sinc$spearman)
summary(sinc$rank_correlation - sinc$cor_rank)

      Min.    1st Qu.     Median       Mean    3rd Qu.       Max. 
-0.0013498 -0.0009692 -0.0005325 -0.0004825 -0.0002743  0.0013390 

      Min.    1st Qu.     Median       Mean    3rd Qu.       Max. 
-0.0005836 -0.0001081  0.0001567  0.0001576  0.0003648  0.0007455 

                   Min.                 1st Qu.                  Median 
-0.00000000000000033307 -0.00000000000000011102  0.00000000000000000000 
                   Mean                 3rd Qu.                    Max. 
-0.00000000000000001883  0.00000000000000006245  0.00000000000000027756 

      Min.    1st Qu.     Median       Mean    3rd Qu.       Max. 
-0.0008707 -0.0006046 -0.0003101 -0.0001794  0.0001855  0.0011307 

In [217]:
cor(sinc$rank_slope, sinc$im)
cor(sinc$rank_absolute, sinc$am)
cor(sinc$rank_correlation, sinc$spearman)
cor(sinc$rank_correlation, sinc$cor_rank)