# Define parameters
## Smoking

In [67]:
library(data.table)
library(survey)
library(xtable)
library(reldist)
source("../src/utils.R")

In [68]:
# read NHIS 2019
h = fread("../data/health_survey_adults.csv")
setnames(h, names(h), tolower(names(h)))
setnames(h, "wtfa_a", "wt")

# define smoking variables
# table(h$smkev_a)
# table(h$smknow_a)
# table(h$srvy_yr)

h[, smoking := 0]
h[smkev_a == 1 & smknow_a %in% c(1, 2), smoking := 1]
h[smkev_a %in% c(7, 8, 9), smoking := NA]
h[smknow_a %in% c(7, 8), smoking := NA]
# table(h$smoking)

# gender
# table(h$sex_a)
h[sex_a != 7, sex := ifelse(sex_a == 1, 1, 0)]

# age groups
# summary(h$agep_a)
h[, age_group := ifelse(agep_a >= 30 & agep_a <= 50, 1, 0)]
h[age_group == 1, income_group := cut(faminctc_a, breaks = quantile(faminctc_a,
    probs = 0:5/5),
    labels = 1:5, right = TRUE, include.lowest = TRUE)]
h[, income_group := as.numeric(as.character(income_group))]
# table(h$income_group)

# hist(h[faminctc_a < quantile(h$faminctc_a, 0.33), faminctc_a])
# gini(h$faminctc_a)
# table(h[age_group == 1, income_group ])

# select only respondets between 30 and 50
s = h[age_group == 1]
# dim(s)
setorder(s, income_group )

# total_smoking = weighted.mean(s[!is.na(smoking), smoking], s[!is.na(smoking), wt])

design = svydesign(ids= ~ hhx, weights = ~wt, data=s)
tab = s[, .(smoking_prop = weighted.mean(smoking, wt, na.rm = TRUE)), income_group ][!is.na(income_group )]
total = data.table(income_group   = 9,  smoking_prop = total_smoking)
tab = rbind(tab, total)
setorder(tab, income_group)
tab

saveRDS(tab, "../output/data/smoking_dist_nhis2019.rds")

income_group,smoking_prop
<dbl>,<dbl>
1,0.28693056
2,0.22298422
3,0.15587096
4,0.1086473
5,0.05414104
9,0.16516696


### Logistic model to get baseline income group coefficients


In [69]:

m = svyglm(smoking ~ -1 + as.factor(income_group), design = design, family = quasibinomial)
summary(m)


Call:
svyglm(formula = smoking ~ -1 + as.factor(income_group), design = design, 
    family = quasibinomial)

Survey design:
svydesign(ids = ~hhx, weights = ~wt, data = s)

Coefficients:
                         Estimate Std. Error t value Pr(>|t|)    
as.factor(income_group)1 -0.91034    0.05813  -15.66   <2e-16 ***
as.factor(income_group)2 -1.24836    0.06128  -20.37   <2e-16 ***
as.factor(income_group)3 -1.68928    0.08370  -20.18   <2e-16 ***
as.factor(income_group)4 -2.10463    0.08668  -24.28   <2e-16 ***
as.factor(income_group)5 -2.86050    0.11734  -24.38   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for quasibinomial family taken to be 0.9945759)

Number of Fisher Scoring iterations: 5


####  Scale coefficients


In [70]:

# original distribution
coeff = as.vector(coef(m))
prop = NULL
eprop = NULL
nprop = NULL

for (i in seq_along(coeff)) {
    prop[i] = exp(coeff[i]) / (1 + exp(coeff[i]))
}
(org_coeff = round(prop, 2))
round(mean(prop), 2)


In [30]:
# rank slope effect
for (i in seq_along(coeff)) {
    eprop[i] = exp(coeff[i] + 0.12/0.086 * 0.28) / (1 + exp(coeff[i] +  0.12/0.086 * 0.28))
}
# too high smoking prevalence
round(eprop, 2)


In [77]:
# adjusted coefficients
ncoeff = coeff * c(1.29, 1.24, 1.14, 1.10, 1.09)
ncoeff


In [None]:
1.00 & 0.29 & 0.22 & 0.29 \\ 
  2.00 & 0.22 & 0.15 & 0.21 \\ 
  3.00 & 0.16 & 0.11 & 0.16 \\ 
  4.00 & 0.11 & 0.07 & 0.10 \\ 
  5.00 & 0.05 & 0.03 & 0.05 \\ 
  9.00 & 0.17 & 0.11 & 0.16 \\ 

In [78]:
for (i in seq_along(ncoeff)) {
    nprop[i] = exp(ncoeff[i] + 0.12/0.086 * 0.28) / (1 + exp(ncoeff[i] +  0.12/0.086 * 0.28))
}
(adj_coeff = round(nprop, 2))
round(mean(nprop), 2)


In [39]:
# comparing prevalences by income group
org_coeff - adj_coeff

In [40]:
# dis
for (i in seq_along(ncoeff)) {
    nprop[i] = exp(ncoeff[i] + 0.0/0.086 * 0.28) / (1 + exp(ncoeff[i] +  0.0/0.086 * 0.28))
}
round(nprop, 2)
round(mean(nprop), 2)


In [79]:
# final coefficients
cat(paste0("{", paste0(round(ncoeff, 5), collapse = ","), "}"))

{-1.17434,-1.54797,-1.92578,-2.3151,-3.11795}