# HALI Data Analysis

#### Predictors of outcome (high literacy) and of missingness (of high literacy outcome) from HALI motivating data set.
* table S2

In [30]:
tableS2

Unnamed: 0,Estimate,SE,p-value,Estimate.1,SE.1,p-value.1
Intercept,-3.61,0.38,<0.001,-1.51,0.39,<0.001
Intervention,0.75,0.19,<0.001,0.09,0.15,0.538
Baseline literacy score,0.35,0.02,<0.001,-0.03,0.01,0.105
Age,0.06,0.04,0.094,-0.02,0.04,0.544
Sex (Female),-0.04,0.11,0.733,-0.0,0.13,0.991
HH education,,,0.002,,,0.021
primary,0.13,0.13,,-0.34,0.15,
secondary,0.61,0.22,,-0.17,0.24,
college/degree,1.16,0.38,,0.46,0.32,
SES household,,,0.357,,,0.920


# Code Detail

In [14]:
# install.packages('readstata13')
# install.packages('geeM')
# install.packages('jomo')
# install.packages('knitr')

In [2]:
library(readstata13) # library for read in the dataset
library(lme4) # library for GLM
library(geeM)
library(jomo)
library(knitr)

### Read in data

the full dataset named dat

In [3]:
setwd('/Users/yaolanqiu/Documents/HALI/DATA')
dat <- read.dta13('HALI_CLASS1_2539_MASTER_EDUC_FU1_FU2_LONG_accounts_withdrawals_BL_AS_COV_AND_OTHER_COV_17.11.2016.dta',
                  nonint.factors = TRUE, generate.factors=TRUE)

In [4]:
dim(dat)

### we only focused on 9 month data

the data with only 9 month named data0

In [5]:
data0 <- dat[dat$visit=='9-month FU',]
dim(data0) # 2539 220

### select the input variables

In [6]:
dat0 <- data0[,c("school_id","LIT_grp",
                 "BL_gll21_total", 'age_child','sex',
                 'schlevel_comp',"BL_ses",
                 "gll21_total")]
rownames(dat0) <- NULL
dim(dat0) # 2539 8

### remove the data with missing covariates 

In [7]:
dat1 = dat0[(is.na(dat0$school_id) == 0 & 
      is.na(dat0$LIT_grp) == 0 &
      is.na(dat0$BL_gll21_total) == 0 &
      is.na(dat0$age_child)== 0 & 
      is.na(dat0$sex) == 0 & 
      is.na(dat0$schlevel_comp) == 0 &
      is.na(dat0$BL_ses) ==0) ,]
rownames(dat1) = NULL
dim(dat1) # 2465 8

## Predictors of outcome (high literacy) 

## Predictors of outcome missingness

criteria of high literacy

In [8]:
dat1$y=ifelse(dat1$gll21_total>10,1,0)

missing indicator

In [21]:
dat1$missing = is.na(dat1$gll21_total)

#### model 1
with outcome:
* high literacy

with covariates:
* baseline score
* age 
* sex
* with or without household education level
* SES

In [9]:
model1_edu=glmer(y~LIT_grp+BL_gll21_total+age_child+sex
               +schlevel_comp+BL_ses+(1|school_id),
               data=dat1,
               family=binomial('logit'),
               control=glmerControl(optimizer="bobyqa",
                                    optCtrl=list(maxfun=2e5)))

In [10]:
model1_edu2=glmer(y~LIT_grp+BL_gll21_total+age_child+sex
               +BL_ses+(1|school_id),
               data=dat1,
               family=binomial('logit'),
               control=glmerControl(optimizer="bobyqa",
                                    optCtrl=list(maxfun=2e5)))

#### Calculate the overall p value for household education level

In [11]:
edu=anova(model1_edu, model1_edu2, test="LRT")
round(edu$`Pr(>Chisq)`[2],3)

#### Similarly, calculate the overall p value for SES

In [12]:
model2_ses=glmer(y~LIT_grp+BL_gll21_total+age_child+sex+
                 schlevel_comp+BL_ses+(1|school_id),
                 data=dat1,
                 family=binomial('logit'),
                 control=glmerControl(optimizer="bobyqa",
                                     optCtrl=list(maxfun=2e5)))
model2_ses2=glmer(y~LIT_grp+BL_gll21_total+age_child+sex+
                 schlevel_comp+(1|school_id),
                 data=dat1,
                 family=binomial('logit'),
                 control=glmerControl(optimizer="bobyqa",
                                      optCtrl=list(maxfun=2e5)))
ses=anova(model2_ses, model2_ses2, test="LRT")

#### Model 3 

* outcome: missing indicator

In [24]:
model3_edu=glmer(missing~LIT_grp+BL_gll21_total+age_child+sex+schlevel_comp
                +BL_ses+(1|school_id),
                 data=dat1,
                 family=binomial('logit'),
                 control=glmerControl(optimizer="bobyqa",
                                      optCtrl=list(maxfun=2e5)))
model3_edu2=glmer(missing~LIT_grp+BL_gll21_total+age_child+sex
                +BL_ses+(1|school_id),
                 data=dat1,
                 family=binomial('logit'),
                 control=glmerControl(optimizer="bobyqa",
                                      optCtrl=list(maxfun=2e5)))

the overall p value for household education level

In [25]:
edu2=anova(model3_edu, model3_edu2, test="LRT")

Similarly, calculate the overall p value for SES

In [26]:
model4_ses=glmer(missing~LIT_grp+BL_gll21_total+age_child+sex+schlevel_comp
                 +BL_ses+(1|school_id),
                 data=dat1,
                 family=binomial('logit'),
                 control=glmerControl(optimizer="bobyqa",
                                      optCtrl=list(maxfun=2e5)))

model4_ses2=glmer(missing~LIT_grp+BL_gll21_total+age_child+sex+schlevel_comp
                  +(1|school_id),
                  data=dat1,
                  family=binomial('logit'),
                  control=glmerControl(optimizer="bobyqa",
                                       optCtrl=list(maxfun=2e5)))
ses2=anova(model4_ses, model4_ses2, test="LRT")

### Draw the table S2

In [27]:
table1=round(summary(model2_ses)$coefficient,3)
table2=round(summary(model4_ses)$coefficient,3)

In [28]:
table1=cbind(formatC(table1[,c(1:3)],2,format="f"),table1[,4])
table1[,4][table1[,4]=="0"] = '<0.001'

table2=cbind(formatC(table2[,c(1:3)],2,format="f"),table2[,4])
table2[,4][table2[,4]=="0"] = '<0.001'

ass1=rbind(table1[1:5,],
           c('','','',formatC(round(edu$`Pr(>Chisq)`[2],3),3,format="f")),
           table1[6:8,],
           c("","","",formatC(round(ses$`Pr(>Chisq)`[2],3),3,format="f")),
           table1[9:12,]
)

ass2=rbind(table2[1:5,],
           c('','','',formatC(round(edu2$`Pr(>Chisq)`[2],3),3,format="f")),
           table2[6:8,],
           c("","","",formatC(round(ses2$`Pr(>Chisq)`[2],3),3,format="f")),
           table2[9:12,]
) 
ass=cbind(c(
  'Intercept','Intervention','Baseline literacy score',
  'Age','Sex (Female)','HH education','  primary','  secondary',
  '  college/degree','SES household','  Poor','  Median Poor','  Less Poor',
  '  Least Poor'
),ass1,ass2)
rownames(ass)=NULL
ass = ass[,c(1,2,3,5,6,7,9)]
ass[c(7:9,11:14),c(4,7)] = ''
colnames(ass) = c('','Estimate','SE','p-value','Estimate','SE','p-value')

In [29]:
tableS2 = ass
tableS2

Unnamed: 0,Estimate,SE,p-value,Estimate.1,SE.1,p-value.1
Intercept,-3.61,0.38,<0.001,-1.51,0.39,<0.001
Intervention,0.75,0.19,<0.001,0.09,0.15,0.538
Baseline literacy score,0.35,0.02,<0.001,-0.03,0.01,0.105
Age,0.06,0.04,0.094,-0.02,0.04,0.544
Sex (Female),-0.04,0.11,0.733,-0.0,0.13,0.991
HH education,,,0.002,,,0.021
primary,0.13,0.13,,-0.34,0.15,
secondary,0.61,0.22,,-0.17,0.24,
college/degree,1.16,0.38,,0.46,0.32,
SES household,,,0.357,,,0.920
