# HALI Data Analysis

#### Baseline and outcome characteristics of motivating HALI data set for n=2465 participants with complete baseline covariates

* table 1A


In [48]:
table_1a

Unnamed: 0,Intervention (n=1230),Control (n=1235)
Baseline cluster characteristics,,
Number,51,50
Cluster Size – mean (SD),24.1 (3.3),24.7 (1.9)
Baseline child-level characteristics - % (n),,
Female,47.9% (589),49.5% (611)
Age – mean (sd),7.7 (1.7),7.9 (1.7)
Household head education,,
Did not complete primary education,29.1% (358),34.4% (425)
Primary,55.6% (684),52.7% (651)
Secondary,11.5% (141),10.6% (131)


# Code Detail

In [1]:
# install.packages('readstata13')
# install.packages('geeM')
# install.packages('jomo')
# install.packages('knitr')

In [3]:
library(readstata13) # library for read in the dataset
library(lme4) # library for GLM
library(geeM)
library(jomo)
library(knitr)

### Read in data

the full dataset named dat

In [4]:
setwd('/Users/yaolanqiu/Documents/HALI/DATA')
dat <- read.dta13('HALI_CLASS1_2539_MASTER_EDUC_FU1_FU2_LONG_accounts_withdrawals_BL_AS_COV_AND_OTHER_COV_17.11.2016.dta',
                  nonint.factors = TRUE, generate.factors=TRUE)

In [5]:
dim(dat)

### we only focused on 9 month data

the data with only 9 month named data0

In [6]:
data0 <- dat[dat$visit=='9-month FU',]
dim(data0) # 2539 220

### select the input variables

In [7]:
dat0 <- data0[,c("school_id","LIT_grp",
                 "BL_gll21_total", 'age_child','sex',
                 'schlevel_comp',"BL_ses",
                 "gll21_total")]
rownames(dat0) <- NULL
dim(dat0) # 2539 8

### remove the data with missing covariates 

In [8]:
dat1 = dat0[(is.na(dat0$school_id) == 0 & 
      is.na(dat0$LIT_grp) == 0 &
      is.na(dat0$BL_gll21_total) == 0 &
      is.na(dat0$age_child)== 0 & 
      is.na(dat0$sex) == 0 & 
      is.na(dat0$schlevel_comp) == 0 &
      is.na(dat0$BL_ses) ==0) ,]
rownames(dat1) = NULL
dim(dat1) # 2465 8

## Draw the table 1a

####  Number of student in intervention arm and control arm:

In [9]:
dim(dat1[dat1$LIT_grp=='yes',]) # 1230
dim(dat1[dat1$LIT_grp=='no',])  # 1235

In [10]:
# the table cells
col1 = col2 =col3 =c()

#### Baseline cluster characteristics:

In [11]:
col1 = c(col1, 'Baseline cluster characteristics')
col2 = c(col2, '')
col3 = c(col3, '')

#### cluster number:

In [12]:
length(table(dat1[dat1$LIT_grp=='yes',]$school_id)) # intervention 51
length(table(dat1[dat1$LIT_grp=='no',]$school_id)) # control 50

In [13]:
col1 = c(col1,'Number')
col2 = c(col2, length(table(dat1[dat1$LIT_grp=='yes',]$school_id)) )
col3 = c(col3, length(table(dat1[dat1$LIT_grp=='no',]$school_id)))

#### Cluster size (mean (SD)):

intervention 
 24.1 (3.3)

In [14]:
paste(round(mean(table(dat1[dat1$LIT_grp=='yes',]$school_id)),1),' (',
      round(sd(table(dat1[dat1$LIT_grp=='yes',]$school_id)),1),")",sep='')

control 
 24.7 (1.9)

In [15]:
paste(round(mean(table(dat1[dat1$LIT_grp=='no',]$school_id)),1),' (',
      round(sd(table(dat1[dat1$LIT_grp=='no',]$school_id)),1),")",sep='')

In [16]:
col1 = c(col1,'Cluster Size – mean (SD)')
col2 = c(col2, paste(round(mean(table(dat1[dat1$LIT_grp=='yes',]$school_id)),1),' (',
                     round(sd(table(dat1[dat1$LIT_grp=='yes',]$school_id)),1),")",sep=''))
col3 = c(col3, paste(round(mean(table(dat1[dat1$LIT_grp=='no',]$school_id)),1),' (',
                     round(sd(table(dat1[dat1$LIT_grp=='no',]$school_id)),1),")",sep=''))

#### Baseline child-level characteristics:

In [17]:
col1 = c(col1, 'Baseline child-level characteristics - % (n)')
col2 = c(col2, '')
col3 = c(col3, '')

 intervention:
 gender - female: 47.9% (589)


In [18]:
paste(100 * round(dim(dat1[dat1$sex == 'Female' & 
                             dat1$LIT_grp == 'yes',])[1]/ dim(dat1[dat1$LIT_grp == 'yes',])[1],3),
      "% (",dim(dat1[dat1$sex == 'Female' & dat1$LIT_grp == 'yes',])[1],")",sep='')

 control:
 gender - female:  49.5% (611)


In [19]:
paste(100 * round(dim(dat1[dat1$sex == 'Female' & 
                             dat1$LIT_grp == 'no',])[1]/ dim(dat1[dat1$LIT_grp == 'no',])[1],3),
      "% (",dim(dat1[dat1$sex == 'Female' & dat1$LIT_grp == 'no',])[1],")",sep='')

In [20]:
col1 = c(col1, 'Female')
col2 = c(col2, paste(100 * round(dim(dat1[dat1$sex == 'Female' & 
                                            dat1$LIT_grp == 'yes',])[1]/ dim(dat1[dat1$LIT_grp == 'yes',])[1],3),
                     "% (",dim(dat1[dat1$sex == 'Female' & dat1$LIT_grp == 'yes',])[1],")",sep=''))
col3 = c(col3, paste(100 * round(dim(dat1[dat1$sex == 'Female' & 
                                            dat1$LIT_grp == 'no',])[1]/ dim(dat1[dat1$LIT_grp == 'no',])[1],3),
                     "% (",dim(dat1[dat1$sex == 'Female' & dat1$LIT_grp == 'no',])[1],")",sep=''))

#### Age - mean(sd)

intervention

In [21]:
paste(round(mean(dat1[dat1$LIT_grp == 'yes',]$age_child),1),
      ' (', round(sd(dat1[dat1$LIT_grp == 'yes',]$age_child),1),
      ')', sep = '')

control

In [22]:
paste(round(mean(dat1[dat1$LIT_grp == 'no',]$age_child),1),
      ' (', round(sd(dat1[dat1$LIT_grp == 'no',]$age_child),1),
      ')', sep = '')

In [23]:
col1 = c(col1, 'Age – mean (sd)')
col2 = c(col2, paste(round(mean(dat1[dat1$LIT_grp == 'yes',]$age_child),1),
                     ' (', round(sd(dat1[dat1$LIT_grp == 'yes',]$age_child),1),
                     ')', sep = ''))
col3 = c(col3, paste(round(mean(dat1[dat1$LIT_grp == 'no',]$age_child),1),
                     ' (', round(sd(dat1[dat1$LIT_grp == 'no',]$age_child),1),
                     ')', sep = ''))

#### Household education:

In [24]:
col1 = c(col1, 'Household head education')
col2 = c(col2, '')
col3 = c(col3, '')

intervention

In [25]:
paste(100 * round(table(dat1[dat1$LIT_grp=='yes',]$schlevel_comp)/dim(dat1[dat1$LIT_grp=='yes',])[1],3),
      "% (",
      table(dat1[dat1$LIT_grp=='yes',]$schlevel_comp), ")",sep='')

control

In [26]:
paste(100 * round(table(dat1[dat1$LIT_grp=='no',]$schlevel_comp)/dim(dat1[dat1$LIT_grp=='no',])[1],3),
      "% (",
      table(dat1[dat1$LIT_grp=='no',]$schlevel_comp), ")",sep='')

In [27]:
col1 = c(col1,c('Did not complete primary education',
                'Primary',
                'Secondary',
                'College/degree'))
col2 = c(col2, paste(100 * round(table(dat1[dat1$LIT_grp=='yes',]$schlevel_comp)/dim(dat1[dat1$LIT_grp=='yes',])[1],3),
                     "% (",
                     table(dat1[dat1$LIT_grp=='yes',]$schlevel_comp), ")",sep=''))
col3 = c(col3, paste(100 * round(table(dat1[dat1$LIT_grp=='no',]$schlevel_comp)/dim(dat1[dat1$LIT_grp=='no',])[1],3),
                     "% (",
                     table(dat1[dat1$LIT_grp=='no',]$schlevel_comp), ")",sep=''))

#### SES

In [28]:
col1 = c(col1, 'Household socioeconomic status (SES)')
col2 = c(col2, '')
col3 = c(col3, '')

intervention

In [29]:
paste(100 * round(table(dat1[dat1$LIT_grp=='yes',]$BL_ses)/dim(dat1[dat1$LIT_grp=='yes',])[1],3),
      "% (",
      table(dat1[dat1$LIT_grp=='yes',]$BL_ses), ")",sep='')

control

In [30]:
paste(100 * round(table(dat1[dat1$LIT_grp=='no',]$BL_ses)/dim(dat1[dat1$LIT_grp=='no',])[1],3),
      "% (",
      table(dat1[dat1$LIT_grp=='no',]$BL_ses), ")",sep='')

In [31]:
col1 = c(col1,c('Poorest',
                'Poor',
                'Median poor',
                'Less poor',
                'Least poor'))
col2 = c(col2, paste(100 * round(table(dat1[dat1$LIT_grp=='yes',]$BL_ses)/dim(dat1[dat1$LIT_grp=='yes',])[1],3),
                     "% (",
                     table(dat1[dat1$LIT_grp=='yes',]$BL_ses), ")",sep=''))
col3 = c(col3, paste(100 * round(table(dat1[dat1$LIT_grp=='no',]$BL_ses)/dim(dat1[dat1$LIT_grp=='no',])[1],3),
                     "% (",
                     table(dat1[dat1$LIT_grp=='no',]$BL_ses), ")",sep=''))

#### Baseline literacy – spelling score (0-20) – mean (sd)

intervention

In [32]:
paste(round(mean(dat1[dat1$LIT_grp=='yes',]$BL_gll21_total),1),
      ' (',
      round(sd(dat1[dat1$LIT_grp=='yes',]$BL_gll21_total),1),
      ')',sep='')

control

In [33]:
paste(round(mean(dat1[dat1$LIT_grp=='no',]$BL_gll21_total),1),
      ' (',
      round(sd(dat1[dat1$LIT_grp=='no',]$BL_gll21_total),1),
      ')',sep='')

In [34]:
col1 = c(col1,'Baseline literacy – spelling score (0-20) – mean (sd)')
col2 = c(col2, paste(round(mean(dat1[dat1$LIT_grp=='yes',]$BL_gll21_total),1),
                     ' (',
                     round(sd(dat1[dat1$LIT_grp=='yes',]$BL_gll21_total),1),
                     ')',sep=''))
col3 = c(col3, paste(round(mean(dat1[dat1$LIT_grp=='no',]$BL_gll21_total),1),
                     ' (',
                     round(sd(dat1[dat1$LIT_grp=='no',]$BL_gll21_total),1),
                     ')',sep=''))

#### Outcome at 9-month follow-up 

In [35]:
col1 = c(col1, 'Outcome at 9-month follow-up ')
col2 = c(col2,' ')
col3 = c(col3,' ')

intervention

In [36]:
paste(100 * round(sum(dat1[dat1$LIT_grp=='yes',]$gll21_total > 10,na.rm = TRUE)/
                    dim(dat1[dat1$LIT_grp=='yes', ])[1],3),
      "% (",
      sum(dat1[dat1$LIT_grp=='yes',]$gll21_total > 10,na.rm = TRUE),")",sep='')

control

In [37]:
paste(100 * round(sum(dat1[dat1$LIT_grp=='no',]$gll21_total > 10,na.rm = TRUE)/
                    dim(dat1[dat1$LIT_grp=='no', ])[1],3),
      "% (",
      sum(dat1[dat1$LIT_grp=='no',]$gll21_total > 10,na.rm = TRUE),")",sep='')

In [38]:
col1 = c(col1,'High literacy (spelling score > 10)')
col2 = c(col2, paste(100 * round(sum(dat1[dat1$LIT_grp=='yes',]$gll21_total > 10,na.rm = TRUE)/
                                   dim(dat1[dat1$LIT_grp=='yes', ])[1],3),
                     "% (",
                     sum(dat1[dat1$LIT_grp=='yes',]$gll21_total > 10,na.rm = TRUE),")",sep=''))
col3 = c(col3, paste(100 * round(sum(dat1[dat1$LIT_grp=='no',]$gll21_total > 10,na.rm = TRUE)/
                                   dim(dat1[dat1$LIT_grp=='no', ])[1],3),
                     "% (",
                     sum(dat1[dat1$LIT_grp=='no',]$gll21_total > 10,na.rm = TRUE),")",sep=''))


#### Missing outcome

intervention

In [39]:
paste(100 * round(sum(is.na(dat1[dat1$LIT_grp=='yes',]$gll21_total))/ 
                    dim(dat1[dat1$LIT_grp=='yes',])[1],3),
      '% (',
      sum(is.na(dat1[dat1$LIT_grp=='yes',]$gll21_total)), ")",sep='')

control

In [40]:
paste(100 * round(sum(is.na(dat1[dat1$LIT_grp=='no',]$gll21_total))/ 
                    dim(dat1[dat1$LIT_grp=='no',])[1],3),
      '% (',
      sum(is.na(dat1[dat1$LIT_grp=='no',]$gll21_total)), ")",sep='')

In [42]:
col1 = c(col1,'Missing outcome')
col2 = c(col2, paste(100 * round(sum(is.na(dat1[dat1$LIT_grp=='yes',]$gll21_total))/ 
                    dim(dat1[dat1$LIT_grp=='yes',])[1],3),
      '% (',
      sum(is.na(dat1[dat1$LIT_grp=='yes',]$gll21_total)), ")",sep=''))
col3 = c(col3, paste(100 * round(sum(is.na(dat1[dat1$LIT_grp=='no',]$gll21_total))/ 
                    dim(dat1[dat1$LIT_grp=='no',])[1],3),
      '% (',
      sum(is.na(dat1[dat1$LIT_grp=='no',]$gll21_total)), ")",sep=''))

### Combine the results to draw table:

In [43]:
table_1a = data.frame(col1, col2, col3)
colnames(table_1a) = c('',
                       'Intervention\n(n=1230)',
                       'Control\n(n=1235)')
table_1a

Unnamed: 0,Intervention (n=1230),Control (n=1235)
Baseline cluster characteristics,,
Number,51,50
Cluster Size – mean (SD),24.1 (3.3),24.7 (1.9)
Baseline child-level characteristics - % (n),,
Female,47.9% (589),49.5% (611)
Age – mean (sd),7.7 (1.7),7.9 (1.7)
Household head education,,
Did not complete primary education,29.1% (358),34.4% (425)
Primary,55.6% (684),52.7% (651)
Secondary,11.5% (141),10.6% (131)
