-
Notifications
You must be signed in to change notification settings - Fork 0
/
Code_Imputation.R
98 lines (77 loc) · 3.07 KB
/
Code_Imputation.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
# R script replication file
# Multiple Imputation procedure
# July 2021
# Packages
library(tidyverse)
library(psych)
library(foreign)
library(MASS)
library(Hmisc)
library(reshape2)
library(devtools)
# Upload Data
library(readxl)
MyData <- read_excel("~/Data2.xlsx")
## Missing data procedures ##
library(mice)
library(VIM)
# Build study dataset
final.dat <- MyData %>%
dplyr::select(QB1_2, QB1_3, QB1_6,
QB2_1, QB2_4, QB2_5, QB2_7, QB2_8,
QB3_1, QB3_2, QB3_4, QB3_7, QB3_8,
QB4_5, QB4_6,
QC3_2, QC3_10,
QC4_1, QC4_2, QC4_3, QC4_4, QC4_5)
# Find the degree of missing observations
pMiss <- function(x){sum(is.na(x))/length(x)*100}
apply(final.dat,2,pMiss)
# Visual tools for missing data
aggr_plot <- aggr(final.dat, col=c('navyblue','red'), numbers=TRUE, sortVars=TRUE,
labels=names(final.dat), cex.axis=.7, gap=3, ylab=c("Histogram of missing data","Pattern"))
# Multiple Imputation procedure (using the mice function)
tempData <- mice(final.dat,m=5,maxit=50,meth='pmm',seed=500)
summary(tempData)
# Visual tools to compare imputed and original data
densityplot(tempData)
# Build full (completed) dataset (the number represents the imputed dataset used for completing missing values)
comp.dat1 <- complete(tempData, 1)
# Join co-vars to imputed data
d.covars <- MyData %>%
dplyr::select(exp., education, age_cat, gender)
# Create final full dataset (based on imputed dataset #1)
data_all <- cbind(comp.dat1, d.covars)
## Testing regression model with imputed dataset #1
library(modelsummary)
summary(m3 <- lm(QC4_4 ~ QB2_1 + QB3_2 + QB2_7 + QB2_8 +
QB3_8 + QC3_2 + QB4_6 +
age_cat + job_b + gender, data = data_all))
## Build full datasets based on all imputed data (in addition to comp.dat1)
comp.dat2 <- complete(tempData, 2)
comp.dat3 <- complete(tempData, 3)
comp.dat4 <- complete(tempData, 4)
comp.dat5 <- complete(tempData, 5)
# Generate complete dataset (change comp.dat by selected imputed file)
data_all <- cbind(comp.dat1, d.covars)
# Run model 3 for all 5 datasets
summary(m3 <- lm(QC4_4 ~ QB2_1 + QB3_2 + QB2_7 + QB2_8 +
QB3_8 + QC3_2 + QB4_6 +
age_cat + job_b + gender, data = data_all))
# Save model results for each imputed dataset in list (save after running model with each imputed dataset)
tab1 <- list()
tab1[['Dataset #1']] <- m3
tab1[['Dataset #2']] <- m3
tab1[['Dataset #3']] <- m3
tab1[['Dataset #4']] <- m3
tab1[['Dataset #5']] <- m3
### Compare imputed datasets: plot results
b <- list(geom_vline(xintercept = 0, linetype = "dotted"))
c_lab <- c('QB2_1' = 'Trust:Fed.Govt', 'QB3_2' = 'Regulation', 'QB2_7' = 'Private Ins.',
'QB2_8' = 'Health Pros', 'QB3_8' = 'Tech Good', 'QC3_2' = 'RPM Use', 'QB4_6' = 'Net Benefits',
'age_cat' = 'Age', 'job_b' = 'Govt Job', 'gender' = 'Gender')
coef.compare <- modelplot(tab1, coef_omit = 'Interc', background = b, coef_map = c_lab) +
labs(
x = 'Coefficients',
y = 'Survey Items',
title = 'Model 3: Government Regulation')
coef.compare