-
Notifications
You must be signed in to change notification settings - Fork 13
/
survey_cleaning.R
141 lines (117 loc) · 5.09 KB
/
survey_cleaning.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
# INFO -------------------------------------------------------------------------
# File name: 05_survey_cleaning.R
# Creation date: 06/15/2022
# Last modified date: 07/26/2022
# Author: Erick F Molina
# Modifications: None
# Inputs:
# Outputs:
#
#
# Purpose: Prepare the data to be used for the survey report
#
# 0. LOAD PACKAGES AND FUNCTIONS -----------------------------------------------
# Packages
library(here)
library(haven)
library(tidyr)
library(dplyr)
library(magrittr)
library(data.table)
library(readxl)
library(lubridate)
library(assertthat)
library(readr)
# 1. SURVEY --------------------------------------------------------------------
# Read in data
survey <- read_dta(here("dtafiles", "data_collection", "fta_mexico_prepped.dta"))
survey_code <- read_excel(here("xlsx", "20220824_pos_survey.xlsx")) %>% as_tibble()
# Add case (A, B, C) identifier
survey <- survey %>%
mutate(section = ifelse(has_pos == 1,"A",
ifelse(has_had_pos == 1 & has_pos == 0, "C",
ifelse(has_pos == 0 & has_had_pos == 0, "B", NA))
)
)
# Basic cleaning
surveyraw <- survey
survey <- surveyraw %>%
as_tibble() %>%
mutate(submission_date = ymd(surveydate)) %>%
filter(submission_date >= ymd(20220620)) %>% # Start of survey
filter(!is.na(successful))
# Check for duplicate keys
assert_that(survey %>% nrow() == length(unique(survey$key)))
# Create long version on survey results and apply some caps
excluded_columns <- length(survey) - 21
survey_long <- survey %>%
select(text_audit, key, municipality, zip_code, 33:all_of(excluded_columns)) %>%
mutate_all(as.character) %>%
pivot_longer(cols = 13:636) %>% # Length of survey_long
left_join(survey_code %>% select(name, label), "name") %>%
mutate(TA_filename = gsub("TA_|.csv", "", basename(text_audit))) %>%
select(!text_audit) %>%
rename("field_name" = "name")
# Convert -666, -777, -888 to -6, -7, and -8 respectively
survey_long %<>%
mutate(value = if_else(value == "-666", "-10", value),
value = if_else(value == "-777", "-7", value),
value = if_else(value == "-888", "-8", value)
)
# 2. TA FILES ------------------------------------------------------------------
# Look if a corrupted file exists and erase it, before listing files
corrupt_file <- "TA_a4039a66-d50a-44a3-bfc5-77d3827d889c.csv"
if (file.exists(here("rawdata", "survey", "media", corrupt_file))) {
file.remove(here("rawdata", "survey", "media", corrupt_file))
} else {
print("This file does not exist, there is no problem")
}
# Read in data
files <- list.files(here("rawdata", "survey", "media"), full.names = T, pattern = "^TA")
TA_files <- lapply(files, fread)
# Clean names
names(TA_files) <- gsub("TA_|.csv", "", basename(files)) # make uuid name of each TA file
# Bind files
TA_files <- rbindlist(TA_files, id = "TA_filename")
TA_files %<>%
rename(c("field_name" = `Field name`,
"duration" = "Total duration (seconds)",
"first_appeared" = "First appeared (seconds into survey)")) %>%
mutate(field_name = basename(field_name)) %>%
mutate(key = paste0("uuid:", TA_filename)) %>%
left_join(surveyraw %>% select(surveyor_name, section, successful, key), by = "key")
# Save all TA file data (before we apply filter)
TA_files %>% write_csv(here("proc", "TA_results_nofilter_checked.csv"))
# Get only the data for which we have survey results
TA_files %<>%
filter(TA_filename %in% gsub("uuid:", "", survey$key))
# Sanity check:
assert_that(length(unique(TA_files$TA_filename)) == nrow(survey))
# 3. LONG-SURVEY FORMAT ---------------------------------------------------------
# Join in TA file identifying info to long survey
survey_long %<>%
left_join(TA_files, by = c("field_name", "TA_filename", "key")) %>%
group_by(TA_filename) %>%
fill(key, surveyor_name, section, successful, .direction = "down") %>%
fill(key, surveyor_name, section, successful, .direction = "up")
# Quick checks
dat <- survey_long %>% group_by(TA_filename) %>% summarise(key = length(unique(key)), surveyor_name = length(unique(surveyor_name)))
assert_that(all(dat$key == 1) == T)
assert_that(all(dat$surveyor_name == 1) == T)
assert_that(survey_long %>% filter(is.na(surveyor_name)) %>% nrow() == 0)
# We want to recode missing values. If NA, keep as NA. If -666, code as Other. If -777, code as "Don't know". If -888, "Refuse to respond".
survey_long %<>%
mutate(value_missing = case_when(
value == "-10" ~ "other",
value == "-7" ~ "don't know",
value == "-8" ~ "refuse to respond",
is.na(value) ~ "NA",
value != "-10" & value != "-7" & value != "-8" & !is.na(value) ~ value
)
) %>%
mutate(value_missing = ifelse(value_missing != "NA", value_missing, NA)) %>%
select(key, field_name, value, value_missing, label, duration, first_appeared, TA_filename, surveyor_name)
# Save
survey %>% write_csv(here("proc", "survey_results_checked.csv"))
survey_long %>% write_csv(here("proc", "survey_long_results_checked.csv"))
TA_files %>% write_csv(here("proc", "TA_results_checked.csv"))