-
Notifications
You must be signed in to change notification settings - Fork 3
/
convertDate.R
141 lines (139 loc) · 5.5 KB
/
convertDate.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
#' Converts date columns formatted as characters to be of type datetime
#'
## Copyright(c) 2017-2020 R. Mark Sharp
## This file is part of nprcgenekeepr
#' Part of Pedigree Curation
#'
## ## rmsutilityr get_and_or_list
## ## rmsutilityr is_valid_date_str
#'
#' @return A dataframe with an updated table with date columns converted from
#' \code{character} data type to \code{Date} data type. Values that do not
#' conform to the format %Y%m%d are set to NA. NA values are left as NA.
#'
#' @examples
#' \donttest{
#' library(lubridate)
#' set_seed(10)
#' someBirthDates <- paste0(sample(seq(0, 15, by = 3), 10,
#' replace = TRUE) + 2000, "-",
#' sample(1:12, 10, replace = TRUE), "-",
#' sample(1:28, 10, replace = TRUE))
#' someBadBirthDates <- paste0(sample(1:12, 10, replace = TRUE), "-",
#' sample(1:28, 10, replace = TRUE), "-",
#' sample(seq(0, 15, by = 3), 10,
#' replace = TRUE) + 2000)
#' someDeathDates <- sample(someBirthDates, length(someBirthDates),
#' replace = FALSE)
#' someDepartureDates <- sample(someBirthDates, length(someBirthDates),
#' replace = FALSE)
#' ped1 <- data.frame(birth = someBadBirthDates, death = someDeathDates,
#' departure = someDepartureDates)
#' someDates <- ymd(someBirthDates)
#' ped2 <- data.frame(birth = someDates, death = someDeathDates,
#' departure = someDepartureDates)
#' ped3 <- data.frame(birth = someBirthDates, death = someDeathDates,
#' departure = someDepartureDates)
#' someNADeathDates <- someDeathDates
#' someNADeathDates[c(1, 3, 5)] <- ""
#' someNABirthDates <- someDates
#' someNABirthDates[c(2, 4, 6)] <- NA
#' ped4 <- data.frame(birth = someNABirthDates, death = someNADeathDates,
#' departure = someDepartureDates)
#'
#' ## convertDate identifies bad dates
#' result = tryCatch({
#' convertDate(ped1)
#' }, warning = function(w) {
#' print("Warning in date")
#' }, error = function(e) {
#' print("Error in date")
#' })
#'
#' ## convertDate with error flag returns error list and not an error
#' convertDate(ped1, reportErrors = TRUE)
#'
#' ## convertDate recognizes good dates
#' all(is.Date(convertDate(ped2)$birth))
#' all(is.Date(convertDate(ped3)$birth))
#'
#' ## convertDate handles NA and empty character string values correctly
#' convertDate(ped4)
#' }
#'
#' @param ped a dataframe of pedigree information that may contain birth,
#' death, departure, or exit dates. The fields are optional, but will be used
#' if present.(optional fields: birth, death, departure, and exit).
#' @param time.origin date object used by \code{as.Date} to set \code{origin}.
#' @param reportErrors logical value if TRUE will scan the entire file and
#' make a list of all errors found. The errors will be returned in a
#' list of list where each sublist is a type of error found.
#' @importFrom stringi stri_trim_both stri_c
#' @export
convertDate <- function(ped, time.origin = as.Date("1970-01-01"),
reportErrors = FALSE) {
## Ignore records added because of unknown parents
if (any("recordStatus" %in% names(ped))) {
addedPed <- ped[ped$recordStatus == "added", ]
ped <- ped[ped$recordStatus == "original", ]
if (nrow(ped) == 0)
return(rbind(ped, addedPed))
}
headers <- tolower(names(ped))
headers <- headers[headers %in% getDateColNames()]
format <- "%Y-%m-%d"
invalid_date_rows <- NULL
for (header in headers) {
dates <- ped[[header]]
if (any(class(dates) %in% c("factor","logical", "integer"))) {
dates <- as.character(dates)
}
if (class(dates) == "Date") {
dates <- removeEarlyDates(dates, 1000)
originalNAs <- is.na(dates)
dates <- dates[!originalNAs]
} else if (class(dates) == "character") {
dates[stri_trim_both(dates) == ""] <- NA
ped[[header]] <- dates
originalNAs <- is.na(dates)
dates <- dates[!originalNAs]
if (length(dates) > 0) {
dates <- insertSeparators(dates)
dates <- as.Date(dates, format = format, origin = time.origin,
optional = TRUE)
dates <- removeEarlyDates(dates, 1000)
}
} else {
stop(stri_c("class(dates) is not 'character', 'factor', 'integer', or ",
"'Date' it is == ", class(dates)))
}
if (any(is.na(dates))) {
goodAndBadDates <- ifelse(is.na(dates), "bad", "good")
originalDates <- as.character(ped[[header]])
originalDates[originalNAs] <- "good"
originalDates[!originalNAs] <- goodAndBadDates
if (reportErrors) {
invalid_date_rows <- c(invalid_date_rows,
seq_along(originalDates)[originalDates == "bad"])
next
}
rowNums <- get_and_or_list(
seq_along(originalDates)[originalDates == "bad"], "and")
stop(paste0("Column '", header, "' has invalid dates on row(s) ",
rowNums, "."))
}
ped[!originalNAs, header] <- dates
ped[originalNAs, header] <- NA # For those NAs from dates <= 1000 CE
ped[[header]] <- as.Date(as.integer(ped[[header]]), origin = time.origin)
}
if (reportErrors) {
if (!is.null(invalid_date_rows))
invalid_date_rows <- as.character(sort(invalid_date_rows))
return(invalid_date_rows)
} else {
## Add back records of unknown parents
if (any("recordStatus" %in% names(ped)))
ped <- rbind(ped, addedPed)
return(ped)
}
}