/
qcStudbook.R
271 lines (263 loc) · 10.9 KB
/
qcStudbook.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
#' Quality Control for the Studbook or pedigree
#'
## Copyright(c) 2017-2020 R. Mark Sharp
## This file is part of nprcgenekeepr
#' Main pedigree curation function that performs basic quality control on
#' pedigree information
#'
#' @return A data.frame with standardized and quality controlled pedigree
#' information.
#'
#' @examples
#' \donttest{
#' examplePedigree <- nprcgenekeepr::examplePedigree
#' ped <- qcStudbook(examplePedigree, minParentAge = 2, reportChanges = FALSE,
#' reportErrors = FALSE)
#' names(ped)
#' }
#'
#' @param sb A dataframe containing a table of pedigree and demographic
#' information.
#'
#' The function recognizes the following columns (optional columns
#' will be used if present, but are not required):
#'
#' \itemize{
#' \item{id} {--- Character vector with Unique identifier for all individuals}
#' \item{sire} {--- Character vector with unique identifier for the father of
#' the current id}
#' \item{dam} {--- Character vector with unique identifier for the mother of
#' the current id}
#' \item{sex} {--- Factor {levels: "M", "F", "U"} Sex specifier for an
#' individual}
#' \item{birth} {--- Date or \code{NA} (optional) with the individual's birth
#' date}
#' \item{departure} {--- Date or \code{NA} (optional) an individual was sold
#' or shipped from the colony}
#' \item{death} {--- date or \code{NA} (optional)
#' Date of death, if applicable}
#' \item{status} {--- Factor {levels: ALIVE, DEAD, SHIPPED} (optional)
#' Status of an individual}
#' \item{origin} {--- Character or \code{NA} (optional)
#' Facility an individual originated from, if other than ONPRC}
#' \item{ancestry} {--- Character or \code{NA} (optional)
#' Geographic population to which the individual belongs}
#' \item{spf} {--- Character or \code{NA} (optional)
#' Specific pathogen-free status of an individual}
#' \item{vasxOvx} {--- Character or \code{NA} (optional)
#' Indicator of the vasectomy/ovariectomy status of an animal; \code{NA} if
#' animal is intact, assume all other values indicate surgical alteration}
#' \item{condition} {--- Character or \code{NA} (optional)
#' Indicator of the restricted status of an animal. "Nonrestricted" animals
#' are generally assumed to be naive.}
#' }
#' @param minParentAge numeric values to set the minimum age in years for
#' an animal to have an offspring. Defaults to 2 years. The check is not
#' performed for animals with missing birth dates.
#' @param reportChanges logical value that if \code{TRUE}, the \code{errorLst}
#' contains the list of changes made to the column names. Default is
#' \code{FALSE}.
#' @param reportErrors logical value if \code{TRUE} will scan the entire file
#' and report back changes made to input and errors in a
#' list of list where each sublist is a type of change or error found.
#' Changes will include column names, case of categorical values (male,
#' female, unknown), etc.
#' Errors will include missing columns, invalid date rows, male dams,
#' female sires, and records with one or more parents below minimum age
#' of parents.
#'
#' The following changes are made to the cols.
#'
#' \itemize{
#' \item {Column cols are converted to all lower case}
#' \item {Periods (".") within column cols are collapsed to no space ""}
#' \item {\code{egoid} is converted to \code{id}}
#' \item {\code{sireid} is convert to \code{sire}}
#' \item {\code{damid} is converted to \code{dam}}}
#'
#' If the dataframe (\code{sb} does not contain the five required columns
#' (\code{id}, \code{sire}, \code{dam}, \code{sex}), and
#' \code{birth} the function throws an error by calling \code{stop()}.
#'
#' If the \code{id} field has the string \emph{UNKNOWN} (any case) or both
#' the fields \code{sire} or \code{dam} have \code{NA} or \emph{UNKNOWN}
#' (any case), the record is removed.
#' If either of the fields \code{sire} or \code{dam} have the
#' string \emph{UNKNOWN} (any case), they are replaced with a unique identifier
#' with the form \code{Unnnn}, where \code{nnnn} represents one of a series
#' of sequential integers representing the number of missing sires and
#' dams right justified in a pattern of \code{0000}. See \code{addUIds}
#' function.
#'
#' The function \code{addParents} is used to add records for parents missing
#' their own record in the pedigree.
#'
#' The function \code{convertSexCodes} is used with \code{ignoreHerm == TRUE}
#' to convert sex codes according to the following factors of standardized
#' codes:
#'
#' \itemize{
#' \item{F} {-- replacing "FEMALE" or "2"}
#' \item{M} {-- replacing "MALE" or "1"}
#' \item{H} {-- replacing "HERMAPHRODITE" or "4", if ignore.herm == FALSE}
#' \item{U} {-- replacing "HERMAPHRODITE" or "4", if ignore.herm == TRUE}
#' \item{U} {-- replacing "UNKNOWN" or "3"}}
#'
#' The function \code{correctParentSex} is used to ensure no parent is both
#' a sire and a dam. If this error is detected, the function throws an error
#' and halts the program.
#'
#' The function \code{convertStatusCodes} converts status indicators to the
#' following factors of standardized codes. Case of the original status value
#' is ignored.
#'
#' \itemize{
#' \item{"ALIVE"} {--- replacing "alive", "A" and "1"}
#' \item {"DECEASED"} {--- replacing "deceased", "DEAD", "D", "2"}
#' \item {"SHIPPED"} {--- replacing "shipped", "sold", "sale", "s", "3"}
#' \item{"UNKNOWN"} {--- replacing is.na(status)}
#' \item {"UNKNOWN"} {--- replacing "unknown", "U", "4"}}
#'
#' The function \code{convertAncestry} coverts ancestry indicators using
#' regular expressions such that the following conversions are made from
#' character strings that match selected substrings to the following factors.
#'
#' \itemize{
#' \item{"INDIAN"} {--- replacing "ind" and not "chin"}
#' \item{"CHINESE"} {--- replacing "chin" and not "ind"}
#' \item{"HYBRID"} {--- replacing "hyb" or "chin" and "ind"}
#' \item{"JAPANESE"} {--- replacing "jap"}
#' \item{"UNKNOWN"} {--- replacing \code{NA}}
#' \item{"OTHER"} {--- replacing not matching any of the above}}
#'
#' The function \code{convertDate} converts character representations of
#' dates in the columns \code{birth}, \code{death}, \code{departure}, and
#' \code{exit} to dates using the \code{as.Date} function.
#'
#' The function \code{setExit} uses heuristics and the columns \code{death}
#' and \code{departure} to set \code{exit} if it is not already defined.
#'
#' The function \code{calcAge} uses the \code{birth} and the \code{exit}
#' columns to define the \code{age} column. The numerical values is rounded
#' to the nearest 0.1 of a year. If \code{exit} is not defined, the
#' current system date (\code{Sys.Date()}) is used.
#'
#' The function \code{findGeneration} is used to define the generation number
#' for each animal in the pedigree.
#'
#' The function \code{removeDuplicates} checks for any duplicated records and
#' removes the duplicates. I also throws an error and stops the program if an
#' ID appears in more
#' than one record where one or more of the other columns have a difference.
#'
#' Columns that cannot be used subsequently are removed and the rows are
#' ordered by generation number and then ID.
#'
#' Finally the columns \code{id} \code{sire}, and \code{dam} are coerce to
#' character.
#'
#' @importFrom lubridate is.Date
#' @importFrom utils write.csv
## ## rmsutilityr str_detect_fixed_all
#' @export
qcStudbook <- function(sb, minParentAge = 2, reportChanges = FALSE,
reportErrors = FALSE) {
newColumns <- fixColumnNames(names(sb), getEmptyErrorLst())
cols <- newColumns$newColNames
errorLst <- newColumns$errorLst
if (reportChanges == FALSE) # remove changed columns
errorLst$changedCols <- getEmptyErrorLst()$changedCols
missingColumns <- checkRequiredCols(cols, reportErrors)
if (reportErrors & !is.null(missingColumns)) {
errorLst$missingColumns <- missingColumns
return(errorLst)
}
names(sb) <- cols
sb <- toCharacter(sb, headers = c("id", "sire", "dam"))
sb <- unknown2NA(sb)
sb <- addUIds(sb)
sb <- addParents(sb) # add parent record for parents that don't have
#their own line entry
# Add and standardize needed fields
sb$sex <- convertSexCodes(sb$sex)
if (reportErrors) {
testVal <- correctParentSex(sb$id, sb$sire, sb$dam, sb$sex,
sb$recordStatus, reportErrors)
if (is.null(testVal$femaleSires) & is.null(testVal$maleDams)
& is.null(testVal$sireAndDam)) {
sb$sex <- correctParentSex(sb$id, sb$sire, sb$dam, sb$sex,
sb$recordStatus, reportErrors = FALSE)
} else {
errorLst$femaleSires <- testVal$femaleSires
errorLst$maleDams <- testVal$maleDams
errorLst$sireAndDam <- testVal$sireAndDam
}
} else {
sb$sex <- correctParentSex(sb$id, sb$sire, sb$dam, sb$sex, sb$recordStatus)
}
if (any("status" %in% cols)) {
sb$status <- convertStatusCodes(sb$status)
}
if (any("ancestry" %in% cols)) {
sb$ancestry <- convertAncestry(sb$ancestry)
}
if (any("fromCenter" %in% cols)) {
sb$fromCenter <- convertFromCenter(sb$fromCenter)
}
# converting date column entries from strings and integers to date
if (reportErrors) {
sbAndErrors <- getDateErrorsAndConvertDatesInPed(sb, errorLst)
sb <- sbAndErrors$sb
errorLst <- sbAndErrors$errorLst
} else {
sb <- convertDate(sb, time.origin = as.Date("1970-01-01"))
sb <- setExit(sb, time.origin = as.Date("1970-01-01"))
}
# ensure parents are older than offspring
suspiciousParents <- checkParentAge(sb, minParentAge, reportErrors)
if (reportErrors) {
if (!is.null(suspiciousParents)) {
if (nrow(suspiciousParents) > 0)
errorLst$suspiciousParents <- suspiciousParents
}
} else {
if (nrow(suspiciousParents) > 0) {
fileName <- paste0(tempdir(), "/lowParentAge.csv")
write.csv(suspiciousParents,
file = fileName, row.names = FALSE)
stop(paste0("Parents with low age at birth of offspring are listed in ",
fileName, ".\n"))
}
}
# setting age:
# uses current date as the end point if no exit date is available
if (any(("birth" %in% cols)) && !any(("age" %in% cols))) {
if (all(is.Date(sb$birth)))
sb["age"] <- calcAge(sb$birth, sb$exit)
}
# Adding generation numbers
sb["gen"] <- findGeneration(sb$id, sb$sire, sb$dam)
# Cleaning-up the data.frame
# Filtering unnecessary columns and ordering the data
if (reportErrors) {
testVal <- removeDuplicates(sb, reportErrors = reportErrors)
if (!is.null(testVal)) {
errorLst$duplicateIds <- testVal
} ## else do not update sb, because it will fail
} else {
sb <- sb <- removeDuplicates(sb)
}
sb <- fixGenotypeCols(sb)
cols <- intersect(getPossibleCols(), colnames(sb))
novelCols <- colnames(sb)[!colnames(sb) %in% cols]
sb <- sb[, c(cols, novelCols)]
sb <- sb[with(sb, order(gen, id)), ]
rownames(sb) <- seq(length.out = nrow(sb))
# Ensuring the IDs are stored as characters
sb <- toCharacter(sb, headers = c("id", "sire", "dam"))
if (reportErrors) {
return(checkChangedColAndErrorLst(errorLst))
} else {
return(sb)
}
}