Skip to content
Permalink
Branch: master
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
211 lines (177 sloc) 6.45 KB
#' Visualise a data.frame to display missingness.
#'
#' `vis_miss` provides an at-a-glance ggplot of the missingness inside a
#' dataframe, colouring cells according to missingness, where black indicates
#' a missing cell and grey indicates a present cell. As it returns a ggplot
#' object, it is very easy to customize and change labels.
#'
#' @param x a data.frame
#'
#' @param cluster logical. TRUE specifies that you want to use hierarchical
#' clustering (mcquitty method) to arrange rows according to missingness.
#' FALSE specifies that you want to leave it as is. Default value is FALSE.
#'
#' @param sort_miss logical. TRUE arranges the columns in order of missingness.
#' Default value is FALSE.
#'
#' @param show_perc logical. TRUE now adds in the \% of missing/complete data
#' in the whole dataset into the legend. Default value is TRUE.
#'
#'@param show_perc_col logical. TRUE adds in the \% missing data in a given
#' column into the x axis. Can be disabled with FALSE. Default value is TRUE.
#'
#' @param warn_large_data logical - warn if there is large data? Default is TRUE
#' see note for more details
#'
#' @param large_data_size integer default is 900000, this can be changed. See
#' note for more details
#'
#' @return `ggplot2` object displaying the position of missing values in the
#' dataframe, and the percentage of values missing and present.
#'
#' @seealso [vis_dat()] [vis_guess()] [vis_expect()] [vis_cor()] [vis_compare()]
#'
#' @note Some datasets might be too large to plot, sometimes creating a blank
#' plot - if this happens, I would recommend downsampling the data, either
#' looking at the first 1,000 rows or by taking a random sample. This means
#' that you won't get the same "look" at the data, but it is better than
#' a blank plot! See example code for suggestions on doing this.
#'
#' @examples
#'
#' vis_miss(airquality)
#'
#' \dontrun{
#' vis_miss(airquality, cluster = TRUE)
#'
#' vis_miss(airquality, sort_miss = TRUE)
#'
#' # if you have a large dataset, you might want to try downsampling:
#' library(nycflight13)
#' library(dplyr)
#' flights %>%
#' sample_n(1000) %>%
#' vis_miss()
#'
#' flights %>%
#' slice(1:1000) %>%
#' vis_miss()
#'
#' }
#'
#' @export
vis_miss <- function(x,
cluster = FALSE,
sort_miss = FALSE,
show_perc = TRUE,
show_perc_col = TRUE,
large_data_size = 900000,
warn_large_data = TRUE){
# throw error if x not data.frame
test_if_dataframe(x)
# add warning for large data
if (ncol(x) * nrow(x) > large_data_size && warn_large_data) {
stop("Data exceeds recommended size for visualisation, please consider
downsampling your data, or set argument 'warn_large_data' to FALSE.")
}
# make a TRUE/FALSE matrix of the data.
# This tells us whether it is missing (true) or not (false)
x.na <- is.na(x)
# switch for creating the missing clustering
if (cluster){
# this retrieves a row order of the clustered missingness
row_order_index <-
stats::dist(x.na*1) %>%
stats::hclust(method = "mcquitty") %>%
stats::as.dendrogram() %>%
stats::order.dendrogram()
} else {
row_order_index <- seq_len(nrow(x))
} # end else
if (sort_miss) {
# arrange by the columns with the highest missingness
# code inspired from https://r-forge.r-project.org/scm/viewvc.php/ ...
# pkg/R/missing.pattern.plot.R?view=markup&root=mi-dev
# get the order of columns with highest missingness
na_sort <- order(colSums(is.na(x)), decreasing = TRUE)
# get the names of those columns
col_order_index <- names(x)[na_sort]
} else if (!sort_miss) {
col_order_index <- names(x)
}
# Arranged data by dendrogram order index
# gather the variables together for plotting
# here we now have a column of the row number (row),
# then the variable(variables),
# then the contents of that variable (value)
dat_pre_vis <- as.data.frame(x.na[row_order_index , ])
d <- dat_pre_vis %>%
vis_gather_() %>%
# add info for plotly mousover
dplyr::mutate(value = vis_extract_value_(dat_pre_vis))
# calculate the overall % missingness to display in legend -------------------
if (show_perc) {
temp <- miss_guide_label(x)
p_miss_lab <- temp$p_miss_lab
p_pres_lab <- temp$p_pres_lab
# else if show_perc FALSE
} else {
p_miss_lab <- "Missing"
p_pres_lab <- "Present"
}
# then we plot it
vis_miss_plot <- vis_create_(d) +
ggplot2::scale_fill_manual(name = "",
values = c("grey80",
"grey20"),
labels = c(p_pres_lab,
p_miss_lab)) +
ggplot2::guides(fill = ggplot2::guide_legend(reverse = TRUE)) +
ggplot2::theme(legend.position = "bottom") +
# fix up the location of the text
ggplot2::theme(axis.text.x = ggplot2::element_text(hjust = 0))
# add the missingness column labels
# if there is only one colummn you don't need to sort the columns
# this is perhaps a bit of a hacky way around, but I can't see another
# way around it. Related issue: https://github.com/ropensci/visdat/issues/72
if (ncol(x) == 1) {
if (show_perc_col) {
return(
# print(
vis_miss_plot +
ggplot2::scale_x_discrete(position = "top",
labels = label_col_missing_pct(
x,
col_order_index)
)
# )
)
} else if (!show_perc_col) {
return(
# print(
vis_miss_plot +
ggplot2::scale_x_discrete(position = "top",
labels = col_order_index)
# )
)
}
}
if (show_perc_col) {
# flip the axes, add the info about limits
vis_miss_plot +
ggplot2::scale_x_discrete(position = "top",
limits = col_order_index,
labels = label_col_missing_pct(
x,
col_order_index)
)
} else {
vis_miss_plot +
ggplot2::scale_x_discrete(position = "top",
limits = col_order_index)
}
# guides(fill = guide_legend(title = "Type"))
# Thanks to
# http://www.markhneedham.com/blog/2015/02/27/rggplot-controlling-x-axis-order/
# For the tip on using scale_x_discrete
} # end of function
You can’t perform that action at this time.
You signed in with another tab or window. Reload to refresh your session. You signed out in another tab or window. Reload to refresh your session.