man/kNN.Rd

% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/kNNFaster.R
\name{kNN}
\alias{kNN}
\alias{kNN.data.table}
\alias{kNN.data.frame}
\alias{kNN.survey.design}
\alias{kNN.default}
\title{k-Nearest Neighbour Imputation}
\usage{
kNN(
  data,
  variable = colnames(data),
  metric = NULL,
  k = 5,
  dist_var = colnames(data),
  weights = NULL,
  numFun = median,
  catFun = maxCat,
  makeNA = NULL,
  NAcond = NULL,
  impNA = TRUE,
  donorcond = NULL,
  mixed = vector(),
  mixed.constant = NULL,
  trace = FALSE,
  imp_var = TRUE,
  imp_suffix = "imp",
  addRF = FALSE,
  onlyRF = FALSE,
  addRandom = FALSE,
  useImputedDist = TRUE,
  weightDist = FALSE
)

\method{kNN}{data.table}(
  data,
  variable = colnames(data),
  metric = NULL,
  k = 5,
  dist_var = colnames(data),
  weights = NULL,
  numFun = median,
  catFun = maxCat,
  makeNA = NULL,
  NAcond = NULL,
  impNA = TRUE,
  donorcond = NULL,
  mixed = vector(),
  mixed.constant = NULL,
  trace = FALSE,
  imp_var = TRUE,
  imp_suffix = "imp",
  addRF = FALSE,
  onlyRF = FALSE,
  addRandom = FALSE,
  useImputedDist = TRUE,
  weightDist = FALSE
)

\method{kNN}{data.frame}(
  data,
  variable = colnames(data),
  metric = NULL,
  k = 5,
  dist_var = colnames(data),
  weights = NULL,
  numFun = median,
  catFun = maxCat,
  makeNA = NULL,
  NAcond = NULL,
  impNA = TRUE,
  donorcond = NULL,
  mixed = vector(),
  mixed.constant = NULL,
  trace = FALSE,
  imp_var = TRUE,
  imp_suffix = "imp",
  addRF = FALSE,
  onlyRF = FALSE,
  addRandom = FALSE,
  useImputedDist = TRUE,
  weightDist = FALSE
)

\method{kNN}{survey.design}(
  data,
  variable = colnames(data),
  metric = NULL,
  k = 5,
  dist_var = colnames(data),
  weights = NULL,
  numFun = median,
  catFun = maxCat,
  makeNA = NULL,
  NAcond = NULL,
  impNA = TRUE,
  donorcond = NULL,
  mixed = vector(),
  mixed.constant = NULL,
  trace = FALSE,
  imp_var = TRUE,
  imp_suffix = "imp",
  addRF = FALSE,
  onlyRF = FALSE,
  addRandom = FALSE,
  useImputedDist = TRUE,
  weightDist = FALSE
)

\method{kNN}{default}(
  data,
  variable = colnames(data),
  metric = NULL,
  k = 5,
  dist_var = colnames(data),
  weights = NULL,
  numFun = median,
  catFun = maxCat,
  makeNA = NULL,
  NAcond = NULL,
  impNA = TRUE,
  donorcond = NULL,
  mixed = vector(),
  mixed.constant = NULL,
  trace = FALSE,
  imp_var = TRUE,
  imp_suffix = "imp",
  addRF = FALSE,
  onlyRF = FALSE,
  addRandom = FALSE,
  useImputedDist = TRUE,
  weightDist = FALSE
)
}
\arguments{
\item{data}{data.frame or matrix}

\item{variable}{variables where missing values should be imputed}

\item{metric}{metric to be used for calculating the distances between}

\item{k}{number of Nearest Neighbours used}

\item{dist_var}{names or variables to be used for distance calculation}

\item{weights}{weights for the variables for distance calculation.
If \code{weights = "auto"} weights will be selected based on variable importance from random forest regression, using function \code{\link[ranger]{ranger}}.
Weights are calculated for each variable seperately.}

\item{numFun}{function for aggregating the k Nearest Neighbours in the case
of a numerical variable}

\item{catFun}{function for aggregating the k Nearest Neighbours in the case
of a categorical variable}

\item{makeNA}{list of length equal to the number of variables, with values, that should be converted to NA for each variable}

\item{NAcond}{list of length equal to the number of variables, with a condition for imputing a NA}

\item{impNA}{TRUE/FALSE whether NA should be imputed}

\item{donorcond}{condition for the donors e.g. list(">5"), must be NULL or a list of same length as variable}

\item{mixed}{names of mixed variables}

\item{mixed.constant}{vector with length equal to the number of
semi-continuous variables specifying the point of the semi-continuous
distribution with non-zero probability}

\item{trace}{TRUE/FALSE if additional information about the imputation
process should be printed}

\item{imp_var}{TRUE/FALSE if a TRUE/FALSE variables for each imputed
variable should be created show the imputation status}

\item{imp_suffix}{suffix for the TRUE/FALSE variables showing the imputation
status}

\item{addRF}{TRUE/FALSE each variable will be modelled using random forest regression (\code{\link[ranger]{ranger}}) and used as additional distance variable.}

\item{onlyRF}{TRUE/FALSE if TRUE only additional distance variables created from random forest regression will be used as distance variables.}

\item{addRandom}{TRUE/FALSE if an additional random variable should be added
for distance calculation}

\item{useImputedDist}{TRUE/FALSE if an imputed value should be used for distance calculation for imputing another variable.
Be aware that this results in a dependency on the ordering of the variables.}

\item{weightDist}{TRUE/FALSE if the distances of the k nearest neighbours should be used as weights in the
aggregation step}
}
\value{
the imputed data set.
}
\description{
k-Nearest Neighbour Imputation based on a variation of the Gower Distance
for numerical, categorical, ordered and semi-continous variables.
}
\examples{

data(sleep)
kNN(sleep)
library(laeken)
kNN(sleep, numFun = weightedMean, weightDist=TRUE)

}
\references{
A. Kowarik, M. Templ (2016) Imputation with
R package VIM.  \emph{Journal of
Statistical Software}, 74(7), 1-16.
}
\author{
Alexander Kowarik, Statistik Austria
}
\keyword{manip}