R/LabelEncoder.R

#' Label Encoder
#' @description Encodes and decodes categorical variables into integer values and vice versa.
#' This is a commonly performed task in data preparation during model training, because all machine learning models require
#' the data to be encoded into numerical format. It takes a vector of character or factor values and encodes them into numeric.
#' @format \code{\link{R6Class}} object.
#' @section Usage:
#' For usage details see \bold{Methods, Arguments and Examples} sections.
#' \preformatted{
#' lbl = LabelEncoder$new()
#' lbl$fit(x)
#' lbl$fit_transform(x)
#' lbl$transform(x)
#' }
#' @section Methods:
#' \describe{
#'   \item{\code{$new()}}{Initialise the instance of the encoder}
#'   \item{\code{$fit()}}{creates a memory of encodings but doesn't return anything}
#'   \item{\code{$transform()}}{based on encodings learned in \code{fit} method is applies the transformation}
#'   \item{\code{$fit_transform()}}{encodes the data and keep a memory of encodings simultaneously}
#'   \item{\code{$inverse_transform()}}{encodes the data and keep a memory of encodings simultaneously}
#' }
#' @section Arguments:
#' \describe{
#'  \item{data}{a vector or list containing the character / factor values}
#' }
#' @export
#' @examples
#' data_ex <- data.frame(Score = c(10,20,30,4), Name=c('Ao','Bo','Bo','Co'))
#' lbl <- LabelEncoder$new()
#' data_ex$Name <- lbl$fit_transform(data_ex$Name)
#' decode_names <- lbl$inverse_transform(data_ex$Name)

LabelEncoder <- R6Class("LabelEncoder", public = list(

    #' @field input_data internal use
    input_data = NA,
    #' @field encodings internal use
    encodings = NA,
    #' @field decodings internal use
    decodings = NA,
    #' @field fit_model internal use
    fit_model = FALSE,

    #' @details
    #' Fits the labelencoder model on given data
    #'
    #' @param data_col a vector containing non-null values
    #' @return NULL, calculates the encoding and save in memory
    #'
    #' @examples
    #' data_ex <- data.frame(Score = c(10,20,30,4), Name=c('Ao','Bo','Bo','Co'))
    #' lbl <- LabelEncoder$new()
    #' lbl$fit(data_ex$Name)
    #' data_ex$Name <- lbl$fit_transform(data_ex$Name)
    #' decode_names <- lbl$inverse_transform(data_ex$Name)

    fit = function(data_col){

        self$input_data <- private$check_data(data_col)
        self$encodings <- private$encoder(self$input_data)
        self$decodings <- private$decoder(self$encodings)
        self$fit_model <- TRUE

    },

    #' @details
    #' Fits and returns the encoding
    #'
    #' @param data_col a vector containing non-null values
    #' @return encoding values for the given input data
    #'
    #' @examples
    #' data_ex <- data.frame(Score = c(10,20,30,4), Name=c('Ao','Bo','Bo','Co'))
    #' lbl <- LabelEncoder$new()
    #' lbl$fit(data_ex$Name)
    #' data_ex$Name <- lbl$fit_transform(data_ex$Name)

    fit_transform = function(data_col){

        data_col <- private$check_data(data_col)
        self$fit(data_col)

        vals <- private$mapper(data_col,
                               self$encodings,
                               convert_type = NULL,
                               output_type = numeric(1))
        return(vals)

    },

    #' @details
    #' Returns the encodings from the fitted model
    #'
    #' @param data_col a vector containing non-null values
    #' @return encoding values for the given input data
    #'
    #' @examples
    #' data_ex <- data.frame(Score = c(10,20,30,4), Name=c('Ao','Bo','Bo','Co'))
    #' lbl <- LabelEncoder$new()
    #' lbl$fit(data_ex$Name)
    #' data_ex$Name <- lbl$transform(data_ex$Name)

    transform = function(data_col){

        if (!(isTRUE(self$fit_model)))
            stop("Please run fit before using transform.")

        data_col <- private$check_data(data_col)

        # all values in the new vector should be in encodings
        if (!(all(data_col %in% names(self$encodings)))) {
            message(strwrap("There are new values in this vector which weren't
                 available during fit. Replacing those values with 'NA'"))

            # replace new values with 'NA'
            val_index <- which(!(data_col %in% names(self$encodings)))
            data_col[val_index] <- "NA"

        }

        vals <- private$mapper(data_col,
                               self$encodings,
                               convert_type = NULL,
                               output_type = numeric(1))
        return(vals)
    },

    #' @details
    #' Gives back the original values from a encoded values
    #'
    #' @param coded_col a vector containing label encoded values
    #' @return original values from the label encoded data
    #'
    #' @examples
    #' data_ex <- data.frame(Score = c(10,20,30,4), Name=c('Ao','Bo','Bo','Co'))
    #' lbl <- LabelEncoder$new()
    #' lbl$fit(data_ex$Name)
    #' data_ex$Name <- lbl$fit_transform(data_ex$Name)
    #' decode_names <- lbl$inverse_transform(data_ex$Name)

    inverse_transform = function(coded_col){

        #check if all values exist in decode
        if (!(all(coded_col %in% self$encodings))) {
            stop(strwrap("There are new values in this data which weren't
                    available during fit. Please fix the issue."))
        }

        # write here
        vals <- private$mapper(coded_col,
                               self$decodings,
                               convert_type = as.character,
                               output_type = character(1))
        return(vals)

    }

),
    private = list(

    encoder = function(data_col){

        # this is to handle if the input value is factor, it should be character
        data_col <- as.character(data_col)

        all_values <- unique(data_col)
        maps <- list()

        # because the encoding should start with zero (i - 1)
        for (i in seq(all_values))  maps[all_values[i]] <- i - 1
        return(maps)

    },

    decoder = function(en_output){

        # reverse the encoded list - make values as names and vice versa
        f <- as.list(names(en_output))
        names(f) <- en_output
        return(f)

    },

    mapper = function(input_vec, coded_list, convert_type ,output_type){

        if (is.null(convert_type)) {
            # this is to avoid using convert_type argument for
            # transform, fit_transform
            return(vapply(input_vec,
                           function(x) coded_list[[x]],
                           FUN.VALUE = output_type,
                           USE.NAMES = F))
        }

        return(vapply(input_vec,
                      function(x) coded_list[[convert_type(x)]],
                      FUN.VALUE = output_type,
                      USE.NAMES = F))
    },


    check_data = function(data_col){

        # fix data issues here
        if (any(is.na(data_col))) {
            message("The data contains NA values. Imputing NA with 'NA' ")
            data_col <- replace(data_col, which(is.na(data_col)), "NA")
        }

        if (any((data_col == ""))) {
            message("The data contains blank values. Imputing them with 'NA' ")
            data_col <- replace(data_col, which(data_col == ""), "NA")
        }

        if (any((data_col == " "))) {
            message("The data contains blank values. Imputing them with 'NA' ")
            data_col <- replace(data_col, which(data_col == " "), "NA")
        }

        return(data_col)

    }
    )
)