In [247]:
suppressPackageStartupMessages({
    library(data.table)
    library(tidyverse)
    library(tidymodels)
})

In [277]:
sparse_data <- function(df,sparse_percentage = 0.2){
    
    df_new <- copy(df) 
    setDT(df_new)
    categorical_names <- df_new %>% purrr::discard(is.numeric) %>% colnames

    for(i in categorical_names){
        features <- df_new[,unique(.SD),.SDcols = i] %>% pull
        
        for(j in features){
            
            condition_format <- 'df_new[,length(%s[%s == "%s"])/.N < %s]'
            condition_command <- sprintf(condition_format,i,i,j,sparse_percentage)
        
            condition <- eval(parse(text = condition_command))
            
            if(condition){
                sparse_format <- 'df_new[%s == "%s", %s := "other"]'
                sparse_command <- sprintf(sparse_format,i,j,i)
                eval(parse(text = sparse_command))
            } 
        }
    }
    df_new
}

In [265]:
get_scaling_factors <- function(df){
    setDT(df)
    factors <- lapply(df %>% keep(is.numeric),function(x) list(mean = mean(x),sd = sd(x)))
    factors
}      

In [266]:
scale_data <- function(df,scaling_factors,reverse = FALSE){
    
    data <- copy(df)
    setDT(data)
    for(i in names(scaling_factors)){
        factors <- scaling_factors[[i]]
        if(reverse){
            data[,(i) := lapply(.SD,function(x) (x*factors[['sd']] + factors[['mean']])),.SDcols = i]
        }else{
            data[,(i) := lapply(.SD,function(x) (x - factors[['mean']]) / factors[['sd']]),.SDcols = i]
        }  
    }
    out <- list(scaling_factors = scaling_factors,data = data)
    return(out)
} 

In [267]:
dummy_data <- function(scaled_data){
    df <- copy(scaled_data[['data']])
    df_dummied <- recipe(df) %>% step_dummy(all_nominal()) %>% prep %>% bake(new_data = NULL)
    scaled_data[['data']] <- df_dummied
    
    scaled_data
}

In [283]:
sparsed <- sparse_data(ISLR::Default)
scl <- get_scaling_factors(sparsed)
scld <- scale_data(ISLR::Default,scl)
dummied <- dummy_data(scld)

In [288]:
df[default %in% c('koko')]

default,student,balance,income
<fct>,<fct>,<dbl>,<dbl>


In [258]:
dummy_data(scld)

balance,income,default_Yes,student_Yes
<dbl>,<dbl>,<dbl>,<dbl>
-0.21882388,0.81314661,0,0
-0.03761405,-1.60541545,0,1
0.49238557,-0.13120568,0,0
-0.63286086,0.16402273,0,0
-0.10278574,0.37089658,0,0
0.17409766,-1.95142286,0,1
-0.02038712,-0.64572153,0,0
-0.05521305,-1.19344385,0,1
0.67329518,0.29629259,0,0
-1.72699815,-0.31804965,0,0


In [213]:
a <- sparse_data(ISLR::Default,sparse_percentage =0.4)

In [219]:
df <- copy(ISLR::Default)

In [236]:
dt <- df %>% sparse_data(sparse_percentage = .4) 

In [237]:
scl_fct <- dt %>% get_scaling_factors

In [238]:
scl_dt <- scale_data(dt,scaling_factors = scl_fct)

In [239]:
scl_dt

default,student,balance,income
<fct>,<fct>,<dbl>,<dbl>
No,No,-0.21882388,0.81314661
No,other,-0.03761405,-1.60541545
No,No,0.49238557,-0.13120568
No,No,-0.63286086,0.16402273
No,No,-0.10278574,0.37089658
No,other,0.17409766,-1.95142286
No,No,-0.02038712,-0.64572153
No,other,-0.05521305,-1.19344385
No,No,0.67329518,0.29629259
No,No,-1.72699815,-0.31804965
