In [1]:
suppressPackageStartupMessages({
    library(data.table)
    library(tidyverse)
    library(tidymodels)
    library(treesnip)
    library(caret)
    library(keras)
    library(catboost)
    library(tensorflow)
    library(collapse)
})

"package 'dials' was built under R version 4.2.1"
"package 'recipes' was built under R version 4.2.1"
"package 'rsample' was built under R version 4.2.1"
"package 'tune' was built under R version 4.2.1"
"package 'workflows' was built under R version 4.2.1"
"package 'caret' was built under R version 4.2.1"


In [2]:
df <- fread('salary.csv')

colnames(df) <- gsub('-','',colnames(df)) 

In [3]:
config <- list(
    label = 'salary',
    tune = TRUE,
    tune_iter = 3,
    seed = 1453,
    sparse_percentage = 0,
    folds = 3,
    learning_rate = 0.05,
    early_stopping_rounds = 100,
    iter = 5000,
    objective = 'Logloss',
    loss_fun = 'F1:use_weights=False',
    eval_metric = 'F1:use_weights=False',
    auto_class_weights = 'SqrtBalanced',
    use_best = FALSE,
    task = 'CPU',
    device = '0:1',
    slicer_1 = '*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*',
    slicer_2 = '_____________________________________________'
)

In [472]:
#df <- fread('salary.csv')

In [473]:
#colnames(df) <- gsub('-','',colnames(df))

In [474]:
#df %>% select(salary) %>% filter(salary == '>50K')

In [4]:
sparse_data <- function(df,sparse_percentage){
    
    df_new <- copy(df) 
    setDT(df_new)
    categorical_names <- df_new %>% purrr::discard(is.numeric) %>% colnames

    for(i in categorical_names){
        features <- df_new[,unique(.SD),.SDcols = i] %>% pull
        
        for(j in features){
            
            condition_format <- 'df_new[,length(%s[%s == "%s"])/.N < %s]'
            condition_command <- sprintf(condition_format,i,i,j,sparse_percentage)
        
            condition <- eval(parse(text = condition_command))
            
            if(condition){
                sparse_format <- 'df_new[%s == "%s", %s := "other"]'
                sparse_command <- sprintf(sparse_format,i,j,i)
                eval(parse(text = sparse_command))
            } 
        }
    }
    df_new
}

In [5]:
get_scaling_factors <- function(df){
    setDT(df)
    factors <- lapply(df %>% keep(is.numeric),function(x) list(min = min(x,na.rm = T),max = max(x,na.rm = T)))
    factors
}      

In [6]:
scale_data <- function(df,scaling_factors,reverse = FALSE){
    
    data <- copy(df)
    setDT(data)
    for(i in names(scaling_factors)){
        factors <- scaling_factors[[i]]
        if(reverse){
            #data[,(i) := lapply(.SD,function(x) (x*(factors[['max']] - factors[['min']]) + factors[['min']])),.SDcols = i]
        }else{
            #data[,(i) := lapply(.SD,function(x) (x - factors[['min']]) / (factors[['max']] - factors[['min']])),.SDcols = i]
            #data[,(i) := lapply(.SD,function(x) fifelse(is.na(x),-1,x)),.SDcols = i]
        }  
    }
                                
    catcols <- df %>% purrr::discard(is.numeric) %>% colnames
    out <- list(scaling_factors = scaling_factors,data = data,cat_cols = catcols)
    return(out)
} 

In [7]:
clean_test_set <- function(scaled_data,categorical_cols,distinct_values_on_train){
    df <- copy(scaled_data[['data']])
    
    make_paste <- function(x){
    wrapped <- sapply(x,function(x) paste0('"',x,'"'))
    paste0(wrapped,collapse = ',')
    }       
   
    
    for(i in categorical_cols){
        distincts <- distinct_values_on_train[[i]]
        distincts <- make_paste(distincts)
        command_format <- "df[! %s %%in%% c(%s), %s := NA]"
        command <- sprintf(command_format,i,distincts,i)
        eval(parse(text = command))
    }
                      
    scaled_data[['data']] <- df
                      
    scaled_data
}

In [8]:
get_distincts <- function(scaled_data){
    df <- copy(scaled_data[['data']])
    
    catcols <- scaled_data[['cat_cols']]
    
    distincts <- list()
    for(i in catcols){
        distinct_values <- df[,unique(.SD),.SDcols = i] %>% pull %>% as.character
        distincts[[i]] <- c(distinct_values,'Missing')
        df[,(i) := lapply(.SD,function(x) as.character(x)),.SDcols = i]
        #df[,(i) := lapply(.SD,function(x) ifelse(is.na(x),'Missing',x)),.SDcols = i]
        df[,(i) := lapply(.SD,function(x) factor(x,levels = c(distinct_values,'Missing'))),.SDcols = i]
    }
    
    

    scaled_data[['cat_distincts']] <- distincts
    
    scaled_data[['data']] <- df
                          
    scaled_data
}

In [10]:
fetch_test_levels_to_train <- function(scaled_data,categorical_cols,distinct_values_on_train){
    df <- copy(scaled_data[['data']])
    
    for(i in categorical_cols){
        distincts <- distinct_values_on_train[[i]]
        df[,(i) := lapply(.SD,function(x) factor(x,levels = distincts)),.SDcols = i]
    }
   
    scaled_data[['data']] <- df
    scaled_data
}

In [11]:
dummy_data <- function(scaled_data){
    df <- copy(scaled_data[['data']])
    #df_dummied <- recipe(df) %>% step_dummy(all_nominal()) %>% prep %>% bake(new_data = NULL)
    df_dummied <- recipe(df) %>% prep %>% step_string2factor(all_nominal()) %>% prep %>% bake(new_data = NULL)
    scaled_data[['data']] <- df_dummied
    scaled_data
}

In [12]:
make_frame <- function(df,test = FALSE,train_frame = NULL,config){
    dt <- copy(df)
    setDT(dt)
    if(!test){
        target_ <- dt[[config$label]]
        
        dt[,(config$labe) := NULL]
    }
    sparsed <- sparse_data(dt,sparse_percentage = config$sparse_percentage)
    gc()
    base::message('Data sparsed.')
    
    if(!test){
        scl <- get_scaling_factors(sparsed)
    }else{
        scl <- train_frame[['scaling_factors']]
    }
    gc()
    base::message('Scaling factors calculated.')
    base::message('Missing values labeled.')
    scld <- scale_data(sparsed,scl)
    gc()
    base::message('Data scaled.')
    
    if(test){
        scld <- clean_test_set(scld,train_frame[['cat_cols']],train_frame[['cat_distincts']])
        base::message('Unseen values removed from test set.')
        scld <- fetch_test_levels_to_train(scld,train_frame[['cat_cols']],train_frame[['cat_distincts']])
        base::message('Train & test set nominal levels fetched.')
    }
    
    if(!test){
    distincted <- get_distincts(scld)
    gc()
    base::message('Got distinct values for nominals.')
    }else{
        distincted <- scld
    }
    
    dummied <- dummy_data(distincted)
    gc()
    base::message('Data dummied.')
    
    if(!test){
        dummied[['label_to_keras']] <- target_# %>% label_encode %>% keras::to_categorical()
        dummied[['label']] <- target_ #%>% label_encode
    }
    setDT(dummied[['data']])
    #dummied[['data']] <- dummied[['data']]# %>% as.matrix
    base::message('All done !')
    dummied
}

In [13]:
create_folds <- function(train_data,config){
    set.seed(config$seed)
    caret::createFolds(train_data$label,k = config$folds)
}

In [14]:
fold_threshold <- function(y_probs,y_test){
    thresholds <- data.frame()
    
    for(j in seq(from = 0.1,to = 0.9,by = 0.01)){
        obs <- y_test
        prd <- +(y_probs >= j)
        if(sum(prd) == 0){
            next
        }
        f1_clc <- f_meas_vec(truth = factor(obs,levels = 0:1),estimate = factor(prd,levels = 0:1),event_level = 'second')
        fold_row <- data.table(threshold = j,f1 = f1_clc)
        thresholds <- rbindlist(list(thresholds,fold_row))
    }
    
    best_threshold <- thresholds %>% filter(f1 == max(f1)) %>% select(threshold) %>% pull %>% .[1]
    error <- thresholds %>% filter(f1 == max(f1)) %>% select(f1) %>% pull %>% .[1]
    
    return(list(threshold = best_threshold,score = error))
}

In [112]:
tuner <- function(folds,train_frame,label,config){
    fold_list <- list()
    fold_nums <- 1:length(folds)
    
    single_tbl <- train_frame %>% mutate(response = factor(enc(label),levels = 0:1))
    
    for(i in fold_nums){
        asses_ind <- i
        analysis_ind <- setdiff(fold_nums,i)

        created_folds <- list(assessment = unlist(folds[asses_ind]),analysis = unlist(folds[analysis_ind]))
        fold_list[[i]] <- created_folds
    }
    
    
    splits <- purrr::map(fold_list, make_splits, data = single_tbl)
    
    rsmpl <- manual_rset(splits, paste0("Fold", fold_nums))
    
    scl <- sum(single_tbl[['label']] == 0) / sum(single_tbl[['label']] == 1)
    
    modelspec <- boost_tree(learn_rate = tune(),tree_depth = tune(),min_n = tune()) %>%
    set_mode('classification') %>%
    set_engine('xgboost',scale_pos_weight = 99)
    
    ctrl <- control_bayes(verbose = TRUE,event_level = 'second')
    
    rcp_format <- recipe(response ~ .,data = single_tbl) %>% step_dummy(all_nominal_predictors())
    #return(list(dt = single_tbl,rcp = rcp_format))
    #rcp <- sprintf(rcp_format,config$label) %>% as.formula
    
    metric <- yardstick::metric_set(f_meas)
    #return(modelspec)
    tuned <- tune_bayes(object = modelspec,preprocessor = rcp_format,iter = config$tune_iter,resamples = rsmpl,metrics = metric,control = ctrl,initial = 5)
    
    best_param <- tuned %>% select_best('f_meas')
    
    learning_rate <- best_param %>% select(learn_rate) %>% pull
    
    tree_depth <- best_param %>% select(tree_depth) %>% pull 
    
    min_n <- best_param %>% select(min_n) %>% pull
    
    return(list(lr = learning_rate,td = tree_depth,mn = min_n))
}

In [119]:
cross_validate <- function(train_data,target_data,folds,config,label_encoder,label_back_encoder){
    
    set.seed(config$seed)
    
    folds_ <- 1:length(folds)
    
    errors <- c()
    
    preds <- list()
    models <- list()
    feature_importances <- data.table()
    
    if(config$tune){
        base::message('Bayesian tuning started..')
        
        new_params <- tuner(folds = folds,train_frame = train_data$data,label = train_data$label,config = config)
        
        prm <- list(learning_rate = new_params$lr,depth = new_params$td,min_data_in_leaf = new_params$mn,
                iterations = config$iter,
                loss_function = config$objective,auto_class_weights = config$auto_class_weights,
                verbose = 0,custom_loss = config$loss_fun,eval_metric = config$eval_metric,
                use_best_model = config$use_best,task_type = config$task,devices = config$device,
                early_stopping_rounds = config$early_stopping_rounds)
    }else{
        prm <- list(learning_rate = config$learning_rate,iterations = config$iter,
                loss_function = config$objective,auto_class_weights = config$auto_class_weights,
                verbose = 0,custom_loss = config$loss_fun,eval_metric = config$eval_metric,
                use_best_model = config$use_best,task_type = config$task,devices = config$device,
                early_stopping_rounds = config$early_stopping_rounds)
    }
    
    
    #print(prm)
    IRdisplay::display(config$slicer_1)
    for(i in folds_){
        
        train_indices <- setdiff(folds_,i) 
        test_indices <- i
        
        train_x <- train_data$data[unlist(folds[train_indices]),] 
        test_x <- train_data$data[unlist(folds[test_indices]),]
        
        train_y <- train_data$label[unlist(folds[train_indices])] %>% label_encoder 
        test_y <- train_data$label[unlist(folds[test_indices])] %>% label_encoder 
        
        train_frame <- catboost::catboost.load_pool(data = train_x,label = train_y)
        test_frame <- catboost::catboost.load_pool(data = test_x,label = test_y)
        
        start_it <- Sys.time()
        
        cat_model <- catboost::catboost.train(learn_pool = train_frame,test_pool = test_frame,params = prm)
        
        prediction_to_threshold <- catboost.predict(cat_model,test_frame,prediction_type='Probability')
        
        thresholded <- fold_threshold(y_probs = prediction_to_threshold,y_test = test_y)
        
        msgformat_score <- 'Fold %s F1 : %s'
        msgformat_threshold <- 'Best threshold for Fold %s : %s'
        
        msg_score <- sprintf(msgformat_score,i,round(thresholded$score,4))
        msg_threshold <- sprintf(msgformat_threshold,i,thresholded$threshold)
        
        prediction <- catboost.predict(cat_model,catboost.load_pool(target_data$data),prediction_type='Probability')
        
        prediction_class <- +(prediction >= thresholded$threshold) %>% label_back_encoder
        
        preds[[i]] <- prediction_class
        models[[i]] <- cat_model
        errors[i] <- thresholded$score
        
        fi_row <- cat_model %>% .$feature_importances %>% t %>% as.data.table
        
        feature_importances <- rbindlist(list(feature_importances,fi_row))
        
        finished_it <- Sys.time()
        
        process <- round(as.numeric(difftime(finished_it,start_it,units = 'min')),4)
        
        msgformat_process <- 'Elapsed time for Fold %s : %s minutes'
        
        msg_process <- sprintf(msgformat_process,i,process)
        
        IRdisplay::display(msg_score)
        IRdisplay::display(msg_threshold)
        
        IRdisplay::display(config$slicer_2)
        
        IRdisplay::display(msg_process)
        
        IRdisplay::display(config$slicer_1)
    }
    msgformat2 <- 'CV Mean F1 : %s'
    IRdisplay::display(sprintf(msgformat2,round(mean(errors,na.rm = T),4)))
    
    feature_importance_plot <- feature_importances %>%
    summarise_all(mean) %>%
    t %>%
    as.data.table(keep.rownames = T) %>%
    ggplot(aes(rn,V1))+
    geom_bar(stat = 'identity')+
    xlab('Variable')+
    ylab('Importance')+
    ggtitle('Variable Importance Plot')
    
    return(list(models = models,preds = preds,fi = feature_importance_plot))
    
    
}

In [114]:
enc <- function(x){
    case_when(x == '<=50K' ~ 0,
              x == '>50K' ~ 1)
}

encback <- function(x){
    case_when(x == 0 ~ '<=50K',
              x == 1 ~ '>50K')
}

In [115]:
if(FALSE){

    data_sl <- data.table()

    for(i in c(LETTERS[1:10],letters[1:20])){
        charmi <- sample(c(TRUE,FALSE),size = 1)
        ortsec <- sample(200:500,size = 1)
        sdsec <- sample(100:1000,size = 1)
        if(charmi){
            data_sl[,(i) := sample(c(LETTERS,NA),size = 1.7e4,replace = T)]
        }else{
            data_sl[,(i) := sample(c(rnorm(n = 2e6,mean = ortsec,sd = sdsec),NA),size = 1.7e4,replace = T)]
        }
    }

    data_sl[,default := sample(c('No','Yes'),size = 1.7e4,prob = c(.93,.07),replace = T)]
}

In [116]:
set.seed(571)
traindex <- createDataPartition(df[[config$label]],p = .8,times = 1,list = FALSE) %>% as.numeric
train_set <- df[traindex,]
test_set <- df[-traindex,]

In [117]:
train_ok <- train_set %>% make_frame(config = config)
test_ok <- test_set %>% select(-salary) %>% make_frame(test = T,train_frame = train_ok,config = config)

Data sparsed.

Scaling factors calculated.

Missing values labeled.

Data scaled.

Got distinct values for nominals.

Data dummied.

All done !

Data sparsed.

Scaling factors calculated.

Missing values labeled.

Data scaled.

Unseen values removed from test set.

Train & test set nominal levels fetched.

Data dummied.

All done !



In [None]:
fld <- create_folds(train_ok,config = config)
cved <- cross_validate(train_ok,test_ok,folds = fld,config = config,label_encoder = enc,label_back_encoder =encback)

Bayesian tuning started..



[30m❯[39m [30m Generating a set of 5 initial parameter results[39m

[33m![39m [33mFold1: internal:
  While computing binary `precision()`, no predicted events were detecte...
  Precision is undefined in this case, and `NA` will be returned.
  Note that 0 true event(s) actually occured for the problematic event l...
  Recall is undefined in this case, and `NA` will be returned.
  Note that 0 predicted event(s) actually occured for the problematic ev...[39m

[33m![39m [33mFold2: internal:
  While computing binary `precision()`, no predicted events were detecte...
  Precision is undefined in this case, and `NA` will be returned.
  Note that 0 true event(s) actually occured for the problematic event l...
  Recall is undefined in this case, and `NA` will be returned.
  Note that 0 predicted event(s) actually occured for the problematic ev...[39m

[33m![39m [33mFold3: internal:
  While computing binary `precision()`, no predicted events were detect

ERROR: [1m[33mError[39m in [1m[1m`check_gp_failure()`:[22m
[33m![39m Gaussian process model was not fit.


[31m✖[39m Optimization stopped prematurely; returning current results.



In [413]:
df <- cved$preds %>% as.data.frame

In [414]:
colnames(df) <- paste0('Fold',1:3)

In [415]:
kk <- df %>% rowwise %>% mutate(pred = fmode(Fold1,Fold2,Fold3)) %>% as.data.table

In [416]:
obs <- test_set %>% select(salary) %>% pull %>% as.factor

In [417]:
prd <- kk %>% select(pred) %>% pull %>% as.factor

In [418]:
f_meas_vec(truth = obs,estimate = prd,event_level = 'second')

In [35]:
trn <- df[traindex,]
tst <- df[-traindex,]

In [39]:
folds <- 1:length(fld)

In [218]:
prm <- list(learning_rate = config$learning_rate,iterations = config$iter,
                loss_function = config$objective,auto_class_weights = config$auto_class_weights,
                verbose = 0,custom_loss = config$loss_fun,eval_metric = config$eval_metric,
                use_best_model = config$use_best,task_type = config$task,devices = config$device,
                early_stopping_rounds = config$early_stopping_rounds,od_type = 'Iter')

In [219]:
folds

In [220]:
preds <- list()
errors <- c()
IRdisplay::display(config$slicer_1)
for(i in folds[1]){
    train_indice <- setdiff(folds,i)
    test_indice <- i
    
    train_x <- trn[unlist(fld[train_indice]),] %>% select(-salary) %>% mutate_if(function(x) !is.numeric(x),as.factor)
    test_x <- trn[unlist(fld[test_indice]),] %>% select(-salary) %>% mutate_if(function(x) !is.numeric(x),as.factor)
    
    train_y <- trn[unlist(fld[train_indice]),] %>% select(salary) %>% pull %>% enc %>% as.integer
    test_y <- trn[unlist(fld[test_indice]),] %>% select(salary) %>% pull %>% enc %>% as.integer
    
    train_frame <- catboost::catboost.load_pool(data = train_x,label = train_y)
    test_frame <- catboost::catboost.load_pool(data = test_x,label = test_y)
                                                                               
    cat_model <- catboost::catboost.train(learn_pool = train_frame,test_pool = test_frame,params = prm)
                                                                               
                                                                               
    prediction_to_threshold <- catboost.predict(cat_model,test_frame,prediction_type='Probability')
        
    thresholded <- fold_threshold(y_probs = prediction_to_threshold,y_test = test_y)

    msgformat_score <- 'Fold %s F1 : %s'
    msgformat_threshold <- 'Best threshold for Fold %s : %s'

    msg_score <- sprintf(msgformat_score,i,round(thresholded$score,4))
    msg_threshold <- sprintf(msgformat_threshold,i,thresholded$threshold)
                                                                               
    target_data <- tst %>% select(-salary) %>% mutate_if(function(x) !is.numeric(x),as.factor)

    prediction <- catboost.predict(cat_model,catboost.load_pool(target_data),prediction_type='Probability')

    prediction_class <- +(prediction >= thresholded$threshold) %>% encback

    preds[[i]] <- prediction_class
                                                                               
    errors[i] <- thresholded$score
    
    IRdisplay::display(msg_score)
                                                                               
    IRdisplay::display(msg_threshold)

    IRdisplay::display(config$slicer_2)

    IRdisplay::display(config$slicer_1)
                                                                               
}
                                                                               

msgformat2 <- 'CV Mean F1 : %s'
IRdisplay::display(sprintf(msgformat2,round(mean(errors,na.rm = T),4)))

In [67]:
prd <- preds %>% as.data.frame

In [69]:
colnames(prd) <- paste0('Fold',1:3)

In [70]:
kk <- prd %>% rowwise %>% mutate(pred = fmode(Fold1,Fold2,Fold3)) %>% as.data.table

In [72]:
obs <- test_set %>% select(salary) %>% pull %>% as.factor

In [77]:
prd2 <- kk %>% select(pred) %>% pull %>% as.factor

In [78]:
f_meas_vec(truth = obs,estimate = prd2,event_level = 'second')