In [1]:
suppressPackageStartupMessages({
    library(data.table)
    library(tidyverse)
    library(tidymodels)
    library(treesnip)
    library(caret)
    library(keras)
    library(tensorflow)
    library(collapse)
})

"package 'caret' was built under R version 4.2.1"


In [2]:
df <- fread('salary.csv')

In [3]:
colnames(df) <- gsub('-','',colnames(df))

In [4]:
#df %>% select(salary) %>% filter(salary == '>50K')

In [5]:
label_encode <- function(x){
    case_when(x == '<=50K' ~ 0,
              x == '>50K' ~ 1)
}

In [6]:
sparse_data <- function(df,sparse_percentage = 0.02){
    
    df_new <- copy(df) 
    setDT(df_new)
    categorical_names <- df_new %>% purrr::discard(is.numeric) %>% colnames

    for(i in categorical_names){
        features <- df_new[,unique(.SD),.SDcols = i] %>% pull
        
        for(j in features){
            
            condition_format <- 'df_new[,length(%s[%s == "%s"])/.N < %s]'
            condition_command <- sprintf(condition_format,i,i,j,sparse_percentage)
        
            condition <- eval(parse(text = condition_command))
            
            if(condition){
                sparse_format <- 'df_new[%s == "%s", %s := "other"]'
                sparse_command <- sprintf(sparse_format,i,j,i)
                eval(parse(text = sparse_command))
            } 
        }
    }
    df_new
}

In [7]:
get_scaling_factors <- function(df){
    setDT(df)
    factors <- lapply(df %>% keep(is.numeric),function(x) list(min = min(x,na.rm = T),max = max(x,na.rm = T)))
    factors
}      

In [8]:
scale_data <- function(df,scaling_factors,reverse = FALSE){
    
    data <- copy(df)
    setDT(data)
    for(i in names(scaling_factors)){
        factors <- scaling_factors[[i]]
        if(reverse){
            data[,(i) := lapply(.SD,function(x) (x*(factors[['max']] - factors[['min']]) + factors[['min']])),.SDcols = i]
        }else{
            data[,(i) := lapply(.SD,function(x) (x - factors[['min']]) / (factors[['max']] - factors[['min']])),.SDcols = i]
            data[,(i) := lapply(.SD,function(x) fifelse(is.na(x),-1,x)),.SDcols = i]
        }  
    }
                                
    catcols <- df %>% purrr::discard(is.numeric) %>% colnames
    out <- list(scaling_factors = scaling_factors,data = data,cat_cols = catcols)
    return(out)
} 

In [9]:
clean_test_set <- function(scaled_data,categorical_cols,distinct_values_on_train){
    df <- copy(scaled_data[['data']])
    
    make_paste <- function(x){
    wrapped <- sapply(x,function(x) paste0('"',x,'"'))
    paste0(wrapped,collapse = ',')
    }       
   
    
    for(i in categorical_cols){
        distincts <- distinct_values_on_train[[i]]
        distincts <- make_paste(distincts)
        command_format <- "df[! %s %%in%% c(%s), %s := 'Missing']"
        command <- sprintf(command_format,i,distincts,i)
        eval(parse(text = command))
    }
                      
    scaled_data[['data']] <- df
                      
    scaled_data
}

In [10]:
get_distincts <- function(scaled_data){
    df <- copy(scaled_data[['data']])
    
    catcols <- scaled_data[['cat_cols']]
    
    distincts <- list()
    for(i in catcols){
        distinct_values <- df[,unique(.SD),.SDcols = i] %>% pull %>% as.character
        distincts[[i]] <- c(distinct_values,'Missing')
        df[,(i) := lapply(.SD,function(x) as.character(x)),.SDcols = i]
        df[,(i) := lapply(.SD,function(x) ifelse(is.na(x),'Missing',x)),.SDcols = i]
        df[,(i) := lapply(.SD,function(x) factor(x,levels = distincts[[i]])),.SDcols = i]
    }
    
    

    scaled_data[['cat_distincts']] <- distincts
    
    scaled_data[['data']] <- df
                          
    scaled_data
}

In [11]:
fetch_test_levels_to_train <- function(scaled_data,categorical_cols,distinct_values_on_train){
    df <- copy(scaled_data[['data']])
    
    for(i in categorical_cols){
        distincts <- distinct_values_on_train[[i]]
        df[,(i) := lapply(.SD,function(x) factor(x,levels = distincts)),.SDcols = i]
    }
   
    scaled_data[['data']] <- df
    scaled_data
}

In [12]:
dummy_data <- function(scaled_data){
    df <- copy(scaled_data[['data']])
    df_dummied <- recipe(df) %>% step_dummy(all_nominal()) %>% prep %>% bake(new_data = NULL)
    scaled_data[['data']] <- df_dummied
    scaled_data
}

In [13]:
make_frame <- function(df,label,test = FALSE,train_frame = NULL,sparse_perc = 0.03){
    dt <- copy(df)
    setDT(dt)
    if(!test){
        target_ <- dt[[label]]
        
        dt[,(label) := NULL]
    }
    sparsed <- sparse_data(dt,sparse_percentage = sparse_perc)
    gc()
    base::message('Data sparsed.')
    
    if(!test){
        scl <- get_scaling_factors(sparsed)
    }else{
        scl <- train_frame[['scaling_factors']]
    }
    gc()
    base::message('Scaling factors calculated.')
    base::message('Missing values labeled.')
    scld <- scale_data(sparsed,scl)
    gc()
    base::message('Data scaled.')
    
    if(test){
        scld <- clean_test_set(scld,train_frame[['cat_cols']],train_frame[['cat_distincts']])
        base::message('Unseen values removed from test set.')
        scld <- fetch_test_levels_to_train(scld,train_frame[['cat_cols']],train_frame[['cat_distincts']])
        base::message('Train & test set nominal levels fetched.')
    }
    
    if(!test){
    distincted <- get_distincts(scld)
    gc()
    base::message('Got distinct values for nominals.')
    }else{
        distincted <- scld
    }
    
    dummied <- dummy_data(distincted)
    gc()
    base::message('Data dummied.')
    
    if(!test){
        dummied[['label_to_keras']] <- target_# %>% label_encode %>% keras::to_categorical()
        dummied[['label']] <- target_ #%>% label_encode
    }
    
    dummied[['data']] <- dummied[['data']] %>% as.matrix
    base::message('All done !')
    dummied
}

In [14]:
data_sl <- data.table()

for(i in c(LETTERS[1:10],letters[1:20])){
    charmi <- sample(c(TRUE,FALSE),size = 1)
    ortsec <- sample(200:500,size = 1)
    sdsec <- sample(100:1000,size = 1)
    if(charmi){
        data_sl[,(i) := sample(c(LETTERS,NA),size = 2e5,replace = T)]
    }else{
        data_sl[,(i) := sample(c(rnorm(n = 2e2,mean = ortsec,sd = sdsec),NA),size = 2e5,replace = T)]
    }
}

data_sl[,default := sample(c('No','Yes'),size = 2e5,prob = c(.93,.07),replace = T)]

In [15]:
set.seed(571)
#dk <- copy(iris)
traindex <- createDataPartition(df[['salary']],p = .8,times = 1,list = FALSE) %>% as.numeric
train_set <- df[traindex,]
test_set <- df[-traindex,]
#train_set <- dk[traindex,]
#test_set <- dk[-traindex,] %>% select(-default) %>% filter(student != 'No')

In [16]:
train_set %>% select(salary) %>% table %>% prop.table

salary
    <=50K      >50K 
0.7591846 0.2408154 

In [17]:
test_set %>% select(salary) %>% table %>% prop.table

salary
    <=50K      >50K 
0.7592138 0.2407862 

In [18]:
train_ok <- train_set %>% make_frame(label = 'salary')
test_ok <- test_set %>% select(-salary) %>% make_frame(label = 'salary',test = T,train_frame = train_ok)

Data sparsed.

Scaling factors calculated.

Missing values labeled.

Data scaled.

Got distinct values for nominals.

Data dummied.

All done !

Data sparsed.

Scaling factors calculated.

Missing values labeled.

Data scaled.

Unseen values removed from test set.

Train & test set nominal levels fetched.

Data dummied.

All done !



In [19]:
f1 <- function(y_true, y_pred){
    y_pred = k_round(y_pred)
    
    tp = k_sum(k_cast(y_true*y_pred, 'float'), axis=1)
    tn = k_sum(k_cast((1-y_true)*(1-y_pred), 'float'), axis=1)
    fp = k_sum(k_cast((1-y_true)*y_pred, 'float'), axis=1)
    fn = k_sum(k_cast(y_true*(1-y_pred), 'float'), axis=1)

    p = tp / (tp + fp + k_epsilon())
    r = tp / (tp + fn + k_epsilon())

    f1 = 2*p*r / (p+r+k_epsilon())
    f1 = tf$where(tf$math$is_nan(f1), tf$zeros_like(f1), f1)
    return(k_mean(f1,axis = 1))
}

f1_loss <- function(y_true, y_pred){
    #y_pred = k_round(y_pred)
    tp = k_sum(k_cast(y_true*y_pred, 'float'), axis=1)
    tn = k_sum(k_cast((1-y_true)*(1-y_pred), 'float'), axis=1)
    fp = k_sum(k_cast((1-y_true)*y_pred, 'float'), axis=1)
    fn = k_sum(k_cast(y_true*(1-y_pred), 'float'), axis=1)

    p = tp / (tp + fp + k_epsilon())
    r = tp / (tp + fn + k_epsilon())

    f1 = 2*p*r / (p+r+k_epsilon())
    f1 = tf$where(tf$math$is_nan(f1), tf$zeros_like(f1), f1)
    return(1 - k_mean(f1,axis = 1))
}

In [20]:
folds_ <- caret::createFolds(train_ok[['label']],k = 5)

folds <- 1:length(folds_)

In [21]:
input_neurons <- ncol(train_ok[['data']][folds_[[1]],]) 
output_neurons <- 2

nofsample <- nrow(train_ok[['data']][folds_[[1]],])

alpha <- 2

In [22]:
numofneuron <- nofsample / (alpha * (input_neurons + output_neurons))

numofneuron <- numofneuron - output_neurons

In [23]:
numofneuron

In [24]:
layer_dist1 <- base::seq(from = numofneuron,to = output_neurons,length.out = 5)
layer_dist <- layer_dist1[-1]/base::sum(layer_dist1[-1])*numofneuron
layer_disto <- layer_dist %>% ceiling %>% .[1:3]

layer_disto

In [25]:
seed <- 1923
preds_keras <- list()
errors <- c()

for(i in folds){
    train_indices <- setdiff(folds,i) 
    test_indices <- i
    
    train_x <- train_ok[['data']][unlist(folds_[train_indices]),] %>% na.omit
    test_x <- train_ok[['data']][unlist(folds_[test_indices]),] %>% na.omit
    
    target_data <- test_ok[['data']]
    
    train_y <- train_ok[['label']][unlist(folds_[train_indices])]
    test_y <- train_ok[['label']][unlist(folds_[test_indices])]
    
    train_y2 <- train_y %>% label_encode %>% as.matrix %>% to_categorical
    test_y2 <- test_y %>% label_encode %>% as.matrix %>% to_categorical
    
    tf$random$set_seed(seed = seed)

    reticulate::py_set_seed(seed = seed)

    model <- keras_model_sequential()
    

    model %>%
    layer_dense(units = ceiling(numofneuron), activation = 'relu', input_shape = c(input_neurons)) %>%
    #layer_dense(units = layer_disto[7], activation = 'relu') %>%
    layer_dense(units = c(output_neurons),activation = 'softmax')
    
    model %>% compile(
      loss = f1_loss,#f1_loss,
      optimizer =optimizer_rmsprop(learning_rate = 0.01),
      metrics = c(f1)
    )
    
    wgh <- sum(test_y == '<=50K')/sum(test_y == '>50K')
    
    history <- model %>% fit(
      train_x, train_y2, 
      epochs = 1000, batch_size = 1024, 
      validation_data = list(test_x,test_y2),
      callbacks = list(callback_early_stopping(
                        monitor = "val_loss",
                        patience = 100,
                        verbose = 1,
                        mode = "auto",
                        restore_best_weights = TRUE
                        )
    ),
    class_weight = list('0' = sum(test_y == '>50K'),'1' = sum(test_y == '<=50K'))
    )
    
    preds <- model %>% predict(test_x)  %>% .[,2]
    
    # fold thresholding 
    thresholds <- data.table()
    for(j in seq(from = 0.01,to = 0.9,by = 0.01)){
        obs <- test_y %>% label_encode
        prd <- +(preds >= j)
        if(sum(prd) == 0){
            next
        }
        f1_clc <- f_meas_vec(truth = factor(obs,levels = 0:1),estimate = factor(prd,levels = 0:1),event_level = 'second')
        fold_row <- data.table(threshold = j,f1 = f1_clc)
        thresholds <- rbindlist(list(thresholds,fold_row))
    }
    
    best_threshold <- thresholds %>% filter(f1 == max(f1)) %>% select(threshold) %>% pull %>% .[1]
    #best_threshold <- 0.5
    error <- thresholds %>% filter(f1 == max(f1)) %>% select(f1) %>% pull %>% .[1]
    #error <- accuracy_vec(truth = factor(obs,levels = 0:2),estimate = factor(preds,levels = 0:2))#,event_level = 'second')
    IRdisplay::display('**************----------**************')
    IRdisplay::display(sprintf('Fold %s F1 : %s',i,round(error,2)))
    errors[i] <- error
    IRdisplay::display(sprintf('Best threshold for Fold %s is : %s',i,best_threshold))
    
    
    
    preds_target <- model %>% predict(target_data) %>% .[,2]
    
    preds_target <- +(preds_target >= best_threshold)
    
    preds_keras[[i]] <- preds_target
    
    rm(model)
    gc()
}
IRdisplay::display('**************----------**************')
IRdisplay::display(sprintf('CV Mean F1 : %s',mean(errors,na.rm = T)))

Loaded Tensorflow version 2.10.0-dev20220614



In [26]:
krs <- preds_keras %>% as.data.table

In [27]:
colnames(krs) <- paste0('Fold',1:length(folds))

In [28]:
pmin(0,1)

In [29]:
krs <- krs %>% rowwise %>% mutate(prd = pmin(fmode(c(Fold1,Fold2,Fold3,Fold4,Fold5)),1L)) %>% as.data.table

In [30]:
#krs[,prd := +(prd >= 0.5)]

In [31]:
df[-traindex,] %>% select(salary) %>% table %>% prop.table

salary
    <=50K      >50K 
0.7592138 0.2407862 

In [32]:
krs %>% select(prd) %>% table %>% prop.table

prd
        0         1 
0.7383292 0.2616708 

In [33]:
test_set%>% select(salary) %>%
transmute(obs = label_encode(salary)) %>%
bind_cols(krs %>% select(prd)) %>%
f_meas(truth = factor(obs,levels = 0:1),estimate = factor(prd,levels = 0:1),event_level = 'second')

.metric,.estimator,.estimate
<chr>,<chr>,<dbl>
f_meas,binary,0.690709


In [36]:
errors <- c()
preds_cb <- list()
set.seed(1)

rcp <- recipe(salary ~ .,data = train_set) %>% step_string2factor(all_nominal_predictors())


spc <- boost_tree(learn_rate = 0.01) %>%
set_mode('classification') %>%
#set_engine('xgboost',scale_pos_weight = sqrt(ww)) #%>%
set_engine('catboost',auto_class_weights = 'SqrtBalanced',custom_loss = 'F1:use_weights=False',eval_metric = 'F1:use_weights=False') 


wf <- workflow() %>% add_recipe(rcp) %>% add_model(spc)

rsmpl <- vfold_cv(train_set,strata = salary,v = 5)

ctrl <- control_bayes(verbose = T,event_level = 'second')

metrk <- yardstick::metric_set(f_meas)

#tuned <- tune_bayes(wf,resamples = rsmpl,iter = 5,metrics = metrk,control = ctrl)

#best_param <- tuned %>% select_best('f_meas')

#wf <- wf %>% finalize_workflow(best_param)

for(i in folds){
    train_indices <- setdiff(folds,i) 
    test_indices <- i
    
    #train_x <- train_ok[['data']][unlist(folds_[train_indices]),] %>% na.omit
    #test_x <- train_ok[['data']][unlist(folds_[test_indices]),] %>% na.omit
    train_x <- train_set[unlist(folds_[train_indices]),] %>% select(-salary)
    test_x <- train_set[unlist(folds_[test_indices]),] %>% select(-salary)
    target_data <- test_set %>% as.data.table 
    
    train_y <- train_set[unlist(folds_[train_indices])] %>% select(salary) %>% pull
    test_y <- train_set[unlist(folds_[test_indices])] %>% select(salary) %>% pull
    #train_y <- train_ok[['label']][unlist(folds_[train_indices])]
    #test_y <- train_ok[['label']][unlist(folds_[test_indices])]
    
    train_df <- train_x %>% as.data.table %>% mutate(salary = train_y)
    test_df <- test_x %>% as.data.table %>% mutate(salary = test_y)
    
    
    ww <- train_df %>% summarise(length(salary[salary == '<=50K'])/length(salary[salary == '>50K'])) %>% pull
       
    mod <- wf %>% fit(train_df)
    
    prds <- mod %>% predict(test_df,type='prob') %>% .[,2] %>% pull
    
    thresholds <- data.table()
    for(j in seq(from = 0.03,to = 0.7,by = 0.01)){
        obs <- test_df %>% select(salary) %>% pull %>% label_encode
        prd <- +(prds >= j)
        if(sum(prd) == 0){
            next
        }
        f1_clc <- f_meas_vec(truth = factor(obs,levels = 0:1),estimate = factor(prd,levels = 0:1),event_level = 'second')
        fold_row <- data.table(threshold = j,f1 = f1_clc)
        thresholds <- rbindlist(list(thresholds,fold_row))
    }
    
    best_threshold <- thresholds %>% filter(f1 == max(f1)) %>% select(threshold) %>% pull %>% .[1]
    error <- thresholds %>% filter(f1 == max(f1)) %>% select(f1) %>% pull %>% .[1]
    
    #error <- prds %>% bind_cols(ts %>% select(salary)) %>%
    #f_meas(truth = .pred_class,estimate = factor(salary,levels = c('<=50K','>50K')),event_level = 'second') %>%
    #select(.estimate) %>%
    #pull 
    
    IRdisplay::display('**************----------**************')
    IRdisplay::display(sprintf('Fold %s F1 : %s',i,round(error,2)))
    errors[i] <- error
    IRdisplay::display(sprintf('Best threshold for Fold %s is : %s',i,best_threshold))
    
    prds_targ <- mod %>% predict(target_data,type = 'prob') %>% .[,2] %>% pull
    
    prds_targ <- +(prds_targ >= best_threshold)
    
    preds_cb[[i]] <- prds_targ
}


IRdisplay::display('**************----------**************')
IRdisplay::display(sprintf('CV Mean F1 : %s',mean(errors,na.rm = T)))

Parameter 'cat_features' is meaningless because column types are taken from data.frame.
Please, convert categorical columns to factors manually.
Parameter 'cat_features' is meaningless because column types are taken from data.frame.
Please, convert categorical columns to factors manually.
Parameter 'cat_features' is meaningless because column types are taken from data.frame.
Please, convert categorical columns to factors manually.
Parameter 'cat_features' is meaningless because column types are taken from data.frame.
Please, convert categorical columns to factors manually.
Parameter 'cat_features' is meaningless because column types are taken from data.frame.
Please, convert categorical columns to factors manually.


In [37]:
cb <- preds_cb %>% as.data.table

In [38]:
colnames(cb) <- paste0('Fold',1:length(folds))

In [39]:
cb <- cb %>% rowwise %>% mutate(prd = fmode(c(Fold1,Fold2,Fold3,Fold4,Fold5))) %>% as.data.table

In [88]:
test_set %>%
select(salary) %>%
bind_cols(cb %>% select(prd)) %>%
f_meas(truth = factor(ifelse(salary == '<=50K',0,1),levels= 0:1),estimate = factor(prd,levels = 0:1),event_level = 'second')

ERROR: Error in select(., salary): 'test_set' nesnesi bulunamadı


In [None]:
?scale_pos_weight()

In [32]:
library(catboost)

In [37]:
library(xgboost)


Attaching package: 'xgboost'


The following object is masked from 'package:dplyr':

    slice




In [44]:
mod %>% extract_fit_engine -> md

In [93]:
x <- iris[,-5] 
y <- iris[,5] %>% as.numeric - 1

In [94]:
dt <- xgb.DMatrix(data = as.matrix(x),label = y)

In [95]:
kk <- catboost::catboost.load_pool(data = x,label = y)

In [97]:
#mod <- xgb.train(data = dt,nrounds = 2,num_class = 3,objective = 'multi:softprob')

In [130]:
prm <- list(learning_rate = 0.01,iterations = 200,verbose = 1,loss_function = 'MultiClass',eval_metric = 'TotalF1')

In [131]:
mod_cb <- catboost.train(learn_pool = kk,test_pool = kk,params = prm)

Custom logger is already specified. Specify more than one logger at same time is not thread safe.0:	learn: 0.9666366	test: 0.9666366	best: 0.9666366 (0)	total: 707us	remaining: 141ms
1:	learn: 0.9733227	test: 0.9733227	best: 0.9733227 (1)	total: 1.24ms	remaining: 123ms
2:	learn: 0.9666633	test: 0.9666633	best: 0.9733227 (1)	total: 1.92ms	remaining: 126ms
3:	learn: 0.9599840	test: 0.9599840	best: 0.9733227 (1)	total: 2.93ms	remaining: 143ms
4:	learn: 0.9666633	test: 0.9666633	best: 0.9733227 (1)	total: 3.54ms	remaining: 138ms
5:	learn: 0.9666633	test: 0.9666633	best: 0.9733227 (1)	total: 4.17ms	remaining: 135ms
6:	learn: 0.9533287	test: 0.9533287	best: 0.9733227 (1)	total: 4.82ms	remaining: 133ms
7:	learn: 0.9533287	test: 0.9533287	best: 0.9733227 (1)	total: 5.42ms	remaining: 130ms
8:	learn: 0.9533287	test: 0.9533287	best: 0.9733227 (1)	total: 6.03ms	remaining: 128ms
9:	learn: 0.9533287	test: 0.9533287	best: 0.9733227 (1)	total: 6.69ms	remaining: 127ms
10:	learn: 0.9533287	test: 0.95332

In [133]:
source('https://github.com/sametsoekel/nonsystematic_workspace/blob/master/space/cb_pipeline0816.r')

ERROR: Error in source("https://github.com/sametsoekel/nonsystematic_workspace/blob/master/space/cb_pipeline0816.r"): https://github.com/sametsoekel/nonsystematic_workspace/blob/master/space/cb_pipeline0816.r:8:1: beklenmeyen durum, '<'
7: 
8: <
   ^


In [84]:
attr(prds,'dim') <- c(150,3)

In [40]:
#mod <- catboost.train(kk,params = prm)