In [222]:
suppressPackageStartupMessages({
    library(data.table)
    library(tidyverse)
    library(tidymodels)
    library(treesnip)
    library(caret)
    library(keras)
    library(tensorflow)
    library(collapse)
})

In [223]:
a <- sample(0:1,size = 20,replace = T)

In [224]:
label_encode <- function(x){
    case_when(x == 'No' ~ 0,
              x == 'Yes' ~ 1)
}

In [239]:
sparse_data <- function(df,sparse_percentage = 0.02){
    
    df_new <- copy(df) 
    setDT(df_new)
    categorical_names <- df_new %>% purrr::discard(is.numeric) %>% colnames

    for(i in categorical_names){
        features <- df_new[,unique(.SD),.SDcols = i] %>% pull
        
        for(j in features){
            
            condition_format <- 'df_new[,length(%s[%s == "%s"])/.N < %s]'
            condition_command <- sprintf(condition_format,i,i,j,sparse_percentage)
        
            condition <- eval(parse(text = condition_command))
            
            if(condition){
                sparse_format <- 'df_new[%s == "%s", %s := "other"]'
                sparse_command <- sprintf(sparse_format,i,j,i)
                eval(parse(text = sparse_command))
            } 
        }
    }
    df_new
}

In [240]:
get_scaling_factors <- function(df){
    setDT(df)
    factors <- lapply(df %>% keep(is.numeric),function(x) list(min = min(x,na.rm = T),max = max(x,na.rm = T)))
    factors
}      

In [241]:
scale_data <- function(df,scaling_factors,reverse = FALSE){
    
    data <- copy(df)
    setDT(data)
    for(i in names(scaling_factors)){
        factors <- scaling_factors[[i]]
        if(reverse){
            data[,(i) := lapply(.SD,function(x) (x*(factors[['max']] - factors[['min']]) + factors[['min']])),.SDcols = i]
        }else{
            data[,(i) := lapply(.SD,function(x) (x - factors[['min']]) / (factors[['max']] - factors[['min']])),.SDcols = i]
            data[,(i) := lapply(.SD,function(x) fifelse(is.na(x),-1,x)),.SDcols = i]
        }  
    }
                                
    catcols <- df %>% purrr::discard(is.numeric) %>% colnames
    out <- list(scaling_factors = scaling_factors,data = data,cat_cols = catcols)
    return(out)
} 

In [242]:
clean_test_set <- function(scaled_data,categorical_cols,distinct_values_on_train){
    df <- copy(scaled_data[['data']])
    
    make_paste <- function(x){
    wrapped <- sapply(x,function(x) paste0('"',x,'"'))
    paste0(wrapped,collapse = ',')
    }       
   
    
    for(i in categorical_cols){
        distincts <- distinct_values_on_train[[i]]
        distincts <- make_paste(distincts)
        command_format <- "df[! %s %%in%% c(%s), %s := NA]"
        command <- sprintf(command_format,i,distincts,i)
        eval(parse(text = command))
    }
                      
    scaled_data[['data']] <- df
                      
    scaled_data
}

In [289]:
get_distincts <- function(scaled_data){
    df <- copy(scaled_data[['data']])
    
    catcols <- scaled_data[['cat_cols']]
    
    distincts <- list()
    for(i in catcols){
        distinct_values <- df[,unique(.SD),.SDcols = i] %>% pull %>% as.character
        distincts[[i]] <- c(distinct_values,'Missing')
        df[,(i) := lapply(.SD,function(x) as.character(x)),.SDcols = i]
        df[,(i) := lapply(.SD,function(x) ifelse(is.na(x),'Missing',x)),.SDcols = i]
        df[,(i) := lapply(.SD,function(x) factor(x,levels = distincts[[i]])),.SDcols = i]
    }
    
    

    scaled_data[['cat_distincts']] <- distincts
    
    scaled_data[['data']] <- df
                          
    scaled_data
}

In [315]:
fetch_test_levels_to_train <- function(scaled_data,categorical_cols,distinct_values_on_train){
    
}

In [317]:
a <- fetch_test_levels_to_train

In [318]:
a

In [290]:
dummy_data <- function(scaled_data){
    df <- copy(scaled_data[['data']])
    df_dummied <- recipe(df) %>% step_dummy(all_nominal()) %>% prep %>% bake(new_data = NULL)
    scaled_data[['data']] <- df_dummied
    scaled_data
}

In [291]:
make_frame <- function(df,label,test = FALSE,train_frame = NULL,sparse_perc = 0.03){
    dt <- copy(df)
    setDT(dt)
    if(!test){
        target_ <- dt[[label]]
        
        dt[,(label) := NULL]
    }
    sparsed <- sparse_data(dt,sparse_percentage = sparse_perc)
    gc()
    base::message('Data sparsed.')
    
    if(!test){
        scl <- get_scaling_factors(sparsed)
    }else{
        scl <- train_frame[['scaling_factors']]
    }
    gc()
    base::message('Scaling factors calculated.')
    base::message('Missing values labeled.')
    scld <- scale_data(sparsed,scl)
    gc()
    base::message('Data scaled.')
    
    if(test){
        scld <- clean_test_set(scld,train_frame[['cat_cols']],train_frame[['cat_distincts']])
    }
    
    distincted <- get_distincts(scld)
    gc()
    base::message('Got distinct values for nominals.')
    
    dummied <- dummy_data(distincted)
    gc()
    base::message('Data dummied.')
    
    if(!test){
        dummied[['label_to_keras']] <- target_# %>% label_encode %>% keras::to_categorical()
        dummied[['label']] <- target_ #%>% label_encode
    }
    
    dummied[['data']] <- dummied[['data']] %>% as.matrix
    base::message('All done !')
    dummied
}

In [292]:
data_sl <- data.table()

for(i in c(LETTERS[1:10],letters[1:20])){
    charmi <- sample(c(TRUE,FALSE),size = 1)
    ortsec <- sample(200:500,size = 1)
    sdsec <- sample(100:1000,size = 1)
    if(charmi){
        data_sl[,(i) := sample(c(LETTERS,NA),size = 2e5,replace = T)]
    }else{
        data_sl[,(i) := sample(c(rnorm(n = 2e2,mean = ortsec,sd = sdsec),NA),size = 2e5,replace = T)]
    }
}

data_sl[,default := sample(c('No','Yes'),size = 2e5,prob = c(.93,.07),replace = T)]

In [293]:
set.seed(12)
dk <- ISLR::Default
traindex <- sample(1:nrow(dk),nrow(dk)*0.66)
train_set <- dk[traindex,]
test_set <- dk[-traindex,] %>% select(-default)

In [294]:
sprsd <- sparse_data(train_set)
scl <- get_scaling_factors(sprsd)

scld <- scale_data(sprsd,scaling_factors = scl)

In [295]:
train_ok <- train_set %>% make_frame(label = 'default')
test_ok <- test_set %>% make_frame(label = 'default',test = T,train_frame = train_ok)

Data sparsed.

Scaling factors calculated.

Missing values labeled.

Data scaled.

Got distinct values for nominals.

Data dummied.

All done !

Data sparsed.

Scaling factors calculated.

Missing values labeled.

Data scaled.

Got distinct values for nominals.

Data dummied.

All done !



In [299]:
f1 <- function(y_true, y_pred){
    y_pred = k_round(y_pred)
    
    tp = k_sum(k_cast(y_true*y_pred, 'float'), axis=1)
    tn = k_sum(k_cast((1-y_true)*(1-y_pred), 'float'), axis=1)
    fp = k_sum(k_cast((1-y_true)*y_pred, 'float'), axis=1)
    fn = k_sum(k_cast(y_true*(1-y_pred), 'float'), axis=1)

    p = tp / (tp + fp + k_epsilon())
    r = tp / (tp + fn + k_epsilon())

    f1 = 2*p*r / (p+r+k_epsilon())
    f1 = tf$where(tf$math$is_nan(f1), tf$zeros_like(f1), f1)
    return(k_mean(f1,axis = 1))
}

f1_loss <- function(y_true, y_pred){
    #y_pred = k_round(y_pred)
    tp = k_sum(k_cast(y_true*y_pred, 'float'), axis=1)
    tn = k_sum(k_cast((1-y_true)*(1-y_pred), 'float'), axis=1)
    fp = k_sum(k_cast((1-y_true)*y_pred, 'float'), axis=1)
    fn = k_sum(k_cast(y_true*(1-y_pred), 'float'), axis=1)

    p = tp / (tp + fp + k_epsilon())
    r = tp / (tp + fn + k_epsilon())

    f1 = 2*p*r / (p+r+k_epsilon())
    f1 = tf$where(tf$math$is_nan(f1), tf$zeros_like(f1), f1)
    return(1 - k_mean(f1,axis = 1))
}

In [300]:
folds_ <- caret::createFolds(train_ok[['label']],k = 5)

folds <- 1:length(folds_)

In [301]:
input_neurons <- ncol(train_ok[['data']][folds_[[1]],]) 
output_neurons <- 2

nofsample <- nrow(train_ok[['data']][folds_[[1]],])

alpha <- 4

In [302]:
numofneuron <- nofsample / (alpha * (input_neurons + output_neurons))

numofneuron <- numofneuron - output_neurons

In [303]:
layer_dist1 <- base::seq(from = numofneuron,to = output_neurons,length.out = 5)
layer_dist <- layer_dist1[-1]/base::sum(layer_dist1[-1])*numofneuron
layer_disto <- layer_dist %>% ceiling %>% .[1:3]

layer_disto

In [304]:
seed <- 1453
preds_keras <- list()
errors <- c()

for(i in folds){
    train_indices <- setdiff(folds,i) 
    test_indices <- i
    
    train_x <- train_ok[['data']][unlist(folds_[train_indices]),] %>% na.omit
    test_x <- train_ok[['data']][unlist(folds_[test_indices]),] %>% na.omit
    
    target_data <- test_ok[['data']]
    
    train_y <- train_ok[['label']][unlist(folds_[train_indices])]
    test_y <- train_ok[['label']][unlist(folds_[test_indices])]
    
    train_y2 <- train_y %>% label_encode %>% as.matrix %>% to_categorical
    test_y2 <- test_y %>% label_encode %>% as.matrix %>% to_categorical
    
    tf$random$set_seed(seed = seed)

    reticulate::py_set_seed(seed = seed)

    model <- keras_model_sequential()
    

    model %>%
    layer_dense(units = layer_disto[1], activation = 'relu', input_shape = c(input_neurons)) %>%
    layer_dense(units = layer_disto[2], activation = 'relu') %>%
    layer_dense(units = layer_disto[3], activation = 'relu') %>%
    #layer_dense(units = layer_disto[7], activation = 'relu') %>%
    layer_dense(units = 2,activation = 'softmax')
    
    model %>% compile(
      loss = f1_loss,
      optimizer =optimizer_rmsprop(learning_rate = 0.001),
      metrics = c(f1)
    )
    
    wgh <- sqrt(sum(test_y == 'Yes')/sum(test_y == 'No'))
    
    history <- model %>% fit(
      train_x, train_y2, 
      epochs = 200, batch_size = 128, 
      validation_data = list(test_x,test_y2),
      callbacks = list(callback_early_stopping(
                        monitor = "val_loss",
                        patience = 10,
                        verbose = 1,
                        mode = "auto",
                        restore_best_weights = TRUE
                        )
    ),
    class_weight = list('0' = wgh**2,'1' = 1)
    )
    
    preds <- model %>% predict(test_x)  %>% .[,2]
    
    # fold thresholding 
    thresholds <- data.table()
    for(j in seq(from = 0.03,to = 0.7,by = 0.01)){
        obs <- test_y %>% label_encode
        prd <- +(preds >= j)
        if(sum(prd) == 0){
            next
        }
        f1_clc <- f_meas_vec(truth = factor(obs,levels = 0:1),estimate = factor(prd,levels = 0:1),event_level = 'second')
        fold_row <- data.table(threshold = j,f1 = f1_clc)
        thresholds <- rbindlist(list(thresholds,fold_row))
    }
    
    best_threshold <- thresholds %>% filter(f1 == max(f1)) %>% select(threshold) %>% pull %>% .[1]
    error <- thresholds %>% filter(f1 == max(f1)) %>% select(f1) %>% pull %>% .[1]
    IRdisplay::display('**************----------**************')
    IRdisplay::display(sprintf('Fold %s F1 : %s',i,round(error,2)))
    errors[i] <- error
    IRdisplay::display(sprintf('Best threshold for Fold %s is : %s',i,best_threshold))
    
    
    
    preds_target <- model %>% predict(target_data) %>% .[,2]
    
    preds_target <- +(preds_target >= best_threshold)
    
    preds_keras[[i]] <- preds_target
}
IRdisplay::display('**************----------**************')
IRdisplay::display(sprintf('CV Mean F1 : %s',mean(errors,na.rm = T)))

In [313]:
levels(iris$Species)

In [305]:
krs <- preds_keras %>% as.data.table

In [306]:
colnames(krs) <- paste0('Fold',1:length(folds))

In [307]:
krs <- krs %>% rowwise %>% mutate(prd = fmode(c(Fold1,Fold2,Fold3,Fold4,Fold5))) %>% as.data.table

In [308]:
#krs[,prd := +(prd >= 0.5)]

In [309]:
dk[-traindex,] %>% select(default) %>% table %>% prop.table

default
        No        Yes 
0.96352941 0.03647059 

In [310]:
dk[-traindex,] %>% select(default) %>%
transmute(obs = +(default == 'Yes')) %>%
bind_cols(krs %>% select(prd)) %>%
f_meas(truth = factor(obs,levels = 0:1),estimate = factor(prd,levels = 0:1),event_level = 'second')

.metric,.estimator,.estimate
<chr>,<chr>,<dbl>
f_meas,binary,0.5171103


In [190]:
set.seed(1)
koko <- data.table(a = rnorm(n = 1e2,mean = 20,sd = 3),cat = sample(LETTERS[1:3],size = 1e2,replace = T))

In [26]:
koko[,cat := factor(cat,levels = LETTERS[1:4])]

In [216]:
iris2 <- copy(iris)
setDT(iris2)

In [218]:
iris2[Species == 'virginica',Species := NA]

In [220]:
train_ok

balance,income,student_Yes
0.09425117,0.2290072,1
0.00000000,0.6288760,0
0.00000000,0.2590000,1
0.44512797,0.1513121,1
0.09497977,0.5182950,0
0.16931352,0.7161124,0
0.55200908,0.7959084,0
0.00000000,0.8434516,0
0.37409301,0.6151015,0
0.14808346,0.5959889,0
