In [None]:
# Usamos la función pacman para llamar las bibliotecas necesarias para el análisis
install.packages('pacman')
library(pacman)
p_load(tidyverse,rio,tidymodels, keras, ggplot2, themis, yardstick, tensorflow)


rio installed

Installing package into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)

also installing the dependencies ‘shape’, ‘future.apply’, ‘numDeriv’, ‘progressr’, ‘SQUAREM’, ‘Rcpp’, ‘diagram’, ‘lava’, ‘listenv’, ‘parallelly’, ‘prodlim’, ‘future’, ‘warp’, ‘iterators’, ‘lhs’, ‘DiceDesign’, ‘patchwork’, ‘globals’, ‘clock’, ‘gower’, ‘ipred’, ‘timeDate’, ‘furrr’, ‘slider’, ‘foreach’, ‘GPfit’, ‘modelenv’, ‘dials’, ‘hardhat’, ‘infer’, ‘modeldata’, ‘parsnip’, ‘recipes’, ‘rsample’, ‘tune’, ‘workflows’, ‘workflowsets’, ‘yardstick’



tidymodels installed

Installing package into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)

also installing the dependencies ‘RcppTOML’, ‘here’, ‘png’, ‘config’, ‘tfautograph’, ‘reticulate’, ‘tensorflow’, ‘tfruns’, ‘zeallot’



keras installed

Installing package into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)

also installing the dependencies ‘RANN’, ‘ROSE’



themis installed



In [None]:
# Cargar datos y eliminar columnas que no son predictores ------------------------------------------------------------

train <- readRDS("train_final.rds")
train_sin_bog <- readRDS("train_final_sin_bog.rds")
test <- readRDS("test_final.rds")

#Seleccion basada en Lasso
coefs_lasso <- read_csv("coefs_lasso.csv")

#Seleccionar variables
remove_lasso <- coefs_lasso %>%
  filter(estimate == 0) %>%
  select(term)
remove_lasso <- remove_lasso$term

train_clas <- train_sin_bog %>%
  select(-c(Lp, Ingpcug, lIngpcug, train, Clase))

#Validation set para train sin Bogotá
train_clas <- train_clas #%>%
  #mutate(Pobre = factor(Pobre, levels = c(0,1), labels = c("si", "no")))

set.seed(123)

validation_set <- train_clas %>%
  sample_frac(0.15)

validation_pre_train <- train_clas %>%
  anti_join(validation_set, by = "id")

validation_test <- validation_pre_train %>%
  sample_frac(0.15)

validation_train <- validation_pre_train %>%
  anti_join(validation_test, by = "id") %>%
  select(-id)

# Modelo ------------------------------------------------------------------

# Recipe (prep) -----------------------------------------------------------

rec_prep_networks <-
  recipe(Pobre ~ ., data = validation_train) %>%
  step_dummy(all_of(c("Dominio")), -all_outcomes()) %>%
  step_interact(terms = ~ P5130:starts_with("Depto")) %>%
  step_interact(terms = ~ Educ_avg:starts_with("Depto")) %>%
  step_interact(terms = ~ tasa_ocupados:starts_with("Depto")) %>%
  step_interact(terms = ~ tasa_inactivos:starts_with("Depto")) %>%
  step_interact(terms = ~ P5000:starts_with("Depto")) %>%
  step_interact(terms = ~ edad_pet:starts_with("Depto")) %>%
  step_rm(any_of(remove_lasso)) %>%
  step_zv(all_predictors()) %>%
  step_normalize(all_numeric_predictors(), -all_outcomes()) %>%
  step_pca(all_numeric_predictors(), threshold = 0.8)


# Aplicar el preprocesamiento para normalizar los datos
x_val <- prep(rec_prep_networks) %>% bake(new_data = validation_train) %>% select(-Pobre)

# Convertir columnas categóricas a variables dummy
x_val <- x_val %>%
  model.matrix(~ . - 1, data = .)

# Aplicar el preprocesamiento para normalizar los datos
x_test <- prep(rec_prep_networks) %>% bake(new_data = test) #%>% select(-Pobre)

x_test <- x_test %>%
  model.matrix(~ . - 1, data = .)

# Aplicar el preprocesamiento para normalizar los datos
x_val_set <- prep(rec_prep_networks) %>% bake(new_data = validation_set) %>% select(-Pobre)

# Convertir columnas categóricas a variables dummy
x_val_set <- x_val_set %>%
  model.matrix(~ . - 1, data = .)

# Aplicar el preprocesamiento para normalizar los datos
x_val_test <- prep(rec_prep_networks) %>% bake(new_data = validation_test) %>% select(-Pobre)

x_val_test <- x_val_test %>%
  model.matrix(~ . - 1, data = .)

# Sacar dependientes y eliminarlas
y_val <- validation_train$Pobre
y_val_set <- validation_set$Pobre
y_val_test <- validation_test$Pobre

# Entrenamiento -----------------------------------------------------------

# Definir las métricas
METRICS <- list(
  metric_binary_accuracy(name = 'accuracy'),
  metric_precision(name = 'precision'),
  metric_recall(name = 'recall'),
  metric_auc(name = 'auc')
)

EPOCHS <- 30
BATCH_SIZE <- 3000


[1mRows: [22m[34m238[39m [1mColumns: [22m[34m3[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m (1): term
[32mdbl[39m (2): estimate, penalty

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


In [None]:
# Arquitectura del modelo
model <- keras_model_sequential() %>%
  layer_dense(units = 150, activation = 'relu',
              input_shape = dim(x_val)[2],
              kernel_initializer = initializer_random_uniform()) %>%
  layer_dropout(rate = 0.1) %>%
  layer_dense(units = 150, activation = 'relu',
              input_shape = dim(x_val)[2],
              kernel_initializer = initializer_random_uniform()) %>%
  layer_dropout(rate = 0.1) %>%
  layer_dense(units = 1, activation = 'sigmoid')

# El compilador del modelo
model %>% compile(
  optimizer = optimizer_adam(learning_rate = 1e-3),
  loss = 'binary_crossentropy',
  metrics = METRICS
)



In [None]:
# Entrenamiento
historia_modelo_basico <- model %>% fit(
  x = x_val,
  y = as.matrix(y_val),
  batch_size = BATCH_SIZE,
  epochs = 30,
  validation_data = list(x_val_set, as.matrix(y_val_set)),
  verbose = 0,
  seed = 12
)

In [None]:
results <- model %>% evaluate(x_val_test, y_val_test, verbose = 0)
results

In [None]:
#Calcular el F1_score
f1_score <- 2*results['precision'] * results['recall']/(results['precision']+results['recall'])
f1_score

##Undersampling!

In [None]:
summary(validation_train$Pobre)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
 0.0000  0.0000  0.0000  0.2075  0.0000  1.0000 

In [None]:
down_rec <- recipe(Pobre ~ ., data = validation_train %>%mutate(Pobre = factor(Pobre, levels = c(0,1)))) %>%
  step_downsample(Pobre, under_ratio = 1.5 )
validation_train_under <- prep(down_rec) %>% bake(new_data = NULL) %>% mutate(Pobre = as.numeric(as.character(Pobre)))

In [None]:
summary(validation_train_under$Pobre)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
    0.0     0.0     0.0     0.4     1.0     1.0 

In [None]:
# Aplicar el preprocesamiento para normalizar los datos
x_val_under <- prep(rec_prep_networks) %>% bake(new_data = validation_train_under) %>% select(-Pobre)

# Convertir columnas categóricas a variables dummy
x_val_under <- x_val_under %>%
  model.matrix(~ . - 1, data = .)

# Under
y_val_under <- validation_train_under$Pobre

In [None]:
# Entrenamiento
historia_modelo_under <- model %>% fit(
  x = x_val_under,
  y = as.matrix(y_val_under),
  batch_size = BATCH_SIZE,
  epochs = EPOCHS,
  validation_data = list(x_val_set, as.matrix(y_val_set)),
  verbose = 0,
  seed = 12
)

In [None]:
results_undersampling <- model %>% evaluate(x_val_test, y_val_test, verbose = 0)
results_undersampling

In [None]:
#Calcular el F1_score
f1_score <- 2*results_undersampling['precision'] * results_undersampling['recall']/(results_undersampling['precision']+results_undersampling['recall'])
f1_score

In [None]:
# En adición al undersampling podemos maximizar ROC
# ROC function  -----------------------------------------------------------
ROC_function <- function(predictions_real_df) {

  # Curva ROC ---------------------------------------------------------------
  curva_ROC <- roc(predictions_real_df$real, predictions_real_df$predictions)

  #Coordenadas del "top-left" de la curva ROC
  ROC_thresh <- coords(curva_ROC, x = "best", best.method = "closest.topleft")

  ROC_thresh
}


In [None]:
#Predict en val_test para volver a encontrar las métricas
boost_predictions_class <- predict(boost_final_fit, test, type = "prob")$.pred_si %>%
  bind_cols(test$id) %>%
  rename(c("id"="...2")) %>%
  mutate(pobre = ifelse(...1>=roc_thresh_boost$threshold, 1, 0)) %>%
  select(-c(...1))

In [None]:
#ROC
predictions_real_df <- predict(model, x_val_test) %>%
  bind_cols(y_val_test) %>%
  rename(c("real" = "...2", "predictions"="...1")) %>%
  mutate(real = factor(real, levels = c(0,1))) %>%
  fct_relevel

roc_thresh_nn <- ROC_function(predictions_real_df)
roc_thresh_nn

[1m[22mNew names:
[36m•[39m `` -> `...1`
[36m•[39m `` -> `...2`
Setting levels: control = 0, case = 1

Setting direction: controls < cases



threshold,specificity,sensitivity
<dbl>,<dbl>,<dbl>
0.4256816,0.8309064,0.8447266


In [None]:
# Enviar unas predicciones

nn_predictions_class <- predict(model, x_test) %>%
  bind_cols(test$id) %>%
  rename(c("id"="...2")) %>%
  mutate(pobre = ifelse(...1>=roc_thresh_nn$threshold, 1, 0)) %>%
  select(-c(...1))# %>%
  #replace_na(list(pobre = 0))


[1m[22mNew names:
[36m•[39m `` -> `...1`
[36m•[39m `` -> `...2`


In [None]:
write_csv(nn_predictions_class, "classification_neural_networks.csv")