In [None]:
# Usamos la función pacman para llamar las bibliotecas necesarias para el análisis
#install.packages('pacman')
library(pacman)
p_load(tidyverse,rio,tidymodels, keras, ggplot2, themis, yardstick, tensorflow)

In [None]:
# Cargar datos y eliminar columnas que no son predictores ------------------------------------------------------------

train <- readRDS("train_final.rds")
train_sin_bog <- readRDS("train_final_sin_bog.rds")
test <- readRDS("test_final.rds")

#Seleccion basada en Lasso
coefs_lasso <- read_csv("coefs_lasso.csv")

#Seleccionar variables
remove_lasso <- coefs_lasso %>%
  filter(estimate == 0) %>%
  select(term)
remove_lasso <- remove_lasso$term

train_reg <- train_sin_bog %>%
  select(-c(Lp, Ingpcug, Pobre, train, Clase))

#Validation set para train sin Bogotá

set.seed(123)

validation_set <- train_reg %>%
  sample_frac(0.15)

validation_pre_train <- train_reg %>%
  anti_join(validation_set, by = "id")

validation_test <- validation_pre_train %>%
  sample_frac(0.15)

validation_train <- validation_pre_train %>%
  anti_join(validation_test, by = "id") %>%
  select(-id)

# Modelo ------------------------------------------------------------------

# Recipe (prep) -----------------------------------------------------------

rec_prep_networks <-
  recipe(lIngpcug ~ ., data = validation_train) %>%
  step_dummy(all_of(c("Dominio")), -all_outcomes()) %>%
  step_interact(terms = ~ P5130:starts_with("Dominio")) %>%
  step_interact(terms = ~ Educ_avg:starts_with("Dominio")) %>%
  step_interact(terms = ~ tasa_ocupados:starts_with("Dominio")) %>%
  step_interact(terms = ~ tasa_inactivos:starts_with("Dominio")) %>%
  step_interact(terms = ~ P5000:starts_with("Dominio")) %>%
  step_interact(terms = ~ edad_pet:starts_with("Dominio")) %>%
  step_rm(any_of(remove_lasso)) %>%
  step_zv(all_predictors()) %>%
  step_normalize(all_numeric_predictors(), -all_outcomes()) %>%
  step_pca(all_numeric_predictors(), threshold = 0.8)


# Aplicar el preprocesamiento para normalizar los datos
x_val <- prep(rec_prep_networks) %>% bake(new_data = validation_train) %>% select(-lIngpcug)

# Convertir columnas categóricas a variables dummy
x_val <- x_val %>%
  model.matrix(~ . - 1, data = .)

# Aplicar el preprocesamiento para normalizar los datos
x_test <- prep(rec_prep_networks) %>% bake(new_data = test) #%>% select(-Pobre)

x_test <- x_test %>%
  model.matrix(~ . - 1, data = .)

# Aplicar el preprocesamiento para normalizar los datos
x_val_set <- prep(rec_prep_networks) %>% bake(new_data = validation_set) %>% select(-lIngpcug)

# Convertir columnas categóricas a variables dummy
x_val_set <- x_val_set %>%
  model.matrix(~ . - 1, data = .)

# Aplicar el preprocesamiento para normalizar los datos
x_val_test <- prep(rec_prep_networks) %>% bake(new_data = validation_test) %>% select(-lIngpcug)

x_val_test <- x_val_test %>%
  model.matrix(~ . - 1, data = .)

# Sacar dependientes y eliminarlas
y_val <- validation_train$lIngpcug
y_val_set <- validation_set$lIngpcug
y_val_test <- validation_test$lIngpcug

# Entrenamiento -----------------------------------------------------------

EPOCHS <- 30
BATCH_SIZE <- 3000

[1mRows: [22m[34m238[39m [1mColumns: [22m[34m3[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m (1): term
[32mdbl[39m (2): estimate, penalty

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


In [None]:
# Arquitectura del modelo
model <- keras_model_sequential() %>%
  layer_dense(units = 150, activation = 'relu',
              input_shape = dim(x_val)[2],
              kernel_initializer = initializer_random_uniform()) %>%
  layer_dropout(rate = 0.1) %>%
  layer_dense(units = 150, activation = 'relu',
              input_shape = dim(x_val)[2],
              kernel_initializer = initializer_random_uniform()) %>%
  layer_dropout(rate = 0.1) %>%
  layer_dense(units = 1, activation = 'linear')

# El compilador del modelo
model %>% compile(
  loss = "mean_squared_error",
  optimizer = optimizer_rmsprop(),
  metrics = c("mean_squared_error")
)



In [None]:
# Entrenamiento
model_regression <- model %>% fit(
  x = x_val,
  y = as.matrix(y_val),
  batch_size = BATCH_SIZE,
  epochs = 30,
  validation_data = list(x_val_set, as.matrix(y_val_set)),
  verbose = 0,
  seed = 12
)

In [None]:
#Evaluacion
results <- model %>% evaluate(x_val_test, y_val_test, verbose = 0)
results

In [None]:
#Prediccion fuera de muestra
nn_predictions <- predict(model, x_test) %>%
  bind_cols(test$id) %>%
  bind_cols(test$Lp) %>%
  rename(c("predictions"="...1","id"="...2", "Lp" = "...3")) %>%
  mutate(Ingpcug=exp(predictions)) %>%
  mutate(pobre = ifelse(Ingpcug<=Lp,1,0)) %>%
  select(-c(predictions, Ingpcug, Lp))

[1m[22mNew names:
[36m•[39m `` -> `...1`
[36m•[39m `` -> `...2`
[1m[22mNew names:
[36m•[39m `` -> `...3`


In [None]:
summary(nn_predictions)

      id                pobre       
 Length:66168       Min.   :0.0000  
 Class :character   1st Qu.:0.0000  
 Mode  :character   Median :0.0000  
                    Mean   :0.1287  
                    3rd Qu.:0.0000  
                    Max.   :1.0000  

In [None]:
write_csv(nn_predictions, "regression_neural_networks.csv")