In [1]:
library(tidyverse)

“replacing previous import by ‘tidyr::spread’ when loading ‘broom’”── Attaching packages ─────────────────────────────────────── tidyverse 1.2.1 ──
✔ ggplot2 2.2.1     ✔ purrr   0.2.4
✔ tibble  1.4.2     ✔ dplyr   0.7.4
✔ tidyr   0.8.0     ✔ stringr 1.2.0
✔ readr   1.1.1     ✔ forcats 0.2.0
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()


In [2]:
data_path <- '../data/'
vitals_labs <- read_csv(paste0(data_path, 'vitals_labs_spread_filt.csv'))
demographics <- read_csv(paste0(data_path, 'demographics_cohort.csv'))
action_df <- read_csv(paste0(data_path, 'action_df.csv'))

Parsed with column specification:
cols(
  .default = col_double(),
  subject_id = col_integer(),
  hadm_id = col_integer(),
  icustay_id = col_integer(),
  interval_start_time = col_datetime(format = ""),
  interval_end_time = col_datetime(format = ""),
  exclude = col_logical(),
  BANDS = col_integer()
)
See spec(...) for full column specifications.
“55 parsing failures.
row # A tibble: 5 x 5 col     row col   expected               actual           file                      expected   <int> <chr> <chr>                  <chr>            <chr>                     actual 1  2733 BANDS no trailing characters .5               '../data/vitals_labs_spr… file 2  5229 BANDS no trailing characters .333333333333336 '../data/vitals_labs_spr… row 3  8611 BANDS no trailing characters .5               '../data/vitals_labs_spr… col 4 11437 BANDS no trailing characters .2               '../data/vitals_labs_spr… expected 5 11513 BANDS no trailing characters .5               '../data/vitals_labs_spr…
.

In [3]:
names(vitals_labs)

In [4]:
# Perform LVCF
vitals_labs_lvcf <- vitals_labs %>% group_by(subject_id, hadm_id, icustay_id) %>%
                        arrange(subject_id, hadm_id, icustay_id, interval_start_time) %>%
                        fill(ALBUMIN:WBC)

In [5]:
data_all <- vitals_labs_lvcf %>% full_join(demographics) %>% ungroup()

Joining, by = c("subject_id", "hadm_id", "icustay_id")


In [6]:
# Generate the train/test split
set.seed(10)
ids <- unique(data_all$icustay_id)

train_prop <- 0.8
num_train <- floor(train_prop * length(ids))

train_ids <- sample(ids, num_train)
test_ids <- setdiff(ids, train_ids)

# Proportion validation - of the test data
val_prop <- 0.5 # 0.5 of 0.2 = 0.1 total
num_val <- floor(val_prop * length(test_ids))
val_ids <- sample(test_ids, num_val)
test_ids <- setdiff(test_ids, val_ids)

train_id_df <- data.frame(icustay_id = train_ids)
val_id_df <- data.frame(icustay_id = val_ids)
test_id_df <- data.frame(icustay_id = test_ids)

In [7]:
train_data <- data_all %>% inner_join(train_id_df) %>% ungroup()
val_data <- data_all %>% inner_join(val_id_df) %>% ungroup()
test_data <- data_all %>% inner_join(test_id_df) %>% ungroup()

Joining, by = "icustay_id"
Joining, by = "icustay_id"
Joining, by = "icustay_id"


In [8]:
dim(train_data)
dim(val_data)
dim(test_data)

In [10]:
library(caret)

Loading required package: lattice

Attaching package: ‘caret’

The following object is masked from ‘package:purrr’:

    lift



In [11]:
feature_names <- setdiff(c(names(vitals_labs_lvcf), names(demographics)),
                         c('subject_id', 
                           'hadm_id', 
                           'icustay_id', 
                           'interval_start_time',
                           'interval_end_time',
                           'exclude'))
train_features <- train_data[, feature_names]
val_features <- val_data[, feature_names]
test_features <- test_data[, feature_names]

In [12]:
# Preprocess the data

preprocessor <- preProcess(train_features, method = c('center', 'scale', 'medianImpute'))
train_proc <- predict(preprocessor, train_features)
val_proc <- predict(preprocessor, val_features)
test_proc <- predict(preprocessor, test_features)

In [14]:
id_vars <- c('subject_id', 
               'hadm_id', 
               'icustay_id', 
               'interval_start_time',
               'interval_end_time')

# train_out <- cbind(train_proc, train_data[, !(names(train_data) %in% names(train_proc))])

# test_out <- cbind(test_proc, test_data[, !(names(test_data) %in% names(test_proc))])

train_out <- cbind(train_data[, !(names(train_data) %in% names(train_proc))], train_proc)
val_out <- cbind(val_data[, !(names(val_data) %in% names(val_proc))], val_proc)
test_out <- cbind(test_data[, !(names(test_data) %in% names(test_proc))], test_proc)


# train_proc[, id_features] <- train_data[, id_vars]
# test_proc[ ,id_features] <- test_data[, id_vars]
# head(train_proc, id_features)

In [15]:
head(train_out)

subject_id,hadm_id,icustay_id,interval_start_time,interval_end_time,exclude,ALBUMIN,ANION GAP,BANDS,BICARBONATE,⋯,vent,sofa,lods,sirs,qsofa,qsofa_sysbp_score,qsofa_gcs_score,qsofa_resprate_score,elixhauser_hospital,blood_culture_positive
4,185777,294638,2191-03-15 12:00:00,2191-03-15 16:00:00,False,-0.05746277,0.7674253,-0.2986458,0.04208446,⋯,-1.169914,-0.4491586,-1.431576,-1.023693,0.1715143,0.5402321,-0.5779918,0.3918894,1.006947,1.458774
4,185777,294638,2191-03-15 20:00:00,2191-03-16 00:00:00,False,-0.05746277,0.7674253,-0.2986458,0.04208446,⋯,-1.169914,-0.4491586,-1.431576,-1.023693,0.1715143,0.5402321,-0.5779918,0.3918894,1.006947,1.458774
4,185777,294638,2191-03-16 00:00:00,2191-03-16 04:00:00,False,-0.05746277,0.7674253,-0.2986458,0.04208446,⋯,-1.169914,-0.4491586,-1.431576,-1.023693,0.1715143,0.5402321,-0.5779918,0.3918894,1.006947,1.458774
4,185777,294638,2191-03-16 04:00:00,2191-03-16 08:00:00,False,-0.48527552,0.2709276,-0.2986458,-0.59964321,⋯,-1.169914,-0.4491586,-1.431576,-1.023693,0.1715143,0.5402321,-0.5779918,0.3918894,1.006947,1.458774
4,185777,294638,2191-03-16 08:00:00,2191-03-16 12:00:00,False,-0.48527552,0.2709276,-0.2986458,-0.59964321,⋯,-1.169914,-0.4491586,-1.431576,-1.023693,0.1715143,0.5402321,-0.5779918,0.3918894,1.006947,1.458774
4,185777,294638,2191-03-16 12:00:00,2191-03-16 16:00:00,False,-0.48527552,0.2709276,-0.2986458,-0.59964321,⋯,-1.169914,-0.4491586,-1.431576,-1.023693,0.1715143,0.5402321,-0.5779918,0.3918894,1.006947,1.458774


In [23]:
dim(train_out)
dim(val_out)
dim(test_out)

In [24]:
write_csv(train_out, paste0(data_path, 'train_data.csv'))
write_csv(val_out, paste0(data_path, 'val_data.csv'))
write_csv(test_out, paste0(data_path, 'test_data.csv'))