# Read the main data file

In [1]:
library("tidyverse")

── Attaching packages ─────────────────────────────────────── tidyverse 1.2.1 ──
✔ ggplot2 3.0.0     ✔ purrr   0.2.5
✔ tibble  1.4.2     ✔ dplyr   0.7.6
✔ tidyr   0.8.1     ✔ stringr 1.3.1
✔ readr   1.1.1     ✔ forcats 0.3.0
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()


In [2]:
df = read_csv(file = "../../data/processed/wnv.trap.date.rev3b.csv")
head(df)

Parsed with column specification:
cols(
  .default = col_double(),
  t_date = col_date(format = ""),
  t_yr = col_integer(),
  t_mo = col_integer(),
  t_day = col_integer(),
  t_qtr = col_integer(),
  t_wk = col_integer(),
  t_day_of_yr = col_integer(),
  t_day_of_wk = col_integer(),
  t_day_of_wk_name = col_character(),
  t_eval_day = col_integer(),
  t_eval_wk = col_integer(),
  part_train = col_logical(),
  part_validate = col_logical(),
  part_test = col_logical(),
  part_partition = col_character(),
  trap_trap_name = col_character(),
  loc_lat_lng_src = col_character(),
  trap_satellite_ind = col_logical(),
  loc_zipcode = col_integer(),
  loc_community = col_character()
  # ... with 33 more columns
)
See spec(...) for full column specifications.


t_date,t_yr,t_mo,t_day,t_qtr,t_wk,t_day_of_yr,t_day_of_wk,t_day_of_wk_name,t_eval_day,⋯,wea_ohare_tavg2_ma60_lag14,wea_ohare_tavg2_ma60_lag21,wea_ohare_tavg2_ma60_lag28,wea_ohare_tavg2_ma60_lag35,wea_ohare_tavg2_ma60_lag42,wea_ohare_tavg2_ma60_lag49,wea_ohare_tavg2_ma60_lag56,wea_ohare_tavg2_ma60_lag63,wea_ohare_tavg2_ma60_lag90,wea_ohare_tavg2_ma60_lag120
2007-05-29,2007,5,29,2,22,149,2,Mon,514,⋯,51.00833,48.725,44.69167,42.1,38.61667,35.55,31.775,27.80833,23.83333,31.73333
2007-05-29,2007,5,29,2,22,149,2,Mon,514,⋯,51.00833,48.725,44.69167,42.1,38.61667,35.55,31.775,27.80833,23.83333,31.73333
2007-05-29,2007,5,29,2,22,149,2,Mon,514,⋯,51.00833,48.725,44.69167,42.1,38.61667,35.55,31.775,27.80833,23.83333,31.73333
2007-05-29,2007,5,29,2,22,149,2,Mon,514,⋯,51.00833,48.725,44.69167,42.1,38.61667,35.55,31.775,27.80833,23.83333,31.73333
2007-05-29,2007,5,29,2,22,149,2,Mon,514,⋯,51.00833,48.725,44.69167,42.1,38.61667,35.55,31.775,27.80833,23.83333,31.73333
2007-05-29,2007,5,29,2,22,149,2,Mon,514,⋯,51.00833,48.725,44.69167,42.1,38.61667,35.55,31.775,27.80833,23.83333,31.73333


In [3]:
glimpse(df)

Observations: 13,631
Variables: 117
$ t_date                             <date> 2007-05-29, 2007-05-29, 2007-05...
$ t_yr                               <int> 2007, 2007, 2007, 2007, 2007, 20...
$ t_mo                               <int> 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,...
$ t_day                              <int> 29, 29, 29, 29, 29, 29, 29, 29, ...
$ t_qtr                              <int> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,...
$ t_wk                               <int> 22, 22, 22, 22, 22, 22, 22, 22, ...
$ t_day_of_yr                        <int> 149, 149, 149, 149, 149, 149, 14...
$ t_day_of_wk                        <int> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,...
$ t_day_of_wk_name                   <chr> "Mon", "Mon", "Mon", "Mon", "Mon...
$ t_eval_day                         <int> 514, 514, 514, 514, 514, 514, 51...
$ t_eval_wk                          <int> 74, 74, 74, 74, 74, 74, 74, 74, ...
$ part_train                         <lgl> TRUE, TRUE, TRUE, TRUE, TRUE, TR...
$ part_validate 

# Data Type Conversions

In [4]:
factor_vars = c("loc_zipcode", "loc_census_block_group_id","loc_census_tract_id","zone_type")
for (col in factor_vars){
    df[,col] = as.factor(df[,col][[1]])
}

In [5]:
library("forcats")

In [6]:
df$loc_zipcode = fct_infreq(df$loc_zipcode)
df$loc_census_block_group_id = fct_infreq(df$loc_census_block_group_id)
df$loc_census_tract_id = fct_infreq(df$loc_census_tract_id)
df$zone_type = fct_infreq(df$zone_type)

# Split into train, validation and test datasets

In [7]:
df %>% count(part_partition)

part_partition,n
test,1979
train,8222
validate,3430


In [8]:
partition_vars = df %>% select(starts_with("part_")) %>% names()

In [9]:
df_sub = df %>% select(-partition_vars)

In [10]:
df_train = df_sub[df$part_partition=="train",]
df_val   = df_sub[df$part_partition=="validate",]
df_test  = df_sub[df$part_partition=="test",]

What are the date ranges for each of these splits?

In [11]:
range(df_train$t_date)

In [12]:
range(df_val$t_date)

In [13]:
range(df_test$t_date)

# Save to file

In [14]:
saveRDS(df_train,file = "df_train.RData")
saveRDS(df_val,file = "df_val.RData")
saveRDS(df_test,file = "df_test.RData")