In [8]:
# rm(list=ls())
pacman::p_load(data.table, MASS, ggplot2, stringr, matrixStats, doParallel, abind, ncdf4, arrayhelpers)

parallel::detectCores()
## Resizing notebook plot space
options(repr.plot.width=16, repr.plot.height=9)

In [9]:
## A function to just open the ncdf file and it's corresponding metadata

ncdf_open_extract_dims <- function(file) {
    
    le_file <- list()
    
    ## Open the file
    le_file[[1]] <- nc_open(paste0(file))
    
    
    val_name <- names(le_file[[1]]$var)
    dim_numbers <- length(le_file[[1]]$var[[paste0(val_name)]]$dim)
    
    ## Get the corresponding dimvalues in the order of the xarray
        dim_vals <- lapply(c(1:dim_numbers), function(x) {
            le_file[[1]]$var[[paste0(val_name)]]$dim[[x]]$vals
        })
    ## Get the corresponding dimnames in the order of the xarray
        dim_names <- lapply(c(1:dim_numbers), function(x) {
            le_file[[1]]$var[[paste0(val_name)]]$dim[[x]]$name
        })

    ## Name the dim metadata
        names(dim_vals) <- dim_names
    
    le_file[[2]] <- dim_vals
    
    return(le_file)
    
}

In [10]:
file <- paste0("/ihme/forecasting/data/fbd_scenarios_data/forecast/covariate/education/20170608_GBD2016Final//20170726_cohort_maternal_scenarios.nc")
example_nc <- ncdf_open_extract_dims(file)
example_nc[[1]]

File /ihme/forecasting/data/fbd_scenarios_data/forecast/covariate/education/20170608_GBD2016Final//20170726_cohort_maternal_scenarios.nc (NC_FORMAT_NETCDF4):

     1 variables (excluding dimension variables):
        double __xarray_dataarray_variable__[draw,year_id,sex_id,age_group_id,location_id,scenario]   (Contiguous storage)  
            _FillValue: NaN

     6 dimensions:
        location_id  Size:195
        age_group_id  Size:20
        sex_id  Size:2
        year_id  Size:91
        draw  Size:1000
        scenario  Size:3

    1 global attributes:
        _NCProperties: version=1|netcdflibversion=4.4.1|hdf5libversion=1.8.17

In [11]:
## Now, here's the important part: subsetting and getting just the data we need, and also to make sure that, 
## if a subset occurs, then we don't exceed the dimensions in the count option

# head(example_nc[[2]])

In [36]:
## So, the first datapoint is scenario == -1 and for age_group > 7. Let's extract that
system.time(edu_pes <- ncvar_get(example_nc[[1]], "__xarray_dataarray_variable__", 
                                 start=c(1,1,1,6,1,3)))
str(edu_pes)

   user  system elapsed 
 13.733   6.844  20.582 

 num [1:1000, 1:91, 1:2, 1:15, 1:195] 1.28 1.38 1.39 1.19 1.25 ...


In [37]:
# Get a list of array dim names, stripped according to subset
array_dimnames <- example_nc[[2]]
# array_dimnames[["draw"]] <- array_dimnames[["draw"]][c(996:1000)]
array_dimnames[["age_group_id"]] <- array_dimnames[["age_group_id"]][c(6:20)]
array_dimnames[["scenario"]] <- array_dimnames[["scenario"]][3]

In [67]:
## Set array dim values
for(i in c(1:5)) {
    dimnames(edu_pes)[[i]] <- array_dimnames[[i]]
}

## Set array dim names (no scenario since we killed it off by subset)
names(dimnames(edu_pes)) <- setdiff(names(array_dimnames), "scenario")

str(edu_pes)

 num [1:1000, 1:91, 1:2, 1:15, 1:195] 1.28 1.38 1.39 1.19 1.25 ...
 - attr(*, "dimnames")=List of 5
  ..$ draw        : chr [1:1000] "0" "1" "2" "3" ...
  ..$ year_id     : chr [1:91] "1950" "1951" "1952" "1953" ...
  ..$ sex_id      : chr [1:2] "1" "2"
  ..$ age_group_id: chr [1:15] "7" "8" "9" "10" ...
  ..$ location_id : chr [1:195] "6" "7" "8" "10" ...


In [68]:
## Finally, to a data table!
edu_pes_DT <- data.table::melt(edu_pes)
colnames(edu_pes_DT)[6] <- "edu"
head(edu_pes_DT)

draw,year_id,sex_id,age_group_id,location_id,value
0,1950,1,7,6,1.280493
1,1950,1,7,6,1.379191
2,1950,1,7,6,1.392498
3,1950,1,7,6,1.187493
4,1950,1,7,6,1.248056
5,1950,1,7,6,1.464628


In [71]:
fwrite(edu_pes_DT, file = paste0("/share/forecasting/data/fbd_scenarios_data/forecast/covariate/education//20170608_GBD2016Final/",
    "20170729_GBD2016Final_gpr_draws_scenario-1_from_Nafis.csv"))

Written 2.2% of 532350000 rows in 2 secs using 48 threads. anyBufferGrown=no; maxBuffUsed=49%. Finished in 89 secs.      Written 7.6% of 532350000 rows in 3 secs using 48 threads. anyBufferGrown=no; maxBuffUsed=50%. Finished in 36 secs.      Written 14.1% of 532350000 rows in 4 secs using 48 threads. anyBufferGrown=no; maxBuffUsed=50%. Finished in 24 secs.      Written 20.6% of 532350000 rows in 5 secs using 48 threads. anyBufferGrown=no; maxBuffUsed=50%. Finished in 19 secs.      Written 26.0% of 532350000 rows in 6 secs using 48 threads. anyBufferGrown=no; maxBuffUsed=50%. Finished in 17 secs.      Written 32.4% of 532350000 rows in 7 secs using 48 threads. anyBufferGrown=no; maxBuffUsed=50%. Finished in 14 secs.      Written 38.9% of 532350000 rows in 8 secs using 48 threads. anyBufferGrown=no; maxBuffUsed=50%. Finished in 12 secs.      Written 44.3% of 532350000 rows in 9 secs using 48 threads. anyBufferGrown=no; maxBuffUsed=50%. Finished in 11 secs.      Written 50.8% of 