In [1]:
# rm(list=ls())
require(data.table)
require(MASS)
require(ggplot2)
require(gridExtra)
require(stringr)
require(doParallel)
require(abind)
require(Matrix)
set.seed(123)

## Resizing notebook plot space
options(repr.plot.width=16, repr.plot.height=9)

Loading required package: data.table
Loading required package: MASS
Loading required package: ggplot2
Loading required package: gridExtra
Loading required package: stringr
Loading required package: doParallel
Loading required package: foreach
Loading required package: iterators
Loading required package: parallel
Loading required package: abind
Loading required package: Matrix


## (0) Some functions

In [2]:
## Load in the R functions
source("/home/j/temp/central_comp/libraries/current/r/get_draws.R")

In [3]:
## Neal's multivariate copula function

draw2Dcopula <- function(X, cor_mat, df_return = F){
  L <- dim(X)[2]
  D <- dim(X)[3]
  Xsum <- apply(X, c(2, 3), sum)
  mvdat <- mvrnorm(n=D, mu=0 * 1:L, Sigma=cor_mat, empirical=TRUE)
  ranks <- apply(mvdat, 2, rank, ties.method="first")
  sortedXsim <- apply(Xsum, 1, function(x) sort(x, index.return=TRUE)$ix)
  sortedX <- X
  for(i in 1:L){
    sortedX[,i,] <- X[,i,sortedXsim[,i]]
  }
  Xcorr <- sortedX
  for(i in 1:L){
    Xcorr[,i,] <- sortedX[,i,ranks[,i]]
  }
  if (df_return==T) {
    return(data.table(melt(Xcorr)))
    }
      else {
          Xcorr
      }
}


In [53]:
## Stacking function: stack the names of the ones that matter and ones that don't
stacking_df <- function(data, to_stack, to_stack_name, no_stack, no_stack_name) {
    
    data[,paste0(to_stack_name) := do.call(paste, c(data[, .SD, .SDcols = to_stack], sep = "_")) ]
    data[, paste0(no_stack_name) := do.call(paste, c(data[, .SD, .SDcols = no_stack], sep = "_"))]
    
    return(data)
}

## (1) Prepping the data to be into the array we want

In [16]:
## Get some example draws (causes 587 and 495)
dalys_diabetes <- get_draws(gbd_id_field = "cause_id", gbd_id = 587, source = 'dalynator', metric_id = 1,
                            measure_ids = 2, location_id = c(6,7,8), 
                            age_group_ids = c(6,7,8,9,10), sex_ids = c(1,2))

dalys_ihd <- get_draws(gbd_id_field = "cause_id", gbd_id = 495, source = 'dalynator', metric_id = 1,
                            measure_ids = 2, location_id = c(6,7,8), 
                            age_group_ids = c(6,7,8,9,10), sex_ids = c(1,2))

In [17]:
## Clean up and only what we need (loc, year, age, sex, draws):
dalys_ihd <- dalys_ihd[, .SD, .SDcols = c("location_id", "year_id", "age_group_id", "sex_id", "cause_id", paste0("draw_",c(0:999)) )]
dalys_diabetes <- dalys_diabetes[, .SD, .SDcols = c("location_id", "year_id", "age_group_id", "sex_id", "cause_id", paste0("draw_",c(0:999)) )]

In [18]:
## Convert age group to a string with leading zeroes, so that the sorting is maintained in the arrays
dalys_ihd[, age_group_id:= formatC(age_group_id, width = 3, format = "d", flag = "0")]
dalys_diabetes[, age_group_id:= formatC(age_group_id, width = 3, format = "d", flag = "0")]

In [20]:
## Stack age_sex and loc_year
corring_over <- c("age_group_id", "sex_id", "cause_id")
corring_var_name <- "age_sex_cause"

not_corring_over <- c("location_id", "year_id")
not_corring_var_name <- "loc_year"

dalys_ihd <- stacking_df(dalys_ihd, to_stack =  corring_over, to_stack_name = corring_var_name,
                         no_stack = not_corring_over, no_stack_name = not_corring_var_name)

dalys_diabetes <- stacking_df(dalys_diabetes, to_stack =  corring_over, to_stack_name = corring_var_name,
                         no_stack = not_corring_over, no_stack_name = not_corring_var_name)

In [21]:
head(dalys_diabetes); tail(dalys_ihd)

location_id,year_id,age_group_id,sex_id,var_name,draw_0,draw_1,draw_2,draw_3,draw_4,⋯,draw_992,draw_993,draw_994,draw_995,draw_996,draw_997,draw_998,draw_999,age_sex_var,loc_year
6,2016,6,1,diabeetus,1502.603,1464.278,1159.717,1123.799,1117.349,⋯,1749.5,1556.551,1537.333,1938.616,1579.557,1576.566,1485.885,1702.443,006_1_diabeetus,6_2016
6,2016,7,1,diabeetus,4544.98,5109.784,4331.432,4388.135,4026.03,⋯,7566.693,5161.448,4875.295,6053.514,4516.764,5316.403,5184.32,5352.031,007_1_diabeetus,6_2016
6,2016,8,1,diabeetus,22877.532,21976.269,16302.054,16337.912,16569.428,⋯,26773.15,20345.621,16477.317,20851.499,14993.269,17000.141,20112.642,16799.632,008_1_diabeetus,6_2016
6,2016,9,1,diabeetus,99663.02,85973.618,53148.5,54046.493,63074.437,⋯,83049.148,79542.014,68065.248,83753.853,46989.948,60156.015,61445.95,52345.268,009_1_diabeetus,6_2016
6,2016,10,1,diabeetus,207400.937,187044.589,116781.61,112546.733,133283.33,⋯,181121.544,170419.012,157672.722,193634.411,91804.074,137670.104,120549.784,122673.684,010_1_diabeetus,6_2016
6,2016,6,2,diabeetus,2129.917,1870.235,1835.528,1770.46,1846.798,⋯,2871.924,2508.616,2421.117,2665.646,2356.602,2619.106,2224.157,2340.772,006_2_diabeetus,6_2016


location_id,year_id,age_group_id,sex_id,var_name,draw_0,draw_1,draw_2,draw_3,draw_4,⋯,draw_992,draw_993,draw_994,draw_995,draw_996,draw_997,draw_998,draw_999,age_sex_var,loc_year
8,2005,10,1,heart_stuff,233.988585,171.48719,209.9186,154.464707,135.579504,⋯,167.509717,174.953283,217.3493,140.951074,101.94097,205.146271,118.190764,233.869982,010_1_heart_stuff,8_2005
8,2005,6,2,heart_stuff,4.457144,6.132533,6.77355,7.412024,5.990928,⋯,5.085523,3.212157,8.599625,5.393792,4.949043,5.543227,5.685739,6.499283,006_2_heart_stuff,8_2005
8,2005,7,2,heart_stuff,11.385647,12.958085,13.54328,10.741678,8.925954,⋯,14.483783,11.895921,15.118872,15.822339,14.795711,11.584933,11.505027,13.801689,007_2_heart_stuff,8_2005
8,2005,8,2,heart_stuff,29.771521,31.065605,23.15046,22.519256,36.584111,⋯,20.610921,16.632874,30.001893,23.911072,24.180766,32.952451,30.363117,34.850801,008_2_heart_stuff,8_2005
8,2005,9,2,heart_stuff,128.309223,82.572138,85.95811,72.925669,69.914348,⋯,83.147467,74.205571,91.836777,101.789327,59.807383,82.2911,105.698706,122.497104,009_2_heart_stuff,8_2005
8,2005,10,2,heart_stuff,159.220627,158.660794,126.50677,161.854878,156.890885,⋯,144.133512,176.340762,124.226775,152.610968,129.468943,146.645636,127.134103,169.171297,010_2_heart_stuff,8_2005


In [22]:
## Melt the draws and add a column called "var_name"
dalys_ihd_long <- melt(dalys_ihd, id.vars = c(not_corring_var_name, corring_var_name), 
                       measure.vars = paste0("draw_", c(0:999)),
                      value.name = "dalys", variable.name = "draw_num")

dalys_diabetes_long <- melt(dalys_diabetes, id.vars =c(not_corring_var_name, corring_var_name), 
                            measure.vars = paste0("draw_", c(0:999)),
                            value.name = "dalys", variable.name = "draw_num")

In [23]:
## Bind the dataframes
dalys_binded_long <- rbind(dalys_diabetes_long, dalys_ihd_long)

In [24]:
head(dalys_binded_long); tail(dalys_binded_long)

loc_year,age_sex_var,draw_num,dalys
6_2016,006_1_diabeetus,draw_0,1502.603
6_2016,007_1_diabeetus,draw_0,4544.98
6_2016,008_1_diabeetus,draw_0,22877.532
6_2016,009_1_diabeetus,draw_0,99663.02
6_2016,010_1_diabeetus,draw_0,207400.937
6_2016,006_2_diabeetus,draw_0,2129.917


loc_year,age_sex_var,draw_num,dalys
8_2005,010_1_heart_stuff,draw_999,233.869982
8_2005,006_2_heart_stuff,draw_999,6.499283
8_2005,007_2_heart_stuff,draw_999,13.801689
8_2005,008_2_heart_stuff,draw_999,34.850801
8_2005,009_2_heart_stuff,draw_999,122.497104
8_2005,010_2_heart_stuff,draw_999,169.171297


## (2) Devising AR correlation matrix for each unique corring variable (age, sex and cause) and then combine... maybe?

In [25]:
## Number of groups
age_groups <- unique(dalys_ihd[year_id == 2016, age_group_id])
sex_groups <- unique(dalys_ihd[year_id == 2016, sex_id])
year_groups <- unique(dalys_ihd[year_id == 2016, year_id])
loc_groups <- unique(dalys_ihd[year_id == 2016, location_id])
var_groups <- unique(c("heart_stuff", "diabeetus"))

In [26]:
## Matrices within each group first (2x2 matrices do not follow AR processes)
age_corr_mat <- 0.75**abs(outer(1:length(age_groups), 1:length(age_groups), "-"))
colnames(age_corr_mat) = rownames(age_corr_mat) = age_groups

sex_corr_mat <- 0.4**abs(outer(1:length(sex_groups), 1:length(sex_groups), "-"))
colnames(sex_corr_mat) = rownames(sex_corr_mat) = sex_groups

var_corr_mat <- 0.6**abs(outer(1:length(var_groups), 1:length(var_groups), "-"))
colnames(var_corr_mat) = rownames(var_corr_mat) = var_groups

In [27]:
## Create the ultimate correlation matrix: the Kronecker product (IN THE REVERSE ORDER OF age_sex_var)
k1 <- kronecker(var_corr_mat, sex_corr_mat, make.dimnames = T)
k2 <- kronecker(k1, age_corr_mat, make.dimnames = T)

In [51]:
## The time has come: make the data.table into a multi dimensional array!
system.time(dalys_array <- reshape2::acast(dalys_binded_long, loc_year ~ age_sex_var ~ draw_num, value.var = "dalys"))
str(dalys_array)

   user  system elapsed 
  1.663   0.023   1.687 

 num [1:18, 1:20, 1:1000] 4956 6162 5783 2280 2266 ...
 - attr(*, "dimnames")=List of 3
  ..$ : chr [1:18] "6_1990" "6_1995" "6_2000" "6_2005" ...
  ..$ : chr [1:20] "006_1_diabeetus" "006_1_heart_stuff" "006_2_diabeetus" "006_2_heart_stuff" ...
  ..$ : chr [1:1000] "draw_0" "draw_1" "draw_2" "draw_3" ...


In [52]:
## Copulate all over my body
system.time(dalys_corr <- draw2Dcopula(X = dalys_array, cor_mat = k2, df_return = T))
names(dalys_corr) <- colnames(dalys_binded_long)
str(dalys_corr)

   user  system elapsed 
  1.218   0.019   1.237 

Classes ‘data.table’ and 'data.frame':	360000 obs. of  4 variables:
 $ loc_year   : Factor w/ 18 levels "6_1990","6_1995",..: 1 2 3 4 5 6 7 8 9 10 ...
 $ age_sex_var: Factor w/ 20 levels "006_1_diabeetus",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ draw_num   : Factor w/ 1000 levels "draw_0","draw_1",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ dalys      : num  4558 6343 5643 2492 2274 ...
 - attr(*, ".internal.selfref")=<externalptr> 
