In [1]:
library(stringr)
library(doParallel)
registerDoParallel(cores=parallel::detectCores()-1)
library(data.table)

Loading required package: foreach
Loading required package: iterators
Loading required package: parallel


In [2]:
# determine the range of polynomial degree. E.g.,
# {x^min_order, ..., x^max_order}
min_order <- 0
max_order <- 5

# figure out how many variables we want to care about:
# + the maximum number of interactions (i.e. a*b*c == 3-way interaction)
# + the maximum number of possible variables (should be >= max interactions)
max_num_vars <- 10 # needs to be <= 26
max_interactions <- 10 # needs to be <= max_num_vars
vars <- paste0("x", seq.int(max_num_vars))

# prototypical terms in a function
function_space <-
c(paste0("(A)^", seq(from = min_order, to = max_order)),
         "sin(A)",
         "cos(A)",
         "tan(A)",
         "exp(A)")

max_weight <- 50
weight_space <- # [-N,N]... space of possible coefficients
    seq(from = -max_weight, to = max_weight, by = 0.01)

max_terms <- 5 # how many terms should be in each random function?

generate_random_function <-
function(function_space, max_terms, max_interactions, weight_space) {
    random_func <- sample(x = function_space, size = max_terms, replace = TRUE)

    # limitation: polynomial terms all have the same order
    terms <-
        sapply(random_func,
               function(x) {
                   num_interactions <- sample(seq.int(max_interactions), size = 1)
                   interactions <- paste0(sample(vars, size = num_interactions, replace=FALSE), collapse="*")
                   term <- str_replace(x, pattern = "A", replacement = interactions)
                   weight <- sample(weight_space, size = 1)
                   paste0(weight, "*", term)
               })

    definition <- paste0(terms, collapse="+")

    function_string <-
        paste0("function(",
               paste0(vars, collapse=","),
               ") {",
               definition,
               "}")
    
    list(r_func = eval(expr = parse(text=function_string)),
         definition = definition)
}

In [3]:
set.seed(1234)

num_of_random_functions <- 5

random_functions <-
lapply(seq.int(num_of_random_functions),
       function(x) {
           generate_random_function(function_space, max_terms, max_interactions, weight_space)
       })

In [4]:
domain_limit <- 10
function_domain <- seq(from = -domain_limit, to = domain_limit, by = 0.01)

size_of_dataset <- 1E1
noise_sd <- 1 # given the order of magnitude of function values, this may be too low

# write the dataset to disk
data_path <- "experimental_data.csv"
header <- 
t(c(vars,
    paste0("fun", seq_along(random_functions)),
    paste0("noisey_fun", seq_along(random_functions))))
    
write.table(header,
            sep = ",",
            append = FALSE,
            row.names = FALSE,
            col.names = FALSE,
            file = data_path)

write_output <-
foreach(n = iter(seq.int(size_of_dataset)), .final = rbindlist) %dopar% {
    test_point <- sapply(vars, function(x) {sample(function_domain, size=1)})
    
    func_values <-
    sapply(random_functions,
          function(x) {do.call(x$r_func, as.list(test_point))})

    noisey_func_values <-
    func_values + rnorm(n = length(func_values), sd = noise_sd)
    
    write.table(x = t(c(test_point, func_values, noisey_func_values)),
                sep = ",",
                append = TRUE,
                row.names = FALSE,
                col.names = FALSE,
                file = data_path)
}

In [5]:
system(paste("head -n 1", data_path), intern = TRUE)