In [27]:
library(tidyverse)

setwd("~/hbv_covar3/analysis/sim_seq/")

# result files
result_files <- list.files("./", pattern = "simresult.*test")


In [70]:
# get the running parameter for each result
files_strat <- map_chr(strsplit(result_files, "_"), 2)
files_strat <- strsplit(files_strat, '[lnf]')
strat_tb <- do.call(rbind, files_strat)[, -1]
strat_tb <- as.data.frame(apply(strat_tb, 2, as.numeric))
colnames(strat_tb) <- c("l", "n", "f")
l_unique <- unique(strat_tb$l)
n_unique <- unique(strat_tb$n)
f_unique <- unique(strat_tb$f)

# get the result for each parameter combination
coev_pair <- c("site1:site2", "site2:site3") # the true coevolving pairs
indir_coev_pair <- c("site1:site3") # the indirect coevolution

result <- list()
lnf_comb <- expand.grid(l_unique, n_unique, f_unique)
for (i in 1:nrow(lnf_comb)) {
    l <- lnf_comb[i, 1]
    n <- lnf_comb[i, 2]
    f <- lnf_comb[i, 3]
    param_set <- paste0("l", l, "n", n, "f", f)
    result[[param_set]] <- list()
    cur_strat_ind <- which(
                strat_tb$l == l &
                strat_tb$n == n &
                strat_tb$f == f
            )
    cur_files <- result_files[cur_strat_ind]

    # read each result files to get the simulation result
    for (file in cur_files) {
        # if there is no line, then there is no TP nor FP
        if (file.info(file)$size < 2) {
            tp_cnt <- 0
            fp1_cnt <- 0
            fp2_cnt <- 0
        } else {
            # get the pair id from the result files
            cur_result <- read.table(file, sep = " ", header = F)
            colnames(cur_result) <- c("siteA", "siteB")
            cur_result$pair_id <- map_chr(1:nrow(cur_result), function(x) {
                paste(sort(c(cur_result[x, 1], cur_result[x, 2])), collapse = ":")
            })
            pair_id_unique <- unique(cur_result$pair_id)
            # we are not interested in edge_length signal
            el_ind <- grep("edge_length", pair_id_unique)
            if (length(el_ind) > 0) {
                pair_id_unique <- pair_id_unique[-el_ind]
            }
            
            # use the pair_id to find the TP and FP
            tp_cnt <- sum(pair_id_unique %in% coev_pair)
            fp1_cnt <- sum(pair_id_unique %in% indir_coev_pair)
            fp2_cnt <- length(pair_id_unique) - tp_cnt - fp1_cnt
        }
        result[[param_set]][[file]] <- c(tp_cnt, fp1_cnt, fp2_cnt)
       
    }
}

# group each parameter set in to a table
result_tb <- map(result, function(x) {
    cur_result <- as.data.frame(do.call(rbind, x))
    colnames(cur_result) <- c("coev_found", "indir_coev_found", "false_coev_found")
    return(cur_result)
})

In [72]:
# From the result, get the performance of each parameter set
true_dir_assoc_cnt <- 2
true_indir_assoc_cnt <- 1

roc_result <- list()
lnf_comb <- expand.grid(l_unique, n_unique, f_unique)
for (i in 1:nrow(lnf_comb)) {    
    l <- lnf_comb[i, 1]
    n <- lnf_comb[i, 2]
    f <- lnf_comb[i, 3]
    param_set <- paste0("l", l, "n", n, "f", f)

    true_no_assoc_cnt <- choose(l, 2)
    cur_result <- result_tb[[param_set]]
    avg_tpr <- sum(cur_result$coev_found)/(true_dir_assoc_cnt*nrow(cur_result))
    avg_fpr <- (sum(cur_result$indir_coev_found) + sum(cur_result$false_coev_found))/(true_no_assoc_cnt * nrow(cur_result))
    avg_indir <- sum(cur_result$indir_coev_found) / nrow(cur_result)
    roc_result[[param_set]] <- c(avg_tpr, avg_fpr, avg_indir)
}

roc_result <- do.call(rbind, roc_result)
colnames(roc_result) <- c("avg_tpr", "avg_fpr", "avg_indir")

In [73]:
roc_result

Unnamed: 0,avg_tpr,avg_fpr,avg_indir
l100n1000f100,0.985,0.0058343434,0.03
l100n100f100,0.27,0.0004080808,0.03
l100n2000f100,0.985,0.0093070707,0.03
l100n1000f10,0.19,0.0057131313,0.01
l100n100f10,0.015,0.0004060606,0.0
l100n2000f10,0.43,0.0094888889,0.01
l100n1000f2,0.0,0.0057535354,0.0
l100n100f2,0.0,0.0004060606,0.0
l100n2000f2,0.005,0.0094464646,0.0
