In [1]:
library(phangorn)

Loading required package: ape


In [2]:
dir()

In [None]:
log_data <- read.table('rhdv/rhdv_train_ucld_constant.log', head = T)
log_data <- log_data[(nrow(log_data) - 99):nrow(log_data), ]
print(dim(log_data))

test_data <- phyDat(read.dna('rhdv/rhdv_test.fasta', format = 'fasta'))
names(test_data) <- gsub('_|[-]', '', names(test_data))

tree_data <- read.tree('rhdv/rhdv_train_ucld_constant_phylogs.trees')
for(i in 1:length(tree_data)){
    tree_data[[i]]$tip.label <- gsub('_|[-]',  '', tree_data[[i]]$tip.label)
}

head(log_data)

In [None]:
get_likelihoods <- function(log_data, tree_data, test_data){
    if(nrow(log_data) != length(tree_data)) stop('log data and phylograms have different lengths')
    gamma <- log_data$gammaShape
    freqs <- log_data[c('freqParameter.1','freqParameter.2', 'freqParameter.3', 'freqParameter.4')]
    ex_rates <- log_data[c('rateAC', 'rateAG', 'rateAT', 'rateCG' ,'rateGT')]

    liks <- vector()
    for(i in 1:length(tree_data)){
        # Calculate phhylogenetic likelihood for each posterior sample
        liks[i] <- pml(tree = tree_data[[i]], data= test_data, bf = as.numeric(freqs[i, ]), Q = as.numeric(c(ex_rates[i, ], 1)),
                               k = 4, shape = gamma[i])$logLik
    }
    return(liks)
}

In [None]:
system('date', intern = T)

In [None]:
log_strict_constant <- read.table('rhdv/rhdv_train_strict_constant.log', head = T)
log_strict_constant <- log_strict_constant[(nrow(log_strict_constant) - 100):(nrow(log_strict_constant)), ]
trees_strict_constant <- read.tree('rhdv/rhdv_train_strict_constant_phylogs.trees')
test_strict_constant <- phyDat(read.dna('rhdv/rhdv_test.fasta', format = 'fasta'))
strict_constant_liks <- get_likelihoods(log_strict_constant, trees_strict_constant, test_data = test_strict_constant)
print(mean(log_strict_constant$likelihood))
print(mean(strict_constant_liks))

In [None]:
log_strict_exponential <- read.table('rhdv/rhdv_train_strict_exponential.log', head = T)
log_strict_exponential <- log_strict_exponential[(nrow(log_strict_exponential) - 100):(nrow(log_strict_exponential)), ]
trees_strict_exponential <- read.tree('rhdv/rhdv_train_strict_exponential_phylogs.trees')
test_strict_exponential <- phyDat(read.dna('rhdv/rhdv_test.fasta', format = 'fasta'))
strict_exponential_liks <- get_likelihoods(log_strict_exponential,
                                        trees_strict_exponential, test_data = test_strict_exponential)
print(mean(log_strict_exponential$likelihood))
print(mean(strict_exponential_liks))

In [None]:
log_ucld_constant <- read.table('rhdv/rhdv_train_ucld_constant.log', head = T)
log_ucld_constant <- log_ucld_constant[(nrow(log_ucld_constant) - 99):(nrow(log_ucld_constant)), ]
trees_ucld_constant <- read.tree('rhdv/rhdv_train_ucld_constant_phylogs.trees')
test_ucld_constant <- phyDat(read.dna('rhdv/rhdv_test.fasta', format = 'fasta'))

for(i in 1:length(trees_ucld_constant)){
    trees_ucld_constant[[i]]$tip.label <- gsub('_|[-]', '', trees_ucld_constant[[i]]$tip.label)
}

names(test_ucld_constant) <- gsub('_|[-]', '', names(test_strict_constant))



ucld_constant_liks <- get_likelihoods(log_ucld_constant, trees_ucld_constant, test_data = test_ucld_constant)
print(mean(log_ucld_constant$likelihood))
print(mean(ucld_constant_liks))

In [None]:
log_ucld_exponential <- read.table('rhdv/rhdv_train_ucld_exponential.log', head = T)
log_ucld_exponential <- log_ucld_exponential[(nrow(log_ucld_exponential) - 99):(nrow(log_ucld_exponential)), ]
trees_ucld_exponential <- read.tree('rhdv/rhdv_train_ucld_exponential_phylogs.trees')
test_ucld_exponential <- phyDat(read.dna('rhdv/rhdv_test.fasta', format = 'fasta'))

for(i in 1:length(trees_ucld_exponential)){
    trees_ucld_exponential[[i]]$tip.label <- gsub('_|[-]', '', trees_ucld_exponential[[i]]$tip.label)
}

names(test_ucld_exponential) <- gsub('_|[-]', '', names(test_strict_exponential))



ucld_exponential_liks <- get_likelihoods(log_ucld_exponential, trees_ucld_exponential, test_data = test_ucld_exponential)
print(mean(log_ucld_exponential$likelihood))
print(mean(ucld_exponential_liks))

In [None]:
log_strict_constant <- read.table('rhdv/rhdv_train2_strict_constant.log', head = T)
log_strict_constant <- log_strict_constant[(nrow(log_strict_constant) - 100):(nrow(log_strict_constant)), ]
trees_strict_constant <- read.tree('rhdv/rhdv_train2_strict_constant_phylogs.trees')
test_strict_constant <- phyDat(read.dna('rhdv/rhdv_test2.fasta', format = 'fasta'))
strict_constant_liks <- get_likelihoods(log_strict_constant, trees_strict_constant, test_data = test_strict_constant)
print(mean(log_strict_constant$likelihood))
print(mean(strict_constant_liks))

In [None]:
log_strict_exponential <- read.table('rhdv/rhdv_train2_strict_exponential.log', head = T)
log_strict_exponential <- log_strict_exponential[(nrow(log_strict_exponential) - 100):(nrow(log_strict_exponential)), ]
trees_strict_exponential <- read.tree('rhdv/rhdv_train2_strict_exponential_phylogs.trees')
test_strict_exponential <- phyDat(read.dna('rhdv/rhdv_test2.fasta', format = 'fasta'))
strict_exponential_liks <- get_likelihoods(log_strict_exponential,
                                        trees_strict_exponential, test_data = test_strict_exponential)
print(mean(log_strict_exponential$likelihood))
print(mean(strict_exponential_liks))

In [None]:
log_ucld_constant <- read.table('rhdv/rhdv_train2_ucld_constant.log', head = T)
log_ucld_constant <- log_ucld_constant[(nrow(log_ucld_constant) - 99):(nrow(log_ucld_constant)), ]
trees_ucld_constant <- read.tree('rhdv/rhdv_train2_ucld_constant_phylogs.trees')
test_ucld_constant <- phyDat(read.dna('rhdv/rhdv_test2.fasta', format = 'fasta'))

for(i in 1:length(trees_ucld_constant)){
    trees_ucld_constant[[i]]$tip.label <- gsub('_|[-]', '', trees_ucld_constant[[i]]$tip.label)
}

names(test_ucld_constant) <- gsub('_|[-]', '', names(test_strict_constant))



ucld_constant_liks <- get_likelihoods(log_ucld_constant, trees_ucld_constant, test_data = test_ucld_constant)
print(mean(log_ucld_constant$likelihood))
print(mean(ucld_constant_liks))

In [None]:
log_ucld_exponential <- read.table('rhdv/rhdv_train2_ucld_exponential.log', head = T)
log_ucld_exponential <- log_ucld_exponential[(nrow(log_ucld_exponential) - 99):(nrow(log_ucld_exponential)), ]
trees_ucld_exponential <- read.tree('rhdv/rhdv_train2_ucld_exponential_phylogs.trees')
test_ucld_exponential <- phyDat(read.dna('rhdv/rhdv_test2.fasta', format = 'fasta'))

for(i in 1:length(trees_ucld_exponential)){
    trees_ucld_exponential[[i]]$tip.label <- gsub('_|[-]', '', trees_ucld_exponential[[i]]$tip.label)
}

names(test_ucld_exponential) <- gsub('_|[-]', '', names(test_strict_exponential))



ucld_exponential_liks <- get_likelihoods(log_ucld_exponential, trees_ucld_exponential, test_data = test_ucld_exponential)
print(mean(log_ucld_exponential$likelihood))
print(mean(ucld_exponential_liks))