In [1]:
library(phangorn)

Loading required package: ape


In [2]:
get_likelihoods <- function(log_data, tree_data, test_data, model){

    gamma <- log_data$gammaShape
    freqs <- log_data[c('freqParameter.1','freqParameter.2', 'freqParameter.3', 'freqParameter.4')]
    ex_rates <- log_data[c('rateAC', 'rateAG', 'rateAT', 'rateCG' ,'rateGT')]

    liks <- vector()
    for(i in 1:length(tree_data)){

    if(model == 'JC'){
        liks[i] <- pml(tree = tree_data[[i]], data= test_data)$logLik            
    }else if(model == 'GTR'){
        liks[i] <- pml(tree = tree_data[[i]], data= test_data, bf = as.numeric(freqs[i, ]), Q = as.numeric(c(ex_rates[i, ], 1)), k = 4, shape = gamma[i])$logLik            
    }
    }
    return(liks)
}


In [3]:
log_strict_constant <- read.table('enterovir_training_strict_constant.log', head = T)
log_strict_constant <- log_strict_constant[(nrow(log_strict_constant) - 100):(nrow(log_strict_constant)), ]
trees_strict_constant <- read.tree('enterovir_training_strict_constant_phylogs.trees')
test_strict_constant <- phyDat(read.dna('enterovir_test.fasta', format = 'fasta'))
strict_constant_liks <- get_likelihoods(log_strict_constant, trees_strict_constant, test_data = test_strict_constant, model = 'JC')
print(mean(log_strict_constant$likelihood))
print(mean(strict_constant_liks, na.rm = T))
print(strict_constant_liks)

[1] -1010.835
[1] -1124.628
  [1] -1103.647 -1110.989 -1118.919 -1128.435 -1100.208 -1126.698 -1122.089
  [8] -1098.913 -1114.102 -1134.427 -1121.139 -1116.930 -1092.452 -1114.972
 [15] -1133.259 -1134.410 -1129.104 -1128.255 -1119.716 -1132.154 -1139.657
 [22] -1121.487 -1122.516 -1130.400 -1140.067 -1124.187 -1109.898 -1123.279
 [29] -1132.467 -1112.975 -1128.883 -1120.761 -1143.894 -1110.403 -1113.745
 [36] -1172.200 -1140.440 -1128.863 -1137.551 -1139.156 -1134.921 -1125.297
 [43] -1129.917 -1126.283 -1118.732 -1123.773 -1137.109 -1119.909 -1108.582
 [50] -1113.925 -1128.329 -1101.568 -1131.394 -1138.890 -1098.368 -1158.128
 [57] -1157.754 -1114.252 -1135.161 -1111.396 -1114.230 -1105.121 -1116.930
 [64] -1105.112 -1098.550 -1129.114 -1114.438 -1118.779 -1110.706 -1115.950
 [71] -1117.469 -1128.089 -1117.648 -1124.471 -1131.378 -1125.635 -1138.383
 [78] -1117.493 -1123.984 -1124.643 -1148.906 -1170.616 -1127.420 -1134.582
 [85] -1153.414 -1161.133 -1129.103 -1136.495 -1142.175 -111

In [4]:
log_strict_constant <- read.table('enterovir_training2_strict_constant.log', head = T)
log_strict_constant <- log_strict_constant[(nrow(log_strict_constant) - 100):(nrow(log_strict_constant)), ]
trees_strict_constant <- read.tree('enterovir_training2_strict_constant_phylogs.trees')
test_strict_constant <- phyDat(read.dna('enterovir_test2.fasta', format = 'fasta'))
strict_constant_liks <- get_likelihoods(log_strict_constant, trees_strict_constant, test_data = test_strict_constant, model = 'JC')
print(mean(log_strict_constant$likelihood))
print(mean(strict_constant_liks, na.rm = T))
print(strict_constant_liks)

[1] -995.1767
[1] -1138.819
  [1] -1169.390 -1123.288 -1137.153 -1127.111 -1127.689 -1129.933 -1141.301
  [8] -1133.330 -1141.215 -1129.686 -1133.005 -1131.469 -1143.177 -1135.532
 [15] -1160.124 -1154.289 -1172.426 -1141.689 -1151.295 -1127.098 -1159.630
 [22] -1150.375 -1139.108 -1130.924 -1153.586 -1148.081 -1127.671 -1149.665
 [29] -1142.467 -1123.895 -1139.232 -1137.486 -1137.869 -1153.516 -1102.128
 [36] -1143.405 -1151.939 -1139.306 -1139.969 -1152.269 -1132.884 -1113.911
 [43] -1140.933 -1140.721 -1140.229 -1126.165 -1139.100 -1131.508 -1138.769
 [50] -1179.073 -1159.289 -1153.105 -1145.219 -1131.878 -1124.941 -1139.649
 [57] -1148.107 -1129.035 -1108.217 -1126.024 -1134.831 -1144.905 -1132.274
 [64] -1145.303 -1156.396 -1143.394 -1138.309 -1121.329 -1145.552 -1120.923
 [71] -1136.543 -1117.624 -1170.984 -1133.120 -1130.739 -1132.592 -1146.189
 [78] -1113.746 -1148.796 -1150.692 -1152.250 -1127.478 -1132.162 -1152.645
 [85] -1127.019 -1123.087 -1133.084 -1115.199 -1125.206 -115

In [6]:
log_strict_exponential <- read.table('enterovir_training2_strict_exponential.log', head = T)
log_strict_exponential <- log_strict_exponential[(nrow(log_strict_exponential) - 100):(nrow(log_strict_exponential)), ]
trees_strict_exponential <- read.tree('enterovir_training2_strict_exponential_phylogs.trees')
test_strict_exponential <- phyDat(read.dna('enterovir_test2.fasta', format = 'fasta'))
strict_exponential_liks <- get_likelihoods(log_strict_exponential, trees_strict_exponential, test_data = test_strict_exponential, model = 'JC')
print(mean(log_strict_exponential$likelihood))
print(mean(strict_exponential_liks, na.rm = T))
print(strict_exponential_liks)

[1] -995.4637
[1] -1126.377
  [1] -1091.739 -1119.946 -1120.957 -1152.378 -1126.996 -1121.046 -1112.079
  [8] -1122.972 -1138.954 -1127.421 -1122.505 -1134.365 -1140.392 -1124.798
 [15] -1116.465 -1114.733 -1127.547 -1133.352 -1137.704 -1133.821 -1141.646
 [22] -1120.791 -1139.326 -1112.802 -1124.656 -1123.273 -1131.283 -1111.307
 [29] -1120.343 -1131.990 -1130.822 -1112.378 -1126.027 -1155.340 -1125.736
 [36] -1150.266 -1132.742 -1131.302 -1123.366 -1133.038 -1132.516 -1121.163
 [43] -1141.308 -1140.360 -1127.630 -1128.608 -1123.932 -1126.764 -1116.397
 [50] -1112.711 -1126.091 -1113.440 -1128.847 -1137.918 -1134.726 -1131.485
 [57] -1139.675 -1133.734 -1136.240 -1107.679 -1137.750 -1128.331 -1128.032
 [64] -1110.715 -1109.339 -1101.132 -1138.201 -1141.287 -1115.268 -1143.360
 [71] -1136.795 -1119.036 -1134.693 -1129.641 -1108.337 -1125.815 -1133.548
 [78] -1110.518 -1125.986 -1132.563 -1126.033 -1136.143 -1136.881 -1128.700
 [85] -1122.014 -1125.429 -1098.567 -1117.365 -1117.084 -111

In [7]:
log_strict_exponential <- read.table('enterovir_training_strict_exponential.log', head = T)
log_strict_exponential <- log_strict_exponential[(nrow(log_strict_exponential) - 100):(nrow(log_strict_exponential)), ]
trees_strict_exponential <- read.tree('enterovir_training_strict_exponential_phylogs.trees')
test_strict_exponential <- phyDat(read.dna('enterovir_test.fasta', format = 'fasta'))
strict_exponential_liks <- get_likelihoods(log_strict_exponential, trees_strict_exponential, test_data = test_strict_exponential, model = 'JC')
print(mean(log_strict_exponential$likelihood))
print(mean(strict_exponential_liks, na.rm = T))
print(strict_exponential_liks)

[1] -1009.138
[1] -1115.28
  [1] -1104.237 -1121.394 -1111.456 -1117.996 -1135.046 -1115.134 -1121.969
  [8] -1097.878 -1119.003 -1114.066 -1097.172 -1107.440 -1111.005 -1110.336
 [15] -1120.916 -1127.024 -1122.472 -1121.268 -1120.495 -1102.494 -1118.175
 [22] -1108.316 -1117.528 -1112.634 -1108.610 -1105.547 -1113.889 -1101.612
 [29] -1126.969 -1126.084 -1099.054 -1135.975 -1126.375 -1125.271 -1124.495
 [36] -1133.446 -1083.363 -1150.267 -1130.056 -1113.998 -1104.280 -1121.100
 [43] -1108.391 -1095.994 -1105.496 -1117.310 -1105.576 -1128.535 -1102.489
 [50] -1104.201 -1087.912 -1112.424 -1134.509 -1109.220 -1144.908 -1123.487
 [57] -1114.387 -1111.953 -1112.334 -1105.016 -1102.321 -1114.763 -1107.058
 [64] -1107.365 -1106.749 -1140.701 -1115.602 -1100.490 -1108.770 -1100.472
 [71] -1111.927 -1100.981 -1104.750 -1113.444 -1126.519 -1115.371 -1123.694
 [78] -1121.588 -1112.767 -1133.491 -1097.555 -1122.156 -1122.143 -1110.665
 [85] -1116.460 -1110.262 -1116.296 -1145.413 -1121.851 -1121

In [8]:
log_ucld_constant <- read.table('enterovir_training_ucld_constant.log', head = T)
log_ucld_constant <- log_ucld_constant[(nrow(log_ucld_constant) - 100):(nrow(log_ucld_constant)), ]
trees_ucld_constant <- read.tree('enterovir_training_ucld_constant_phylogs.trees')
test_ucld_constant <- phyDat(read.dna('enterovir_test.fasta', format = 'fasta'))
ucld_constant_liks <- get_likelihoods(log_ucld_constant, trees_ucld_constant, test_data = test_ucld_constant, model = 'JC')
print(mean(log_ucld_constant$likelihood))
print(mean(ucld_constant_liks, na.rm = T))
print(ucld_constant_liks)

[1] -995.1652
[1] -4875.012
  [1]  -4657.903  -5242.491  -4067.869  -8757.183  -2311.819  -7628.511
  [7]  -3502.301  -2878.600  -3486.101  -2882.991  -2288.006  -4074.276
 [13]  -3508.391  -5841.374 -12794.863  -4080.337  -5849.045  -7577.893
 [19]  -2894.548  -3494.628  -2924.887  -4063.759  -2874.648  -2901.265
 [25]  -5262.912  -2893.483  -8133.203  -4674.219  -4077.227  -7007.682
 [31]  -6422.568  -5835.149  -5277.150  -3491.981  -5875.657  -9327.116
 [37]  -5259.570  -6439.404  -8747.151  -4669.554  -6402.705  -2924.052
 [43]  -3499.645  -2311.194  -2882.917  -2300.614  -3468.406  -5852.270
 [49]  -4063.400  -4089.640  -6432.738  -4046.223  -4058.793  -2311.692
 [55]  -2902.507  -2891.810  -4068.567  -4689.195  -3502.634  -4047.129
 [61]  -4063.270  -1754.468  -5827.262  -2891.849  -5820.758  -1717.265
 [67]  -4066.052  -2885.887  -4083.069  -4088.246  -4654.986  -2917.601
 [73]  -2889.419  -9308.820  -5859.165  -4645.829  -5247.955  -5244.931
 [79]  -4039.665  -7004.090  -4083.2

In [9]:
log_ucld_constant <- read.table('enterovir_training2_ucld_constant.log', head = T)
log_ucld_constant <- log_ucld_constant[(nrow(log_ucld_constant) - 100):(nrow(log_ucld_constant)), ]
trees_ucld_constant <- read.tree('enterovir_training2_ucld_constant_phylogs.trees')
test_ucld_constant <- phyDat(read.dna('enterovir_test2.fasta', format = 'fasta'))
ucld_constant_liks <- get_likelihoods(log_ucld_constant, trees_ucld_constant, test_data = test_ucld_constant, model = 'JC')
print(mean(log_ucld_constant$likelihood))
print(mean(ucld_constant_liks, na.rm = T))
print(ucld_constant_liks)

[1] -993.6443
[1] -2815.864
  [1] -2921.380 -1128.635 -2317.244 -2329.631 -3527.084 -4106.320 -3533.594
  [8] -1715.703 -2908.317 -1742.786 -1158.063 -2326.132 -1722.782 -2313.586
 [15] -2922.074 -4077.120 -4073.762 -2918.448 -2906.251 -2907.259 -4663.545
 [22] -2301.707 -1743.501 -2303.860 -2899.129 -1127.274 -2324.478 -2881.832
 [29] -4082.230 -1695.292 -1747.898 -1730.294 -2341.736 -2296.640 -3504.852
 [36] -1710.642 -4122.147 -5272.830 -2347.526 -2320.967 -2321.253 -3482.418
 [43] -2929.657 -3514.352 -4687.533 -1718.215 -2887.423 -2354.198 -2330.512
 [50] -1741.638 -2906.856 -3512.579 -4095.839 -1731.469 -3497.657 -1752.646
 [57] -2320.305 -2915.117 -1753.267 -2328.061 -1729.784 -2338.958 -4114.806
 [64] -4090.106 -1712.765 -2914.319 -2885.275 -2897.991 -2922.315 -2331.864
 [71] -3520.340 -2886.666 -1757.801 -1727.895 -2333.566 -3479.040 -4087.963
 [78] -2313.321 -4086.135 -2874.288 -4061.620 -2344.762 -2328.884 -4063.100
 [85] -3488.469 -4656.046 -3507.025 -1715.806 -2908.543 -348

In [11]:
log_data <- read.table('enterovir_training2_ucld_exponential.log', head = T)
log_data <- log_data[(nrow(log_data) - 100):(nrow(log_data)), ]
trees_data <- read.tree('enterovir_training2_ucld_exponential_phylogs.trees')
test_data <- phyDat(read.dna('enterovir_test2.fasta', format = 'fasta'))

liks <- get_likelihoods(log_data, trees_data, test_data = test_data, model = 'JC')
print(mean(log_data$likelihood))
print(mean(liks, na.rm = T))
print(liks)

[1] -994.7062
[1] -3035.147
  [1] -1106.630 -2899.339 -4058.469 -2295.685 -2309.482 -4095.900 -2323.197
  [8] -4086.305 -2290.233 -1717.887 -2863.743 -1734.801 -2316.257 -2898.764
 [15] -3497.779 -5256.546 -3483.145 -2896.802 -4102.107 -4672.430 -3499.960
 [22] -1710.294 -1737.974 -3493.382 -2878.392 -1708.840 -3496.411 -2303.240
 [29] -3509.971 -4665.617 -2315.634 -3464.898 -4654.897 -2303.203 -1713.649
 [36] -3497.446 -3480.572 -3512.098 -2885.570 -2312.510 -2917.855 -2297.097
 [43] -2315.977 -2308.057 -4055.313 -1706.616 -2885.725 -3502.497 -2316.606
 [50] -4083.809 -4077.238 -2905.299 -1725.451 -3493.989 -2890.875 -3491.044
 [57] -4058.373 -1730.420 -2298.585 -4057.521 -4090.252 -4096.681 -1135.437
 [64] -6443.262 -3490.836 -4689.830 -3476.048 -4695.728 -2879.779 -2304.275
 [71] -3511.096 -2309.030 -1717.248 -2891.557 -3506.088 -5255.505 -2314.882
 [78] -1128.421 -3513.478 -3513.254 -1706.214 -2319.087 -3464.549 -3474.651
 [85] -2886.519 -2892.197 -2915.881 -4102.799 -3504.027 -290

In [12]:
log_data <- read.table('enterovir_training_ucld_exponential.log', head = T)
log_data <- log_data[(nrow(log_data) - 100):(nrow(log_data)), ]
trees_data <- read.tree('enterovir_training_ucld_exponential_phylogs.trees')
test_data <- phyDat(read.dna('enterovir_test.fasta', format = 'fasta'))

liks <- get_likelihoods(log_data, trees_data, test_data = test_data, model = 'JC')
print(mean(log_data$likelihood))
print(mean(liks, na.rm = T))
print(liks)

[1] -998.0738
[1] -4391.366
  [1] -4675.924 -3470.155 -7019.046 -2871.159 -5253.557 -5236.515 -4640.084
  [8] -5238.473 -7579.442 -2296.599 -5803.930 -6416.287 -4060.008 -2867.271
 [15] -2279.058 -3478.916 -2898.655 -2882.233 -4659.850 -3465.679 -2886.758
 [22] -2886.418 -4052.897 -5819.803 -5210.617 -2882.085 -3500.545 -1118.565
 [29] -3473.828 -5238.146 -5805.334 -3474.938 -5836.304 -5204.475 -4074.946
 [36] -4669.851 -2321.493 -5813.878 -5253.627 -4668.713 -3486.251 -2888.379
 [43] -6417.038 -5831.392 -2885.646 -4085.806 -3492.979 -2904.186 -3475.360
 [50] -4043.456 -5265.853 -5235.400 -7009.198 -5212.126 -5827.406 -8096.303
 [57] -4047.770 -5771.972 -7608.225 -5248.063 -8764.305 -3479.953 -4653.110
 [64] -5229.691 -5242.815 -3494.478 -4084.987 -8721.254 -3492.008 -2882.309
 [71] -4715.008 -1132.576 -3481.953 -2311.483 -1135.261 -5276.255 -4634.133
 [78] -2290.915 -4064.890 -9869.323 -2304.608 -5815.431 -5240.609 -2880.569
 [85] -5251.634 -2286.391 -3485.005 -2876.453 -5828.389 -348