In [3]:
source("../functions.R")

Loading required package: geiger
Loading required package: laser


## Prepare empirical data sets for cross validation

### RHDV

In [170]:
rhdv_data <- read.dna('rhdv.fasta', format = 'fasta')

In [171]:
rhdv_data

72 DNA sequences in binary format stored in a matrix.

All sequences of same length: 1737 

Labels: AUS_NSW_Grafton_2013.52 AUS_NSW_Hall_WAT1-13_2007.94 AUS_NSW_Murrumbateman_BlueGums_2014.24 AUS_SA_BillaKallina_2004.50 AUS_SA_Bulgania_2009.65 AUS_SA_Coorong_2004.75 ...

Base composition:
    a     c     g     t 
0.235 0.289 0.252 0.224 

In [172]:
rhdv_tre <- nj(dist.dna(rhdv_data, gamma = T, model = 'TN93'))
p1 <- optim.pml(pml(rhdv_tre, data = phyDat(rhdv_data), k = 6), optQ = T, optGamma = T, 
                optEdge = T, optNni = T, optBf = T)

optimize edge weights:  -12002.74 --> -11918.91 
optimize base frequencies:  -11918.91 --> -11893.88 
optimize rate matrix:  -11893.88 --> -11053.3 
optimize shape parameter:  -11053.3 --> -10874.4 
optimize edge weights:  -10874.4 --> -10872.18 
optimize topology:  -10872.18 --> -10829.41 
optimize topology:  -10829.41 --> -10816.98 
optimize topology:  -10816.98 --> -10804.06 
16 
optimize base frequencies:  -10804.06 --> -10790.97 
optimize rate matrix:  -10790.97 --> -10789.7 
optimize shape parameter:  -10789.7 --> -10789.69 
optimize edge weights:  -10789.69 --> -10789.66 
optimize topology:  -10789.66 --> -10789.62 
optimize topology:  -10789.62 --> -10788.3 
optimize topology:  -10788.3 --> -10788.3 
4 
optimize base frequencies:  -10788.3 --> -10788.02 
optimize rate matrix:  -10788.02 --> -10787.96 
optimize shape parameter:  -10787.96 --> -10787.96 
optimize edge weights:  -10787.96 --> -10787.96 
optimize topology:  -10787.96 --> -10787.96 
0 
optimize base frequencies:  -1

In [173]:
p1


 loglikelihood: -10787.94 

unconstrained loglikelihood: -6282.495 
Discrete gamma model
Number of rate categories: 6 
Shape parameter: 0.2673988 

Rate matrix:
          a         c         g         t
a  0.000000  1.112320 11.274530  1.663144
c  1.112320  0.000000  0.988718 15.051599
g 11.274530  0.988718  0.000000  1.000000
t  1.663144 15.051599  1.000000  0.000000

Base frequencies:  
0.2365411 0.2793954 0.248316 0.2357474 

- remove one site from rhdv data set such that the number of sites is even

In [174]:
rhdv_even_data <- rhdv_data[, 1:(ncol(rhdv_data) - 1)]
print(ncol(rhdv_data) / 2)
print(ncol(rhdv_even_data) / 2)

[1] 868.5
[1] 868


In [175]:
set.seed(0468671667)
rhdv_sample_sites <- sample(x = 1:ncol(rhdv_even_data), size = ncol(rhdv_even_data) / 2) 
print(ncol(rhdv_even_data))
print(length(rhdv_sample_sites))

[1] 1736
[1] 868


In [176]:
rhdv_train <- rhdv_even_data[, rhdv_sample_sites]
write.dna(rhdv_train, file = 'rhdv_train.fasta', format = 'fasta', nbcol = -1, colsep = '')

In [177]:
rhdv_test <- rhdv_even_data[, -rhdv_sample_sites]
rhdv_test
write.dna(rhdv_test, file = 'rhdv_test.fasta', format = 'fasta', nbcol = -1, colsep = '')

72 DNA sequences in binary format stored in a matrix.

All sequences of same length: 868 

Labels: AUS_NSW_Grafton_2013.52 AUS_NSW_Hall_WAT1-13_2007.94 AUS_NSW_Murrumbateman_BlueGums_2014.24 AUS_SA_BillaKallina_2004.50 AUS_SA_Bulgania_2009.65 AUS_SA_Coorong_2004.75 ...

Base composition:
    a     c     g     t 
0.240 0.304 0.228 0.228 

### Check that test and training set have the same number of variable sites and site patterns

In [3]:
source('~/Desktop/sandbox/modadclocks/subst_modad/multlik.R')

Loading required package: phangorn
Loading required package: ape


In [179]:
print(multlik(al = rhdv_data))
print(length(seg.sites(rhdv_data)) / ncol(rhdv_data) )

[1] -6282.495
[1] 0.3287277


In [180]:
print(multlik(al = rhdv_train))
print(length(seg.sites(rhdv_train)) / ncol(rhdv_train))
print(ncol(rhdv_train))

[1] -2963.959
[1] 0.3329493
[1] 868


In [181]:
print(multlik(al = rhdv_test))
print(length(seg.sites(rhdv_test)) / ncol(rhdv_test))
print(ncol(rhdv_test))

[1] -2942.294
[1] 0.3248848
[1] 868


## West Nile Fever Virus

In [273]:
#setwd("/wnfv/")

In [274]:
wnfv <- read.dna('150810_WNV.NY99.fasta', format = 'fasta')

In [275]:
wnfv

68 DNA sequences in binary format stored in a matrix.

All sequences of same length: 10299 

Labels: HM488128|AMCR|CT|BIDV4189|1999|NorthEast|NY99 HM488132|mosquito|CT|BIDV4194|2000|NorthEast|NY99 HQ671712|mosquito|CT|BIDV4904|2000|NorthEast|NY99 V6478|AMCR|MD|04745118|2001|South|NY99 HM756662|AMCR|NY|BIDV4693|2001|NorthEast|NY99 V6450|AMCR|MI|04736009|2001|MidWest|NY99 ...

Base composition:
    a     c     g     t 
0.272 0.223 0.288 0.216 

In [276]:
head(rownames(wnfv))

 Get only accession number and date

In [277]:
new_names <- vector()
for(i in 1:length(rownames(wnfv))){
    split_name <- strsplit(rownames(wnfv)[i], '[|]')[[1]]
    new_names[i] <- paste0(split_name[1], '_', split_name[length(split_name)-2])
}

In [278]:
rownames(wnfv) <- new_names

In [279]:
wnfv_even <- wnfv[, 1:(ncol(wnfv) - 1)]
wnfv_even

68 DNA sequences in binary format stored in a matrix.

All sequences of same length: 10298 

Labels: HM488128_1999 HM488132_2000 HQ671712_2000 V6478_2001 HM756662_2001 V6450_2001 ...

Base composition:
    a     c     g     t 
0.272 0.224 0.288 0.216 

In [280]:
ncol(wnfv_even) / 2

In [281]:
set.seed(12345344)
wnfv_sample <- sample(1:ncol(wnfv_even), size = ncol(wnfv_even) / 2)

In [282]:
wnfv_train <- wnfv_even[, wnfv_sample]
multlik(wnfv_train)
length(seg.sites(wnfv_train))
write.dna(wnfv_train, file = 'wnv_train.fasta', format = 'fasta', nbcol = -1, colsep = '')

In [283]:
wnfv_test <- wnfv_even[, -wnfv_sample]
multlik(wnfv_test)
length(seg.sites(wnfv_test))
write.dna(wnfv_test, file = 'wnv_test.fasta', format = 'fasta', nbcol = -1, colsep = '')

## Enterovirus

In [4]:
enterovir <- read.dna('VP1.B5.Vietnam.EVA71.fasta', format = 'fasta')
enterovir

34 DNA sequences in binary format stored in a matrix.

All sequences of same length: 859 

Labels: 1_042/12-Sep-13 0_070/28-Oct-13 0_024/26-Aug-13 1_008/20-Jul-13 0_090/25-Nov-13 49148/23-Oct-12 ...

Base composition:
    a     c     g     t 
0.265 0.254 0.246 0.235 

In [5]:
new_names <- vector()
for(i in 1:nrow(enterovir)){
    split_name <- strsplit(rownames(enterovir)[i], '/')[[1]]
    date_raw <- strsplit(split_name[2], '[-]')[[1]]
    month <- (1:12)[c('Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 
                      'Sep', 'Oct', 'Nov', 'Dec') == date_raw[2]] / 13
    day <- as.numeric(date_raw[1]) / 365
    year <- as.numeric(date_raw[3]) + 2000
    new_names[i] <- paste0(split_name[1], '_' , as.character(round(day + month + year, 2)))
}

In [6]:
rownames(enterovir) <- new_names

In [8]:
enterovir
write.dna(enterovir, file = 'enterovirus.fasta', format = 'fasta', nbcol = -1, colsep ='')

34 DNA sequences in binary format stored in a matrix.

All sequences of same length: 859 

Labels: 1_042_2013.73 0_070_2013.85 0_024_2013.69 1_008_2013.59 0_090_2013.91 49148_2012.83 ...

Base composition:
    a     c     g     t 
0.265 0.254 0.246 0.235 

In [341]:
enterovir_even <- enterovir[, 1:(ncol(enterovir) - 1)]
enterovir_even

34 DNA sequences in binary format stored in a matrix.

All sequences of same length: 858 

Labels: 1_042_2013.73 0_070_2013.85 0_024_2013.69 1_008_2013.59 0_090_2013.91 49148_2012.83 ...

Base composition:
    a     c     g     t 
0.265 0.253 0.247 0.235 

In [342]:
set.seed(34567)
enterovir_samples <- sample(1:ncol(enterovir_even), ncol(enterovir_even) / 2)

In [343]:
enterovir_training <- enterovir_even[, enterovir_samples]
multlik(enterovir_training)
length(seg.sites(enterovir_training))
write.dna(enterovir_training, file = '../enterovirus/enterovir_training.fasta', format = 'fasta', nbcol = -1, colsep = '')

In [344]:
enterovir_test <- enterovir_even[, -enterovir_samples]
multlik(enterovir_test)
length(seg.sites(enterovir_test))
write.dna(enterovir_test, file = '../enterovirus/enterovir_test.fasta', format = 'fasta', nbcol = -1, colsep ='')

In [345]:
getwd()

### Shigella sonnei

In [4]:
shigella_data <- read.dna('shigella/Shigella_sonnei.fasta', format = 'fasta')

In [5]:
shigella_data

161 DNA sequences in binary format stored in a matrix.

All sequences of same length: 1626 

Labels: MS0043_HCMC_1995 20263_HCMC_2009 20343_HCMC_2009 30003_HCMC_2009 30008_HCMC_2009 30010_HCMC_2009 ...

Base composition:
    a     c     g     t 
0.166 0.346 0.327 0.161 

In [16]:
set.seed(543223234)
shigella_sample <- sample(1:ncol(shigella_data), ncol(shigella_data) / 2)
length(shigella_sample)

In [21]:
shigella_training <- shigella_data[, shigella_sample]
write.dna(shigella_training, file = 'shigella/shigella_training.fasta', format = 'fasta', nbcol = -1, colsep = '')
length(seg.sites(shigella_training))
multlik(shigella_training)

In [22]:
shigella_test <- shigella_data[, -shigella_sample]
write.dna(shigella_test, file = 'shigella/shigella_test.fasta', format = 'fasta', nbcol = -1, colsep = '')
length(seg.sites(shigella_test))
multlik(shigella_test)