In [69]:
library(data.table)
library(ggplot2)
library(dplyr)
library(stringr)
# library(matrixStats)

get_cami_abundances <- function(path="/lustre/scratch/microbiome/n10853499/00-rosella_testing/01-CAMI_II/CAMI_Airways/") {
   
#     Retrieve the CAMI abundances for the test in the given path. also counts which strains are present based on the taxonomy profiles.
#     returns single table with OTU, strain count, and species
    # Deduplicates the final output based on species id and chooses the representative as the most abundant strain
    
    cami_abundances <- Sys.glob(paste0(path, "/short_read/abundanc*")) %>%
    lapply(fread) %>%
    bind_cols

    cami_abundances <- cami_abundances %>%
        select(c(1, seq(2, ncol(cami_abundances), 2)))
    colnames(cami_abundances) <- c("OTU", list.files(path=paste0(path, "/short_read/"), pattern="abundanc*", full.names=FALSE))
    
    # retrieve_taxonomy_distribution
    tax_test <- Sys.glob(paste0(path, "/short_read/strain_ta*")) %>%
        lapply(fread)

    # rename the column headers to uniquify them
    for (i in seq_along(tax_test)) {
      for (j in seq_along(colnames(tax_test[[i]]))) {
        if (colnames(tax_test[[i]])[j] != "V3" && colnames(tax_test[[i]])[j] != "V6") {
          colnames(tax_test[[i]])[j] <- paste(colnames(tax_test[[i]])[j], i, sep = ".")
        }
      }
    }

    # Combine the tables with renamed columns
    tax_test <- Reduce(function(x, y) {merge(x, y, all = F, by = c("V3", "V6"))}, tax_test)

    tax_test <- tax_test %>% 
        select(c("V3", "V6", grep("V5", names(tax_test), value=TRUE)))

    tax_test[, sum:=rowSums(.SD), .SDcols = grep("V5", names(tax_test))] # get the sum of the rows, strain abundance across samples

    # filter out any strains that aren't present
    present_strains <- tax_test[sum>=0.0]
    present_strains[, species_id := tstrsplit(V3, "\\|", keep=7L)]
    present_strains[, N:=.N, by=species_id]
    present_strains <- present_strains[, c("V6", "species_id", "N")]
    present_strains[, OTU:=V6]
    present_strains[, V6:=NULL]
    
    cami_abundances <- full_join(cami_abundances, present_strains, by="OTU")
    
    # deduplicate based on most abundant strain
    cami_abundances[, sum:=rowSums(.SD), .SDcols = grep("abundance", names(cami_abundances))]
    cami_abundances <- cami_abundances[order(species_id, -sum)]
    cami_abundances <- cami_abundances[!duplicated(cami_abundances$species_id)]
    
    # attach genome file name
    genome_to_id <- fread(paste0(path, "/short_read/genome_to_id.tsv"), header=FALSE)
    genome_to_id[, genome:=tstrsplit(V2, "/", keep=8L)]
    genome_to_id[, OTU:=V1]
    genome_to_id[, V1:=NULL]
    genome_to_id[, V2:=NULL]
    cami_abundances <- inner_join(cami_abundances, genome_to_id, by="OTU")
    
    # write genomes
    write.table(cami_abundances$genome, paste0(path, "/short_read/strain_representatives.txt"), row.names=F, quote=F, col.names=F)
    
    return(cami_abundances)
}

In [198]:
path <- "/lustre/scratch/microbiome/n10853499/00-rosella_testing/01-CAMI_II/CAMI_Airways/"

genome_to_id <- fread(paste0(path, "/short_read/genome_to_id.tsv"), header=FALSE)
genome_to_id[, genome:=tstrsplit(V2, "/", keep=8L)]
genome_to_id[, OTU:=V1]
genome_to_id[, V1:=NULL]
genome_to_id[, V2:=NULL]
# metadata <- inner_join(cami_airways_abundances, genome_to_id, by="OTU")
# metadata

OTU,abundance10.tsv,abundance11.tsv,abundance12.tsv,abundance23.tsv,abundance26.tsv,abundance27.tsv,abundance4.tsv,abundance7.tsv,abundance8.tsv,abundance9.tsv,species_id,N,sum,genome
OTU_97.4107.1,0.00000000,0,0.0000000,0.00000,0.000000e+00,0.0000000,0.0000000,0.0000000,0,0.3368157,101385,1,3.368157e-01,GCA_000265385.1_ASM26538v1.fa
OTU_97.42457.0,0.00000000,0,0.0000000,0.00000,0.000000e+00,0.0000000,0.0000000,0.0000000,0,3.0000000,102684,1,3.000000e+00,GCA_001477615.1_ASM147761v1.fa
OTU_97.20597.1,0.00000000,0,0.0000000,0.00000,5.380201e-03,0.0000000,0.0000000,0.0000000,0,0.0000000,1028989,1,5.380201e-03,GCA_000829415.1_ASM82941v1.fa
OTU_97.404.0,0.00000000,1,0.0000000,0.00000,0.000000e+00,0.0000000,0.0000000,0.0000000,0,1.0000000,103817,1,2.000000e+00,GCA_001889125.1_ASM188912v1.fa
OTU_97.1940.1,0.00000000,0,0.9864051,0.00000,0.000000e+00,0.0000000,0.0000000,0.0000000,0,0.0000000,1050174,1,9.864051e-01,GCA_001021025.1_ASM102102v1.fa
OTU_97.588.0,0.00000000,0,0.0000000,0.00000,0.000000e+00,4.0000000,0.0000000,0.0000000,0,0.0000000,1061,1,4.000000e+00,GCA_000021865.1_ASM2186v1.fa
OTU_97.11580.0,0.00000000,0,0.0000000,0.00000,0.000000e+00,1.0000000,0.0000000,0.0000000,0,0.0000000,1063,1,1.000000e+00,GCA_001576595.1_ASM157659v1.fa
OTU_97.32155.0,0.00000000,2,0.0000000,0.00000,0.000000e+00,0.0000000,0.0000000,0.0000000,0,0.0000000,1072256,1,2.000000e+00,GCA_001021065.1_ASM102106v1.fa
OTU_97.21476.0,0.00000000,0,0.0000000,0.00000,1.000000e+00,0.0000000,0.0000000,0.0000000,0,0.0000000,1076,1,1.000000e+00,GCA_000013685.1_ASM1368v1.fa
OTU_97.7928.0,0.00000000,1,3.0000000,0.00000,0.000000e+00,0.0000000,0.0000000,0.0000000,0,0.0000000,108486,1,4.000000e+00,GCA_000525655.1_ASM52565v1.fa


genome,OTU
GCA_001688705.1_ASM168870v1.fa,OTU_97.34268.0
GCA_000056065.1_ASM5606v1.fa,OTU_97.2666.0
GCA_001514415.1_ASM151441v1.fa,OTU_97.2666.1
GCA_000953215.1_DG5.fa,OTU_97.9718.1
GCA_000184925.1_ASM18492v1.fa,OTU_97.9718.0
GCA_001879545.1_ASM187954v1.fa,OTU_97.16660.0
GCA_000017125.1_ASM1712v1.fa,OTU_97.6425.0
GCA_000317855.1_ASM31785v1.fa,OTU_97.44566.0
GCA_000604045.1_ASM60404v1.fa,OTU_97.42609.0
GCA_000400875.1_ASM40087v1.fa,OTU_97.14789.0


In [179]:
present_strains[N>100]
nrow(tax_test[V3 %like% "1280"])

species_id,N,OTU
1280,115,OTU_97.910.0
1280,115,OTU_97.34654.0
1280,115,OTU_97.34821.0
1280,115,OTU_97.34032.0
1280,115,OTU_97.25628.0
1280,115,OTU_97.37323.0
1280,115,OTU_97.34670.0
1280,115,OTU_97.11162.0
1280,115,OTU_97.3011.0
1280,115,OTU_97.7051.0


In [70]:
cami_airways_abundances <- get_cami_abundances("/lustre/scratch/microbiome/n10853499/00-rosella_testing/01-CAMI_II/CAMI_Airways/")
cami_oral_abundances <- get_cami_abundances("/lustre/scratch/microbiome/n10853499/00-rosella_testing/01-CAMI_II/CAMI_Oral/")
cami_uro_abundances <- get_cami_abundances("/lustre/scratch/microbiome/n10853499/00-rosella_testing/01-CAMI_II/CAMI_uro/")
cami_gi_abundances <- get_cami_abundances("/lustre/scratch/microbiome/n10853499/00-rosella_testing/01-CAMI_II/CAMI_gi/")
cami_skin_abundances <- get_cami_abundances("/lustre/scratch/microbiome/n10853499/00-rosella_testing/01-CAMI_II/CAMI_Skin/")

New names:
* V1 -> V1...1
* V2 -> V2...2
* V1 -> V1...3
* V2 -> V2...4
* V1 -> V1...5
* ...
“Invalid .internal.selfref detected and fixed by taking a (shallow) copy of the data.table so that := can add this new column by reference. At an earlier point, this data.table has been copied by R (or was created manually using structure() or similar). Avoid names<- and attr<- which in R currently (and oddly) may copy the whole data.table. Use set* syntax instead to avoid copying: ?set, ?setnames and ?setattr. If this message doesn't help, please report your use case to the data.table issue tracker so the root cause can be fixed or this message improved.”New names:
* V1 -> V1...1
* V2 -> V2...2
* V1 -> V1...3
* V2 -> V2...4
* V1 -> V1...5
* ...
“Invalid .internal.selfref detected and fixed by taking a (shallow) copy of the data.table so that := can add this new column by reference. At an earlier point, this data.table has been copied by R (or was created manually using structure() or similar). 

“Invalid .internal.selfref detected and fixed by taking a (shallow) copy of the data.table so that := can add this new column by reference. At an earlier point, this data.table has been copied by R (or was created manually using structure() or similar). Avoid names<- and attr<- which in R currently (and oddly) may copy the whole data.table. Use set* syntax instead to avoid copying: ?set, ?setnames and ?setattr. If this message doesn't help, please report your use case to the data.table issue tracker so the root cause can be fixed or this message improved.”New names:
* V1 -> V1...1
* V2 -> V2...2
* V1 -> V1...3
* V2 -> V2...4
* V1 -> V1...5
* ...
“Invalid .internal.selfref detected and fixed by taking a (shallow) copy of the data.table so that := can add this new column by reference. At an earlier point, this data.table has been copied by R (or was created manually using structure() or similar). Avoid names<- and attr<- which in R currently (and oddly) may copy the whole data.table. Use

In [72]:
cami_airways_abundances[genome == 'GCA_001617545.1_ASM161754v1.fa']

OTU,abundance10.tsv,abundance11.tsv,abundance12.tsv,abundance23.tsv,abundance26.tsv,abundance27.tsv,abundance4.tsv,abundance7.tsv,abundance8.tsv,abundance9.tsv,species_id,N,sum,genome
OTU_97.28645.0,0,0,0,0,0,0,0,0,1,0,48296,2,1,GCA_001617545.1_ASM161754v1.fa


In [64]:
metadata <- fread("/lustre/scratch/microbiome/n10853499/00-rosella_testing/01-CAMI_II/CAMI_Airways/short_read/metadata.tsv", header=T)
genome_to_id <- fread("/lustre/scratch/microbiome/n10853499/00-rosella_testing/01-CAMI_II/CAMI_Airways/short_read/genome_to_id.tsv", header=F)
# metadata
genome_to_id[, file:=tstrsplit(V2, "/", keep=8L)]
genome_to_id[, V2:=NULL]
genome_to_id[, genome_ID:=V1]
genome_to_id[, V1:=NULL]
metadata <- inner_join(metadata, genome_to_id, by="genome_ID")
metadata[, .N, by=OTU]

OTU,N
541000,14
1578,149
1279,173
1301,199
724,79
848,15
286,26
407,9
194,13
416916,5


In [50]:
# tstrsplit("2|976|200643|171549|2005525|195950|712710|712710.0", "\\|")
tax_test[sum > 1]

V3,V6,V5.1,V5.2,V5.3,V5.4,V5.5,V5.6,V5.7,V5.8,V5.9,V5.10,V5.11,sum
2|1224|1236|72274|468|469|40214|1242245,OTU_97.165.0,0.0,0.0,0.0,0.0,0.0398,3.1175,0.0,0.0,0.0,0.0,0.0,3.1573
2|1224|1236|91347|543|160674|54291|1286170,OTU_97.1365.0,0.2943,0.0,0.0,0.0,0.4775,0.0,0.0,0.0,0.0,0.0,0.2943,1.0661
2|1224|1236|91347|543|544|35703|1261127,OTU_97.41428.0,2.8547,0.0,0.0,0.0,3.1487,0.0,0.0,0.0,0.0,0.053,2.8547,8.9111
2|1224|1236|91347|543|561|562|1045010,OTU_97.11086.0,0.3923,0.0,0.0,0.0,0.2388,0.0,0.0,0.0,0.0,0.0,0.3923,1.0234
2|1224|1236|91347|543|561|562|316401,OTU_97.1661.0,0.4414,0.0,0.0,7.8947,0.2388,0.0,0.0,0.0,0.0,0.0,0.4414,9.0163
2|1224|1236|91347|543|590|28901|28150,OTU_97.3377.0,1.079,0.0,0.0,0.7974,0.5571,0.0,0.0,0.0,0.0,0.0,1.079,3.5125
2|1224|1236|91347|543|590|28901|527001,OTU_97.161.0,11.182,0.0,0.0,1.1164,10.2666,0.0,0.0,0.0,0.0,0.0,11.182,33.747
2|1224|28216|206351|481|482|487|935589,OTU_97.3579.0,0.6866,0.0,0.0,0.0,0.2786,0.0,0.0,0.0,0.0,0.0,0.6866,1.6518
2|1224|28216|206351|481|482|487|935591,OTU_97.20828.0,1.128,0.0,0.0,0.0,1.6713,0.0,0.0,0.0,0.3255,0.0,1.128,4.2528
2|1224|28216|206351|481|482|487|942513,OTU_97.10505.0,0.2452,0.0,0.0,0.0,0.3183,0.0,0.0,0.0,0.3255,0.0,0.2452,1.1342


In [111]:
inner_join(cami_airways_abundances, taxonomy_profile, by="OTU")

OTU,abundance10.tsv,abundance11.tsv,abundance12.tsv,abundance23.tsv,abundance26.tsv,abundance27.tsv,abundance4.tsv,abundance7.tsv,abundance8.tsv,abundance9.tsv,strain_id,N
OTU_97.34268.0,0.0000000,0,0,0,0,0,0.00000,0,0,0,0,1
OTU_97.2666.0,0.0000000,0,0,0,0,0,0.00000,0,0,0,0,1
OTU_97.2666.1,0.0000000,0,0,0,0,0,0.00000,0,0,0,1,3
OTU_97.9718.1,0.0000000,0,0,0,0,0,0.00000,0,0,0,1,1
OTU_97.9718.0,0.0000000,0,0,0,0,0,0.00000,0,0,0,0,1
OTU_97.16660.0,0.0000000,1,0,0,0,0,0.00000,0,0,0,0,57
OTU_97.6425.0,1.0000000,0,0,0,0,0,0.00000,0,0,0,0,1
OTU_97.44566.0,0.0000000,0,0,0,0,0,1.00000,0,0,0,0,1
OTU_97.42609.0,0.0000000,0,1,0,0,0,0.00000,0,0,0,0,2
OTU_97.14789.0,0.0000000,0,0,0,0,0,1.00000,0,0,0,0,1
