# Exporting 16S data into relevant formats
Quang Nguyen.   
Last updated 2022-26-04.   

In [1]:
library(phyloseq)
library(here)
library(biomformat)
library(tidyverse)

here() starts at /Users/quangnguyen/research/microbe_set_trait

── [1mAttaching packages[22m ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.1 ──

[32m✔[39m [34mggplot2[39m 3.3.5     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.1.6     [32m✔[39m [34mdplyr  [39m 1.0.8
[32m✔[39m [34mtidyr  [39m 1.2.0     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 2.1.2     [32m✔[39m [34mforcats[39m 0.5.1

── [1mConflicts[22m ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()



In [2]:
data_path <- here("output", "sequence_process_16s", "crc_16s", "exports", "feature-table.biom")
tax_path <- here("output", "sequence_process_16s", "crc_16s", "exports", "taxonomy.tsv")

First we read some data files from biom format and import it into OTU formats for phyloseq objects

In [3]:
data <- read_biom(data_path);

In [4]:
otu <- otu_table(as.matrix(biom_data(data)), taxa_are_rows = TRUE)
head(otu)

Unnamed: 0,DE-013,DE-029,DE-031,DE-034,DE-037,DE-038,DE-039,DE-044,DE-045,DE-046,⋯,FR-817,FR-820,FR-824,FR-825,FR-826,FR-827,FR-828,FR-829,FR-830,FR-835
10afda2baef44de4c584a6641de399b1,82,0,92,28,2164,12,4,0,5,712,⋯,14704,18287,5932,50641,10075,44440,20060,12938,46681,14023
99deb3c5ecb022ec05609ebd1112a557,11171,39,21042,1788,9849,8230,8703,11514,1678,0,⋯,1394,2962,0,4824,4560,11880,53574,617,0,208
c6c3ab4e828fb40d6e05967b7aac9338,7,878,305,753,169,860,212,2128,675,1318,⋯,8812,7645,59,33,8125,15433,1469,2814,39,2357
675c847bccbc53942ebb7b8cbb4efc4d,0,0,0,0,3,0,3,0,0,0,⋯,0,13,35,0,31,28,21,46743,9,20410
f5f5e0da89730462abaf6301a9557193,9,4132,2261,3458,209,1114,85,23022,1373,8091,⋯,13,1784,5,66,6542,1528,2573,2852,8,5038
403fb2a88868cc6e20138b715f157fea,0,0,0,0,0,0,0,0,0,0,⋯,4726,9224,2941,18254,4970,16591,8118,4696,20091,6915


In [42]:
taxonomy <- read.csv(tax_path, sep = "\t")
head(taxonomy)

Unnamed: 0_level_0,Feature.ID,Taxon,Confidence
Unnamed: 0_level_1,<chr>,<chr>,<dbl>
1,10afda2baef44de4c584a6641de399b1,d__Bacteria; p__Proteobacteria; c__Alphaproteobacteria; o__Sphingomonadales; f__Sphingomonadaceae; g__Sphingomonas; s__Sphingomonas_paucimobilis,0.9846092
2,99deb3c5ecb022ec05609ebd1112a557,d__Bacteria; p__Bacteroidota; c__Bacteroidia; o__Bacteroidales; f__Bacteroidaceae; g__Bacteroides; s__Bacteroides_vulgatus,0.9830613
3,c6c3ab4e828fb40d6e05967b7aac9338,d__Bacteria; p__Firmicutes; c__Clostridia; o__Lachnospirales; f__Lachnospiraceae; g__Blautia; s__Blautia_wexlerae,0.8237579
4,675c847bccbc53942ebb7b8cbb4efc4d,d__Bacteria; p__Bacteroidota; c__Bacteroidia; o__Bacteroidales; f__Prevotellaceae; g__Prevotella; s__Prevotella_copri,0.9839953
5,f5f5e0da89730462abaf6301a9557193,d__Bacteria; p__Firmicutes; c__Clostridia; o__Oscillospirales; f__Ruminococcaceae; g__Faecalibacterium; s__human_gut,0.8073996
6,403fb2a88868cc6e20138b715f157fea,d__Bacteria; p__Proteobacteria; c__Alphaproteobacteria; o__Rhodobacterales; f__Rhodobacteraceae,0.9975062


Taxonomy is a bit more tricky since the taxon is an entire string and not just a table. Therefore, we split the taxonomy table based on it's categories (e.g. d__ is for domain/superkingdom)

In [43]:
extract_names <- function(vec_obj){
    tax_names <- c("superkingdom", "phylum", "class", "order", "family", "genus", "species")
    n_vec <- vector(mode = "list", length = length(tax_names))
    names(n_vec) <- tax_names
    n_vec[["superkingdom"]] <- gsub(x = vec_obj[str_detect(string = vec_obj, pattern = "d__")], 
                                    pattern = "[a-z]__", replacement = "")
    n_vec[["phylum"]] <- gsub(x = vec_obj[str_detect(string = vec_obj, pattern = "p__")], 
                                    pattern = "[a-z]__", replacement = "")
    n_vec[["class"]] <- gsub(x = vec_obj[str_detect(string = vec_obj, pattern = "c__")], 
                                    pattern = "[a-z]__", replacement = "")
    n_vec[["order"]] <- gsub(x = vec_obj[str_detect(string = vec_obj, pattern = "o__")], 
                                    pattern = "[a-z]__", replacement = "")
    n_vec[["family"]] <- gsub(x = vec_obj[str_detect(string = vec_obj, pattern = "f__")], 
                                    pattern = "[a-z]__", replacement = "")
    n_vec[["genus"]] <- gsub(x = vec_obj[str_detect(string = vec_obj, pattern = "g__")], 
                                    pattern = "[a-z]__", replacement = "")
    n_vec[["species"]] <- gsub(x = vec_obj[str_detect(string = vec_obj, pattern = "s__")], 
                                    pattern = "[a-z]__", replacement = "")
    n_vec[map_lgl(n_vec, ~length(.x) == 0)] <- NA_character_
    return(as_tibble(n_vec))
}


Split by ";" and then convert to matrix to import into phyloseq objects

In [44]:
taxonomy <- taxonomy %>% 
    mutate(tax = str_split(Taxon, pattern = ";")) %>% 
    mutate(t = map(tax, extract_names))

In [47]:
taxonomy <- taxonomy %>% select(-c(Taxon, tax)) %>% unnest(t) %>% select(-Confidence) %>% 
    column_to_rownames(var = "Feature.ID")
    

In [52]:
taxonomy <- as.matrix(taxonomy)
taxtab <- tax_table(taxonomy)

Simple import into phyloseq

In [57]:
physeq <- phyloseq(otu, taxtab)
physeq
saveRDS(physeq, file = here("data", "pred_relabun_crc_16s_physeq.rds"))
# physeq here are phyloseq objects

phyloseq-class experiment-level object
otu_table()   OTU Table:         [ 6135 taxa and 141 samples ]
tax_table()   Taxonomy Table:    [ 6135 taxa by 7 taxonomic ranks ]

Let's do the same for IBD data set 

In [58]:
data_path <- here("output", "sequence_process_16s", "ibd_16s", "exports", "feature-table.biom")
tax_path <- here("output", "sequence_process_16s", "ibd_16s", "exports", "taxonomy.tsv")
data <- read_biom(data_path);
otu <- otu_table(as.matrix(biom_data(data)), taxa_are_rows = TRUE)
head(otu)

Unnamed: 0,1939.MGH100079,1939.MGH100698,1939.MGH100896.a,1939.MGH100896.b,1939.MGH101010,1939.MGH101089,1939.MGH102701,1939.MGH102797,1939.MGH103108,1939.MGH103128,⋯,1939.SKBTI089.b,1939.SKBTI090,1939.SKBTI091.a,1939.SKBTI091.b,1939.SKBTI092.a,1939.SKBTI092.b,1939.SKBTI093,1939.SKBTI094,1939.SKBTI095,1939.SKBTI096
b7d71352bb89f991f7a0023b7596bc1c,19,252,513,14539,63,2004,454,0,1481,38,⋯,293,16,1209,3110,3982,40131,26500,64,5897,8777
1ea34271f850146467c46488ea2b141a,130,174,0,24,412,30,200,789,1,17,⋯,4902,5,6,0,2824,22153,7920,23,1112,95
c6256b8ca548753525c7e45fc2faae7a,2738,373,703,15230,5,51,8,281,33,40,⋯,171,0,173,343,140,649,347,1799,2038,16354
1cce571d048c08affacf0c5018a2d16b,167,0,0,7,8,2,2,0,113,4,⋯,5902,0,140,303,753,5879,6604,79,439,192
b6861df288b33d1326996af77c70f680,47,0,0,11,0,0,0,0,11,8,⋯,96,0,368,440,0,86,356,49,241,7949
f86dfa006a15a63433aa50170c3b412f,6,0,0,0,37,0,0,0,403,0,⋯,8097,0,0,0,0,472,139,0,370,498


In [59]:
taxonomy <- read.csv(tax_path, sep = "\t")
taxonomy <- taxonomy %>% 
    mutate(tax = str_split(Taxon, pattern = ";")) %>% 
    mutate(t = map(tax, extract_names))
taxonomy <- taxonomy %>% select(-c(Taxon, tax)) %>% unnest(t) %>% select(-Confidence) %>% 
    column_to_rownames(var = "Feature.ID")
taxonomy <- as.matrix(taxonomy)
taxtab <- tax_table(taxonomy)
head(taxtab)

Unnamed: 0,superkingdom,phylum,class,order,family,genus,species
b7d71352bb89f991f7a0023b7596bc1c,Bacteria,Bacteroidota,Bacteroidia,Bacteroidales,Bacteroidaceae,Bacteroides,Bacteroides_vulgatus
1ea34271f850146467c46488ea2b141a,Bacteria,Bacteroidota,Bacteroidia,Bacteroidales,Bacteroidaceae,Bacteroides,Bacteroides_fragilis
c6256b8ca548753525c7e45fc2faae7a,Bacteria,Proteobacteria,Gammaproteobacteria,Enterobacterales,Enterobacteriaceae,Escherichia-Shigella,Escherichia_sp.
1cce571d048c08affacf0c5018a2d16b,Bacteria,Firmicutes,Clostridia,Oscillospirales,Ruminococcaceae,Faecalibacterium,human_gut
b6861df288b33d1326996af77c70f680,Bacteria,Firmicutes,Clostridia,Oscillospirales,Ruminococcaceae,Faecalibacterium,gut_metagenome
f86dfa006a15a63433aa50170c3b412f,Bacteria,Bacteroidota,Bacteroidia,Bacteroidales,Bacteroidaceae,Bacteroides,


In [62]:
physeq <- phyloseq(otu, taxtab)
physeq
saveRDS(physeq, file = here("data", "pred_relabun_ibd_16s_physeq.rds"))

phyloseq-class experiment-level object
otu_table()   OTU Table:         [ 4765 taxa and 639 samples ]
tax_table()   Taxonomy Table:    [ 4765 taxa by 7 taxonomic ranks ]