# Processing Weissman et al. database. 

Last updated 2022-04-14.    
Quang Nguyen.    

Notebook ran under R version 4.1.2 and dependencies were managed using `renv`. See `renv.lock` and the `DESCRIPTION` file for more details on dependencies.     

In [3]:
library(tidyverse)
library(dtplyr)
library(data.table)
library(here)
here::i_am("notebooks/weissman_proc.ipynb")

here() starts at /dartfs-hpc/rc/home/k/f00345k/research/microbe_set_trait



In [4]:
weissman <- read_csv(here("data", "weissman.csv"))

# select the relevant columns
weissman <- weissman %>% select(c("taxid_species",
                                  "kingdom", "phylum", "class", "order", "family", "genus", "species",
                                  "Motility_general", "Oxygen.Preference", "Cell.Shape",
                                  starts_with("Enzyme.Assays"), 
                                  starts_with("Volatile.Gas.Production"), 
                                  starts_with("Substrate.Utilization")))

[1mRows: [22m[34m3369[39m [1mColumns: [22m[34m174[39m
[36m──[39m [1mColumn specification[22m [36m───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m  (16): Organism, kingdom, phylum, class, order, family, genus, species, ...
[32mdbl[39m (158): taxid_kingdom, taxid_phylum, taxid_class, taxid_order, taxid_fami...

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


Here, we're going to re-format it similar to Madin et al. `Enzyme.Assays` and `Volatile.Gas.Production` is equivalent to `pathways` while `Substrate.Utililization` is equivalent to `carbon_substrates`. Since there might be non-carbon compounds here, we're going to rename Madin et al.'s `carbon_substrates` into just `substrate` here similar to Weissman's database.  

In [5]:
weissman <- weissman %>% group_by(taxid_species, kingdom, phylum, 
                      class, order, family, genus, species) %>%
    nest(pathways = starts_with(c("Enzyme.Assays", "Volatile.Gas.Production")), 
         substrate = starts_with("Substrate.Utilization")) %>% ungroup() %>% 
    mutate(across(where(is.character), ~na_if(., "0"))) %>% 
    rename("species_tax_id" = taxid_species, "superkingdom" = kingdom, 
           "metabolism" = Oxygen.Preference,
           "motility" = Motility_general, 
           "cell_shape" = Cell.Shape)

# motility, cell shape, metabolism
weissman <- weissman %>% 
    mutate(motility = if_else(motility == "non-motile", "yes", "no"), 
                    cell_shape = case_when(
                        cell_shape == "rod" ~ "bacillus",
                        cell_shape == "ovoid/coccobacillus" ~ "coccus", 
                        cell_shape == "spirillum/corkscrew" ~ "spiral",
                        TRUE ~ cell_shape
                    ), 
                    metabolism = case_when(
                        metabolism == "microaerophile" ~ "microaerophilic",
                        TRUE ~ metabolism
                    ), 
                    metabolism = str_replace(metabolism, 
                                             pattern = "obe$", replacement = "obic"))

In [6]:
weissman <- weissman %>% drop_na(species_tax_id) 
head(weissman)[,1:6]

species_tax_id,superkingdom,phylum,class,order,family
<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>
46125,Bacteria,Firmicutes,Bacilli,Lactobacillales,Aerococcaceae
155978,Bacteria,Cyanobacteria,,Synechococcales,Acaryochloridaceae
435,Bacteria,Proteobacteria,Alphaproteobacteria,Rhodospirillales,Acetobacteraceae
438,Bacteria,Proteobacteria,Alphaproteobacteria,Rhodospirillales,Acetobacteraceae
33952,Bacteria,Firmicutes,Clostridia,Clostridiales,Eubacteriaceae
33952,Bacteria,Firmicutes,Clostridia,Clostridiales,Eubacteriaceae


Here we're going to define some functions to process the traits that is internal (collapse into one column) for pathways and for substrates

In [7]:
#' @param unit A single unit from a list of trait data frames
proc_pathways <- function(unit){
    string <- unit %>% pivot_longer(everything()) %>% 
        filter(value != 0)
    if (nrow(string) == 0){
        return(NA)
    } else {
        string <- string %>% 
            mutate(name = str_replace(name, pattern = "Enzyme.Assays..", replacement = "")) %>%
            mutate(name = str_replace(name, pattern = "Volatile.Gas.Production..", replacement = "synthesis_")) %>%
            mutate(name = str_replace(name, pattern = "\\.$", "")) %>%
            mutate(name = str_replace(name, pattern = "\\.\\.(.*)$", "")) %>%
            mutate(name = str_replace_all(name, pattern = "\\.", "_")) %>% 
            rowwise() %>%
            mutate(name = if_else(str_detect(name, "synthesis_"), 
                                  true = paste(rev(str_split(name, pattern = "_", 
                                                             n = 2)[[1]]), 
                                               collapse = "_"),
                                  false = name)) %>% 
            ungroup() %>%
            pull(name) %>% paste(., collapse = ", ")
    }
        
    return(string)
}

proc_substrate <- function(unit){
    string <- unit %>% pivot_longer(everything()) %>% filter(value != 0)
    if (nrow(string) == 0){
        return(NA)
    } else {
        string <- string %>% mutate(name = str_split(name, pattern = "\\.\\.", 
                                           n = 2, simplify = TRUE)[,2]) %>%
            mutate(name = str_replace(name, "(\\.\\.|\\.)$", "")) %>% 
            mutate(name = str_replace_all(name, "(\\.\\.|\\.)", "_")) %>% 
            pull(name) %>% paste(., collapse = ", ")
    }
    return(string)
}


Here, we apply the function to substrate and pathways

In [8]:
substr <- map_chr(weissman$substrate, proc_substrate)
pthway <- map_chr(weissman$pathways, proc_pathways)

weissman <- weissman %>% select(-c(pathways, substrate)) %>% 
  mutate(substrate = substr, pathways = pthway)

In [9]:
head(weissman)

species_tax_id,superkingdom,phylum,class,order,family,genus,species,motility,metabolism,cell_shape,substrate,pathways
<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
46125,Bacteria,Firmicutes,Bacilli,Lactobacillales,Aerococcaceae,Abiotrophia,defectiva,yes,facultative anaerobic,,"sucrose, trehalose",alpha_galactosidase
155978,Bacteria,Cyanobacteria,,Synechococcales,Acaryochloridaceae,Acaryochloris,marina,yes,,,,
435,Bacteria,Proteobacteria,Alphaproteobacteria,Rhodospirillales,Acetobacteraceae,Acetobacter,aceti,,obligate aerobic,,"butanol, ethanol, glucose, glycerol, mannitol, mannose, xylose","catalase, hydrogen_sulfide_synthesis"
438,Bacteria,Proteobacteria,Alphaproteobacteria,Rhodospirillales,Acetobacteraceae,Acetobacter,pasteurianus,,obligate aerobic,,"alanine, butanol, ethanol, glucose, glycerol, proline",catalase
33952,Bacteria,Firmicutes,Clostridia,Clostridiales,Eubacteriaceae,Acetobacterium,Woodii,no,obligate anaerobic,,"butanol, ethanol, formate, fructose, glucose, glycerol, lactose, pyruvate, 2_3_butanediol",acetoin
33952,Bacteria,Firmicutes,Clostridia,Clostridiales,Eubacteriaceae,Acetobacterium,woodii,no,obligate anaerobic,,"formate, glucose, lactate, sugars_generic",


In [10]:
write_csv(weissman, here("output", "databases", "weissman_proc.csv"))
saveRDS(weissman, here("output", "databases", "weissman_proc.rds"))