In [2]:
library(piggyback)
library(data.table)
library(dtplyr)
library(targets)
library(here)
library(stringdist)
library(tidyverse)
here::i_am("analysis/db_prep.ipynb");

here() starts at /dartfs-hpc/rc/home/k/f00345k/research/microbe_set_trait

── [1mAttaching packages[22m ───────────────────────────────────────────────────────────────────── tidyverse 1.3.1 ──

[32m✔[39m [34mggplot2[39m 3.3.5     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.1.6     [32m✔[39m [34mdplyr  [39m 1.0.8
[32m✔[39m [34mtidyr  [39m 1.2.0     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 2.1.2     [32m✔[39m [34mforcats[39m 0.5.1

── [1mConflicts[22m ──────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mbetween()[39m   masks [34mdata.table[39m::between()
[31m✖[39m [34mtidyr[39m::[32mextract()[39m   masks [34mstringdist[39m::extract()
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m    masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mfirst()[39m     masks [34mdata.table[39m::first()
[31m✖[39m [34mdplyr[39m::[

This code is used to upload data onto GitHub using `piggyback` R package
```r
piggyback::pb_upload(file = here("large_files", "goldData.xlsx"), tag = "0.1", overwrite = TRUE)
```

In [8]:
base <- read_csv(here("data", "condensed_species_NCBI.txt")) %>% 
    select(species_tax_id, superkingdom, phylum, class, order, family, 
           genus, species, metabolism, gram_stain, pathways, 
           carbon_substrates, sporulation, motility, cell_shape) %>% 
    rename("substrate" = carbon_substrates)

[1mRows: [22m[34m14893[39m [1mColumns: [22m[34m79[39m
[36m──[39m [1mColumn specification[22m [36m──────────────────────────────────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m (19): species, genus, family, order, class, phylum, superkingdom, gram_s...
[32mdbl[39m (60): species_tax_id, d1_lo, d1_up, d2_lo, d2_up, doubling_h, genome_siz...

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


In [5]:
pth <- here("large_files", "goldData.csv")
if (!file.exists(pth)){
    piggyback::pb_download(file = "goldData.xlsx", dest = here("large_files"), tag = "0.1", repo = "qpmnguyen/microbe_set_trait")
    gold <- readxl::read_xlsx(path = here("large_files", "goldData.xlsx"), sheet = "Organism")
    readr::write_csv(x = gold, file = pth)
}
gold <- read_csv(file = pth);

“One or more parsing issues, see `problems()` for details”
[1mRows: [22m[34m428241[39m [1mColumns: [22m[34m42[39m
[36m──[39m [1mColumn specification[22m [36m──────────────────────────────────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m (35): ORGANISM GOLD ID, ORGANISM NAME, ORGANISM NCBI SUPERKINGDOM, ORGAN...
[32mdbl[39m  (4): ORGANISM NCBI TAX ID, ORGANISM ISOLATION PUBMED ID, ORGANISM ECOSY...
[33mlgl[39m  (3): ORGANISM SALINITY CONCENTRATION, ORGANISM PRESSURE, ORGANISM CARBO...

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


In [7]:
# convert names 
colnames(gold) <- colnames(gold) %>% 
    gsub(x = ., pattern = " ", replacement = "_") %>% 
    tolower() %>% 
    gsub(x = ., pattern = "organism_", replacement = "")


gold_reduced <- gold %>% 
    select(ncbi_tax_id, ncbi_superkingdom,  
            ncbi_phylum, ncbi_class, ncbi_order, ncbi_family, ncbi_genus, ncbi_species, 
            name, gram_stain, metabolism, oxygen_requirement, 
            sporulation, motility, cell_shape) %>% 
    rename("species_tax_id" = ncbi_tax_id,
           "superkingdom" = ncbi_superkingdom,
           "phylum" = ncbi_phylum,
           "class" = ncbi_class,
           "order" = ncbi_order,
           "family" = ncbi_family,
           "genus" = ncbi_genus,
           "species" = ncbi_species,
           "pathways" = metabolism,
           "metabolism" = oxygen_requirement) %>% as.data.table()

# nest traits 
tbl <- gold_reduced %>%
    select(-name) %>%
    group_by(species_tax_id, superkingdom, phylum, order, 
             family, genus, species) %>%
    nest(traits = c(gram_stain, pathways, metabolism, 
           cell_shape, motility, sporulation))
    

# a subset of the table that has more than one row per trait nested values 
tbl_munge <- tbl %>% filter(map_lgl(traits, ~{nrow(.x) > 1}))

tbl_munge

[1mSource: [22mlocal data table [11,177 x 9]
[1mGroups: [22mspecies_tax_id, superkingdom, phylum, order, family, genus, species
[1mCall:[22m
  _DT4 <- `_DT3`[, .(species_tax_id, superkingdom, phylum, class, order, family, genus,
  _DT4 <-   species, gram_stain, pathways, metabolism, sporulation, motility, cell_shape)][
  _DT4 <-   , .(traits = .(.SD)), by = .(species_tax_id, superkingdom, phylum, class,
  _DT4 <-     order, family, genus, species)]
  `_DT4`[`_DT4`[, .I[map_lgl(traits, ~{
    nrow(.x) > 1
})], by = .(species_tax_id, superkingdom, phylum, order, family, 
    genus, species)]$V1]

  species_tax_id superkingdom phylum       class      order family genus species
           [3m[90m<dbl>[39m[23m [3m[90m<chr>[39m[23m        [3m[90m<chr>[39m[23m        [3m[90m<chr>[39m[23m      [3m[90m<chr>[39m[23m [3m[90m<chr>[39m[23m  [3m[90m<chr>[39m[23m [3m[90m<chr>[39m[23m  
[90m1[39m          [4m5[24m[4m4[24m005 Bacteria     Firmicutes   Tissier

In [8]:
# This function takes a data frame and a column 
# and selects the response with the highest frequency
select_best <- function(df, column){
    vec <- unlist(df[,..column])
    freq <- as.data.frame(table(vec))
    if (nrow(freq) == 0){
        return(NA_character_)
    } else {
        freq <- freq %>% mutate(prop = Freq/sum(Freq)) %>%
            filter(prop >= 0.5) %>% top_n(n = 1, wt = prop)
        return(freq %>% pull(vec) %>% as.vector())
    }
}

# This function then utilizes select_best
# to process entries with duplicates (more than one row)
# for pathways, the goal is to concatenate them
process_duplicates <- function(df){
    # get only unique rows
    df <- unique(df)
    if (nrow(df) == 1){
        return(df)
    }
    v <- c("gram_stain", "pathways", "metabolism", 
           "sporulation", "motility", "cell_shape")
    suppressMessages(res <- map_dfc(v, ~{
        if (.x == "pathways"){
            str_vec <- na.omit(df$pathways) %>% as.vector()
            if (length(str_vec) == 0){
                out <- NA_character_
            } else {
                out <- str_replace(str_vec, pattern = " ", 
                                   replacement = "_") %>% 
                    paste(collapse = ", ")
            }
        } else {
            out <- select_best(df, .x)
        }
        return(out)
    }))
    colnames(res) <- v
    res <- as.data.table(res)
    return(res)
}

In [9]:
tbl_munge <- tbl_munge %>% 
    mutate(traits = map(traits, process_duplicates)) %>% as_tibble() %>%
    unnest(traits)

tbl_munge

In [None]:
ids <- tbl_munge %>% pull(species_tax_id)

gold_final <- tbl %>% filter(!species_tax_id %in% ids) %>% 
    as_tibble() %>% 
    unnest(traits) %>% bind_rows(tbl_munge)

# final cleaning
gold_final <- gold_final %>% 
    mutate(metabolism = tolower(metabolism), 
           gram_stain = if_else(gram_stain == "Gram-", "negative", "positive"), 
           sporulation = if_else(sporulation == "Nonsporulating", "no", "yes"), 
           motility = case_when(
               motility == "Nonmotile" ~ "no", 
               motility == "Motile" ~ "yes", 
               TRUE ~ motility
           ), 
           cell_shape = tolower(str_replace(cell_shape,"-shaped","")),
           cell_shape = case_when(
               cell_shape %in% c("rod") ~ "bacillus",
               cell_shape %in% c("sphere", "oval", 
                                 "bean", "coccoid", "ovoid", 
                                 "spore") ~ "coccus", 
               cell_shape %in% c("helical") ~ "spiral", 
               cell_shape %in% c("curved") ~ "vibrio", 
               cell_shape %in% c("flask", "open-ring", "lancet") ~ "irregular", 
               # only Mycoplasma genitalium for flask and 
               # only Thiomicrospira cyclica for open-ring
               # only Nitrolancea hollandica for lancet
               TRUE ~ cell_shape
           ))

head(gold_final)
# pray not to crash
rm(gold)
rm(gold_reduced)
rm(tbl)
rm(tbl_munge)
gc()

## Processing Weissman et al.  

In [10]:
weissman <- read_csv(here("data", "weissman.csv"))

# select the relevant columns
weissman <- weissman %>% select(c("taxid_species",
                                  "kingdom", "phylum", "class", "order", "family", "genus", "species",
                                  "Motility_general", "Oxygen.Preference", "Cell.Shape", "Cell.Aggregation",
                                  starts_with("Enzyme.Assays"), 
                                  starts_with("Volatile.Gas.Production"), 
                                  starts_with("Substrate.Utilization")))

[1mRows: [22m[34m3369[39m [1mColumns: [22m[34m174[39m
[36m──[39m [1mColumn specification[22m [36m──────────────────────────────────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m  (16): Organism, kingdom, phylum, class, order, family, genus, species, ...
[32mdbl[39m (158): taxid_kingdom, taxid_phylum, taxid_class, taxid_order, taxid_fami...

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


In [None]:
weissman <- weissman %>% group_by(taxid_species, kingdom, phylum, 
                      class, order, family, genus, species) %>%
    nest(pathways = starts_with(c("Enzyme.Assays", "Volatile.Gas.Production")), 
         substrate = starts_with("Substrate.Utilization")) %>% ungroup() %>% 
    mutate(across(where(is.character), ~na_if(., "0"))) %>% 
    rename("species_tax_id" = taxid_species, "superkingdom" = kingdom, 
           "metabolism" = Oxygen.Preference,
           "motility" = Motility_general, 
           "cell_shape" = Cell.Shape, "cell_aggregation" = Cell.Aggregation)

# motility, cell shape, metabolism
weissman$cell_shape %>% unique()

weissman <- weissman %>% 
    mutate(motility = if_else(motility == "non-motile", "yes", "no"), 
                    cell_shape = case_when(
                        cell_shape == "rod" ~ "bacillus",
                        cell_shape == "ovoid/coccobacillus" ~ "coccus", 
                        cell_shape == "spirillum/corkscrew" ~ "spiral",
                        TRUE ~ cell_shape
                    ), 
                    metabolism = case_when(
                        metabolism == "microaerophile" ~ "microaerophilic",
                        TRUE ~ metabolism
                    ), 
                    metabolism = str_replace(metabolism, 
                                             pattern = "obe$", replacement = "obic"))

In [None]:
#' @param unit A single unit from a list of trait data frames
proc_pathways <- function(unit){
    string <- unit %>% pivot_longer(everything()) %>% 
        filter(value != 0)
    if (nrow(string) == 0){
        return(NA)
    } else {
        string <- string %>% 
            mutate(name = str_replace(name, pattern = "Enzyme.Assays..", replacement = "")) %>%
            mutate(name = str_replace(name, pattern = "Volatile.Gas.Production..", replacement = "synthesis_")) %>%
            mutate(name = str_replace(name, pattern = "\\.$", "")) %>%
            mutate(name = str_replace(name, pattern = "\\.\\.(.*)$", "")) %>%
            mutate(name = str_replace_all(name, pattern = "\\.", "_")) %>% 
            rowwise() %>%
            mutate(name = if_else(str_detect(name, "synthesis_"), 
                                  true = paste(rev(str_split(name, pattern = "_", 
                                                             n = 2)[[1]]), 
                                               collapse = "_"),
                                  false = name)) %>% 
            ungroup() %>%
            pull(name) %>% paste(., collapse = ", ")
    }
        
    return(string)
}

proc_substrate <- function(unit){
    string <- unit %>% pivot_longer(everything()) %>% filter(value != 0)
    if (nrow(string) == 0){
        return(NA)
    } else {
        string <- string %>% mutate(name = str_split(name, pattern = "\\.\\.", 
                                           n = 2, simplify = TRUE)[,2]) %>%
            mutate(name = str_replace(name, "(\\.\\.|\\.)$", "")) %>% 
            mutate(name = str_replace_all(name, "(\\.\\.|\\.)", "_")) %>% 
            pull(name) %>% paste(., collapse = ", ")
    }
    return(string)
}


In [None]:
substr <- map_chr(weissman$substrate, proc_substrate)
pthway <- map_chr(weissman$pathways, proc_pathways)

weissman <- weissman %>% select(-c(pathways, substrate)) %>% 
  mutate(substrate = substr, pathways = pthway)

## Checking for duplicates

In [None]:
check_matches <- function(df, type=c("pathways","substrates")){
    b_val <- base %>% pull(!!type) %>% unique() %>% str_split(pattern = ", ") %>%
        unlist() %>% unique() %>% na.omit() %>% as.vector()
    
    q_val <- df %>% pull(!!type) %>% unique() %>% str_split(pattern = ", ") %>% 
        unlist() %>% unique() %>% na.omit() %>% as.vector()
    
    check <- map(q_val, ~{
        match <- stringdist(a = .x, b = b_val)
        # match 0 is the same, and match > 2 is too different 
        ret <- b_val[match > 0 & match <= 2]
        if (length(ret) == 0){
            return(NA)
        } else {
            out <- tibble(
                query = rep(.x, length(ret)),
                ref = ret
            )
        }
    })
    check <- check[!sapply(check, function(x) all(is.na(x)))]
    
    return(check)
}

In [None]:
Reduce(check_matches(weissman, "pathways"), f = rbind)
Reduce(check_matches(weissman, "substrate"), f = rbind)

In [None]:
Reduce(check_matches(gold_final, "pathways"), f = rbind)

In [None]:
trim_path <- function(vec){
    vec <- vec %>% tolower() %>% 
        str_split(pattern = "(, |\\|)") %>% 
        map(~{
            str_trim(.x) %>% str_replace_all("\\-", "") %>%
                str_replace_all(" ", "_") %>% unique()
        })
    return(vec)
}


gold_final$pathways <- trim_path(gold_final$pathways)
base$pathways <- trim_path(base$pathways)
weissman$pathways <- trim_path(weissman$pathways)
weissman$substrate <- trim_path(weissman$substrate)
base$substrate <- trim_path(base$substrate)

## Combine all data frames

In [None]:
complete <- bind_rows(
    base %>% mutate(source = "madin"), 
    gold_final %>% mutate(source = "gold"), 
    weissman %>% mutate(source = "weissman")
)

tally <- complete %>% filter(!is.na(species)) %>% 
    group_by(species_tax_id, superkingdom, phylum, class, order, 
             family, genus, species) %>% 
    tally()


multiple_rows <- tally %>% filter(n >= 2) %>% pull(species_tax_id)

reconcile <- complete %>% filter(species_tax_id %in% multiple_rows)

reconcile <- reconcile %>% group_by(species_tax_id, superkingdom, phylum, class, order,
                       family, genus, species, metabolism) %>%
    nest(data = c(metabolism, gram_stain, pathways, substrate, sporulation, motility, cell_shape, cell_aggregation, source)) %>% ungroup()

In [None]:
#' @param df This is a data frame of multiple columns, where the columns of 
#'     pathways and substrates are themselves lists 
collapse_trait <- function(df){
    nonlist <- c("metabolism", "gram_stain", "sporulation", "motility", 
                 "cell_shape", "cell_aggregation")    
    out <- suppressMessages(map_dfc(nonlist, ~{
        traits <- df %>% pull(.x)
        traits <- as.data.frame(table(traits))
        if (nrow(traits) >= 1){
            return(traits %>% 
                       mutate(prop = Freq/sum(Freq)) %>%
                       filter(prop == max(prop) & prop >= 0.5) %>% 
                       pull(traits) %>% as.vector()
            )
        } else {
            return(NA_character_)
        }
    }))
    names(out) <- nonlist
    out$pathways <- list(df %>% pull("pathways") %>% 
                             Reduce(f = c, x = .) %>% unique())
    out$substrate <- list(df %>% pull("substrate") %>% 
                              Reduce(f = c, x = .) %>% 
                              unique())
    return(out)
}

reconcile <- reconcile %>% mutate(traits = map(data, collapse_trait)) %>% 
    select(-data)

In [None]:
complete <- bind_rows(reconcile %>% unnest(traits), 
          complete %>% filter(!species_tax_id %in% multiple_rows))

saveRDS(complete, file = here("output", "db_merged.rds"))
complete %>% mutate(
    pathways = map_chr(pathways, ~{
        paste(.x, collapse = ", ")
    }), 
    substrate = map_chr(substrate, ~{
        paste(.x, collapse = ", ")
    })
) %>% write_csv(x = ., file = here("output", "db_merged.csv"))