ropensci · sckott · Oct 7, 2017 · Oct 2, 2017 · Oct 2, 2017 · Oct 2, 2017
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -38,6 +38,7 @@ Imports:
     plyr,
     foreach,
     ape,
+    zoo,
     bold (>= 0.3.5),
     data.table,
     rredlist (>= 0.3.0),

diff --git a/NAMESPACE b/NAMESPACE
@@ -326,6 +326,7 @@ importFrom(bold,bold_tax_id)
 importFrom(bold,bold_tax_name)
 importFrom(data.table,rbindlist)
 importFrom(data.table,setDF)
+importFrom(data.table,transpose)
 importFrom(foreach,"%do%")
 importFrom(foreach,foreach)
 importFrom(graphics,plot)
@@ -355,6 +356,7 @@ importFrom(reshape2,dcast)
 importFrom(reshape2,melt)
 importFrom(stats,aggregate)
 importFrom(stats,as.dist)
+importFrom(stats,complete.cases)
 importFrom(stats,hclust)
 importFrom(stats,na.omit)
 importFrom(stats,setNames)
@@ -376,3 +378,4 @@ importFrom(xml2,xml_find_first)
 importFrom(xml2,xml_name)
 importFrom(xml2,xml_ns)
 importFrom(xml2,xml_text)
+importFrom(zoo,na.locf)
diff --git a/R/class2tree.R b/R/class2tree.R
@@ -1,4 +1,4 @@
-#' Convert list of classifications to a tree.
+#' Convert a list of classifications to a tree.
 #'
 #' This function converts a list of hierarchies for individual species into
 #' a single species by taxonomic level matrix, then calculates a distance
@@ -56,30 +56,50 @@ class2tree <- function(input, varstep = TRUE, check = TRUE, ...) {
     message('Removed species without classification.')
     input <- input[!is.na(input)]
   }
+
   # Check that there is more than 2 taxon
   if (length(input) < 3)
     stop("Your input list of classifications must be 3 or longer.")
+
+  if (length(unique(names(input))) < length(names(input)))
+    stop("Input list of classifications contains duplicates")
+
   dat <- rbind.fill(lapply(input, class2tree_helper))
-  df <- dat[ , !apply(dat, 2, function(x) any(is.na(x))) ]
+  # Get rank and ID list
+  rankList <- rbind.fill(lapply(input, get_rank))
+  nameList <- rbind.fill(lapply(input, get_name))
+
+  # Create taxonomy matrix
+  df <- taxonomy_table_creator(nameList,rankList)
+
   if (!inherits(df, "data.frame")) {
     stop("no taxon ranks in common - try different inputs")
   }
+
   row.names(df) <- df[,1]
   df <- df[,-1]
+
+  # calculate distance matrix
   taxdis <- tryCatch(taxa2dist(df, varstep = varstep, check = check),
                      error = function(e) e)
+
+  tdf = t(df)
+  for (i in 1:ncol(tdf)){
+    tdf[,i][duplicated(tdf[,i])] <- NA
+  }
+
+
   # check for incorrect dimensions error
   if (is(taxdis, 'simpleError'))
     stop("Try check=FALSE, but see docs for taxa2dist function in the vegan package for details.")
   out <- as.phylo.hclust(hclust(taxdis, ...))
-  res <- list(phylo = out, classification = dat, distmat = taxdis,
+  res <- list(phylo = out, classification = as.data.frame(t(tdf)), distmat = taxdis,
               names = names(input))
   class(res) <- 'classtree'
   return( res )
 }
 
 class2tree_helper <- function(x){
-  x <- x[!x$rank == "no rank", ]
   df <- x[-nrow(x), 'name']
   names(df) <- x[-nrow(x), 'rank']
   df <- data.frame(t(data.frame(df)), stringsAsFactors = FALSE)
@@ -104,7 +124,6 @@ print.classtree <- function(x, ...) {
   print(x$phylo)
 }
 
-
 # Function from the vegan package
 # CRAN: http://cran.rstudio.com/web/packages/vegan/
 # License: GPL-2
@@ -152,3 +171,217 @@ taxa2dist <- function(x, varstep = FALSE, check = TRUE, labels) {
     warning("you used 'check=FALSE' and some distances are zero -- was this intended?")
   out
 }
+
+###############################################################################
+#################### GET LIST OF ALL TAXONOMY RANK AND IDs ####################
+###############################################################################
+get_rank <- function(x){
+  rankDf <- x[, 'rank']
+  names(rankDf) <- x[, 'rank']
+
+  idDf <- x[, 'id']
+  joinedDf <- cbind(data.frame(rankDf,stringsAsFactors=FALSE),
+                    data.frame(idDf,stringsAsFactors=FALSE))
+  joinedDf <- within(joinedDf,
+                     rankDf[rankDf=='no rank'] <-
+                     paste0("norank_",idDf[rankDf=='no rank']))
+
+  df <- data.frame(t(data.frame(rev(joinedDf$rankDf))),
+                   stringsAsFactors = FALSE)
+  outDf <- data.frame(tip = x[nrow(x), "name"], df, stringsAsFactors = FALSE)
+  return(outDf)
+}
+
+get_name <- function(x){
+  rankDf <- x[, 'rank']
+  names(rankDf) <- x[, 'rank']
+
+  nameDf <- x[, 'name']
+  idDf <- x[, 'id']
+
+  joinedDf <- cbind(data.frame(rankDf,stringsAsFactors=FALSE),
+                    data.frame(nameDf,stringsAsFactors=FALSE))
+  joinedDf <- within(joinedDf,
+                     rankDf[rankDf=='no rank'] <-
+                     paste0("norank_",idDf[rankDf=='no rank']))
+  joinedDf$name <- paste0(joinedDf$nameDf,"#",joinedDf$rankDf)
+
+  df <- data.frame(t(data.frame(rev(joinedDf$name))), stringsAsFactors = FALSE)
+  outDf <- data.frame(tip = x[nrow(x), "name"], df, stringsAsFactors = FALSE)
+  return(outDf)
+}
+
+###############################################################################
+#################### INDEXING ALL AVAILABLE RANKS (INCLUDING NORANK) ##########
+###############################################################################
+rank_indexing <- function(rankList){
+  ##### input is a dataframe, where each row is a rank list of a taxon
+
+  ### get all available ranks from input rankList
+  uList <- unlist(rankList)
+  # remove unique rank by replacing with NA
+  # (unique rank is uninformative for sorting taxa)
+  uList[!duplicated(uList)] <- NA
+  # get final list of available ranks (remove NA items)
+  allInputRank <- as.character(unique(uList))
+  allInputRank <- allInputRank[!is.na(allInputRank)]
+
+  ### initial index for main ranks
+  mainRank <- c("strain","forma","subspecies","varietas","species",
+                "speciessubgroup","speciesgroup","subgenus","genus","subtribe",
+                "tribe","subfamily","family","superfamily","parvorder",
+                "infraorder","suborder","order","superorder","infraclass",
+                "subclass","class","superclass","subphylum","phylum",
+                "superphylum","subkingdom","kingdom","superkingdom")
+  rank2Index <- new.env()
+  for(i in 1:length(mainRank)){
+    rank2Index[[mainRank[i]]] = i
+  }
+
+  ### the magic happens here
+  for(k in 1:nrow(rankList)){
+    ### get subset of rank list for current taxon which
+    ### contains only ranks existing in allInputRank
+    subList <- rankList[k,][!is.na(rankList[k,])]
+    filter <- sapply(subList, function(x) x %in% allInputRank)
+    subList <- subList[filter]
+
+    ### now go to each rank and check...
+    for(i in 1:length(subList)){
+      ## if it has no index (probably only for norank), then...
+      if(is.null(rank2Index[[subList[i]]])){
+        ## set index for this rank = the [available] index of previous rank + 1
+        for(j in 1:length(subList)){
+          if(!is.null(rank2Index[[subList[i-j]]])){
+            rank2Index[[subList[i]]] = rank2Index[[subList[i-j]]] + 1
+            break
+          } else {
+            j = j-1
+          }
+        }
+      }
+      ## else, check if the current index is smaller than
+      ## the index of previous rank,
+      else {
+        if(i>1){
+          preRank <- subList[i-1]
+          ## if so, increase index of this current rank
+          ## by (index of previous rank + 1)
+          if(rank2Index[[subList[i]]] <= rank2Index[[preRank]]){
+            rank2Index[[subList[i]]] = rank2Index[[preRank]] + 1
+          }
+        }
+      }
+    }
+  }
+
+  ### output a list of indexed ranks
+  index2RankDf <- data.frame("index"=character(),
+                             "rank"=character(),
+                             stringsAsFactors=FALSE)
+  for(i in 1:length(allInputRank)){
+    index2RankDf[i,] = c(rank2Index[[allInputRank[i]]],allInputRank[i])
+  }
+
+  index2RankDf$index <- as.numeric(index2RankDf$index)
+  index2RankDf <- index2RankDf[with(index2RankDf, order(index2RankDf$index)),]
+
+  return(index2RankDf)
+}
+
+###############################################################################
+#################### ARRANGE RANK IDs INTO SORTED RANK LIST (index2RankDf) ####
+###############################################################################
+taxonomy_table_creator <- function(nameList,rankList){
+  colnames(nameList)[1] <- "tip"
+
+  ### get indexed rank list
+  index2RankDf <- rank_indexing(rankList)
+
+  ### get ordered rank list
+  orderedRank <- factor(index2RankDf$rank, levels = index2RankDf$rank)
+
+  ### create a dataframe containing ordered ranks
+  full_rank_name_df <- data.frame("rank"=matrix(unlist(orderedRank),
+                             nrow=length(orderedRank),
+                             byrow=TRUE),
+                             stringsAsFactors=FALSE)
+  full_rank_name_df$index <- as.numeric(rownames(full_rank_name_df))
+
+  for(i in 1:nrow(nameList)){
+    ### get list of all IDs for this taxon
+    taxonDf <- data.frame(nameList[i,])
+    taxonName <- unlist(strsplit(as.character(nameList[i,]$tip),
+                                "#",
+                                fixed = TRUE))
+
+    ### convert into long format
+    mTaxonDf <- suppressWarnings(melt(taxonDf,id = "tip"))
+
+    ### get rank names and corresponding IDs
+    splitCol <- data.frame(do.call('rbind',
+                                   strsplit(as.character(mTaxonDf$value),
+                                   '#',
+                                   fixed=TRUE)))
+    mTaxonDf <- cbind(mTaxonDf,splitCol)
+
+    ### remove NA cases
+    mTaxonDf <- mTaxonDf[complete.cases(mTaxonDf),]
+
+    ### subselect mTaxonDf to keep only 2 column rank id and rank name
+    mTaxonDf <- mTaxonDf[,c("X1","X2")]
+    if(mTaxonDf$X2[1] != index2RankDf$rank[1]){
+      mTaxonDf <- rbind(data.frame("X1"=mTaxonDf$X1[1],
+                                   "X2"=index2RankDf$rank[1]),
+                                   mTaxonDf)
+    }
+
+    ### rename columns
+    colnames(mTaxonDf) <- c(taxonName[1],"rank")
+
+    ### merge with index2RankDf (Df contains all available ranks of input data)
+    full_rank_name_df <- merge(full_rank_name_df,mTaxonDf, by=c("rank"), all.x = T)
+
+    ### reorder ranks
+    full_rank_name_df <- full_rank_name_df[order(full_rank_name_df$index),]
+
+    ### replace NA id by id of previous rank
+    full_rank_name_df <- na.locf(full_rank_name_df)
+  }
+
+  ### remove index column
+  full_rank_name_df <- subset(full_rank_name_df, select = -c(index))
+
+  ### transpose into wide format
+  t_full_rank_name_df <- transpose(full_rank_name_df)
+
+  ### set first row to column names
+  colnames(t_full_rank_name_df) = as.character(unlist(t_full_rank_name_df[1,]))
+  t_full_rank_name_df <- t_full_rank_name_df[-1,]
+
+  ### replace NA values in the dataframe t_full_rank_name_df
+  if(nrow(t_full_rank_name_df[is.na(t_full_rank_name_df),]) > 0){
+    t_full_rank_name_dfTMP <- t_full_rank_name_df[complete.cases(t_full_rank_name_df),]
+    t_full_rank_name_dfEdit <- t_full_rank_name_df[is.na(t_full_rank_name_df),]
+
+    for(i in 1:nrow(t_full_rank_name_dfEdit)){
+      for(j in 1:(ncol(t_full_rank_name_df)-1)){
+        if(is.na(t_full_rank_name_dfEdit[i,j])){
+          t_full_rank_name_dfEdit[i,j] <- t_full_rank_name_dfEdit[i,j+1]
+        }
+      }
+      if(is.na(t_full_rank_name_dfEdit[i,ncol(t_full_rank_name_df)])){
+        t_full_rank_name_dfEdit[i,ncol(t_full_rank_name_df)] <-
+        t_full_rank_name_dfEdit[i,ncol(t_full_rank_name_df)-1]
+      }
+    }
+    t_full_rank_name_df <- rbind(t_full_rank_name_dfEdit,t_full_rank_name_dfTMP)
+  }
+
+  ### add fullName column into t_full_rank_name_df
+  fullName <- colnames(full_rank_name_df)[-c(1)]
+  full_rank_name_df <- cbind(fullName,t_full_rank_name_df)
+
+  ### return
+  return(full_rank_name_df)
+}
diff --git a/R/taxize-package.R b/R/taxize-package.R
@@ -54,13 +54,14 @@
 #'
 #' @importFrom graphics plot
 #' @importFrom methods as is
-#' @importFrom stats as.dist hclust na.omit setNames aggregate
+#' @importFrom stats as.dist hclust na.omit setNames aggregate complete.cases
+#' @importFrom zoo na.locf
 #' @importFrom utils URLencode citation download.file read.delim write.table tail
 #' @importFrom ape read.tree as.phylo.hclust plot.phylo
 #' @importFrom jsonlite fromJSON toJSON
 #' @importFrom httr GET POST content stop_for_status upload_file warn_for_status
 #' add_headers timeout config
-#' @importFrom data.table rbindlist setDF
+#' @importFrom data.table rbindlist setDF transpose
 #' @importFrom foreach foreach %do%
 #' @importFrom stringr str_extract str_split str_replace str_replace_all
 #' @importFrom plyr failwith rbind.fill llply ldply ddply l_ply summarise colwise .

diff --git a/man/class2tree.Rd b/man/class2tree.Rd
diff --git a/tests/testthat/test-class2tree.R b/tests/testthat/test-class2tree.R
@@ -8,6 +8,9 @@ spnames <- c('Klattia flava', 'Trollius sibiricus', 'Arachis paraguariensis',
  'Helianthus annuus','Madia elegans','Lupinus albicaulis',
  'Pinus lambertiana')
 
+dupnames <- c('Mus musculus', 'Escherichia coli',
+              'Haloferax denitrificans','Mus musculus')
+
 test_that("class2tree returns the correct value and class", {
   skip_on_cran()
 
@@ -24,3 +27,9 @@ test_that("class2tree returns the correct value and class", {
   expect_is(tr$distmat, "dist")
   expect_is(tr$names, "character")
 })
+
+test_that("class2tree will abort when input contains duplicate taxa", {
+  skip_on_cran()
+  out <- classification(dupnames, db = 'ncbi', verbose = FALSE)
+  expect_error(class2tree(out), "Input list of classifications contains duplicates")
+})