Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

class2tree replacement #634

Merged
merged 16 commits into from Oct 7, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions DESCRIPTION
Expand Up @@ -38,6 +38,7 @@ Imports:
plyr,
foreach,
ape,
zoo,
bold (>= 0.3.5),
data.table,
rredlist (>= 0.3.0),
Expand Down
3 changes: 3 additions & 0 deletions NAMESPACE
Expand Up @@ -326,6 +326,7 @@ importFrom(bold,bold_tax_id)
importFrom(bold,bold_tax_name)
importFrom(data.table,rbindlist)
importFrom(data.table,setDF)
importFrom(data.table,transpose)
importFrom(foreach,"%do%")
importFrom(foreach,foreach)
importFrom(graphics,plot)
Expand Down Expand Up @@ -355,6 +356,7 @@ importFrom(reshape2,dcast)
importFrom(reshape2,melt)
importFrom(stats,aggregate)
importFrom(stats,as.dist)
importFrom(stats,complete.cases)
importFrom(stats,hclust)
importFrom(stats,na.omit)
importFrom(stats,setNames)
Expand All @@ -376,3 +378,4 @@ importFrom(xml2,xml_find_first)
importFrom(xml2,xml_name)
importFrom(xml2,xml_ns)
importFrom(xml2,xml_text)
importFrom(zoo,na.locf)
243 changes: 238 additions & 5 deletions R/class2tree.R
@@ -1,4 +1,4 @@
#' Convert list of classifications to a tree.
#' Convert a list of classifications to a tree.
#'
#' This function converts a list of hierarchies for individual species into
#' a single species by taxonomic level matrix, then calculates a distance
Expand Down Expand Up @@ -56,30 +56,50 @@ class2tree <- function(input, varstep = TRUE, check = TRUE, ...) {
message('Removed species without classification.')
input <- input[!is.na(input)]
}

# Check that there is more than 2 taxon
if (length(input) < 3)
stop("Your input list of classifications must be 3 or longer.")

if (length(unique(names(input))) < length(names(input)))
stop("Input list of classifications contains duplicates")

dat <- rbind.fill(lapply(input, class2tree_helper))
df <- dat[ , !apply(dat, 2, function(x) any(is.na(x))) ]
# Get rank and ID list
rankList <- rbind.fill(lapply(input, get_rank))
nameList <- rbind.fill(lapply(input, get_name))

# Create taxonomy matrix
df <- taxonomy_table_creator(nameList,rankList)

if (!inherits(df, "data.frame")) {
stop("no taxon ranks in common - try different inputs")
}

row.names(df) <- df[,1]
df <- df[,-1]

# calculate distance matrix
taxdis <- tryCatch(taxa2dist(df, varstep = varstep, check = check),
error = function(e) e)

tdf = t(df)
for (i in 1:ncol(tdf)){
tdf[,i][duplicated(tdf[,i])] <- NA
}


# check for incorrect dimensions error
if (is(taxdis, 'simpleError'))
stop("Try check=FALSE, but see docs for taxa2dist function in the vegan package for details.")
out <- as.phylo.hclust(hclust(taxdis, ...))
res <- list(phylo = out, classification = dat, distmat = taxdis,
res <- list(phylo = out, classification = as.data.frame(t(tdf)), distmat = taxdis,
names = names(input))
class(res) <- 'classtree'
return( res )
}

class2tree_helper <- function(x){
x <- x[!x$rank == "no rank", ]
df <- x[-nrow(x), 'name']
names(df) <- x[-nrow(x), 'rank']
df <- data.frame(t(data.frame(df)), stringsAsFactors = FALSE)
Expand All @@ -104,7 +124,6 @@ print.classtree <- function(x, ...) {
print(x$phylo)
}


# Function from the vegan package
# CRAN: http://cran.rstudio.com/web/packages/vegan/
# License: GPL-2
Expand Down Expand Up @@ -152,3 +171,217 @@ taxa2dist <- function(x, varstep = FALSE, check = TRUE, labels) {
warning("you used 'check=FALSE' and some distances are zero -- was this intended?")
out
}

###############################################################################
#################### GET LIST OF ALL TAXONOMY RANK AND IDs ####################
###############################################################################
get_rank <- function(x){
rankDf <- x[, 'rank']
names(rankDf) <- x[, 'rank']

idDf <- x[, 'id']
joinedDf <- cbind(data.frame(rankDf,stringsAsFactors=FALSE),
data.frame(idDf,stringsAsFactors=FALSE))
joinedDf <- within(joinedDf,
rankDf[rankDf=='no rank'] <-
paste0("norank_",idDf[rankDf=='no rank']))

df <- data.frame(t(data.frame(rev(joinedDf$rankDf))),
stringsAsFactors = FALSE)
outDf <- data.frame(tip = x[nrow(x), "name"], df, stringsAsFactors = FALSE)
return(outDf)
}

get_name <- function(x){
rankDf <- x[, 'rank']
names(rankDf) <- x[, 'rank']

nameDf <- x[, 'name']
idDf <- x[, 'id']

joinedDf <- cbind(data.frame(rankDf,stringsAsFactors=FALSE),
data.frame(nameDf,stringsAsFactors=FALSE))
joinedDf <- within(joinedDf,
rankDf[rankDf=='no rank'] <-
paste0("norank_",idDf[rankDf=='no rank']))
joinedDf$name <- paste0(joinedDf$nameDf,"#",joinedDf$rankDf)

df <- data.frame(t(data.frame(rev(joinedDf$name))), stringsAsFactors = FALSE)
outDf <- data.frame(tip = x[nrow(x), "name"], df, stringsAsFactors = FALSE)
return(outDf)
}

###############################################################################
#################### INDEXING ALL AVAILABLE RANKS (INCLUDING NORANK) ##########
###############################################################################
rank_indexing <- function(rankList){
##### input is a dataframe, where each row is a rank list of a taxon

### get all available ranks from input rankList
uList <- unlist(rankList)
# remove unique rank by replacing with NA
# (unique rank is uninformative for sorting taxa)
uList[!duplicated(uList)] <- NA
# get final list of available ranks (remove NA items)
allInputRank <- as.character(unique(uList))
allInputRank <- allInputRank[!is.na(allInputRank)]

### initial index for main ranks
mainRank <- c("strain","forma","subspecies","varietas","species",
"speciessubgroup","speciesgroup","subgenus","genus","subtribe",
"tribe","subfamily","family","superfamily","parvorder",
"infraorder","suborder","order","superorder","infraclass",
"subclass","class","superclass","subphylum","phylum",
"superphylum","subkingdom","kingdom","superkingdom")
rank2Index <- new.env()
for(i in 1:length(mainRank)){
rank2Index[[mainRank[i]]] = i
}

### the magic happens here
for(k in 1:nrow(rankList)){
### get subset of rank list for current taxon which
### contains only ranks existing in allInputRank
subList <- rankList[k,][!is.na(rankList[k,])]
filter <- sapply(subList, function(x) x %in% allInputRank)
subList <- subList[filter]

### now go to each rank and check...
for(i in 1:length(subList)){
## if it has no index (probably only for norank), then...
if(is.null(rank2Index[[subList[i]]])){
## set index for this rank = the [available] index of previous rank + 1
for(j in 1:length(subList)){
if(!is.null(rank2Index[[subList[i-j]]])){
rank2Index[[subList[i]]] = rank2Index[[subList[i-j]]] + 1
break
} else {
j = j-1
}
}
}
## else, check if the current index is smaller than
## the index of previous rank,
else {
if(i>1){
preRank <- subList[i-1]
## if so, increase index of this current rank
## by (index of previous rank + 1)
if(rank2Index[[subList[i]]] <= rank2Index[[preRank]]){
rank2Index[[subList[i]]] = rank2Index[[preRank]] + 1
}
}
}
}
}

### output a list of indexed ranks
index2RankDf <- data.frame("index"=character(),
"rank"=character(),
stringsAsFactors=FALSE)
for(i in 1:length(allInputRank)){
index2RankDf[i,] = c(rank2Index[[allInputRank[i]]],allInputRank[i])
}

index2RankDf$index <- as.numeric(index2RankDf$index)
index2RankDf <- index2RankDf[with(index2RankDf, order(index2RankDf$index)),]

return(index2RankDf)
}

###############################################################################
#################### ARRANGE RANK IDs INTO SORTED RANK LIST (index2RankDf) ####
###############################################################################
taxonomy_table_creator <- function(nameList,rankList){
colnames(nameList)[1] <- "tip"

### get indexed rank list
index2RankDf <- rank_indexing(rankList)

### get ordered rank list
orderedRank <- factor(index2RankDf$rank, levels = index2RankDf$rank)

### create a dataframe containing ordered ranks
full_rank_name_df <- data.frame("rank"=matrix(unlist(orderedRank),
nrow=length(orderedRank),
byrow=TRUE),
stringsAsFactors=FALSE)
full_rank_name_df$index <- as.numeric(rownames(full_rank_name_df))

for(i in 1:nrow(nameList)){
### get list of all IDs for this taxon
taxonDf <- data.frame(nameList[i,])
taxonName <- unlist(strsplit(as.character(nameList[i,]$tip),
"#",
fixed = TRUE))

### convert into long format
mTaxonDf <- suppressWarnings(melt(taxonDf,id = "tip"))

### get rank names and corresponding IDs
splitCol <- data.frame(do.call('rbind',
strsplit(as.character(mTaxonDf$value),
'#',
fixed=TRUE)))
mTaxonDf <- cbind(mTaxonDf,splitCol)

### remove NA cases
mTaxonDf <- mTaxonDf[complete.cases(mTaxonDf),]

### subselect mTaxonDf to keep only 2 column rank id and rank name
mTaxonDf <- mTaxonDf[,c("X1","X2")]
if(mTaxonDf$X2[1] != index2RankDf$rank[1]){
mTaxonDf <- rbind(data.frame("X1"=mTaxonDf$X1[1],
"X2"=index2RankDf$rank[1]),
mTaxonDf)
}

### rename columns
colnames(mTaxonDf) <- c(taxonName[1],"rank")

### merge with index2RankDf (Df contains all available ranks of input data)
full_rank_name_df <- merge(full_rank_name_df,mTaxonDf, by=c("rank"), all.x = T)

### reorder ranks
full_rank_name_df <- full_rank_name_df[order(full_rank_name_df$index),]

### replace NA id by id of previous rank
full_rank_name_df <- na.locf(full_rank_name_df)
}

### remove index column
full_rank_name_df <- subset(full_rank_name_df, select = -c(index))

### transpose into wide format
t_full_rank_name_df <- transpose(full_rank_name_df)

### set first row to column names
colnames(t_full_rank_name_df) = as.character(unlist(t_full_rank_name_df[1,]))
t_full_rank_name_df <- t_full_rank_name_df[-1,]

### replace NA values in the dataframe t_full_rank_name_df
if(nrow(t_full_rank_name_df[is.na(t_full_rank_name_df),]) > 0){
t_full_rank_name_dfTMP <- t_full_rank_name_df[complete.cases(t_full_rank_name_df),]
t_full_rank_name_dfEdit <- t_full_rank_name_df[is.na(t_full_rank_name_df),]

for(i in 1:nrow(t_full_rank_name_dfEdit)){
for(j in 1:(ncol(t_full_rank_name_df)-1)){
if(is.na(t_full_rank_name_dfEdit[i,j])){
t_full_rank_name_dfEdit[i,j] <- t_full_rank_name_dfEdit[i,j+1]
}
}
if(is.na(t_full_rank_name_dfEdit[i,ncol(t_full_rank_name_df)])){
t_full_rank_name_dfEdit[i,ncol(t_full_rank_name_df)] <-
t_full_rank_name_dfEdit[i,ncol(t_full_rank_name_df)-1]
}
}
t_full_rank_name_df <- rbind(t_full_rank_name_dfEdit,t_full_rank_name_dfTMP)
}

### add fullName column into t_full_rank_name_df
fullName <- colnames(full_rank_name_df)[-c(1)]
full_rank_name_df <- cbind(fullName,t_full_rank_name_df)

### return
return(full_rank_name_df)
}
5 changes: 3 additions & 2 deletions R/taxize-package.R
Expand Up @@ -54,13 +54,14 @@
#'
#' @importFrom graphics plot
#' @importFrom methods as is
#' @importFrom stats as.dist hclust na.omit setNames aggregate
#' @importFrom stats as.dist hclust na.omit setNames aggregate complete.cases
#' @importFrom zoo na.locf
#' @importFrom utils URLencode citation download.file read.delim write.table tail
#' @importFrom ape read.tree as.phylo.hclust plot.phylo
#' @importFrom jsonlite fromJSON toJSON
#' @importFrom httr GET POST content stop_for_status upload_file warn_for_status
#' add_headers timeout config
#' @importFrom data.table rbindlist setDF
#' @importFrom data.table rbindlist setDF transpose
#' @importFrom foreach foreach %do%
#' @importFrom stringr str_extract str_split str_replace str_replace_all
#' @importFrom plyr failwith rbind.fill llply ldply ddply l_ply summarise colwise .
Expand Down
2 changes: 1 addition & 1 deletion man/class2tree.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

9 changes: 9 additions & 0 deletions tests/testthat/test-class2tree.R
Expand Up @@ -8,6 +8,9 @@ spnames <- c('Klattia flava', 'Trollius sibiricus', 'Arachis paraguariensis',
'Helianthus annuus','Madia elegans','Lupinus albicaulis',
'Pinus lambertiana')

dupnames <- c('Mus musculus', 'Escherichia coli',
'Haloferax denitrificans','Mus musculus')

test_that("class2tree returns the correct value and class", {
skip_on_cran()

Expand All @@ -24,3 +27,9 @@ test_that("class2tree returns the correct value and class", {
expect_is(tr$distmat, "dist")
expect_is(tr$names, "character")
})

test_that("class2tree will abort when input contains duplicate taxa", {
skip_on_cran()
out <- classification(dupnames, db = 'ncbi', verbose = FALSE)
expect_error(class2tree(out), "Input list of classifications contains duplicates")
})