undates all

shaileshtripathi · Sep 13, 2016 · b4c8543 · b4c8543
1 parent 00d8a6d
commit b4c8543
Show file tree

Hide file tree

Showing 16 changed files with 3,670 additions and 118 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,8 +1,8 @@
 Package: samExploreR
 Type: Package
 Title: samExploreR package: high-performance read summarisation to count vectors with avaliability of sequencing depth reduction simulation
-Version: 0.99.23
-Depends: ggplot2,pBrackets,colorspace,R (>= 3.3.0)
+Version: 0.99.24
+Depends: ggplot2,pBrackets,colorspace,Rsubread, edgeR,R (>= 3.2.0)
 Author: Alexey Stupnikov, Shailesh Tripathi and Frank Emmert-Streib with contributions from Wei Shi and Yang Liao
 Maintainer: shailesh tripathi <shailesh.tripathy@gmail.com>
 Description: This R package is designed for subsampling procedure to simulate sequencing experiments with reduced sequencing depth. This package can be used to anlayze data generated from all major sequencing platforms such as Illumina GA, HiSeq, MiSeq, Roche GS-FLX, ABI SOLiD and LifeTech Ion PGM Proton sequencers. It supports multiple operating systems incluidng Linux, Mac OS X, FreeBSD and Solaris. Was developed with usage of Rsubread 1.16.1

diff --git a/NAMESPACE b/NAMESPACE
@@ -1,3 +1,4 @@
-useDynLib(samExploreR,.registration = TRUE, .fixes = "C_")
+#useDynLib(samExploreR,.registration = TRUE)
 import(ggplot2)
 exportPattern("^[^\\.]")
+export("samExplore")
diff --git a/R/featureCounts.R b/R/featureCounts.R
@@ -1,129 +1,87 @@
-samExplore <- function(files,annot.inbuilt="mm9",annot.ext=NULL,
-isGTFAnnotationFile=FALSE,GTF.featureType="exon",
-GTF.attrType="gene_id",useMetaFeatures=TRUE,allowMultiOverlap=FALSE,
-isPairedEnd=FALSE,requireBothEndsMapped=FALSE,checkFragLength=FALSE,
-minFragLength=50,maxFragLength=600,nthreads=1,strandSpecific=0,
-minMQS=0,readExtension5=0,readExtension3=0,
-read2pos=NULL,minReadOverlap=1,countSplitAlignmentsOnly=FALSE,
-countMultiMappingReads=FALSE,countPrimaryAlignmentsOnly=FALSE,
-countChimericFragments=TRUE,ignoreDup=FALSE,chrAliases=NULL,
-reportReads=FALSE, subsample_d=1, N_boot=1){
+samExplore <- function (files, annot.inbuilt = "mm10", annot.ext = NULL, isGTFAnnotationFile = FALSE, 
+    GTF.featureType = "exon", GTF.attrType = "gene_id", chrAliases = NULL, 
+    useMetaFeatures = TRUE, allowMultiOverlap = FALSE, minOverlap = 1, 
+    largestOverlap = FALSE, readExtension5 = 0, readExtension3 = 0, 
+    read2pos = NULL, countMultiMappingReads = FALSE, fraction = FALSE, 
+    minMQS = 0, splitOnly = FALSE, nonSplitOnly = FALSE, primaryOnly = FALSE, 
+    ignoreDup = FALSE, strandSpecific = 0, juncCounts = FALSE, 
+    genome = NULL, isPairedEnd = FALSE, requireBothEndsMapped = FALSE, 
+    checkFragLength = FALSE, minFragLength = 50, maxFragLength = 600, 
+    countChimericFragments = TRUE, autosort = TRUE, nthreads = 1, 
+    maxMOp = 10, reportReads = FALSE, subsample_d=1, N_boot=1,
+    countboot=c("all","Assigned", "Unassigned_Ambiguity",
+    "Unassigned_MultiMapping",   "Unassigned_NoFeatures",
+    "Unassigned_Unmapped",       "Unassigned_MappingQuality",
+    "Unassigned_FragmentLength", "Unassigned_Chimera",
+    "Unassigned_Secondary",      "Unassigned_Nonjunction",
+    "Unassigned_Duplicate" )){
+
  .printParam(subsample_d, N_boot) 
  resultlist <- list()
- for(i in 1:N_boot){
+
+fc <- featureCounts(
+	files=files, annot.inbuilt =annot.inbuilt , annot.ext = annot.ext,
+ isGTFAnnotationFile = isGTFAnnotationFile,
+    GTF.featureType = GTF.featureType, GTF.attrType =GTF.attrType, chrAliases =chrAliases,
+    useMetaFeatures = useMetaFeatures, allowMultiOverlap = allowMultiOverlap, minOverlap = minOverlap,
+    largestOverlap = largestOverlap, readExtension5 = readExtension5, readExtension3 = readExtension3,
+    read2pos = read2pos, countMultiMappingReads = countMultiMappingReads, fraction = fraction,
+    minMQS = minMQS, splitOnly = splitOnly, nonSplitOnly = nonSplitOnly, primaryOnly = primaryOnly,
+    ignoreDup = ignoreDup, strandSpecific = strandSpecific, juncCounts = juncCounts,
+    genome = genome, isPairedEnd = isPairedEnd, requireBothEndsMapped = requireBothEndsMapped,
+    checkFragLength = checkFragLength, minFragLength = minFragLength, maxFragLength = maxFragLength,
+    countChimericFragments = TRUE, autosort = TRUE, nthreads = 1,
+    maxMOp = maxMOp, reportRead=reportReads)
+
+ ts <- c()
+for(i in 1:ncol(fc$counts)){
+  fctmp <- list()
+  fctmp[["counts"]] <- as.matrix(fc$counts[,i])
+  fctmp[["stat"]] <- fc$stat[,c(1,(i+1))]	
  cat("//========================== Sub sampling ========================\\\\ \n")
  cat(paste("||		Sub sample N: ", i,"			||","\n",sep=""))
  cat("\\\\===================================================================//")
- resultlist[[i]] <- .featureCounts(files,annot.inbuilt,annot.ext,
-  isGTFAnnotationFile,GTF.featureType,GTF.attrType,useMetaFeatures,
-  allowMultiOverlap,isPairedEnd,requireBothEndsMapped,checkFragLength,
-  minFragLength,maxFragLength,nthreads,strandSpecific,minMQS,
-  readExtension5,readExtension3,read2pos,minReadOverlap,
-  countSplitAlignmentsOnly,countMultiMappingReads,
-  countPrimaryAlignmentsOnly,countChimericFragments,ignoreDup,
-  chrAliases,reportReads, subsample_d)	
+ restmp <- .featureCountsBoot(fctmp,
+  subsample_d=subsample_d, N_boot=N_boot, countboot=countboot)	
+
+ resultlist[[i]] <- restmp[[1]]
+ ts <- c(ts, restmp[[2]])
  }
- resultlist
+names(resultlist) <- fc$targets
+names(ts) <- fc$targets
+list(bootres=resultlist, target.size=ts, featuremain=fc)
 }
 
 
-.featureCounts <- function(files,annot.inbuilt="mm9",annot.ext=NULL,
-isGTFAnnotationFile=FALSE,GTF.featureType="exon",GTF.attrType="gene_id",
-useMetaFeatures=TRUE,allowMultiOverlap=FALSE,
-isPairedEnd=FALSE,requireBothEndsMapped=FALSE,
-checkFragLength=FALSE,minFragLength=50,
-maxFragLength=600,nthreads=1,strandSpecific=0,
-minMQS=0,readExtension5=0,readExtension3=0,
-read2pos=NULL,minReadOverlap=1,countSplitAlignmentsOnly=FALSE,
-countMultiMappingReads=FALSE,countPrimaryAlignmentsOnly=FALSE,
-countChimericFragments=TRUE,ignoreDup=FALSE,chrAliases=NULL,
-reportReads=FALSE, subsample_d=1)
+.featureCountsBoot <- function(flcount,
+subsample_d=1, N_boot=10, countboot="all")
 {
-flag <- FALSE
-if(is.null(annot.ext)){
- switch(tolower(as.character(annot.inbuilt)),
- mm9={
- ann <- system.file("annot","mm9_RefSeq_exon.txt",package="samExploreR")
- cat("NCBI RefSeq annotation for mm9 (build 37.2) is used.\n")
- },
- mm10={
- ann <- system.file("annot","mm10_RefSeq_exon.txt",package="samExploreR")
- cat("NCBI RefSeq annotation for mm10 (build 38.1) is used.\n")
- },
- hg19={
- ann <- system.file("annot","hg19_RefSeq_exon.txt",package="samExploreR")
- cat("NCBI RefSeq annotation for hg19 (build 37.2) is used.\n")
- },
- {
- stop("In-built annotation for ", annot.inbuilt, " is not available.\n")
- }
- ) # end switch
-}
-else{
- if(is.character(annot.ext)){
- ann <- annot.ext
-}
-else{
- annot_df <- as.data.frame(annot.ext,stringsAsFactors=FALSE)
- if(sum(c("geneid","chr","start","end", "strand") %in% 
-    tolower(colnames(annot_df))) != 5)
-      stop("One or more required columns are missing in the provided 
-      annotation data. Please refer to help page 
-      for annotation format.\n")
- colnames(annot_df) <- tolower(colnames(annot_df))
- annot_df <- data.frame(geneid=annot_df$geneid,chr=annot_df$chr,
- start=annot_df$start,end=annot_df$end,strand=annot_df$strand,
- stringsAsFactors=FALSE)
- annot_df$chr <- as.character(annot_df$chr)
- fout_annot <- file.path(".",paste(".Rsubread_UserProvidedAnnotation_pid",
- Sys.getpid(),sep=""))
- oldScipen <- options(scipen=999)
- write.table(x=annot_df,file=fout_annot,sep="\t",row.names=FALSE,quote=FALSE)
- options(oldScipen)
- ann <- fout_annot
- flag <- TRUE
-}
-}
-
-fout <- file.path(".",paste(".Rsubread_featureCounts_pid",Sys.getpid(),sep=""))
-files_C <- paste(files,collapse=";")
-#if(nchar(files_C) == 0) stop("No read files provided!")
- chrAliases_C <- chrAliases
- if(is.null(chrAliases))
- chrAliases_C <- " "
- countMultiMappingReads_C <- countMultiMappingReads
- if(countPrimaryAlignmentsOnly) countMultiMappingReads_C <- 2
- read2pos_C <- read2pos
- if(is.null(read2pos)) read2pos_C <- 0
- cmd <- paste("readSummary",ann,files_C,fout,as.numeric(isPairedEnd),
- minFragLength,maxFragLength,
- 0,as.numeric(allowMultiOverlap),as.numeric(useMetaFeatures),nthreads,
- as.numeric(isGTFAnnotationFile),strandSpecific,as.numeric(reportReads),
- as.numeric(requireBothEndsMapped),as.numeric(!countChimericFragments),
- as.numeric(checkFragLength),GTF.featureType,GTF.attrType,minMQS,
- as.numeric(countMultiMappingReads_C),chrAliases_C," ",as.numeric(FALSE),
- 14,readExtension5,readExtension3,minReadOverlap,
- as.numeric(countSplitAlignmentsOnly),
- read2pos_C," ",as.numeric(ignoreDup)," ", as.numeric(subsample_d),sep=",")
- n <- length(unlist(strsplit(cmd,",")))
- C_args <- .C("R_readSummary_wrapper",as.integer(n),as.character(cmd),
- PACKAGE="samExploreR")
+    ts <- seq <- NULL
+	totalcount <- as.numeric(flcount$stat[,2])
+	names(totalcount) <- as.vector(flcount$stat[,1])
+	if(countboot[1]=="all"){
+            ts <- round(sum(totalcount)*subsample_d)
+	    seqtmp <- as.vector(flcount$counts)
+	    names(seqtmp) <- rownames(flcount$counts)		
+	    seq <- c(seqtmp, totalcount[-which(names(totalcount)=="Assigned")])	
+        } 
+	else{
+	    countboot <- unique(c("Assigned",countboot))
+            ts <- round(sum(totalcount[countboot])*subsample_d)
+	    seqtmp <- as.vector(flcount$counts)
+            names(seqtmp) <- rownames(flcount$counts)
+	    tmpx <- totalcount[countboot]
+	    tmpx <- tmpx[-which(names(tmpx)=="Assigned")]
+	    seq <- c(seqtmp, tmpx)	
+	}
+	bootmat <- c()
+
+    for(i in 1:N_boot){	
+        tmp <- thinCounts(seq, target.size=ts)
+    	bootmat <- cbind(bootmat, tmp[rownames(flcount$counts),]) 
+    }
+    list(bootmat, target.size=unique(ts,sum(tmp)))
 
- x <- read.delim(fout,stringsAsFactors=FALSE)
- colnames(x)[1:6] <- c("GeneID","Chr","Start","End","Strand","Length")
- x_summary <- read.delim(paste(fout,".summary",sep=""), stringsAsFactors=FALSE)
- file.remove(fout)
- file.remove(paste(fout,".summary",sep=""))
- if(flag) 
-  file.remove(fout_annot)
-  if(ncol(x) == 6){
-   stop("No count data were generated.")
-  }
- y <- as.matrix(x[,-c(1:6)])
- colnames(y) <- colnames(x)[-c(1:6)]
- rownames(y) <- x$GeneID
- z <- list(counts=y,annotation=x[,1:6],targets=colnames(y),stat=x_summary)
- z
 }
 .printParam <- function(depth=1, boot=1){
  cat("\n\n\n		samExplorer \n")

diff --git a/data/df_intersect.RData b/data/df_intersect.RData
diff --git a/data/df_sole.RData b/data/df_sole.RData
diff --git a/man/df_intersect.Rd b/man/df_intersect.Rd
@@ -0,0 +1,27 @@
+\name{df_intersect}
+\alias{df_intersect}
+\docType{data}
+\title{
+    Example data for plotting output of samExplore function.
+}
+\description{
+    Example data for plotting output of samExplore function.
+}
+\usage{data("df_intersect")}
+\format{
+    A data frame with 1125 observations on the following 3 variables.
+      \describe{
+        \item{\code{Label}}{a character vector}
+        \item{\code{Variable}}{a numeric vector}
+        \item{\code{Value}}{a numeric vector}
+    }
+}
+\details{
+    is a dataframe object
+}
+\value{Example data for plotting results.}
+\examples{
+data(df_intersect)
+
+}
+\keyword{datasets}
diff --git a/man/df_sole.Rd b/man/df_sole.Rd
@@ -0,0 +1,28 @@
+\name{df_sole}
+\alias{df_sole}
+\docType{data}
+\title{
+    Example data for plotting output of samExplore function.
+}
+\description{
+
+    Example data for plotting output of samExplore function.
+}
+\usage{data("df_sole")}
+\format{
+  A data frame with 1125 observations on the following 3 variables.
+  \describe{
+    \item{\code{Label}}{a character vector}
+    \item{\code{Variable}}{a numeric vector}
+    \item{\code{Value}}{a numeric vector}
+  }
+}
+\details{
+ is a dataframe object
+}
+\value{Example data for plotting results.}
+\examples{
+data(df_sole)
+
+}
+\keyword{datasets}
diff --git a/man/exploreRep.Rd b/man/exploreRep.Rd
@@ -0,0 +1,67 @@
+\name{exploreRep}
+\alias{exploreRep}
+\title{exploreRep: function to explore the reproducibility}
+\description{This function explores the reproducibility of analysis with annotation altering}
+\usage{
+exploreRep((df_d, lbl_vect, f))
+}
+\arguments{
+  \item{df_d}{ a dataframe containing the dataset to explore with 3 columns: 
+   label, f ratio, value to compare (e.g. number of differentially 
+   expressed genes)}
+  \item{lbl_vect}{a vector of character strings specifing the labels for 
+   which the analysis should be run}
+  \item{f}{ A numeric value of f for which the analysis should be run}
+  }
+\details{
+\code{exploreRep} function to explore the reproducibility of the analysis 
+ with altering of annotation. It runs ANOVA test for values to compare 
+ (e.g. number of differentially expressed genes) corresponding to different 
+ Annotation labels (i.e. analysis' run for different annotation types)
+
+This function takes as input a dataframe containing the dataset to explore.
+
+Here is the example of the dataframe
+
+\preformatted{
+...
+AnnotA	0.1	13
+AnnotB	0.1	101	
+AnnotC	0.1	36
+AnnotA	0.1	13
+AnnotB	0.1	101	
+AnnotC	0.1	36
+AnnotA	0.4	40
+AnnotB	0.4	153	
+AnnotC	0.4	62
+AnnotA	0.8	71
+AnnotB	0.8	203
+AnnotC	0.8	160
+...
+}
+
+\code{exploreRob}
+Thired column gives the values to compare (here number of differentially
+ expressed genes).
+
+\code{exploreRep} function subsets the dataset to consider only valyes for one 
+f and runs ANOVA test for groups corresponding to annotations of interest.
+}
+\value{ 
+An output of aov function
+
+}
+\author{Alexey Stupnikov and Shailesh Tripathi}
+%\note{}
+\examples{
+%\dontrun{
+#library(samExploreR)
+data("df_sole")
+#run ANOVA for annotation types labeled 'New, Gene' and 'New, Exon' and 
+#f value 0.9
+exploreRep(df_sole, lbl_vect = c('New, Gene', 'Old, Gene'), f = 0.9)
+
+#run ANOVA for annotation type labeled 'Old' and 'New' and f value 0.5
+exploreRep(df_sole, lbl_vect = c('New, Gene', 'Old, Gene'), f = 0.5)
+%}
+}