Skip to content
Permalink
Branch: master
Find file Copy path
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
508 lines (412 sloc) 18.2 KB
annotateNearest <- function(x, subject, annotate=TRUE, ...) {
if (is.data.frame(x)) x <- makeGRangesFromDataFrame(x)
if(is.data.frame(subject)) subject <- makeGRangesFromDataFrame(subject)
if(class(x)!="GRanges" | class(subject)!="GRanges")
stop("x and subject must be GRanges or a data.frame with column names chr, start and end.")
dots <- list(...)
names.dots <- names(dots)
if ("select" %in% names.dots && dots$select == "all")
stop("select=\"all\" not supported (yet)")
# find nearests
NN <- smarterNearest(x, subject, ...)
nearest_exists <- !is.na(NN)
# full return value
Ret <- matrix(NA, nrow=length(NN), ncol=7)
colnames(Ret) <- c("distance", "subjectHits", "type", "amountOverlap",
"insideDist", "size1", "size2")
Ret <- data.frame(Ret, stringsAsFactors=FALSE)
# queries with a nearest target
x <- x[nearest_exists]
# corresponding nearest targets
y <- subject[NN[nearest_exists],]
# partial value for when nearest exists
ret <- matrix(NA, nrow=length(x), ncol=7)
colnames(ret) <- c("distance", "subjectHits", "type", "amountOverlap",
"insideDistance", "size1", "size2")
ret[,"subjectHits"] <- NN[nearest_exists]
ret <- data.frame(ret, stringsAsFactors=FALSE)
x.start <- start(x)
y.start <- start(y)
x.end <- end(x)
y.end <- end(y)
ret$type[(x.start > y.start & x.end <= y.end) |
(x.start >= y.start & x.end < y.end)] <- "inside"
ret$type[x.start <= y.start & x.end >= y.end] <- "cover"
ret$type[x.start > y.end] <- "disjointR"
ret$type[x.end < y.start] <- "disjointL"
ret$type[x.start > y.start & x.start <= y.end & x.end > y.end] <- "overlapR"
ret$type[x.start < y.start & x.end >= y.start & x.end < y.end] <- "overlapL"
ret$distance <- 0L
i <- ret$type == "disjointR"
ret$distance[i] <- y.end[i] - x.start[i]
i <- ret$type == "disjointL"
ret$distance[i] <- y.start[i] - x.end[i]
if (!annotate) {
Ret[nearest_exists,] <- ret
return(as.matrix(Ret[, c("distance", "subjectHits")]))
}
i <- ret$type == "overlapR"
ret$amountOverlap[i] <- -1L * (y.end[i] - x.start[i] + 1L)
i <- ret$type == "overlapL"
ret$amountOverlap[i] <- x.end[i] - y.start[i] + 1L
ret$type[ret$type%in%c("disjointR", "disjointL")] <- "disjoint"
ret$type[ret$type%in%c("overlapR", "overlapL")] <- "overlap"
## insideDistance column:
i <- ret$type == "inside" #no missing ret$type at this point
tmp0 <- cbind(x.end[i] - y.end[i], x.start[i] - y.start[i])
tmp <- apply(abs(tmp0), 1, which.min)
tmpinsidedist <- tmp0[, 1]
tmpIndex <- tmp == 2
tmpinsidedist[tmpIndex] <- tmp0[tmpIndex, 2]
ret$insideDistance[i] <- tmpinsidedist
## size1 and size2 columns:
ret$size1 <- x.end - x.start + 1L
ret$size2 <- y.end - y.start + 1L
Ret[nearest_exists,] <- ret
Ret
}
#######################################################
#######################################################
annotateTranscripts <-function(txdb, annotationPackage=NULL, by=c("tx","gene"),codingOnly=FALSE,verbose=TRUE,requireAnnotation=FALSE, mappingInfo = NULL, simplifyGeneID = FALSE){
if( class(txdb)!="TxDb") stop("txdb must be of class TxDb")
if(is.null(annotationPackage)){
organism <- organism(txdb)
organism <- strsplit(organism," ")[[1]]
organism <- paste0(substr(organism[1],1,1),
tolower(substr(organism[2],1,1)))
annotationPackage <- paste("org",organism,"eg.db",sep=".")
if(verbose)
message(sprintf("No annotationPackage supplied. Trying %s.",annotationPackage))
}
if(!require(annotationPackage,character.only=TRUE)){
if(requireAnnotation){
stop("Can't load ",annotationPackage,".\nMake sure library is installed.\nAnd make sure the organism argument follows the convention here http://www.bioconductor.org/packages/release/data/annotation/.\nFor example for human use Hs")}else{
message("Could not load ",annotationPackage,". Will continue without annotation")
annotationPackage <- NULL
}
}
##################################################
### Get TSS and TSE
#######################################################
if(verbose) message("Getting TSS and TSE.")
by <- match.arg(by)
if(by=="tx"){
tt <- transcriptsBy(txdb, by="gene")
seqinfo <- seqinfo(tt)
## the transcript names
LL <- tt@unlistData@elementMetadata@listData
txx <- unlist(tt)$tx_name
} else{
tt <- genes(txdb)
seqinfo <- seqinfo(tt)
txx <- names(tt)
}
chr <- unlist(CharacterList(seqnames(tt)))
strand <- unlist(CharacterList(strand(tt)))
RR <- ranges(tt)
## Useful for switching Gencode IDs to Ensembl IDs
if(simplifyGeneID) names(RR) <- gsub('\\..*', '', names(RR))
geneid <- names(RR)
## starts / ends (left/right endpoints):
TSS <- unlist(start(RR))
TSE <- unlist(end(RR))
######################################################
### Get CSS and CSE
#######################################################
if(verbose) message("Getting CSS and CSE.")
cds <- cdsBy(txdb, by=by, use.names=(by=="tx"))
with_coding <- which(txx %in% names(cds))
## take only the left- and right-most endpts:
cds_ranges <- range(cds[txx[with_coding]])
## starts and ends (really left / right endpts):
CSS <- CSE <- integer(length(txx))
CSS[with_coding] <- unlist(start(cds_ranges))
CSE[with_coding] <- unlist(end(cds_ranges))
CSS[-with_coding] <- NA
CSE[-with_coding] <- NA
##################################################
### Get Exons
#######################################################
if(verbose) message("Getting exons.")
ee <- exonsBy(txdb, by=by, use.names=(by=="tx"))
## exon groups of interest in transcript order,
## and sorted "left to right" within each group
Exons <- reduce(ranges(ee)[txx])
Nexons <- elementNROWS(Exons)
##now annotate genes
if(by=="tx") TT <- elementNROWS(tt) else TT <- rep(1,length(tt))
Geneid <- Rle(geneid, TT)
Tx <- txx
if(!is.null(annotationPackage)){
if(verbose) message("Annotating genes.")
if(!is.null(mappingInfo)) {
stopifnot(all(c('column', 'keytype', 'multiVals') %in% names(mappingInfo)))
geneid <- mapIds(get(annotationPackage), keys = geneid, column= mappingInfo$column, keytype = mappingInfo$keytype, multiVals = mappingInfo$multiVals)
}
##Annotate transcrtipt
##cant use select cause map is not 1-1
map <- get(gsub("\\.db", "SYMBOL", annotationPackage))
which <- mappedkeys(map)
symbols <- sapply(as.list(map[which]), paste, collapse=" ")
genes <- symbols[geneid]
map <- get(gsub("\\.db", "REFSEQ", annotationPackage))
which <- mappedkeys(map)
symbols <- sapply(as.list(map[which]), paste, collapse=" ")
refseq <- symbols[geneid] # a handful of NA
Gene <- Rle(genes, TT)
Refseq <- Rle(refseq, TT)
} else {
Gene <- Rle(NA, sum(TT))
Refseq <- Rle(NA, sum(TT))
}
transcripts=GRanges(ranges=IRanges(start=TSS, end=TSE),
seqnames=chr, strand=strand,
seqinfo=seqinfo,
CSS, CSE, Tx, Geneid, Gene, Refseq, Nexons, Exons)
if(codingOnly) transcripts <- transcripts[!is.na(values(transcripts)$CSS),]
attributes(transcripts)$description <- "annotatedTranscripts"
return(transcripts)
}
matchGenes <- function(x,subject, type=c("any","fiveprime"),
promoterDist=2500,
skipExons=FALSE,
verbose=TRUE){
if(attributes(subject)$description!="annotatedTranscripts")
stop("subject must be the output of function annotateTranscripts or have attribute(subject)$description=\"annotatedTranscripts\".")
if (is.data.frame(x)) x <- makeGRangesFromDataFrame(x)
if(class(x)!="GRanges") stop("x must be GRanges or a data.frame with column names chr, start and end.")
type=match.arg(type)
if(type=="fiveprime"){
map <- smarterNearest(x,resize(subject,width=1))
} else{
map <- smarterNearest(x,subject)
}
ind <- which(!is.na(map))
# dist <- rep(NA,length(map))
# dist[ind] <- distance(x[ind,],subject[map[ind],])
#
# dist[ind] <- dist[ind]*ifelse(
# (strand(subject[map[ind],])=="+" &
# end(subject[map[ind],]) < start(x[ind,])) |
# (strand(subject[map[ind],])=="-" &
# start(subject[map[ind],]) > end(x[ind,])),-1,1)
type=rep("",length(x))
subtype=rep("",length(x))
ctype=rep("",length(x))
dist=rep(0,length(x)) # distance to 5' end of the gene
insidedistance<-rep(NA,length(x))
exonnumber<-rep(NA,length(x))
nexons <- rep(NA,length(x))
geneL=rep(0,length(x))
codingL=rep(0,length(x))
subdist=rep(0,length(x))
genenames=rep("",length(x))
geneannotation=rep("",length(x))
geneid=rep("",length(x))
strands=rep("",length(x))
genenames[ind]<-as.character(values(subject[map[ind],])$Gene)
geneannotation[ind]<-as.character(values(subject[map[ind],])$Refseq)
geneid[ind]<-as.character(values(subject[map[ind],])$Geneid)
nexons[ind] <- values(subject[map[ind],])$Nexons
strands[ind] <- as.character(strand(subject[map[ind],]))
## This loop could be moved to C. Note: it uses nearest
for(j in ind){
i <- map[ind][j]
if(verbose & j%%100==0) cat(".")
TS = start(subject)[i]
TE = end(subject)[i]
geneL[j] = TE-TS
if(!is.na(subject$CSS[i])){
CS = subject$CSS[i]
CE = subject$CSE[i]
codingL[j]=CE-CS
} else {
CS <- CE <- codingL <- NA
}
exons <- subject[i,]$Exons[[1]]
Exons <- cbind(start(exons), end(exons))
Strand= ifelse(strand(subject[i,])=="+",1,-1)
S = start(x[j,])
E = end(x[j,])
# type[j]=""
if(S <= TS & E >= TE){
type[j]="covers"
subtype[j]="covers exon(s)"
} else{
if(E < TS){
if(Strand==1){
type[j]="upstream"
dist[j]=TS-E
} else{
type[j]="downstream"
dist[j]=TE-E
}
}
if(S > TE){
if(Strand==-1){
type[j]="upstream"
dist[j]=S-TE
} else{
type[j]="downstream"
dist[j]=S-TS
}
}
## totally within gene
if (S >= TS & E <= TE){
type[j]="inside"
if(Strand==-1) dist[j]=TE-E else dist[j]=S-TS
}
## overlaps exactly one side of gene ("covers" done above)
if(type[j]==""){
if(S < TS & E <= TE){
##OVERLAP FRONT
if(Strand==1) type[j]="overlaps 5'" else{
type[j]="overlaps 3'"
dist[j]=TE-E
}
}
else if (S >= TS & E > TE){
##OVERLAP BACK
if(Strand==-1) type[j]="overlaps 5'" else{
type[j]="overlaps 3'"
dist[j]=S-TS
}
}
}
}
m1=NA;m2=NA
if( type[j]%in%c("overlaps 5'","overlaps 3'","inside") & !skipExons ){
ir=IRanges(start=c(S,E),width=1)
map2 = smarterNearest(ir, exons)
dist2<-distance(ir,exons[map2])
pos <- start(ir) ##start end the same
dist2 <- dist2*ifelse(
(Strand==1 &
end(exons[map2,]) < pos ) |
(Strand==-1 &
start(exons[map2,]) > pos),-1,1)
tmp<-cbind(dist2,map2)
m1 = tmp[1,1]
m2 = tmp[2,1]
exon1 = tmp[1,2]
exon2 = tmp[2,2]
m1m2Index=which.min(abs(c(m1,m2)))
if(exon1==exon2 & m1==0 & m2==0){
subtype[j]="inside exon"
insidedistance[j]=0
exonnumber[j]=c(exon1,exon2)[m1m2Index]
}
if( (sign(m1)==sign(m2) & (m1!=0 & m2!=0) & exon1==exon2) |
(sign(m1)==-1 & sign(m2)==1 & exon2-exon1==1) ){
subtype[j]="inside intron"
insidedistance[j]=c(m1,m2)[m1m2Index]
exonnumber[j]=c(exon1,exon2)[m1m2Index]
}
if( (exon2-exon1 > 1) |
( (exon2-exon1 == 1) &
((sign(m1)==-1 & sign(m2)==-1) |
(sign(m1)==1 & sign(m2)==-1) |
(sign(m1)==1 & sign(m2)==1) |
(sign(m1)==0 & sign(m2)==-1)|
(sign(m1)==1 & sign(m2)==0))) |
(exon2==exon1 & sign(m1)!=sign(m2))){
subtype[j]="covers exon(s)"
insidedistance[j]=0
exonnumber[j]=c(exon1,exon2)[m1m2Index]
}
if( (exon2-exon1 <= 1 & sign(m1)==-1 & sign(m2)==0) |
(exon2==exon1 & sign(m1)==1 & sign(m2)==0)){
if(Strand==1) subtype[j]="overlaps exon upstream" else subtype[j]="overlaps exon downstream"
insidedistance[j]=0
exonnumber[j]=c(exon1,exon2)[m1m2Index]
}
if( (exon2-exon1 <= 1 & sign(m1)==0 & sign(m2)==1) |
(exon2==exon1 & sign(m1)==0 & sign(m2)==-1)){
if(Strand==-1) subtype[j]="overlaps exon upstream" else subtype[j]="overlaps exon downstream"
insidedistance[j]=0
exonnumber[j]=c(exon1,exon2)[m1m2Index]
}
if( exon2-exon1 == 1 & sign(m1)==0 & sign(m2)==0){
subtype[j]="overlaps two exons"
insidedistance[j]=0
exonnumber[j]=c(exon1,exon2)[m1m2Index]
}
if(Strand!=1){
insidedistance[j]= -insidedistance[j]
exonnumber[j] = nrow(Exons) - exonnumber[j] + 1
}
ctype[j]="inside transcription region"
if(!is.na(CS)) {
if(S<CS & E<CS){
if(Strand==1) ctype[j]="5' UTR" else ctype[j]="3'UTR"
}
}
if(!is.na(CE)) {
if(S>CE & E>CE){
if(Strand==-1) ctype[j]="5' UTR" else ctype[j]="3'UTR"
}
}
if(!is.na(CS) & !is.na(CE)) {
if(S<CS & E>CE){
ctype[j]="covers coding region"
}
}
if(!is.na(CS)) {
if(S<CS & E>CS){
if(Strand==1) ctype[j]="overlaps 5' UTR" else ctype[j]="overlaps 3'UTR"
}
}
if(!is.na(CE)) {
if(S<CE & E>CE){
if(Strand==-1) ctype[j]="overlaps 5' UTR" else ctype[j]="overlaps 3'UTR"
}
}
}
## if(TE-TS<10^5){##graphical check
## plot(0,0,ylim=c(0,0.6),xlim=range(c(start(subject[i,]),end(subject[i,]),start(x[j,]),end(x[j,]))),
## xlab=paste("inside distance=",dist[j],insidedistance[j],m1,m2,exonnumber[j]))
## polygon(c(TS,TE,TE,TS),c(0,0,0.5,0.5),density=0,col=2)
## polygon(c(CS,CE,CE,CS),c(0.1,0.1,0.4,0.4),density=0,col=3)
## abline(h=0.25,lwd=2)
## apply(Exons,1,function(x) polygon(c(x[1],x[2],x[2],x[1]),
## c(0.2,0.2,0.3,0.3),col=4))
## polygon(c(start(x[j,]),end(x[j,]),end(x[j,]),start(x[j,])),c(0.4,0.4,0.5,0.5),col=5)
## lines(c(TS,TS+1000),c(0.55,0.55),lwd=3)
## title(paste(j,i,Strand,type[j],subtype[j],ctype[j],dist[j],sep=":"))
## }
}
type[dist<=promoterDist & type=="upstream"] <- "promoter"
type[dist<=promoterDist & type=="downstream"] <- "close to 3'"
description=type
tmpIndex=which(description=="inside")
description[tmpIndex] <- subtype[tmpIndex]
tmp <- data.frame(name=I(genenames),
annotation=I(geneannotation),
description=factor(description,levels=c("upstream","promoter","overlaps 5'","inside intron","inside exon","covers exon(s)","overlaps exon upstream","overlaps exon downstream","overlaps two exons","overlaps 3'","close to 3'","downstream","covers")),
region=factor(type,levels=c("upstream","promoter","overlaps 5'","inside","overlaps 3'","close to 3'","downstream","covers")),
distance=dist,
subregion=factor(subtype,levels=c("inside intron","inside exon","covers exon(s)","overlaps exon upstream","overlaps exon downstream","overlaps two exons")),
insideDistance=insidedistance,
exonnumber=exonnumber,
nexons=nexons,
UTR=factor(ctype,levels=c("inside transcription region","5' UTR","overlaps 5' UTR","3'UTR","overlaps 3'UTR","covers transcription region")),
strand=strands,
geneL=geneL,
codingL=codingL,
Geneid=geneid,
subjectHits=map)
return(tmp)
}
poverlapWidth <- function(query, subject) {
overlap_score <- pmin(end(query), end(subject)) -
pmax(start(query), start(subject)) + 1L
pmax(overlap_score, 0L)
}
smarterNearest <- function(x, subject) {
hits <- nearest(x, subject, select="all")
mcols(hits)$ovwidth <- poverlapWidth(x[queryHits(hits)],subject[subjectHits(hits)])
partitioning <- PartitioningByEnd(queryHits(hits),NG=queryLength(hits))
ovwidth <- relist(mcols(hits)$ovwidth, partitioning)
hits <- hits[unlist(ovwidth==max(ovwidth))]
selectHits(hits, select="arbitrary")
}
You can’t perform that action at this time.