-
Notifications
You must be signed in to change notification settings - Fork 7
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
added a command to compare exons from an isoscm assembly with reference
gene models, requested in issue #7
- Loading branch information
Showing
3 changed files
with
189 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
package executable; | ||
|
||
import java.io.File; | ||
|
||
import javax.xml.bind.annotation.XmlAccessType; | ||
import javax.xml.bind.annotation.XmlAccessorType; | ||
import javax.xml.bind.annotation.XmlElement; | ||
import javax.xml.bind.annotation.XmlRootElement; | ||
|
||
import com.beust.jcommander.Parameter; | ||
|
||
@XmlRootElement(name="CompareCommand") | ||
@XmlAccessorType(XmlAccessType.PROPERTY) | ||
public class DiffCommand{ | ||
|
||
|
||
@Parameter(names="-x1", description="XML configuration file from the assembly step", converter=FileConverter.class, required=true) | ||
File assemblyXml; | ||
|
||
@Parameter(names="-G", description="Ensembl format GTF to which the assembled exons will be compared") | ||
String refGtf; | ||
|
||
public File getAssemblyXml() { | ||
return assemblyXml; | ||
} | ||
|
||
@XmlElement | ||
public void setAssemblyXml(File assemblyXml) { | ||
this.assemblyXml = assemblyXml; | ||
} | ||
|
||
|
||
} | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,146 @@ | ||
package processing; | ||
|
||
import java.io.File; | ||
import java.io.FileNotFoundException; | ||
import java.io.PrintStream; | ||
import java.util.Collections; | ||
import java.util.HashSet; | ||
import java.util.List; | ||
import java.util.Map; | ||
import java.util.Set; | ||
|
||
import org.apache.commons.lang.StringUtils; | ||
|
||
import tools.AnnotatedRegion; | ||
import tools.IntervalTools; | ||
import tools.StrandedGenomicIntervalTree; | ||
import tools.GTFTools.AnnotationParser; | ||
import tools.ParseGTF.TranscriptIterator; | ||
import util.Util; | ||
|
||
public class DiffReference { | ||
|
||
public static void labelExons(AnnotationParser annotation){ | ||
|
||
for(AnnotatedRegion transcript : annotation.transcripts){ | ||
List<AnnotatedRegion> exons = IntervalTools.SelectIntervals(annotation.exons, transcript.chr, transcript.start, transcript.end, transcript.strand, "transcript_id", transcript.getAttribute("transcript_id")); | ||
|
||
if(exons.size()>1){ | ||
Collections.sort(exons,IntervalTools.AnnotatedRegionComparator(true, !transcript.isNegativeStrand())); | ||
List<AnnotatedRegion> internal = exons.subList(1, exons.size()-1); | ||
for(AnnotatedRegion r : internal){ | ||
r.addAttribute("exon_type", "internal_exon"); | ||
} | ||
exons.get(0).addAttribute("exon_type", "5p_exon"); | ||
exons.get(exons.size()-1).addAttribute("exon_type", "3p_exon"); | ||
} | ||
else{ | ||
exons.get(0).addAttribute("exon_type", "unspliced"); | ||
} | ||
} | ||
|
||
|
||
} | ||
|
||
|
||
public static void diff(String gtfFile, File assemblyGtf, File assemblyUnsplicedGtf) throws FileNotFoundException { | ||
TranscriptIterator spliced = new TranscriptIterator(assemblyGtf); | ||
TranscriptIterator unspliced = new TranscriptIterator(assemblyUnsplicedGtf); | ||
AnnotationParser ap = new AnnotationParser(gtfFile); | ||
|
||
labelExons(ap); | ||
PrintStream out = System.out; | ||
out.printf("exon\ttype\tmatch_5p_3p\tmatch_5p_3p_type\tmatch_5p\tmatch_5p_type\tmatch_3p\tmatch_3p_type\toverlaps\toverlaps_type\n"); | ||
for(AnnotatedRegion r : spliced){ | ||
if(r.annotation.equals("exon")){ | ||
String type = (String) r.getAttribute("type"); | ||
|
||
Set<String> matched5p = new HashSet<String>(); | ||
Set<String> matched3p = new HashSet<String>(); | ||
Set<String> matched5p3p = new HashSet<String>(); | ||
Set<String> overlappingIds = new HashSet<String>(); | ||
Set<String> overlappingTypes = new HashSet<String>(); | ||
Set<String> matched5pTypes= new HashSet<String>(); | ||
Set<String> matched3pTypes= new HashSet<String>(); | ||
Set<String> matched5p3pTypes = new HashSet<String>(); | ||
for(AnnotatedRegion e : ap.exons.overlappingRegions(r.chr, r.start, r.end, r.strand)){ | ||
String transcriptId = (String) e.getAttribute("transcript_id"); | ||
String exon_type = (String) e.getAttribute("exon_type"); | ||
|
||
overlappingIds.add(transcriptId); | ||
overlappingTypes.add(exon_type); | ||
|
||
if(r.get5Prime()==e.get5Prime()){ | ||
matched5p.add(transcriptId); | ||
matched5pTypes.add(exon_type); | ||
} | ||
if(r.get3Prime()==e.get3Prime()){ | ||
matched3p.add(transcriptId); | ||
matched3pTypes.add(exon_type); | ||
} | ||
if(r.get5Prime()==e.get5Prime() && r.get3Prime()==e.get3Prime()){ | ||
matched5p3p.add(transcriptId); | ||
matched5p3pTypes.add(exon_type); | ||
} | ||
} | ||
|
||
out.printf("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n", r, type, | ||
StringUtils.join(matched5p3p,","), | ||
StringUtils.join(matched5p3pTypes,","), | ||
StringUtils.join(matched5p,","), | ||
StringUtils.join(matched5pTypes,","), | ||
StringUtils.join(matched3p,","), | ||
StringUtils.join(matched3pTypes,","), | ||
StringUtils.join(overlappingIds,","), | ||
StringUtils.join(overlappingTypes,",") | ||
); | ||
} | ||
} | ||
for(AnnotatedRegion r : unspliced){ | ||
if(r.annotation.equals("exon")){ | ||
String type = "unspliced"; | ||
|
||
Set<String> matched5p = new HashSet<String>(); | ||
Set<String> matched3p = new HashSet<String>(); | ||
Set<String> matched5p3p = new HashSet<String>(); | ||
Set<String> overlappingIds = new HashSet<String>(); | ||
Set<String> overlappingTypes = new HashSet<String>(); | ||
Set<String> matched5pTypes= new HashSet<String>(); | ||
Set<String> matched3pTypes= new HashSet<String>(); | ||
Set<String> matched5p3pTypes = new HashSet<String>(); | ||
for(AnnotatedRegion e : ap.exons.overlappingRegions(r.chr, r.start, r.end, r.strand)){ | ||
String transcriptId = (String) e.getAttribute("transcript_id"); | ||
String exon_type = (String) e.getAttribute("exon_type"); | ||
|
||
overlappingIds.add(transcriptId); | ||
overlappingTypes.add(exon_type); | ||
|
||
if(r.get5Prime()==e.get5Prime()){ | ||
matched5p.add(transcriptId); | ||
matched5pTypes.add(exon_type); | ||
} | ||
if(r.get3Prime()==e.get3Prime()){ | ||
matched3p.add(transcriptId); | ||
matched3pTypes.add(exon_type); | ||
} | ||
if(r.get5Prime()==e.get5Prime() && r.get3Prime()==e.get3Prime()){ | ||
matched5p3p.add(transcriptId); | ||
matched5p3pTypes.add(exon_type); | ||
} | ||
} | ||
|
||
out.printf("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n", r, type, | ||
StringUtils.join(matched5p3p,","), | ||
StringUtils.join(matched5p3pTypes,","), | ||
StringUtils.join(matched5p,","), | ||
StringUtils.join(matched5pTypes,","), | ||
StringUtils.join(matched3p,","), | ||
StringUtils.join(matched3pTypes,","), | ||
StringUtils.join(overlappingIds,","), | ||
StringUtils.join(overlappingTypes,",") | ||
); | ||
} | ||
} | ||
|
||
} | ||
} |