added a command to compare exons from an isoscm assembly with reference

gene models, requested in issue #7
shenkers · Jun 2, 2015 · 86d3819 · 86d3819
1 parent af9ef48
commit 86d3819
Show file tree

Hide file tree

Showing 3 changed files with 189 additions and 0 deletions.
diff --git a/IsoSCM/src/executable/DiffCommand.java b/IsoSCM/src/executable/DiffCommand.java
@@ -0,0 +1,34 @@
+package executable;
+
+import java.io.File;
+
+import javax.xml.bind.annotation.XmlAccessType;
+import javax.xml.bind.annotation.XmlAccessorType;
+import javax.xml.bind.annotation.XmlElement;
+import javax.xml.bind.annotation.XmlRootElement;
+
+import com.beust.jcommander.Parameter;
+
+@XmlRootElement(name="CompareCommand")
+@XmlAccessorType(XmlAccessType.PROPERTY)
+public class DiffCommand{	
+
+
+	@Parameter(names="-x1", description="XML configuration file from the assembly step", converter=FileConverter.class, required=true)
+	File assemblyXml;
+
+	@Parameter(names="-G", description="Ensembl format GTF to which the assembled exons will be compared")
+	String refGtf;
+
+	public File getAssemblyXml() {
+		return assemblyXml;
+	}
+
+	@XmlElement
+	public void setAssemblyXml(File assemblyXml) {
+		this.assemblyXml = assemblyXml;
+	}
+
+
+}
+
diff --git a/IsoSCM/src/executable/IsoSCM.java b/IsoSCM/src/executable/IsoSCM.java
@@ -163,13 +163,15 @@ public static void main(String[] args) throws IOException, JAXBException {
 		AssembleCommand assemble = new AssembleCommand();
 		SegmentCommand segment = new SegmentCommand();
 		CompareCommand compare = new CompareCommand();
+		DiffCommand diff = new DiffCommand();
 		EnumerateCommand enumerate = new EnumerateCommand();
 		HelpCommand help = new HelpCommand();
 
 		jc.addCommand("assemble", assemble);
 		jc.addCommand("segment", segment);
 		jc.addCommand("compare", compare);
 		jc.addCommand("enumerate", enumerate);
+		jc.addCommand("diff", diff);
 		jc.addCommand("-h", help);
 		try{
 			jc.parse(args);		
@@ -614,6 +616,13 @@ else if(jc.getParsedCommand().equals("compare")){
 
 			logger.info("compare comleted.");
 		}
+		else if(jc.getParsedCommand().equals("diff")){
+			AssembleCommand assemblyConfiguration = ConfigurationIO.readAssemblyConfiguration(diff.assemblyXml);
+			File assemblyGtf = new File(Util.sprintf("%s/%s.gtf",assemblyConfiguration.dir, assemblyConfiguration.base));
+			File assemblyUnsplicedGtf = new File(Util.sprintf("%s/%s.unspliced.gtf",assemblyConfiguration.dir, assemblyConfiguration.base));
+			processing.DiffReference.diff(diff.refGtf, assemblyGtf, assemblyUnsplicedGtf);
+
+		}
 		}
 		catch(ParameterException e){
 			e.printStackTrace(System.out);

diff --git a/IsoSCM/src/processing/DiffReference.java b/IsoSCM/src/processing/DiffReference.java
@@ -0,0 +1,146 @@
+package processing;
+
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.PrintStream;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.commons.lang.StringUtils;
+
+import tools.AnnotatedRegion;
+import tools.IntervalTools;
+import tools.StrandedGenomicIntervalTree;
+import tools.GTFTools.AnnotationParser;
+import tools.ParseGTF.TranscriptIterator;
+import util.Util;
+
+public class DiffReference {
+
+	public static void labelExons(AnnotationParser annotation){
+
+		for(AnnotatedRegion transcript : annotation.transcripts){
+			List<AnnotatedRegion> exons = IntervalTools.SelectIntervals(annotation.exons, transcript.chr, transcript.start, transcript.end, transcript.strand, "transcript_id", transcript.getAttribute("transcript_id"));
+
+			if(exons.size()>1){
+				Collections.sort(exons,IntervalTools.AnnotatedRegionComparator(true, !transcript.isNegativeStrand()));
+				List<AnnotatedRegion> internal = exons.subList(1, exons.size()-1);
+				for(AnnotatedRegion r : internal){
+					r.addAttribute("exon_type", "internal_exon");
+				}
+				exons.get(0).addAttribute("exon_type", "5p_exon");
+				exons.get(exons.size()-1).addAttribute("exon_type", "3p_exon");
+			}
+			else{
+				exons.get(0).addAttribute("exon_type", "unspliced");
+			}
+		}
+
+
+	}
+
+
+	public static void diff(String gtfFile, File assemblyGtf, File assemblyUnsplicedGtf) throws FileNotFoundException {
+		TranscriptIterator spliced = new TranscriptIterator(assemblyGtf);
+		TranscriptIterator unspliced = new TranscriptIterator(assemblyUnsplicedGtf);
+		AnnotationParser ap = new AnnotationParser(gtfFile);
+
+		labelExons(ap);
+		PrintStream out = System.out;
+		out.printf("exon\ttype\tmatch_5p_3p\tmatch_5p_3p_type\tmatch_5p\tmatch_5p_type\tmatch_3p\tmatch_3p_type\toverlaps\toverlaps_type\n");
+		for(AnnotatedRegion r : spliced){
+			if(r.annotation.equals("exon")){
+				String type = (String) r.getAttribute("type");
+
+				Set<String> matched5p = new HashSet<String>();
+				Set<String> matched3p = new HashSet<String>();
+				Set<String> matched5p3p = new HashSet<String>();
+				Set<String> overlappingIds = new HashSet<String>();
+				Set<String> overlappingTypes = new HashSet<String>();
+				Set<String> matched5pTypes= new HashSet<String>();
+				Set<String> matched3pTypes= new HashSet<String>();
+				Set<String> matched5p3pTypes = new HashSet<String>();
+				for(AnnotatedRegion e : ap.exons.overlappingRegions(r.chr, r.start, r.end, r.strand)){
+					String transcriptId = (String) e.getAttribute("transcript_id");
+					String exon_type = (String) e.getAttribute("exon_type");
+
+					overlappingIds.add(transcriptId);
+					overlappingTypes.add(exon_type);
+
+					if(r.get5Prime()==e.get5Prime()){
+						matched5p.add(transcriptId);
+						matched5pTypes.add(exon_type);
+					}
+					if(r.get3Prime()==e.get3Prime()){
+						matched3p.add(transcriptId);
+						matched3pTypes.add(exon_type);
+					}
+					if(r.get5Prime()==e.get5Prime() && r.get3Prime()==e.get3Prime()){
+						matched5p3p.add(transcriptId);
+						matched5p3pTypes.add(exon_type);
+					}
+				}		
+
+				out.printf("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n", r, type,
+						StringUtils.join(matched5p3p,","),
+						StringUtils.join(matched5p3pTypes,","),
+						StringUtils.join(matched5p,","),
+						StringUtils.join(matched5pTypes,","),
+						StringUtils.join(matched3p,","),
+						StringUtils.join(matched3pTypes,","),
+						StringUtils.join(overlappingIds,","),
+						StringUtils.join(overlappingTypes,",")
+						);
+			}
+		}
+		for(AnnotatedRegion r : unspliced){
+			if(r.annotation.equals("exon")){
+				String type = "unspliced";
+
+				Set<String> matched5p = new HashSet<String>();
+				Set<String> matched3p = new HashSet<String>();
+				Set<String> matched5p3p = new HashSet<String>();
+				Set<String> overlappingIds = new HashSet<String>();
+				Set<String> overlappingTypes = new HashSet<String>();
+				Set<String> matched5pTypes= new HashSet<String>();
+				Set<String> matched3pTypes= new HashSet<String>();
+				Set<String> matched5p3pTypes = new HashSet<String>();
+				for(AnnotatedRegion e : ap.exons.overlappingRegions(r.chr, r.start, r.end, r.strand)){
+					String transcriptId = (String) e.getAttribute("transcript_id");
+					String exon_type = (String) e.getAttribute("exon_type");
+
+					overlappingIds.add(transcriptId);
+					overlappingTypes.add(exon_type);
+
+					if(r.get5Prime()==e.get5Prime()){
+						matched5p.add(transcriptId);
+						matched5pTypes.add(exon_type);
+					}
+					if(r.get3Prime()==e.get3Prime()){
+						matched3p.add(transcriptId);
+						matched3pTypes.add(exon_type);
+					}
+					if(r.get5Prime()==e.get5Prime() && r.get3Prime()==e.get3Prime()){
+						matched5p3p.add(transcriptId);
+						matched5p3pTypes.add(exon_type);
+					}
+				}		
+
+				out.printf("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n", r, type,
+						StringUtils.join(matched5p3p,","),
+						StringUtils.join(matched5p3pTypes,","),
+						StringUtils.join(matched5p,","),
+						StringUtils.join(matched5pTypes,","),
+						StringUtils.join(matched3p,","),
+						StringUtils.join(matched3pTypes,","),
+						StringUtils.join(overlappingIds,","),
+						StringUtils.join(overlappingTypes,",")
+						);
+			}
+		}
+
+	}
+}