diff --git a/README.md b/README.md
index 211f6c6..35f3f4c 100644
--- a/README.md
+++ b/README.md
@@ -452,3 +452,4 @@ For any of the listed reasons, or anything else, please leave us a Leave a message here
+
diff --git a/sample_relocaTE_run/sample_readme.txt b/sample_relocaTE_run/sample_readme.txt
index c2ac506..cba62e4 100644
--- a/sample_relocaTE_run/sample_readme.txt
+++ b/sample_relocaTE_run/sample_readme.txt
@@ -9,6 +9,7 @@ These scripts expect that the following programs are installed and included in y
3. BWA
4. BioPerl
5. Blat
+ 6. Blast: formatdb and fastacmd
STEPS A-C
A. RelocaTE
@@ -32,32 +33,32 @@ A. To find TE insertions in the included fastq reads run RelocaTE:
or
sh run_relocaTE.sh
- These scritps will run relocaTE directly or through a series of shell scripts.
- If you are running
+ These scripts will run relocaTE directly or through a series of shell scripts.
+ If you are running
- run_relocaTE_qsub.sh:
- 1) follow the instructions that are printed to the screen (run run_these_jobs.sh)
- 2) once complete, view the results in 02052012_sample/mping/results
+ 1) follow the instructions that are printed to the screen (run run_these_jobs.sh)
+ 2) once complete, view the results in 02052012_sample/mping/results
or
- run_relocaTE_shell.sh
- 1) follow the instructions that are printed to the screen (run run_these_jobs.sh)
- 2) once complete, view the results in 02052012_sample/mping/results
+ 1) follow the instructions that are printed to the screen (run run_these_jobs.sh)
+ 2) once complete, view the results in 02052012_sample/mping/results
or
- run_relocaTE.sh
- 1) once complete, view the results in 02052012_sample/mping/results
+ 1) once complete, view the results in 02052012_sample/mping/results
-B. Find Spanners to help classify the insertions (homozygous, heterozygous, etc) by generating a BAM file of the reads not trimmed of TE to the referenc
-e.
+B. Find Spanners to help classify the insertions (homozygous, heterozygous, etc) by generating a BAM file of the reads not trimmed of TE to the reference.
A BAM file is included in the sample data set, but one can be genreted by running the included script:
- create_bam.sh by:
- - changing directory into the directory of this script
- - and typing "sh create_bam.sh"
+ create_bam.sh by:
+ - changing directory into the directory of this script
+ - and typing "sh create_bam.sh"
C. To classify the insertions (homozygous, heterozygous, etc) run_characTErizer.sh
- - change directory into the directory of this script
- - type "sh run_characTErizer.sh"
+ - change directory into the directory of this script
+ - type "sh run_characTErizer.sh"
- once complete, view the resulting files in the directory you ran characTErizer.pl
- 1) sample.inserts_characTErized.gff: GFF file of the classified insertions including excisions
- 2) sample.inserts_characTErized.txt: Text file of the classified insertions including excisions
- 3) excisions_with_footprint.vcfinfo: additional information on the insertions that have been classified as exicision events
+ 1) sample.inserts_characTErized.gff: GFF file of the classified insertions including excisions
+ 2) sample.inserts_characTErized.txt: Text file of the classified insertions including excisions
+ 3) excisions_with_footprint.vcfinfo: additional information on the insertions that have been classified as exicision events
+
diff --git a/scripts/characterizer.pl b/scripts/characterizer.pl
index 028cc7b..dd79d20 100755
--- a/scripts/characterizer.pl
+++ b/scripts/characterizer.pl
@@ -310,3 +310,4 @@ sub getHelp {
}
}
}
+
diff --git a/scripts/characterizer.pl~ b/scripts/characterizer.pl~
deleted file mode 100755
index a16273c..0000000
--- a/scripts/characterizer.pl~
+++ /dev/null
@@ -1,312 +0,0 @@
-#!/usr/bin/perl -w
-use strict;
-use Data::Dumper;
-use Cwd;
-use Getopt::Long;
-
-if ( !defined @ARGV ) {
- &getHelp;
-}
-
-my $cwd = getcwd();
-my $sites_file;
-my $bam_dir;
-my $genome_fasta;
-## can be a single .bam file or a direcotory containing .bam files
-my @bam_files;
-my $excision = 0;
-
-GetOptions(
- 's|sites_file:s' => \$sites_file,
- 'b|bam_dir:s' => \$bam_dir,
- 'g|genome_fasta:s' => \$genome_fasta,
- 'x|excision:i' => \$excision,
-);
-
-sub getHelp {
- print '
-usage:
-./characterizer.pl [-s relocaTE table output file][-b bam file or dir of bam files][-g reference genome fasta file][-h]
-
-options:
--s file relocaTE output file: YOURSAMPLENAME.mping.all_nonref.txt [no default]
--b dir/file bam file of the orginal reads aligned to reference (before TE was trimmed) or directory of bam files [no default]
--g file reference genome fasta file [no default]
--x int find excision events that leave a footprint, yes or no(1|0) [0]
--h this help message
-
-For more information see documentation: http://srobb1.github.com/RelocaTE/
-
-';
- exit 1;
-}
-
-if ( -d $bam_dir ) {
-
- #remove trailing '/'
- $bam_dir =~ s/\/$//;
- @bam_files = <$bam_dir/*bam>;
-}
-elsif ( -f $bam_dir or -l $bam_dir ) {
- push @bam_files, $bam_dir;
-}
-open INSITES, "$sites_file" or die "cannot open $sites_file $!\n";
-my @dir_path = split '/', $sites_file;
-my $filename = pop @dir_path;
-my $file = $filename;
-$file =~ s/\..+?$//;
-$cwd =~ s/\/$//; #remove trailing /
-open OUTGFF, ">$cwd/$file.inserts_characTErized.gff";
-open OUT, ">$cwd/$file.inserts_characTErized.txt";
-print OUT
- "strain\tTE\tTSD\tchromosome.pos\tstrand\tavg_flankers\tspanners\tstatus\n";
-print OUTGFF "##gff-version 3\n";
-my %matches;
-my %TSDs;
-my %toPrint;
-
-while ( my $line = ) {
- next if $line =~ /TE.TSD.Exper.chromosome.insertion_site/;
- next if $line =~ /^\s*$/;
- chomp $line;
-
- # mping TTA A119 Chr1 1446..1448 + T:1 R:0 L:1
- my (
- $te, $TSD, $exp, $chromosome, $coor,
- $TE_orient, $total_string, $right_string, $left_string
- ) = split /\t/, $line;
- my ($pos) = $coor =~ /\d+\.\.(\d+)/;
- $TSDs{$chromosome}{$pos} = $TSD;
- my ($total_count) = $total_string =~ /T:(\d+)/;
- my ($left_count) = $left_string =~ /L:(\d+)/;
- my ($right_count) = $right_string =~ /R:(\d+)/;
-
- my $Mmatch = 0;
- my $cigar_all;
- if ( $left_count >= 1 and $right_count >= 1 and $total_count >= 2 ) {
- my @sam_all;
- foreach my $bam_file (@bam_files) {
- ## get any alignments that overlap the insertion site
- my @sam_out = `samtools view $bam_file \'$chromosome:$pos-$pos\'`;
- push @sam_all, @sam_out;
- }
-
- #remove redundant lines in sam file
- my %sorted_sam;
- my $order;
- foreach my $line (@sam_all) {
- $order++;
- if ( !exists $sorted_sam{$line} ) {
- $sorted_sam{$line} = $order;
- }
- }
-
- #make new sorted sam array by sorting on the value of the sort hash
- my @sorted_sam =
- sort { $sorted_sam{$a} <=> $sorted_sam{$b} } keys %sorted_sam;
-
- foreach my $sam_line (@sorted_sam) {
- chomp $sam_line;
- my @sam_line = split /\t/, $sam_line;
- my $cigar = $sam_line[5];
- my $flag = $sam_line[1];
- my $seqLen = length $sam_line[9];
- my $start = $sam_line[3];
- my $end = $start + $seqLen - 1;
- next unless $end >= $pos + 5;
- next unless $start <= $pos - 5;
- ## must be a all M match no soft clipping
- if ( $cigar =~ /^\d+M$/ ) {
- my ($NM) = $sam_line =~ /NM:i:(\d+)/; ## edit distance used
- ## bwa specific: mismatch count, often the same as NM, have not seen a XM>0 and NM==0
- my ($XM) = $sam_line =~ /XM:i:(\d+)/;
- if ( defined $XM and $XM == 0 ) {
- $Mmatch++;
- }
- elsif ( defined $NM and $NM == 0 ) {
- $Mmatch++;
- }
- elsif ( !defined $NM and !defined $XM ) {
- $Mmatch++;
- }
- }
- elsif ( $cigar =~ /[IND]/ ) {
- $matches{"$chromosome.$pos"}{sam}{$sam_line} = 1;
- }
- }
- my $spanners = $Mmatch;
- my $average_flankers = $total_count / 2;
- my $status = 0;
-
- if ( $spanners == 0 ) {
- $status = 'homozygous';
- }
- elsif ( $average_flankers >= 5 and $spanners < 5 ) {
- $status = 'homozygous/excision_no_footprint';
- }
- elsif ( $spanners < ( $average_flankers * .2 ) and $spanners <= 10 ) {
- $status = 'homozygous/excision_no_footprint';
- }
- elsif ( $average_flankers <= 2 and $spanners > 10 ) {
- $status = 'new_insertion';
- }
- elsif ( abs( $average_flankers - $spanners ) <= 5 ) {
- $status = 'heterozygous';
- }
- elsif (
- abs( $average_flankers - $spanners ) -
- ( ( $average_flankers + $spanners ) / 2 ) <= 10 )
- {
- $status = 'heterozygous?';
- }
- elsif ( $average_flankers > 10 and $spanners > 10 ) {
- $status = 'heterozygous';
- }
- elsif (
- (
- ( $spanners - $average_flankers ) >
- ( $spanners + $average_flankers ) / 2
- )
- and ( $average_flankers <= 10 )
- )
- {
- $status = 'new_insertion';
- }
- else {
- $status = 'other';
- }
-
- $matches{"$chromosome.$pos"}{status} = $status;
- $toPrint{$chromosome}{$pos}{$TSD}{TE} = $te;
- $toPrint{$chromosome}{$pos}{$TSD}{flank} = $average_flankers;
- $toPrint{$chromosome}{$pos}{$TSD}{span} = $spanners;
- $toPrint{$chromosome}{$pos}{$TSD}{status} = $status;
- $toPrint{$chromosome}{$pos}{$TSD}{strain} = $exp;
- $toPrint{$chromosome}{$pos}{$TSD}{coor} = $coor;
- $toPrint{$chromosome}{$pos}{$TSD}{TE_orient} = $TE_orient;
- }
-}
-
-if ($excision) {
-
-##generate vcf of spanners looking for excision events
- my @unlink_files;
- my @vcfs;
- foreach my $pos ( keys %matches ) {
- my ( $target, $loc ) = split /\./, $pos;
- next unless exists $toPrint{$target}{$loc};
- my $range = "$target:$pos";
- my $sam = "$cwd/$pos.sam";
- my $bam = "$cwd/$pos.bam";
- my $sorted_bam = "$cwd/$pos.sorted";
- my @sam_lines = keys %{ $matches{$pos}{sam} };
- if ( @sam_lines > 1 ) {
- my $pos_sam = join "\n", @sam_lines;
- open POSSAM, ">$sam";
- print POSSAM $pos_sam;
- `samtools view -bT $genome_fasta $sam > $bam`;
- `samtools sort $bam $sorted_bam`;
- `samtools index $sorted_bam.bam`;
-
-`samtools mpileup -C50 -ugf $genome_fasta -r $range $sorted_bam.bam | bcftools view -bvcg - > $cwd/$pos.var.raw.bcf`;
-`bcftools view $cwd/$pos.var.raw.bcf | vcfutils.pl varFilter -D 100 > $cwd/$pos.var.flt.vcf`;
- push @vcfs, "$cwd/$pos.var.flt.vcf";
- push @unlink_files, $sam, $bam, "$sorted_bam.bam.bai", "$sorted_bam.bam",
- "$cwd/$pos.var.raw.bcf", "$cwd/$pos.var.flt.vcf";
- close POSSAM;
- }
- }
-##Chr1 16327633 . GAGTACTACAATTAGTA GAGTA 1930.0% . INDEL;DP=2;AF1=1;CI95=0.5,1;DP4=0,0,1,1;MQ=29;FQ=-40.5 GT:PL:GQ 1/1:58,6,0:10
- open EXCISION, ">>$cwd/excisions_with_footprint.vcfinfo"
- or die "can't open $cwd/excisions_with_footprint.vcfinfo for writing $!";
- foreach my $vcf (@vcfs) {
- ##Chr2.30902247.var.flt.vcf
- my @path = split /\//, $vcf;
- my $file = pop @path;
- my ( $insert_ref, $insert_pos ) = $file =~ /(.+)\.(\d+)\.var\.flt\.vcf/;
- my $TSD = $TSDs{$insert_ref}{$insert_pos};
- my $TSD_len = length $TSD;
- open VCF, $vcf;
- while ( my $line = ) {
- next unless $line !~ /^#/;
- chomp $line;
- my ( $ref, $first_base, $col_3, $ref_seq, $strain_seq ) = split /\t/,
- $line;
- my $aln_start = $first_base - $insert_pos - 1;
- my $aln_end_ref = $first_base - length($ref_seq) - $insert_pos - 1;
- my $aln_end_strain = $first_base - length($strain_seq) - $insert_pos - 1;
- my $aln_end_ref_near_insert_pos = 0;
- my $aln_end_strain_near_insert_pos = 0;
- my $aln_start_near_insert_pos = 0;
- my $insert_bwt_ends = 0;
- my $all_values_after_insertion = 0;
-
- if ( ( $aln_start <= $TSD_len + 1 )
- and ( ( $aln_start * -1 <= $TSD_len + 1 ) ) )
- {
- $aln_start_near_insert_pos = 1;
- }
- if ( ( $aln_end_ref <= $TSD_len + 1 )
- and ( ( $aln_end_ref * -1 <= $TSD_len + 1 ) ) )
- {
- $aln_end_ref_near_insert_pos = 1;
- }
- if ( ( $aln_end_strain <= $TSD_len + 1 )
- and ( ( $aln_end_strain * -1 <= $TSD_len + 1 ) ) )
- {
- $aln_end_strain_near_insert_pos = 1;
- }
- my ( $end_ref, $end_strain ) = (
- ( $first_base + length($ref_seq) - 1 ),
- ( $first_base + length($strain_seq) + 1 )
- );
- if ( ( $end_ref < $insert_pos and $end_strain > $insert_pos )
- or ( $end_strain < $insert_pos and $end_ref > $insert_pos ) )
- {
- $insert_bwt_ends = 1;
- }
- if ( ( ( $first_base - $TSD_len + 1 ) > $insert_pos )
- and ( $end_ref > $insert_pos )
- and ( $end_strain > $insert_pos ) )
- {
- $all_values_after_insertion = 1;
- }
- ## if the alignment end in ref or in strain is close to the insertion postion,
- ## or if one is before and one is after the insertion postion,
- ## it is a potential excision with footprint
- if ( ( $aln_end_ref_near_insert_pos or $aln_end_strain_near_insert_pos )
- or ($insert_bwt_ends) )
- {
- ##make sure all values are not after the insertion
- if ( !$all_values_after_insertion ) {
- print EXCISION "$insert_ref.$insert_pos\t$line\n";
- my $status = $toPrint{$insert_ref}{$insert_pos}{$TSD}{status};
- ##only append if it already isnt there
- $toPrint{$insert_ref}{$insert_pos}{$TSD}{status} =
- $status . "/excision_with_footprint"
- if $status !~ /\/excision_with_footprint/;
- }
- }
- }
- }
- unlink @unlink_files;
-}
-
-foreach my $chr ( sort keys %toPrint ) {
- foreach my $pos ( sort { $a <=> $b } keys %{ $toPrint{$chr} } ) {
- foreach my $tsd ( sort keys %{ $toPrint{$chr}{$pos} } ) {
- my $TE = $toPrint{$chr}{$pos}{$tsd}{TE};
- my $flankers = $toPrint{$chr}{$pos}{$tsd}{flank};
- my $spanners = $toPrint{$chr}{$pos}{$tsd}{span};
- my $status = $toPrint{$chr}{$pos}{$tsd}{status};
- my $strain = $toPrint{$chr}{$pos}{$tsd}{strain};
- my $coor = $toPrint{$chr}{$pos}{$tsd}{coor};
- my $TE_orient = $toPrint{$chr}{$pos}{$tsd}{TE_orient};
- my ($start) = $coor =~ /(\d+)\.\.\d+/;
- print OUT
-"$strain\t$TE\t$tsd\t$chr:$coor\t$TE_orient\t$flankers\t$spanners\t$status\n";
- print OUTGFF
-"$chr\t$strain\ttransposable_element_attribute\t$start\t$pos\t$TE_orient\t.\t.\tID=$chr.$pos.spanners;avg_flankers=$flankers;spanners=$spanners;type=$status;TE=$TE;TSD=$tsd\n";
- }
- }
-}
diff --git a/scripts/relocaTE.pl b/scripts/relocaTE.pl
index 283164a..04c479a 100755
--- a/scripts/relocaTE.pl
+++ b/scripts/relocaTE.pl
@@ -147,8 +147,8 @@
}
else {
my $fq_path = File::Spec->rel2abs($fq_dir);
- @fq_files = <$fq_path/*fq>;
- my @fastq_files = <$fq_path/*fastq>;
+ @fq_files = <$fq_path/*.fq>;
+ my @fastq_files = <$fq_path/*.fastq>;
push @fq_files, @fastq_files;
if ( scalar @fq_files == 0 ) {
print "Must provide at least 1 short read file\n";
diff --git a/scripts/relocaTE.pl~ b/scripts/relocaTE.pl~
deleted file mode 100755
index 133cd8a..0000000
--- a/scripts/relocaTE.pl~
+++ /dev/null
@@ -1,862 +0,0 @@
-#!/usr/bin/perl -w
-use File::Spec;
-use Getopt::Long;
-use Cwd;
-use FindBin qw($RealBin);
-use File::Path qw(make_path);
-use strict;
-
-my $scripts = $RealBin;
-
-if ( !defined @ARGV ) {
- &getHelp();
-}
-my $genome_fasta = 'NONE';
-my $te_fasta;
-my $target = 'NONE';
-my $len_cutoff = 10;
-my $mismatch_allowance = 0;
-my $fq_dir;
-my $exper = 'not.given';
-my $mate_file_1 = '_p1';
-my $mate_file_2 = '_p2';
-my $mate_file_unpaired = '.unPaired';
-my $workingdir;
-my $outdir = 'outdir_teSearch';
-my $parallel = 0;
-my $qsub_array = 0;
-my $qsub_q;
-my ( $blat_minScore, $blat_tileSize ) = ( 10, 7 );
-my $flanking_seq_len = 100;
-my $existing_TE = 'NONE';
-my $bowtie2 = 0;
-GetOptions(
- 'p|parallel:i' => \$parallel,
- 'a|qsub_array:i' => \$qsub_array,
- 'q|qsub_q:s' => \$qsub_q,
- 'e|exper:s' => \$exper,
- 'w|workingdir:s' => \$workingdir,
- 'o|outdir:s' => \$outdir,
- 'd|fq_dir:s' => \$fq_dir,
- 'g|genome_fasta:s' => \$genome_fasta,
- 't|te_fasta:s' => \$te_fasta,
- 'l|len_cutoff:i' => \$len_cutoff,
- 'm|mismatch:f' => \$mismatch_allowance,
- '1|mate_1_id:s' => \$mate_file_1,
- '2|mate_2_id:s' => \$mate_file_2,
- 'u|unpaired_id:s' => \$mate_file_unpaired,
- 'bm|blat_minScore:i' => \$blat_minScore,
- 'bt|blat_tileSize:i' => \$blat_tileSize,
- 'f|flanking_seq_len:i' => \$flanking_seq_len,
- '-r|reference_ins:s' => \$existing_TE,
-## '-b2|bowtie2:i' => \$bowtie2,
- 'h|help' => \&getHelp,
-);
-my $current_dir;
-
-$parallel = 1 if $qsub_array == 1;
-if ( defined $qsub_q ) {
- $qsub_q = "-q $qsub_q";
-}
-else {
- $qsub_q = '';
-}
-
-if ( defined $workingdir and -d $workingdir ) {
- $current_dir = File::Spec->rel2abs($workingdir);
- $current_dir =~ s/\/$//;
-}
-else {
- $current_dir = cwd();
-}
-my $mapping = 1;
-
-if ( !defined $genome_fasta ) {
- print "\n\nPlease provide reference genome by using -g Genome fasta path\n";
- die "\nuse -h option to get help\n";
-}
-elsif ( $genome_fasta eq 'NONE' ) {
- print
-"A reference genome fasta was NOT provided. Proceeding without a reference will result in only the reads containing the TE being identified, no mapping of insertions will be performed\n";
- print "Proceed without mapping? (y|n) \n";
- my $answer = ;
- if ( $answer =~ /n/i ) {
- &getHelp();
- }
- elsif ( $answer =~ /y/i ) {
- $mapping = 0;
- }
- print "Great, proceeding without aligning to a reference genome.\n";
-}
-elsif ( !-e $genome_fasta ) {
- print "$genome_fasta does not exist. Check file name.\n";
- die "\nuse -h option to get help\n";
-}
-my $genome_path;
-if ( -e $genome_fasta ) {
- $genome_path = File::Spec->rel2abs($genome_fasta);
-}
-if ( !defined $te_fasta ) {
- print
-"\nPlease provide fasta file containing transposable elements by using -t TE fasta path
-
-SAMPLE TE FASTA:
->mping TSD=TTA
-GGCCAGTCACAATGGGGGTTTCACTGGTGTGTCATGCACATTTAATAGGGGTAAGACTGAATAAAAAATG
-ATTATTTGCATGAAATGGGGATGAGAGAGAAGGAAAGAGTTTCATCCTGGTGAAACTCGTCAGCGTCGTT
-TCCAAGTCCTCGGTAACAGAGTGAAACCCCCGTTGAGGCCGATTCGTTTCATTCACCGGATCTCTTGCGT
-CCGCCTCCGCCGTGCGACCTCCGCATTCTCCCGCGCCGCGCCGGATTTTGGGTACAAATGATCCCAGCAA
-CTTGTATCAATTAAATGCTTTGCTTAGTCTTGGAAACGTCAAAGTGAAACCCCTCCACTGTGGGGATTGT
-TTCATAAAAGATTTCATTTGAGAGAAGATGGTATAATATTTTGGGTAGCCGTGCAATGACACTAGCCATT
-GTGACTGGCC
-
-FASTA header must contain \"TSD=\", can be a Perl regular expression.
- Example: these exact characters TTA: TSD=TTA
- Example: any 4 characters: TSD=....
- Example: A or T followed by GCC: TSD=(A|T)GCC
- Example: CGA followed by any character then an A then CT or G: TSD=CGA.A(CT|G)
-\n";
- die "\nuse -h option to get help\n";
-}
-elsif ( !-e $te_fasta ) {
- print "$te_fasta does not exist. Check file name.\n";
- die "\nuse -h option to get help\n";
-}
-else {
- open INFILE, $te_fasta or die "Can't open $te_fasta\n";
- my $first_line = ;
- close INFILE;
- if ( $first_line !~ /^>\S+\s+TSD=\S+/ ) {
- die
-"The TE_fasta:$te_fasta does not have the proper format:\n>TE_NAME TSD=TSD\nSEQUENCE\n";
- }
-}
-my @fq_files;
-my %fq_files;
-if ( !defined $fq_dir ) {
- print "\n\nPlease provide a directory of paired fastq files\n";
- die "\nuse -h option to get help\n";
-}
-elsif ( $fq_dir eq 'SKIP' ) {
- ##skip all other steps for processing the raw fq files
-}
-elsif ( !-d $fq_dir ) {
- print
-"\n\nCheck the spelling or location of $fq_dir, Please provide a directory of paired fastq files\n";
- die "\nuse -h option to get help\n";
-}
-else {
- my $fq_path = File::Spec->rel2abs($fq_dir);
- @fq_files = <$fq_path/*fq>;
- my @fastq_files = <$fq_path/*fastq>;
- push @fq_files, @fastq_files;
- if ( scalar @fq_files == 0 ) {
- print "Must provide at least 1 short read file\n";
- die "\nuse -h option to get help\n";
- }
-}
-my $existing_TE_path = 'NONE';
-my $existing_blat = 0;
-if ( $existing_TE ne 'NONE' ) {
- if ( $existing_TE eq '1' ) {
- ##run blat
- $existing_blat = 1;
- }
- elsif ( !-e $existing_TE ) {
- print "$existing_TE does not exist\n";
- print "Please use -r 1 or provide a file that exists\n";
- die "\nuse -h option to get help\n";
- }
- else {
- $existing_TE_path = File::Spec->rel2abs($existing_TE);
- open INFILE, $existing_TE or die "Can't open $existing_TE\n";
- my $first_line = ;
- close INFILE;
- if ( $first_line !~ /\S+\t\S+:\d+\.\.\d+/ ) {
- print "The existing_TE file is not in the appropriate format:
-
-SAMPLE Reference TEs (the two columns are tab-delimited):
-mping Chr12:839604..840033
-mping Chr11:23200534..23200105
-
-TE_nameref_seqname:first_Base_Of_TIR1..Last_base_of_TIR2
-
-or (recommended) use \'-r 1\' for RelocaTE to find your TE in the reference
- ";
- die "\nuse -h option to get help\n";
- }
- }
-}
-
-sub getHelp {
- print '
-usage:
-./relocaTE.pl [-t TE_fasta_file][-g chromosome_genome_fasta][-d dir_of_fq][-e short_sample_name][-h]
-
-options:
-
-**required:
--t |--te_fasta file fasta containing nucleotide sequences of transposable elements with
- TSD=xxx in the desc. [no default]
--d |--fq_dir dir directory of paired and unpaired fastq files (paired _p1.fq & _p2.fq)
- (.fq or .fastq is acceptable) [no default]
-
-**recommended:
--g |--genome_fasta file genome (reference) fasta file path. If not provided will only align
- reads to TE and remove TE seq from short reads. [no default]
--e |--exper STR Short sample name, will be used in the output files to create IDs for
- the insert (ex. A123) [not.given]
--o |--outdir STR name for directory to contain output directories and files, will be
- created for the run (ex. 04222012_A123) [outdir_teSearch]
-
-**optional:
--p |--parallel INT Break down the single big job of relocaTE into as many smaller jobs as
- possible. The alternative (0) would be to run one after the other
- (int, 0=false or 1=true) [0]
--q |--qsub_q STR same as qsub -q option, not required [no default]
--a |--qsus_array INT if \'-a 1\' , create qsub PBS array jobs to run the many shell scripts
- created in the \'-a 1\' option. (see: man qsub option -t).(
- 0=false or 1=true) [0]
--w |--workingdir dir base working directory, needs to exist, will not be creates, full path
- required [cwd]
--l |--len INT len cutoff for the TE trimmed reads to be aligned [10]
--m |--mismatch FRACTION mismatch allowance for alignment to TE (ex 0.1) [0]
--1 |--mate_1_id STR string to uniquely identify mate 1 paired files ex: file_p1.fq [_p1]
--2 |--mate_2_id STR pattern to uniquely identify mate 2 paired files ex: file_p2.fq [_p2]
--u |--unpaired_id STR pattern to uniquely identify unpaired files ex: file.unPaired.fq [.unPaired]
--bm|--blat_minScore INT blat minScore value, used by blat in the comparison of reads to TE sequence [10]
--bt|--blat_tileSize INT blat tileSize value, used by blat in the comparison of reads to TE sequence [7]
--f |--flanking_seq INT length of the sequence flanking the found insertion to be returned. This
- sequence is taken from the reference genome [100]
--r |--reference_ins STR To identify reference and shared insertions (reference and reads)
- choose option-1 or option-2.
- option-1) (recommended) use \'-r 1\' to have RelocaTE find the location of your TE in the
- reference.
- option-2) input the file name of a tab-delimited file containing the coordinates
- of TE insertions pre-existing in the reference sequence. [no default]
--h |--help this message
-
-
-See documentation for more information. http://srobb1.github.com/RelocaTE/
-
-';
-## use in V2
-## -b2 |--bowtie2 INT to use bowtie2 use \'-b2 1\' else for bowtie use \'-b2 0\' [0]
- exit 1;
-}
-if ( $outdir eq '' or $outdir =~ /^\s+$/ or !defined $outdir ) {
- die "your -o option has an incorrect value, it needs to be something
-\nuse -h option to get help\n";
-}
-else {
- $outdir =~ s/\/$//;
-}
-my $te_path = File::Spec->rel2abs($te_fasta);
-my @outdir = split /\//, $outdir;
-$outdir = pop @outdir;
-my $top_dir = $outdir;
-my @depend;
-my $shellscripts = "$current_dir/$top_dir/shellscripts";
-if ($qsub_array) {
- mkdir "$current_dir/$top_dir";
- mkdir "$shellscripts";
- open QSUBARRAY, ">$current_dir/$top_dir/run_these_jobs.sh"
- or die "Can't open $current_dir/$top_dir/run_these_jobs.sh\n";
-}
-elsif ($parallel) {
- mkdir "$current_dir/$top_dir";
- mkdir "$shellscripts";
- open PARALLEL, ">$current_dir/$top_dir/run_these_jobs.sh"
- or die "Can't open $current_dir/$top_dir/run_these_jobs.sh\n";
-}
-else {
- mkdir "$current_dir/$top_dir";
-}
-my $qsub_formatGenome_cmd = 0;
-## get names of each ref sequecne
-my @genome_seqs;
-if ( $mapping > 0 ) {
- open( INFASTA, "$genome_path" ) || die "$!\n";
- while ( my $line = ) {
- next unless $line =~ /^>(\S+)/;
- if ( $line =~ /^>(\S+)/ ) {
- my $id = $1;
- if ( $id =~ /\|/ ) {
- $id =~ s/\|/_/g;
- }
- push @genome_seqs, $id;
- }
- else {
- die "Your genome FASTA file is in a unexpected format.
->SEQNAME
-SEQUENCE
->SEQNAME2
-SEQUENCE2\n";
- }
- }
- close(INFASTA);
-
- #create bowtie index
- my $cmd;
- if ( !$bowtie2 and !-e "$genome_path.bowtie_build_index.1.ebwt" ) {
- $cmd =
-"bowtie-build -f $genome_path $genome_path.bowtie_build_index 12> $current_dir/$top_dir/bowtie-build.out";
- $qsub_formatGenome_cmd = 1;
- }
- elsif ( $bowtie2 and !-e "$genome_path.bowtie2_build_index.1.bt2" ) {
- $cmd =
-"bowtie2-build -f $genome_path $genome_path.bowtie2_build_index 12> $current_dir/$top_dir/bowtie-build2.out";
- $qsub_formatGenome_cmd = 1;
- }
- my $ref = 'ref';
- if ( $genome_path =~ /(?:.+\/)?(.+)\.(fa|fasta)$/ ) {
- $ref = $1;
- }
- if ( $parallel and defined $cmd ) {
- my $shell_dir = "$shellscripts/step_1";
- mkdir $shell_dir;
- open OUTSH, ">$shell_dir/step_1.$ref.formatGenome.sh";
- print OUTSH "$cmd\n";
- close OUTSH;
- chmod 0755, "$shell_dir/step_1.$ref.formatGenome.sh";
- print PARALLEL "sh $shell_dir/step_1.$ref.formatGenome.sh\n"
- if !$qsub_array;
- }
- elsif ( $parallel and !defined $cmd ) {
- my $step1_file =
- "$shellscripts/step_1_not_needed_genome_fasta_already_formatted";
- my $shell_dir = "$shellscripts";
- mkdir $shell_dir;
- if ($parallel) {
- open STEP1, ">$step1_file" or die "Can't Open $step1_file\n";
- print STEP1 '';
- close STEP1;
- }
- }
- elsif ( defined $cmd ) {
- ##run it now
- print "Formatting the reference genome: $genome_path\n";
- system($cmd);
- }
- if ($qsub_array) {
- if ( !-e "$shellscripts/step_1_not_needed_genome_fasta_already_formatted" )
- {
- my $job = "$shellscripts/step_1/step_1.$ref.formatGenome.sh";
- print QSUBARRAY
- "STEP1=\`qsub -e $shellscripts -o $shellscripts $qsub_q $job\`
-echo \$STEP1\n";
-
- }
- }
-} ##end if($mapping)
-
-##run existing TE blat against ref if the file does not exsit
-my $qsub_existingTE_cmd = 0;
-my $existing_blat_cmd =
-"blat $genome_path $te_path $current_dir/$top_dir/existingTE.blatout 1> $current_dir/$top_dir/existingTE.blat.stdout";
-if ($existing_blat) {
- ##if running blat set existing_TE_path to blatout
- $existing_TE_path = "$current_dir/$top_dir/existingTE.blatout";
- if ( $parallel
- and !-e "$current_dir/$top_dir/existingTE.blatout" )
- {
- my $shell_dir = "$shellscripts";
- if ( !-d $shell_dir ) {
- mkdir $shell_dir;
- }
- open OUTSH, ">$shell_dir/step_0.existingTE_blat.sh"
- or die "Can't open $shell_dir/step_0.existingTE_blat.sh for writing $!\n";
- print PARALLEL "sh $shell_dir/step_0.existingTE_blat.sh\n" if !$qsub_array;
- print OUTSH "$existing_blat_cmd\n";
- if ($qsub_array) {
- $qsub_existingTE_cmd = 1;
- print QSUBARRAY
-"EXISTINGTE=`qsub -e $shellscripts -o $shellscripts $qsub_q $shellscripts/step_0.existingTE_blat.sh`
-echo \$EXISTINGTE\n";
- }
- close OUTSH;
- }
- elsif ( !-e "$current_dir/$top_dir/existingTE.blatout" ) {
- ## do it now
- print "finding TEs ($te_path) in the reference genome ($genome_path)\n";
- system($existing_blat_cmd);
- }
-}
-
-my @fq;
-my @fa;
-
-#convert fq files to fa for blat
-open QSUBARRAY2, ">$shellscripts/step_2.fq2fa.sh"
- if $qsub_array;
-my $fq_count = 0;
-if ( $fq_dir ne 'SKIP' ) {
- foreach my $fq (@fq_files) {
- my $fq_path = File::Spec->rel2abs($fq);
- push @fq, $fq_path;
- my $fa = $fq;
- if ( $fa =~ s/\.(fq|fastq)$/.fa/ ) {
- push @fa, $fa;
- if ( !-e $fa ) {
- my $cmd = "$scripts/relocaTE_fq2fa.pl $fq_path $fa";
- if ($parallel) {
- my @fq_path = split '/', $fq_path;
- my $fq_name = pop @fq_path;
- my $shell_dir = "$shellscripts/step_2";
-
- mkdir $shell_dir;
- my $outsh = "$shell_dir/$fq_count." . "fq2fa.sh";
- open OUTSH, ">$outsh";
- print PARALLEL "sh $outsh\n" if !$qsub_array;
- print OUTSH "$cmd\n";
- }
- else {
- ##run it now
- print "Converting $fq_path to fasta for blat\n";
- system($cmd);
- }
- }
- else {
- my $shell_dir = "$shellscripts";
-
- mkdir $shell_dir;
- my $step2_file =
- "$shellscripts/step_2_not_needed_fq_already_converted_2_fa";
-
- if ($parallel) {
- open STEP2, ">$step2_file" or die "Can't Open $step2_file\n";
- print STEP2 '';
- close STEP2;
- }
- }
- }
- else {
- print
-"$fq does not seem to be a fastq based on the file extension. It should be fq or fastq\n";
- &getHelp();
- }
- $fq_count++;
- }
- if ( !-e "$shellscripts/step_2_not_needed_fq_already_converted_2_fa"
- and $qsub_array )
- {
- my $end = $fq_count - 1;
- my $job = "$shellscripts/step_2.fq2fa.sh";
- if ( !@depend ) {
- print QSUBARRAY
- "STEP2=\`qsub -e $shellscripts -o $shellscripts $qsub_q -t 0-$end $job\`
-echo \$STEP2\n";
- @depend = ( "STEP2", "afterokarray" );
- }
- else {
- my ( $last_job, $afterok ) = @depend;
- @depend = ( "STEP2", "afterokarray" );
- my $jobName = $depend[0];
- print QSUBARRAY
-"$jobName=`qsub -e $shellscripts -o $shellscripts $qsub_q -t 0-$end -W depend=$afterok:\$$last_job $job`
-echo \$$jobName\n";
- }
- print QSUBARRAY2 "sh $shellscripts/step_2/\$PBS_ARRAYID.fq2fa.sh";
- }
- elsif ($qsub_array) {
- unlink "$shellscripts/step_2.fq2fa.sh";
- }
-} ##end if $fq_dir ne 'SKIP'
-close QSUBARRAY2;
-
-##split the TE fasta of many seqs into individual files
-my @te_fastas;
-my %TSD;
-open( INFASTA, "$te_fasta" ) || die "$!\n";
-my $i = 0;
-while ( my $line = ) {
- if ( $line =~ /^>(\S+)\s+TSD=(\S+)/ ) {
- my $id = $1;
- $TSD{$id} = $2;
- if ( $i > 0 ) {
- close(OUTFASTA);
- $i = 0;
- }
- my $te_file = "$id.fa";
- $te_file =~ s/\|/_/g;
- ##create new dir for files: workingDir/outdir/TE/
- my $te_dir = "$current_dir/$top_dir/$id";
- push @te_fastas, "$te_dir/$te_file";
-
- mkdir $te_dir;
- open( OUTFASTA, ">$te_dir/$te_file" ) or die "$!\n";
- print OUTFASTA $line;
- $i++;
- }
- elsif ( $line =~ /^>/ and $line !~ /TSD=/ ) {
- die
-"The TE_fasta:$te_fasta does not have the proper format:\n>TE_NAME TSD=TSD\nSEQUENCE\n";
- }
- else { ##should be sequence
- print OUTFASTA $line;
- }
-}
-close(INFASTA);
-close(OUTFASTA);
-
-#foreach TE fasta blat against target chromosome and parse and find insertion sites
-my $depend = 1 if @depend;
-foreach my $te_path (@te_fastas) {
- if ($depend) {
- @depend = ( "STEP2", "afterokarray" );
- }
- else {
- @depend = ();
- }
- my @path = split '/', $te_path;
- my $te_fasta = pop @path;
- my $path = join '/', @path;
- my $TE = $te_fasta;
- $TE =~ s/\.fa//;
- mkdir "$path/blat_output";
- mkdir "$path/flanking_seq";
- mkdir "$path/te_containing_fq";
- mkdir "$path/te_only_read_portions_fa";
-
- #blat fa files against te.fa
- my @flanking_fq;
- my $fq_file_count = scalar @fq;
-
- open QSUBARRAY3, ">$shellscripts/step_3.$TE.blat.sh"
- if $qsub_array;
- open QSUBARRAY4, ">$shellscripts/step_5.$TE.finder.sh"
- if $qsub_array;
- for ( my $i = 0 ; $i < $fq_file_count ; $i++ ) {
- my $fa = $fa[$i];
- my $fq = $fq[$i];
-
- #remove and save filename part of path
- my @fa_path = split '/', $fa;
- my $fa_name = pop @fa_path;
- $fa_name =~ s/\.fa$//;
- my $shell_dir = "$shellscripts/step_3/$TE";
- if ($parallel) {
- make_path( $shell_dir, { mode => 0755 } );
- open OUTSH, ">$shell_dir/$i.$TE.blat.sh"
- or die "Can't open $shell_dir/$i.$TE.blat.sh $!\n";
- print PARALLEL "sh $shell_dir/$i.$TE.blat.sh\n" if !$qsub_array;
- }
-
- #use pre-existing blatout files
- if ( !-e "$path/blat_output/$fa_name.te_$TE.blatout" ) {
- my $cmd =
-"blat -minScore=$blat_minScore -tileSize=$blat_tileSize $te_path $fa $path/blat_output/$fa_name.te_$TE.blatout 1>> $path/blat_output/blat.out";
- print OUTSH "$cmd\n" if $parallel;
- print "Finding reads in $fa_name that contain sequence of $TE\n"
- if !$parallel;
- system($cmd) if !$parallel;
- }
-
- #use pre-existing te_containing_fq files
- my $te_Containing_fq =
- "$path/te_containing_fq/$fa_name.te_$TE.ContainingReads.fq";
- if ( -e $te_Containing_fq ) {
- $fq = $te_Containing_fq;
- }
- my $cmd =
-"perl $scripts/relocaTE_trim.pl $path/blat_output/$fa_name.te_$TE.blatout $fq $len_cutoff $mismatch_allowance > $path/flanking_seq/$fa_name.te_$TE.flankingReads.fq";
- if ($parallel) {
- print OUTSH "$cmd\n";
- close OUTSH;
- chmod 0755, "$shell_dir/*blat.sh";
- }
- else {
- ##run it now
- print "Trimming $fq reads of $TE sequence\n" if !$parallel;
- system($cmd) if !$parallel;
- }
- }
- if ($qsub_array) {
- my $end = $fq_file_count - 1;
- my $job = "$shellscripts/step_3.$TE.blat.sh";
- my $desc = $TE;
- $desc =~ s/\W/_/;
- if ( !@depend ) {
- print QSUBARRAY
-"STEP_3_$desc=\`qsub -e $shellscripts -o $shellscripts $qsub_q -t 0-$end $job\`
-echo \$STEP_3_$desc\n";
- @depend = ( "STEP_3_$desc", "afterokarray" );
- }
- else {
- my ( $last_job, $afterok ) = @depend;
- @depend = ( "STEP_3_$desc", "afterokarray" );
- my $jobName = $depend[0];
- print QSUBARRAY
-"$jobName=`qsub -e $shellscripts -o $shellscripts $qsub_q -t 0-$end -W depend=$afterok:\$$last_job $job`
-echo \$$jobName\n";
-
- }
- print QSUBARRAY3 "sh $shellscripts/step_3/$TE/\$PBS_ARRAYID.$TE.blat.sh";
- }
- ##if a genome file was provided, align seqs to genome
- ##if no genome file was provided, will only blat and trim reads of te seq
- if ($mapping) {
- my $param_path = "$current_dir/$top_dir/$TE";
- my $outregex = "$param_path/regex.txt";
- open OUTREGEX, ">$outregex" or die $!;
- print OUTREGEX "$mate_file_1\t$mate_file_2\t$mate_file_unpaired\t$TSD{$TE}";
- my $cmd =
-"$scripts/relocaTE_align.pl $scripts $param_path $genome_path $outregex $TE $exper $bowtie2";
- if ( !$parallel ) {
- ## run now
- print "Aligning $TE trimmed reads to the reference ($genome_path)\n";
- system($cmd);
- }
- else {
- my $shell_dir = "$shellscripts/step_4/$TE";
- $genome_path =~ /.+\/(.+)\.(fa|fasta)$/;
- my $ref = $1;
-
- #`mkdir -p $shell_dir`;
- make_path( $shell_dir, { mode => 0755 } );
-
- #mkdir $shell_dir;
- open OUTSH, ">$shell_dir/step_4.$ref.$TE.align.sh";
- print OUTSH "$cmd\n";
- print PARALLEL "sh $shell_dir/step_4.$ref.$TE.align.sh\n" if !$qsub_array;
- close OUTSH;
-
- #`chmod +x $shell_dir/step_4.$ref.$TE.align.sh`;
- chmod 0755, "$shell_dir/step_4.$ref.$TE.align.sh";
- if ($qsub_array) {
- my $existing_depend = '';
- if ( $qsub_formatGenome_cmd ) {
- $existing_depend = "-W depend=afterok:\$STEP1" if !@depend;
- $existing_depend = ",depend=afterok:\$STEP1" if @depend;
- }
- my $job = "$shell_dir/step_4.$ref.$TE.align.sh";
- my $desc = $TE;
- $desc =~ s/\W/_/;
- if ( !@depend ) {
- print QSUBARRAY
-"STEP_4_$desc=\`qsub -e $shellscripts -o $shellscripts $qsub_q $existing_depend $job\`
-echo \$STEP_4_$desc\n";
- @depend = ( "STEP_4_$desc", "afterok" );
- }
- else {
- my ( $last_job, $afterok ) = @depend;
- @depend = ( "STEP_4_$desc", "afterok" );
- my $jobName = $depend[0];
- print QSUBARRAY
-"$jobName=`qsub -e $shellscripts -o $shellscripts $qsub_q -W depend=$afterok:\$$last_job","$existing_depend $job`
-echo \$$jobName\n";
- }
- }
- }
-
- my $genome_count = 0;
- foreach my $seq_id (@genome_seqs) {
- $genome_path =~ /.+\/(.+)\.(fa|fasta)$/;
- my $ref = $1;
- my $merged_bowtie = "$path/bowtie_aln/$ref.$TE.bowtie.out";
- my $cmd =
-"$scripts/relocaTE_insertionFinder.pl $merged_bowtie $seq_id $genome_path $TE $outregex $exper $flanking_seq_len $existing_TE_path $mismatch_allowance $bowtie2";
- if ( !$parallel ) {
- ##run it now
- print "Finding $TE insertions in $seq_id\n";
- system($cmd);
- }
- else {
- my $shell_dir = "$shellscripts/step_5/$TE";
- make_path( $shell_dir, { mode => 0755 } );
- open OUTSH, ">$shell_dir/$genome_count.$TE.findSites.sh";
- print OUTSH "$cmd\n";
- close OUTSH;
- print PARALLEL "sh $shell_dir/$genome_count.$TE.findSites.sh\n"
- if !$qsub_array;
- chmod 0755, "$shell_dir/$genome_count.$TE.findSites.sh";
- }
- $genome_count++;
- }
- if ($qsub_array) {
- my $end = $genome_count - 1;
- my $job = "$shellscripts/step_5.$TE.finder.sh";
- my $existing_depend = '';
- if ($qsub_existingTE_cmd) {
- $existing_depend = "-W depend=afterok:\$EXISTINGTE" if !@depend;
- $existing_depend = ":\$EXISTINGTE" if @depend;
- }
- #if ( $qsub_formatGenome_cmd and $existing_depend eq '' ) {
- # $existing_depend = "-W depend=afterok:\$STEP1" if !@depend;
- #}
- #elsif ( $qsub_formatGenome_cmd and ( $existing_depend ne '' or @depend ) )
- #{
- # $existing_depend .= ":\$STEP1";
- #}
- my $desc = $TE;
- $desc =~ s/\W/_/;
- if ( !@depend ) {
- print QSUBARRAY
-"STEP_5_$desc=\`qsub -e $shellscripts -o $shellscripts $qsub_q -t 0-$end $existing_depend $job\`
-echo \$STEP_5_$desc\n";
- @depend = ( "STEP_5_$desc", "afterokarray" );
- }
- else {
- my ( $last_job, $afterok ) = @depend;
- @depend = ( "STEP_5_$desc", "afterokarray" );
- my $jobName = $depend[0];
- print QSUBARRAY
-"$jobName=`qsub -e $shellscripts -o $shellscripts $qsub_q -t 0-$end -W depend=$afterok:\$$last_job",
- "$existing_depend $job`
-echo \$$jobName\n";
- }
- print QSUBARRAY4
- "sh $shellscripts/step_5/$TE/\$PBS_ARRAYID.$TE.findSites.sh";
- }
- }
- if ($qsub_array) {
- close QSUBARRAY3;
- close QSUBARRAY4;
- }
-}
-## Finished, clean up, cat files
-##cat all '.te_insertion_sites.table.txt' results into one file
-foreach my $te_path (@te_fastas) {
- my @path = split '/', $te_path;
- my $te_fasta = pop @path;
- my $path = join '/', @path;
- my $TE = $te_fasta;
- $TE =~ s/\.fa//;
- if ($parallel) {
- my $shell_dir = "$shellscripts/step_6/$TE";
- make_path( $shell_dir, { mode => 0755 } );
- open FINISH, ">$shellscripts/step_6/$TE/step_6.$TE.finishing.sh";
- print PARALLEL "sh $shellscripts/step_6/$TE/step_6.$TE.finishing.sh\n"
- if !$qsub_array;
-## toDo: Need to put all of this in a perl script then have the
-## finishing shell script execute that perl script
- print FINISH "
-`mkdir -p $path/results/all_files`
-
-#combine confident insertions to one file
-echo \"TE\tTSD\tExper\tchromosome\tinsertion_site\tstrand\tleft_flanking_read_count\tright_flanking_read_count\tleft_flanking_seq\tright_flanking_seq\tTE_orientation\" > $path/results/temp
-for i in \`ls $path/results/*.$TE.confident_nonref_insert.txt\` ; do grep -v flanking_read_count \$i >> $path/results/temp ; done
-mv $path/results/temp $path/results/$exper.$TE.confident_nonref.txt
-mv $path/results/*.$TE.confident_nonref_insert.txt $path/results/all_files
-
-#combine all insertions to one file
-echo \"TE\tTSD\tExper\tchromosome\tinsertion_site\tstrand\tcombined_read_count\tright_flanking_read_count\tleft_flanking_read_count\" > $path/results/temp2
-for i in \`ls $path/results/*.$TE.all_nonref_insert.txt\` ; do grep -v total \$i | grep -v Note >> $path/results/temp2 ; done
-mv $path/results/temp2 $path/results/$exper.$TE.all_nonref.txt
-mv $path/results/*.$TE.all_nonref_insert.txt $path/results/all_files
-
-#combine confident insertions ref seqs to one file
-for i in \`ls $path/results/*.$TE.confident_nonref_genomeflank.fa\` ; do cat \$i >> $path/results/temp3 ; done
-mv $path/results/temp3 $path/results/$exper.$TE.confident_nonref_genomeflanks.fa
-mv $path/results/*.$TE.confident_nonref_genomeflank.fa $path/results/all_files
-
-#combine confident insertions gff to one file
-echo \"##gff-version 3\" > $path/results/temp4
-for i in \`ls $path/results/*.$TE.all_insert.gff\` ; do grep -v gff \$i >> $path/results/temp4 ; done
-mv $path/results/temp4 $path/results/$exper.$TE.all_inserts.gff
-mv $path/results/*.$TE.all_insert.gff $path/results/all_files
-
-#combine confident insertions reads to one file
-for i in \`ls $path/results/*.$TE.confident_nonref_insert_reads_list.txt\` ; do cat \$i >> $path/results/temp5 ; done
-mv $path/results/temp5 $path/results/$exper.$TE.confident_nonref_reads_list.txt
-mv $path/results/*.$TE.confident_nonref_insert_reads_list.txt $path/results/all_files
-
-";
- `chmod +x $shellscripts/step_6/$TE/step_6.$TE.finishing.sh`;
- }
- if ($qsub_array) {
- my $job = "$shellscripts/step_6/$TE/step_6.$TE.finishing.sh";
- my $desc = $TE;
- $desc =~ s/\W/_/;
- if ( !@depend ) {
- my $jobName = "STEP_6_$desc";
- print QSUBARRAY
- "$jobName=\`qsub -e $shellscripts -o $shellscripts $qsub_q $job\`
-echo \$$jobName\n";
- @depend = ( "STEP_6_$desc", "afterok" );
- }
- else {
- my ( $last_job, $afterok ) = ( "STEP_5_$desc", "afterokarray" );
- @depend = ( "STEP_6_$desc", "afterok" );
- my $jobName = $depend[0];
- print QSUBARRAY
-"$jobName=`qsub -e $shellscripts -o $shellscripts $qsub_q -W depend=$afterok:\$$last_job $job`
-echo \$$jobName\n";
- }
- }
- if ( !$parallel and !$qsub_array ) {
- ##do it now
- ##combine and delete individual chr files for confident sites
- print "Finishing and cleaning up\n";
-`echo \"TE\tTSD\tEper\tchromosome\tinsertion_site\tstrand\tleft_flanking_read_count\tright_flanking_read_count\tleft_flanking_seq\tright_flanking_seq\tTE_orientation\" > $path/results/temp`;
- my @files = `ls $path/results/*.$TE.confident_nonref_insert.txt`;
- foreach my $file (@files) {
- chomp $file;
- `grep -v flanking_read_count $file >> $path/results/temp`;
- unlink $file;
- }
- `mv $path/results/temp $path/results/$exper.$TE.confident_nonref.txt`;
-
- ##combine and delete individual chr files for all sites
-`echo \"TE\tTSD\tExper\tchromosome\tinsertion_site\tstrand\tcombined_read_count\tright_flanking_read_count\tleft_flanking_read_count\" > $path/results/temp2`;
- @files = `ls $path/results/*.$TE.all_nonref_insert.txt`;
- foreach my $file (@files) {
- chomp $file;
- `grep -v total $file | grep -v Note >> $path/results/temp2`;
- unlink $file;
- }
- `mv $path/results/temp2 $path/results/$exper.$TE.all_nonref.txt`;
-
- ##combine and delete individual chr fasta files
- @files = `ls $path/results/*.$TE.confident_nonref_genomeflank.fa`;
- foreach my $file (@files) {
- chomp $file;
- `cat $file >> $path/results/temp3`;
- unlink $file;
- }
-`mv $path/results/temp3 $path/results/$exper.$TE.confident_nonref_genomeflanks.fa`;
-
- ##combine and delete individual chr gff files
- `echo \"##gff-version 3\" > $path/results/temp4`;
- @files = `ls $path/results/*.$TE.all_insert.gff`;
- foreach my $file (@files) {
- chomp $file;
- `grep -v gff $file >> $path/results/temp4`;
- unlink $file;
- }
- `mv $path/results/temp4 $path/results/$exper.$TE.all_inserts.gff`;
-
- ##combine and delete individual chr reads list
- @files = `ls $path/results/*.$TE.confident_nonref_insert_reads_list.txt`;
- foreach my $file (@files) {
- chomp $file;
- `cat $file >> $path/results/temp5`;
- unlink $file;
- }
-`mv $path/results/temp5 $path/results/$exper.$TE.confident_nonref_reads_list.txt`;
- print "$TE results are found in $path/results\n";
- }
- close FINISH;
-}
-
-if ($qsub_array) {
- close QSUBARRAY;
-## this would happen before IO was finished on the file
- # system ("qsub $qsub_q $current_dir/$top_dir/run_these_jobs.sh");
- print "$current_dir/$top_dir/run_these_jobs.sh was created
--- Run this script, \'sh $current_dir/$top_dir/run_these_jobs.sh\'
--- This script will submit all jobs to the queue in the appropriate order.
--- Be sure to check the error files in $current_dir/$top_dir/shellscripts. They should all be file size 0.
-\n";
-}
-elsif ($parallel) {
-
- #system (sort "$current_dir/$top_dir/run_these_jobs.sh");
- print
- "Run each command line statement in $current_dir/$top_dir/run_these_jobs.sh.
---Run these in order (step_1,step_2,step_3, so on) for each TE.
---For example, all the step_3 scripts for a specific TE should be successfully completed (finished without errors)
-before running a step_4 script of the same TE.
---All scripts of the same step can be run in parallel (at the same time).
-\n";
-}
diff --git a/scripts/relocaTE_insertionFinder.pl b/scripts/relocaTE_insertionFinder.pl
index ce3b46e..8d6381d 100755
--- a/scripts/relocaTE_insertionFinder.pl
+++ b/scripts/relocaTE_insertionFinder.pl
@@ -444,3 +444,4 @@ sub TSD_check {
}
}
}
+