Skip to content
This repository

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse code

clean up

  • Loading branch information...
commit 6c1cb507fe22ff003012826ba77eee446cbcf711 1 parent 424b8ec
Sofia Robb authored
1  README.md
Source Rendered
@@ -452,3 +452,4 @@ For any of the listed reasons, or anything else, please leave us a <a href="http
452 452
453 453 <a href="https://github.com/srobb1/RelocaTE/issues?page=1&sort=comments&state=open">Leave a message here</a>
454 454
  455 +
35 sample_relocaTE_run/sample_readme.txt
@@ -9,6 +9,7 @@ These scripts expect that the following programs are installed and included in y
9 9 3. BWA
10 10 4. BioPerl
11 11 5. Blat
  12 + 6. Blast: formatdb and fastacmd
12 13
13 14 STEPS A-C
14 15 A. RelocaTE
@@ -32,32 +33,32 @@ A. To find TE insertions in the included fastq reads run RelocaTE:
32 33 or
33 34 sh run_relocaTE.sh
34 35
35   - These scritps will run relocaTE directly or through a series of shell scripts.
36   - If you are running
  36 + These scripts will run relocaTE directly or through a series of shell scripts.
  37 + If you are running
37 38 - run_relocaTE_qsub.sh:
38   - 1) follow the instructions that are printed to the screen (run run_these_jobs.sh)
39   - 2) once complete, view the results in 02052012_sample/mping/results
  39 + 1) follow the instructions that are printed to the screen (run run_these_jobs.sh)
  40 + 2) once complete, view the results in 02052012_sample/mping/results
40 41 or
41 42 - run_relocaTE_shell.sh
42   - 1) follow the instructions that are printed to the screen (run run_these_jobs.sh)
43   - 2) once complete, view the results in 02052012_sample/mping/results
  43 + 1) follow the instructions that are printed to the screen (run run_these_jobs.sh)
  44 + 2) once complete, view the results in 02052012_sample/mping/results
44 45 or
45 46 - run_relocaTE.sh
46   - 1) once complete, view the results in 02052012_sample/mping/results
  47 + 1) once complete, view the results in 02052012_sample/mping/results
47 48
48 49
49   -B. Find Spanners to help classify the insertions (homozygous, heterozygous, etc) by generating a BAM file of the reads not trimmed of TE to the referenc
50   -e.
  50 +B. Find Spanners to help classify the insertions (homozygous, heterozygous, etc) by generating a BAM file of the reads not trimmed of TE to the reference.
51 51 A BAM file is included in the sample data set, but one can be genreted by running the included script:
52   - create_bam.sh by:
53   - - changing directory into the directory of this script
54   - - and typing "sh create_bam.sh"
  52 + create_bam.sh by:
  53 + - changing directory into the directory of this script
  54 + - and typing "sh create_bam.sh"
55 55
56 56
57 57 C. To classify the insertions (homozygous, heterozygous, etc) run_characTErizer.sh
58   - - change directory into the directory of this script
59   - - type "sh run_characTErizer.sh"
  58 + - change directory into the directory of this script
  59 + - type "sh run_characTErizer.sh"
60 60 - once complete, view the resulting files in the directory you ran characTErizer.pl
61   - 1) sample.inserts_characTErized.gff: GFF file of the classified insertions including excisions
62   - 2) sample.inserts_characTErized.txt: Text file of the classified insertions including excisions
63   - 3) excisions_with_footprint.vcfinfo: additional information on the insertions that have been classified as exicision events
  61 + 1) sample.inserts_characTErized.gff: GFF file of the classified insertions including excisions
  62 + 2) sample.inserts_characTErized.txt: Text file of the classified insertions including excisions
  63 + 3) excisions_with_footprint.vcfinfo: additional information on the insertions that have been classified as exicision events
  64 +
1  scripts/characterizer.pl
@@ -310,3 +310,4 @@ sub getHelp {
310 310 }
311 311 }
312 312 }
  313 +
312 scripts/characterizer.pl~
... ... @@ -1,312 +0,0 @@
1   -#!/usr/bin/perl -w
2   -use strict;
3   -use Data::Dumper;
4   -use Cwd;
5   -use Getopt::Long;
6   -
7   -if ( !defined @ARGV ) {
8   - &getHelp;
9   -}
10   -
11   -my $cwd = getcwd();
12   -my $sites_file;
13   -my $bam_dir;
14   -my $genome_fasta;
15   -## can be a single .bam file or a direcotory containing .bam files
16   -my @bam_files;
17   -my $excision = 0;
18   -
19   -GetOptions(
20   - 's|sites_file:s' => \$sites_file,
21   - 'b|bam_dir:s' => \$bam_dir,
22   - 'g|genome_fasta:s' => \$genome_fasta,
23   - 'x|excision:i' => \$excision,
24   -);
25   -
26   -sub getHelp {
27   - print '
28   -usage:
29   -./characterizer.pl [-s relocaTE table output file][-b bam file or dir of bam files][-g reference genome fasta file][-h]
30   -
31   -options:
32   --s file relocaTE output file: YOURSAMPLENAME.mping.all_nonref.txt [no default]
33   --b dir/file bam file of the orginal reads aligned to reference (before TE was trimmed) or directory of bam files [no default]
34   --g file reference genome fasta file [no default]
35   --x int find excision events that leave a footprint, yes or no(1|0) [0]
36   --h this help message
37   -
38   -For more information see documentation: http://srobb1.github.com/RelocaTE/
39   -
40   -';
41   - exit 1;
42   -}
43   -
44   -if ( -d $bam_dir ) {
45   -
46   - #remove trailing '/'
47   - $bam_dir =~ s/\/$//;
48   - @bam_files = <$bam_dir/*bam>;
49   -}
50   -elsif ( -f $bam_dir or -l $bam_dir ) {
51   - push @bam_files, $bam_dir;
52   -}
53   -open INSITES, "$sites_file" or die "cannot open $sites_file $!\n";
54   -my @dir_path = split '/', $sites_file;
55   -my $filename = pop @dir_path;
56   -my $file = $filename;
57   -$file =~ s/\..+?$//;
58   -$cwd =~ s/\/$//; #remove trailing /
59   -open OUTGFF, ">$cwd/$file.inserts_characTErized.gff";
60   -open OUT, ">$cwd/$file.inserts_characTErized.txt";
61   -print OUT
62   - "strain\tTE\tTSD\tchromosome.pos\tstrand\tavg_flankers\tspanners\tstatus\n";
63   -print OUTGFF "##gff-version 3\n";
64   -my %matches;
65   -my %TSDs;
66   -my %toPrint;
67   -
68   -while ( my $line = <INSITES> ) {
69   - next if $line =~ /TE.TSD.Exper.chromosome.insertion_site/;
70   - next if $line =~ /^\s*$/;
71   - chomp $line;
72   -
73   - # mping TTA A119 Chr1 1446..1448 + T:1 R:0 L:1
74   - my (
75   - $te, $TSD, $exp, $chromosome, $coor,
76   - $TE_orient, $total_string, $right_string, $left_string
77   - ) = split /\t/, $line;
78   - my ($pos) = $coor =~ /\d+\.\.(\d+)/;
79   - $TSDs{$chromosome}{$pos} = $TSD;
80   - my ($total_count) = $total_string =~ /T:(\d+)/;
81   - my ($left_count) = $left_string =~ /L:(\d+)/;
82   - my ($right_count) = $right_string =~ /R:(\d+)/;
83   -
84   - my $Mmatch = 0;
85   - my $cigar_all;
86   - if ( $left_count >= 1 and $right_count >= 1 and $total_count >= 2 ) {
87   - my @sam_all;
88   - foreach my $bam_file (@bam_files) {
89   - ## get any alignments that overlap the insertion site
90   - my @sam_out = `samtools view $bam_file \'$chromosome:$pos-$pos\'`;
91   - push @sam_all, @sam_out;
92   - }
93   -
94   - #remove redundant lines in sam file
95   - my %sorted_sam;
96   - my $order;
97   - foreach my $line (@sam_all) {
98   - $order++;
99   - if ( !exists $sorted_sam{$line} ) {
100   - $sorted_sam{$line} = $order;
101   - }
102   - }
103   -
104   - #make new sorted sam array by sorting on the value of the sort hash
105   - my @sorted_sam =
106   - sort { $sorted_sam{$a} <=> $sorted_sam{$b} } keys %sorted_sam;
107   -
108   - foreach my $sam_line (@sorted_sam) {
109   - chomp $sam_line;
110   - my @sam_line = split /\t/, $sam_line;
111   - my $cigar = $sam_line[5];
112   - my $flag = $sam_line[1];
113   - my $seqLen = length $sam_line[9];
114   - my $start = $sam_line[3];
115   - my $end = $start + $seqLen - 1;
116   - next unless $end >= $pos + 5;
117   - next unless $start <= $pos - 5;
118   - ## must be a all M match no soft clipping
119   - if ( $cigar =~ /^\d+M$/ ) {
120   - my ($NM) = $sam_line =~ /NM:i:(\d+)/; ## edit distance used
121   - ## bwa specific: mismatch count, often the same as NM, have not seen a XM>0 and NM==0
122   - my ($XM) = $sam_line =~ /XM:i:(\d+)/;
123   - if ( defined $XM and $XM == 0 ) {
124   - $Mmatch++;
125   - }
126   - elsif ( defined $NM and $NM == 0 ) {
127   - $Mmatch++;
128   - }
129   - elsif ( !defined $NM and !defined $XM ) {
130   - $Mmatch++;
131   - }
132   - }
133   - elsif ( $cigar =~ /[IND]/ ) {
134   - $matches{"$chromosome.$pos"}{sam}{$sam_line} = 1;
135   - }
136   - }
137   - my $spanners = $Mmatch;
138   - my $average_flankers = $total_count / 2;
139   - my $status = 0;
140   -
141   - if ( $spanners == 0 ) {
142   - $status = 'homozygous';
143   - }
144   - elsif ( $average_flankers >= 5 and $spanners < 5 ) {
145   - $status = 'homozygous/excision_no_footprint';
146   - }
147   - elsif ( $spanners < ( $average_flankers * .2 ) and $spanners <= 10 ) {
148   - $status = 'homozygous/excision_no_footprint';
149   - }
150   - elsif ( $average_flankers <= 2 and $spanners > 10 ) {
151   - $status = 'new_insertion';
152   - }
153   - elsif ( abs( $average_flankers - $spanners ) <= 5 ) {
154   - $status = 'heterozygous';
155   - }
156   - elsif (
157   - abs( $average_flankers - $spanners ) -
158   - ( ( $average_flankers + $spanners ) / 2 ) <= 10 )
159   - {
160   - $status = 'heterozygous?';
161   - }
162   - elsif ( $average_flankers > 10 and $spanners > 10 ) {
163   - $status = 'heterozygous';
164   - }
165   - elsif (
166   - (
167   - ( $spanners - $average_flankers ) >
168   - ( $spanners + $average_flankers ) / 2
169   - )
170   - and ( $average_flankers <= 10 )
171   - )
172   - {
173   - $status = 'new_insertion';
174   - }
175   - else {
176   - $status = 'other';
177   - }
178   -
179   - $matches{"$chromosome.$pos"}{status} = $status;
180   - $toPrint{$chromosome}{$pos}{$TSD}{TE} = $te;
181   - $toPrint{$chromosome}{$pos}{$TSD}{flank} = $average_flankers;
182   - $toPrint{$chromosome}{$pos}{$TSD}{span} = $spanners;
183   - $toPrint{$chromosome}{$pos}{$TSD}{status} = $status;
184   - $toPrint{$chromosome}{$pos}{$TSD}{strain} = $exp;
185   - $toPrint{$chromosome}{$pos}{$TSD}{coor} = $coor;
186   - $toPrint{$chromosome}{$pos}{$TSD}{TE_orient} = $TE_orient;
187   - }
188   -}
189   -
190   -if ($excision) {
191   -
192   -##generate vcf of spanners looking for excision events
193   - my @unlink_files;
194   - my @vcfs;
195   - foreach my $pos ( keys %matches ) {
196   - my ( $target, $loc ) = split /\./, $pos;
197   - next unless exists $toPrint{$target}{$loc};
198   - my $range = "$target:$pos";
199   - my $sam = "$cwd/$pos.sam";
200   - my $bam = "$cwd/$pos.bam";
201   - my $sorted_bam = "$cwd/$pos.sorted";
202   - my @sam_lines = keys %{ $matches{$pos}{sam} };
203   - if ( @sam_lines > 1 ) {
204   - my $pos_sam = join "\n", @sam_lines;
205   - open POSSAM, ">$sam";
206   - print POSSAM $pos_sam;
207   - `samtools view -bT $genome_fasta $sam > $bam`;
208   - `samtools sort $bam $sorted_bam`;
209   - `samtools index $sorted_bam.bam`;
210   -
211   -`samtools mpileup -C50 -ugf $genome_fasta -r $range $sorted_bam.bam | bcftools view -bvcg - > $cwd/$pos.var.raw.bcf`;
212   -`bcftools view $cwd/$pos.var.raw.bcf | vcfutils.pl varFilter -D 100 > $cwd/$pos.var.flt.vcf`;
213   - push @vcfs, "$cwd/$pos.var.flt.vcf";
214   - push @unlink_files, $sam, $bam, "$sorted_bam.bam.bai", "$sorted_bam.bam",
215   - "$cwd/$pos.var.raw.bcf", "$cwd/$pos.var.flt.vcf";
216   - close POSSAM;
217   - }
218   - }
219   -##Chr1 16327633 . GAGTACTACAATTAGTA GAGTA 1930.0% . INDEL;DP=2;AF1=1;CI95=0.5,1;DP4=0,0,1,1;MQ=29;FQ=-40.5 GT:PL:GQ 1/1:58,6,0:10
220   - open EXCISION, ">>$cwd/excisions_with_footprint.vcfinfo"
221   - or die "can't open $cwd/excisions_with_footprint.vcfinfo for writing $!";
222   - foreach my $vcf (@vcfs) {
223   - ##Chr2.30902247.var.flt.vcf
224   - my @path = split /\//, $vcf;
225   - my $file = pop @path;
226   - my ( $insert_ref, $insert_pos ) = $file =~ /(.+)\.(\d+)\.var\.flt\.vcf/;
227   - my $TSD = $TSDs{$insert_ref}{$insert_pos};
228   - my $TSD_len = length $TSD;
229   - open VCF, $vcf;
230   - while ( my $line = <VCF> ) {
231   - next unless $line !~ /^#/;
232   - chomp $line;
233   - my ( $ref, $first_base, $col_3, $ref_seq, $strain_seq ) = split /\t/,
234   - $line;
235   - my $aln_start = $first_base - $insert_pos - 1;
236   - my $aln_end_ref = $first_base - length($ref_seq) - $insert_pos - 1;
237   - my $aln_end_strain = $first_base - length($strain_seq) - $insert_pos - 1;
238   - my $aln_end_ref_near_insert_pos = 0;
239   - my $aln_end_strain_near_insert_pos = 0;
240   - my $aln_start_near_insert_pos = 0;
241   - my $insert_bwt_ends = 0;
242   - my $all_values_after_insertion = 0;
243   -
244   - if ( ( $aln_start <= $TSD_len + 1 )
245   - and ( ( $aln_start * -1 <= $TSD_len + 1 ) ) )
246   - {
247   - $aln_start_near_insert_pos = 1;
248   - }
249   - if ( ( $aln_end_ref <= $TSD_len + 1 )
250   - and ( ( $aln_end_ref * -1 <= $TSD_len + 1 ) ) )
251   - {
252   - $aln_end_ref_near_insert_pos = 1;
253   - }
254   - if ( ( $aln_end_strain <= $TSD_len + 1 )
255   - and ( ( $aln_end_strain * -1 <= $TSD_len + 1 ) ) )
256   - {
257   - $aln_end_strain_near_insert_pos = 1;
258   - }
259   - my ( $end_ref, $end_strain ) = (
260   - ( $first_base + length($ref_seq) - 1 ),
261   - ( $first_base + length($strain_seq) + 1 )
262   - );
263   - if ( ( $end_ref < $insert_pos and $end_strain > $insert_pos )
264   - or ( $end_strain < $insert_pos and $end_ref > $insert_pos ) )
265   - {
266   - $insert_bwt_ends = 1;
267   - }
268   - if ( ( ( $first_base - $TSD_len + 1 ) > $insert_pos )
269   - and ( $end_ref > $insert_pos )
270   - and ( $end_strain > $insert_pos ) )
271   - {
272   - $all_values_after_insertion = 1;
273   - }
274   - ## if the alignment end in ref or in strain is close to the insertion postion,
275   - ## or if one is before and one is after the insertion postion,
276   - ## it is a potential excision with footprint
277   - if ( ( $aln_end_ref_near_insert_pos or $aln_end_strain_near_insert_pos )
278   - or ($insert_bwt_ends) )
279   - {
280   - ##make sure all values are not after the insertion
281   - if ( !$all_values_after_insertion ) {
282   - print EXCISION "$insert_ref.$insert_pos\t$line\n";
283   - my $status = $toPrint{$insert_ref}{$insert_pos}{$TSD}{status};
284   - ##only append if it already isnt there
285   - $toPrint{$insert_ref}{$insert_pos}{$TSD}{status} =
286   - $status . "/excision_with_footprint"
287   - if $status !~ /\/excision_with_footprint/;
288   - }
289   - }
290   - }
291   - }
292   - unlink @unlink_files;
293   -}
294   -
295   -foreach my $chr ( sort keys %toPrint ) {
296   - foreach my $pos ( sort { $a <=> $b } keys %{ $toPrint{$chr} } ) {
297   - foreach my $tsd ( sort keys %{ $toPrint{$chr}{$pos} } ) {
298   - my $TE = $toPrint{$chr}{$pos}{$tsd}{TE};
299   - my $flankers = $toPrint{$chr}{$pos}{$tsd}{flank};
300   - my $spanners = $toPrint{$chr}{$pos}{$tsd}{span};
301   - my $status = $toPrint{$chr}{$pos}{$tsd}{status};
302   - my $strain = $toPrint{$chr}{$pos}{$tsd}{strain};
303   - my $coor = $toPrint{$chr}{$pos}{$tsd}{coor};
304   - my $TE_orient = $toPrint{$chr}{$pos}{$tsd}{TE_orient};
305   - my ($start) = $coor =~ /(\d+)\.\.\d+/;
306   - print OUT
307   -"$strain\t$TE\t$tsd\t$chr:$coor\t$TE_orient\t$flankers\t$spanners\t$status\n";
308   - print OUTGFF
309   -"$chr\t$strain\ttransposable_element_attribute\t$start\t$pos\t$TE_orient\t.\t.\tID=$chr.$pos.spanners;avg_flankers=$flankers;spanners=$spanners;type=$status;TE=$TE;TSD=$tsd\n";
310   - }
311   - }
312   -}
4 scripts/relocaTE.pl
@@ -147,8 +147,8 @@
147 147 }
148 148 else {
149 149 my $fq_path = File::Spec->rel2abs($fq_dir);
150   - @fq_files = <$fq_path/*fq>;
151   - my @fastq_files = <$fq_path/*fastq>;
  150 + @fq_files = <$fq_path/*.fq>;
  151 + my @fastq_files = <$fq_path/*.fastq>;
152 152 push @fq_files, @fastq_files;
153 153 if ( scalar @fq_files == 0 ) {
154 154 print "Must provide at least 1 short read file\n";
862 scripts/relocaTE.pl~
... ... @@ -1,862 +0,0 @@
1   -#!/usr/bin/perl -w
2   -use File::Spec;
3   -use Getopt::Long;
4   -use Cwd;
5   -use FindBin qw($RealBin);
6   -use File::Path qw(make_path);
7   -use strict;
8   -
9   -my $scripts = $RealBin;
10   -
11   -if ( !defined @ARGV ) {
12   - &getHelp();
13   -}
14   -my $genome_fasta = 'NONE';
15   -my $te_fasta;
16   -my $target = 'NONE';
17   -my $len_cutoff = 10;
18   -my $mismatch_allowance = 0;
19   -my $fq_dir;
20   -my $exper = 'not.given';
21   -my $mate_file_1 = '_p1';
22   -my $mate_file_2 = '_p2';
23   -my $mate_file_unpaired = '.unPaired';
24   -my $workingdir;
25   -my $outdir = 'outdir_teSearch';
26   -my $parallel = 0;
27   -my $qsub_array = 0;
28   -my $qsub_q;
29   -my ( $blat_minScore, $blat_tileSize ) = ( 10, 7 );
30   -my $flanking_seq_len = 100;
31   -my $existing_TE = 'NONE';
32   -my $bowtie2 = 0;
33   -GetOptions(
34   - 'p|parallel:i' => \$parallel,
35   - 'a|qsub_array:i' => \$qsub_array,
36   - 'q|qsub_q:s' => \$qsub_q,
37   - 'e|exper:s' => \$exper,
38   - 'w|workingdir:s' => \$workingdir,
39   - 'o|outdir:s' => \$outdir,
40   - 'd|fq_dir:s' => \$fq_dir,
41   - 'g|genome_fasta:s' => \$genome_fasta,
42   - 't|te_fasta:s' => \$te_fasta,
43   - 'l|len_cutoff:i' => \$len_cutoff,
44   - 'm|mismatch:f' => \$mismatch_allowance,
45   - '1|mate_1_id:s' => \$mate_file_1,
46   - '2|mate_2_id:s' => \$mate_file_2,
47   - 'u|unpaired_id:s' => \$mate_file_unpaired,
48   - 'bm|blat_minScore:i' => \$blat_minScore,
49   - 'bt|blat_tileSize:i' => \$blat_tileSize,
50   - 'f|flanking_seq_len:i' => \$flanking_seq_len,
51   - '-r|reference_ins:s' => \$existing_TE,
52   -## '-b2|bowtie2:i' => \$bowtie2,
53   - 'h|help' => \&getHelp,
54   -);
55   -my $current_dir;
56   -
57   -$parallel = 1 if $qsub_array == 1;
58   -if ( defined $qsub_q ) {
59   - $qsub_q = "-q $qsub_q";
60   -}
61   -else {
62   - $qsub_q = '';
63   -}
64   -
65   -if ( defined $workingdir and -d $workingdir ) {
66   - $current_dir = File::Spec->rel2abs($workingdir);
67   - $current_dir =~ s/\/$//;
68   -}
69   -else {
70   - $current_dir = cwd();
71   -}
72   -my $mapping = 1;
73   -
74   -if ( !defined $genome_fasta ) {
75   - print "\n\nPlease provide reference genome by using -g Genome fasta path\n";
76   - die "\nuse -h option to get help\n";
77   -}
78   -elsif ( $genome_fasta eq 'NONE' ) {
79   - print
80   -"A reference genome fasta was NOT provided. Proceeding without a reference will result in only the reads containing the TE being identified, no mapping of insertions will be performed\n";
81   - print "Proceed without mapping? (y|n) \n";
82   - my $answer = <STDIN>;
83   - if ( $answer =~ /n/i ) {
84   - &getHelp();
85   - }
86   - elsif ( $answer =~ /y/i ) {
87   - $mapping = 0;
88   - }
89   - print "Great, proceeding without aligning to a reference genome.\n";
90   -}
91   -elsif ( !-e $genome_fasta ) {
92   - print "$genome_fasta does not exist. Check file name.\n";
93   - die "\nuse -h option to get help\n";
94   -}
95   -my $genome_path;
96   -if ( -e $genome_fasta ) {
97   - $genome_path = File::Spec->rel2abs($genome_fasta);
98   -}
99   -if ( !defined $te_fasta ) {
100   - print
101   -"\nPlease provide fasta file containing transposable elements by using -t TE fasta path
102   -
103   -SAMPLE TE FASTA:
104   ->mping TSD=TTA
105   -GGCCAGTCACAATGGGGGTTTCACTGGTGTGTCATGCACATTTAATAGGGGTAAGACTGAATAAAAAATG
106   -ATTATTTGCATGAAATGGGGATGAGAGAGAAGGAAAGAGTTTCATCCTGGTGAAACTCGTCAGCGTCGTT
107   -TCCAAGTCCTCGGTAACAGAGTGAAACCCCCGTTGAGGCCGATTCGTTTCATTCACCGGATCTCTTGCGT
108   -CCGCCTCCGCCGTGCGACCTCCGCATTCTCCCGCGCCGCGCCGGATTTTGGGTACAAATGATCCCAGCAA
109   -CTTGTATCAATTAAATGCTTTGCTTAGTCTTGGAAACGTCAAAGTGAAACCCCTCCACTGTGGGGATTGT
110   -TTCATAAAAGATTTCATTTGAGAGAAGATGGTATAATATTTTGGGTAGCCGTGCAATGACACTAGCCATT
111   -GTGACTGGCC
112   -
113   -FASTA header must contain \"TSD=\", can be a Perl regular expression.
114   - Example: these exact characters TTA: TSD=TTA
115   - Example: any 4 characters: TSD=....
116   - Example: A or T followed by GCC: TSD=(A|T)GCC
117   - Example: CGA followed by any character then an A then CT or G: TSD=CGA.A(CT|G)
118   -\n";
119   - die "\nuse -h option to get help\n";
120   -}
121   -elsif ( !-e $te_fasta ) {
122   - print "$te_fasta does not exist. Check file name.\n";
123   - die "\nuse -h option to get help\n";
124   -}
125   -else {
126   - open INFILE, $te_fasta or die "Can't open $te_fasta\n";
127   - my $first_line = <INFILE>;
128   - close INFILE;
129   - if ( $first_line !~ /^>\S+\s+TSD=\S+/ ) {
130   - die
131   -"The TE_fasta:$te_fasta does not have the proper format:\n>TE_NAME TSD=TSD\nSEQUENCE\n";
132   - }
133   -}
134   -my @fq_files;
135   -my %fq_files;
136   -if ( !defined $fq_dir ) {
137   - print "\n\nPlease provide a directory of paired fastq files\n";
138   - die "\nuse -h option to get help\n";
139   -}
140   -elsif ( $fq_dir eq 'SKIP' ) {
141   - ##skip all other steps for processing the raw fq files
142   -}
143   -elsif ( !-d $fq_dir ) {
144   - print
145   -"\n\nCheck the spelling or location of $fq_dir, Please provide a directory of paired fastq files\n";
146   - die "\nuse -h option to get help\n";
147   -}
148   -else {
149   - my $fq_path = File::Spec->rel2abs($fq_dir);
150   - @fq_files = <$fq_path/*fq>;
151   - my @fastq_files = <$fq_path/*fastq>;
152   - push @fq_files, @fastq_files;
153   - if ( scalar @fq_files == 0 ) {
154   - print "Must provide at least 1 short read file\n";
155   - die "\nuse -h option to get help\n";
156   - }
157   -}
158   -my $existing_TE_path = 'NONE';
159   -my $existing_blat = 0;
160   -if ( $existing_TE ne 'NONE' ) {
161   - if ( $existing_TE eq '1' ) {
162   - ##run blat
163   - $existing_blat = 1;
164   - }
165   - elsif ( !-e $existing_TE ) {
166   - print "$existing_TE does not exist\n";
167   - print "Please use -r 1 or provide a file that exists\n";
168   - die "\nuse -h option to get help\n";
169   - }
170   - else {
171   - $existing_TE_path = File::Spec->rel2abs($existing_TE);
172   - open INFILE, $existing_TE or die "Can't open $existing_TE\n";
173   - my $first_line = <INFILE>;
174   - close INFILE;
175   - if ( $first_line !~ /\S+\t\S+:\d+\.\.\d+/ ) {
176   - print "The existing_TE file is not in the appropriate format:
177   -
178   -SAMPLE Reference TEs (the two columns are tab-delimited):
179   -mping Chr12:839604..840033
180   -mping Chr11:23200534..23200105
181   -
182   -TE_name<tab>ref_seqname:first_Base_Of_TIR1..Last_base_of_TIR2
183   -
184   -or (recommended) use \'-r 1\' for RelocaTE to find your TE in the reference
185   - ";
186   - die "\nuse -h option to get help\n";
187   - }
188   - }
189   -}
190   -
191   -sub getHelp {
192   - print '
193   -usage:
194   -./relocaTE.pl [-t TE_fasta_file][-g chromosome_genome_fasta][-d dir_of_fq][-e short_sample_name][-h]
195   -
196   -options:
197   -
198   -**required:
199   --t |--te_fasta file fasta containing nucleotide sequences of transposable elements with
200   - TSD=xxx in the desc. [no default]
201   --d |--fq_dir dir directory of paired and unpaired fastq files (paired _p1.fq & _p2.fq)
202   - (.fq or .fastq is acceptable) [no default]
203   -
204   -**recommended:
205   --g |--genome_fasta file genome (reference) fasta file path. If not provided will only align
206   - reads to TE and remove TE seq from short reads. [no default]
207   --e |--exper STR Short sample name, will be used in the output files to create IDs for
208   - the insert (ex. A123) [not.given]
209   --o |--outdir STR name for directory to contain output directories and files, will be
210   - created for the run (ex. 04222012_A123) [outdir_teSearch]
211   -
212   -**optional:
213   --p |--parallel INT Break down the single big job of relocaTE into as many smaller jobs as
214   - possible. The alternative (0) would be to run one after the other
215   - (int, 0=false or 1=true) [0]
216   --q |--qsub_q STR same as qsub -q option, not required [no default]
217   --a |--qsus_array INT if \'-a 1\' , create qsub PBS array jobs to run the many shell scripts
218   - created in the \'-a 1\' option. (see: man qsub option -t).(
219   - 0=false or 1=true) [0]
220   --w |--workingdir dir base working directory, needs to exist, will not be creates, full path
221   - required [cwd]
222   --l |--len INT len cutoff for the TE trimmed reads to be aligned [10]
223   --m |--mismatch FRACTION mismatch allowance for alignment to TE (ex 0.1) [0]
224   --1 |--mate_1_id STR string to uniquely identify mate 1 paired files ex: file_p1.fq [_p1]
225   --2 |--mate_2_id STR pattern to uniquely identify mate 2 paired files ex: file_p2.fq [_p2]
226   --u |--unpaired_id STR pattern to uniquely identify unpaired files ex: file.unPaired.fq [.unPaired]
227   --bm|--blat_minScore INT blat minScore value, used by blat in the comparison of reads to TE sequence [10]
228   --bt|--blat_tileSize INT blat tileSize value, used by blat in the comparison of reads to TE sequence [7]
229   --f |--flanking_seq INT length of the sequence flanking the found insertion to be returned. This
230   - sequence is taken from the reference genome [100]
231   --r |--reference_ins STR To identify reference and shared insertions (reference and reads)
232   - choose option-1 or option-2.
233   - option-1) (recommended) use \'-r 1\' to have RelocaTE find the location of your TE in the
234   - reference.
235   - option-2) input the file name of a tab-delimited file containing the coordinates
236   - of TE insertions pre-existing in the reference sequence. [no default]
237   --h |--help this message
238   -
239   -
240   -See documentation for more information. http://srobb1.github.com/RelocaTE/
241   -
242   -';
243   -## use in V2
244   -## -b2 |--bowtie2 INT to use bowtie2 use \'-b2 1\' else for bowtie use \'-b2 0\' [0]
245   - exit 1;
246   -}
247   -if ( $outdir eq '' or $outdir =~ /^\s+$/ or !defined $outdir ) {
248   - die "your -o option has an incorrect value, it needs to be something
249   -\nuse -h option to get help\n";
250   -}
251   -else {
252   - $outdir =~ s/\/$//;
253   -}
254   -my $te_path = File::Spec->rel2abs($te_fasta);
255   -my @outdir = split /\//, $outdir;
256   -$outdir = pop @outdir;
257   -my $top_dir = $outdir;
258   -my @depend;
259   -my $shellscripts = "$current_dir/$top_dir/shellscripts";
260   -if ($qsub_array) {
261   - mkdir "$current_dir/$top_dir";
262   - mkdir "$shellscripts";
263   - open QSUBARRAY, ">$current_dir/$top_dir/run_these_jobs.sh"
264   - or die "Can't open $current_dir/$top_dir/run_these_jobs.sh\n";
265   -}
266   -elsif ($parallel) {
267   - mkdir "$current_dir/$top_dir";
268   - mkdir "$shellscripts";
269   - open PARALLEL, ">$current_dir/$top_dir/run_these_jobs.sh"
270   - or die "Can't open $current_dir/$top_dir/run_these_jobs.sh\n";
271   -}
272   -else {
273   - mkdir "$current_dir/$top_dir";
274   -}
275   -my $qsub_formatGenome_cmd = 0;
276   -## get names of each ref sequecne
277   -my @genome_seqs;
278   -if ( $mapping > 0 ) {
279   - open( INFASTA, "$genome_path" ) || die "$!\n";
280   - while ( my $line = <INFASTA> ) {
281   - next unless $line =~ /^>(\S+)/;
282   - if ( $line =~ /^>(\S+)/ ) {
283   - my $id = $1;
284   - if ( $id =~ /\|/ ) {
285   - $id =~ s/\|/_/g;
286   - }
287   - push @genome_seqs, $id;
288   - }
289   - else {
290   - die "Your genome FASTA file is in a unexpected format.
291   ->SEQNAME
292   -SEQUENCE
293   ->SEQNAME2
294   -SEQUENCE2\n";
295   - }
296   - }
297   - close(INFASTA);
298   -
299   - #create bowtie index
300   - my $cmd;
301   - if ( !$bowtie2 and !-e "$genome_path.bowtie_build_index.1.ebwt" ) {
302   - $cmd =
303   -"bowtie-build -f $genome_path $genome_path.bowtie_build_index 12> $current_dir/$top_dir/bowtie-build.out";
304   - $qsub_formatGenome_cmd = 1;
305   - }
306   - elsif ( $bowtie2 and !-e "$genome_path.bowtie2_build_index.1.bt2" ) {
307   - $cmd =
308   -"bowtie2-build -f $genome_path $genome_path.bowtie2_build_index 12> $current_dir/$top_dir/bowtie-build2.out";
309   - $qsub_formatGenome_cmd = 1;
310   - }
311   - my $ref = 'ref';
312   - if ( $genome_path =~ /(?:.+\/)?(.+)\.(fa|fasta)$/ ) {
313   - $ref = $1;
314   - }
315   - if ( $parallel and defined $cmd ) {
316   - my $shell_dir = "$shellscripts/step_1";
317   - mkdir $shell_dir;
318   - open OUTSH, ">$shell_dir/step_1.$ref.formatGenome.sh";
319   - print OUTSH "$cmd\n";
320   - close OUTSH;
321   - chmod 0755, "$shell_dir/step_1.$ref.formatGenome.sh";
322   - print PARALLEL "sh $shell_dir/step_1.$ref.formatGenome.sh\n"
323   - if !$qsub_array;
324   - }
325   - elsif ( $parallel and !defined $cmd ) {
326   - my $step1_file =
327   - "$shellscripts/step_1_not_needed_genome_fasta_already_formatted";
328   - my $shell_dir = "$shellscripts";
329   - mkdir $shell_dir;
330   - if ($parallel) {
331   - open STEP1, ">$step1_file" or die "Can't Open $step1_file\n";
332   - print STEP1 '';
333   - close STEP1;
334   - }
335   - }
336   - elsif ( defined $cmd ) {
337   - ##run it now
338   - print "Formatting the reference genome: $genome_path\n";
339   - system($cmd);
340   - }
341   - if ($qsub_array) {
342   - if ( !-e "$shellscripts/step_1_not_needed_genome_fasta_already_formatted" )
343   - {
344   - my $job = "$shellscripts/step_1/step_1.$ref.formatGenome.sh";
345   - print QSUBARRAY
346   - "STEP1=\`qsub -e $shellscripts -o $shellscripts $qsub_q $job\`
347   -echo \$STEP1\n";
348   -
349   - }
350   - }
351   -} ##end if($mapping)
352   -
353   -##run existing TE blat against ref if the file does not exsit
354   -my $qsub_existingTE_cmd = 0;
355   -my $existing_blat_cmd =
356   -"blat $genome_path $te_path $current_dir/$top_dir/existingTE.blatout 1> $current_dir/$top_dir/existingTE.blat.stdout";
357   -if ($existing_blat) {
358   - ##if running blat set existing_TE_path to blatout
359   - $existing_TE_path = "$current_dir/$top_dir/existingTE.blatout";
360   - if ( $parallel
361   - and !-e "$current_dir/$top_dir/existingTE.blatout" )
362   - {
363   - my $shell_dir = "$shellscripts";
364   - if ( !-d $shell_dir ) {
365   - mkdir $shell_dir;
366   - }
367   - open OUTSH, ">$shell_dir/step_0.existingTE_blat.sh"
368   - or die "Can't open $shell_dir/step_0.existingTE_blat.sh for writing $!\n";
369   - print PARALLEL "sh $shell_dir/step_0.existingTE_blat.sh\n" if !$qsub_array;
370   - print OUTSH "$existing_blat_cmd\n";
371   - if ($qsub_array) {
372   - $qsub_existingTE_cmd = 1;
373   - print QSUBARRAY
374   -"EXISTINGTE=`qsub -e $shellscripts -o $shellscripts $qsub_q $shellscripts/step_0.existingTE_blat.sh`
375   -echo \$EXISTINGTE\n";
376   - }
377   - close OUTSH;
378   - }
379   - elsif ( !-e "$current_dir/$top_dir/existingTE.blatout" ) {
380   - ## do it now
381   - print "finding TEs ($te_path) in the reference genome ($genome_path)\n";
382   - system($existing_blat_cmd);
383   - }
384   -}
385   -
386   -my @fq;
387   -my @fa;
388   -
389   -#convert fq files to fa for blat
390   -open QSUBARRAY2, ">$shellscripts/step_2.fq2fa.sh"
391   - if $qsub_array;
392   -my $fq_count = 0;
393   -if ( $fq_dir ne 'SKIP' ) {
394   - foreach my $fq (@fq_files) {
395   - my $fq_path = File::Spec->rel2abs($fq);
396   - push @fq, $fq_path;
397   - my $fa = $fq;
398   - if ( $fa =~ s/\.(fq|fastq)$/.fa/ ) {
399   - push @fa, $fa;
400   - if ( !-e $fa ) {
401   - my $cmd = "$scripts/relocaTE_fq2fa.pl $fq_path $fa";
402   - if ($parallel) {
403   - my @fq_path = split '/', $fq_path;
404   - my $fq_name = pop @fq_path;
405   - my $shell_dir = "$shellscripts/step_2";
406   -
407   - mkdir $shell_dir;
408   - my $outsh = "$shell_dir/$fq_count." . "fq2fa.sh";
409   - open OUTSH, ">$outsh";
410   - print PARALLEL "sh $outsh\n" if !$qsub_array;
411   - print OUTSH "$cmd\n";
412   - }
413   - else {
414   - ##run it now
415   - print "Converting $fq_path to fasta for blat\n";
416   - system($cmd);
417   - }
418   - }
419   - else {
420   - my $shell_dir = "$shellscripts";
421   -
422   - mkdir $shell_dir;
423   - my $step2_file =
424   - "$shellscripts/step_2_not_needed_fq_already_converted_2_fa";
425   -
426   - if ($parallel) {
427   - open STEP2, ">$step2_file" or die "Can't Open $step2_file\n";
428   - print STEP2 '';
429   - close STEP2;
430   - }
431   - }
432   - }
433   - else {
434   - print
435   -"$fq does not seem to be a fastq based on the file extension. It should be fq or fastq\n";
436   - &getHelp();
437   - }
438   - $fq_count++;
439   - }
440   - if ( !-e "$shellscripts/step_2_not_needed_fq_already_converted_2_fa"
441   - and $qsub_array )
442   - {
443   - my $end = $fq_count - 1;
444   - my $job = "$shellscripts/step_2.fq2fa.sh";
445   - if ( !@depend ) {
446   - print QSUBARRAY
447   - "STEP2=\`qsub -e $shellscripts -o $shellscripts $qsub_q -t 0-$end $job\`
448   -echo \$STEP2\n";
449   - @depend = ( "STEP2", "afterokarray" );
450   - }
451   - else {
452   - my ( $last_job, $afterok ) = @depend;
453   - @depend = ( "STEP2", "afterokarray" );
454   - my $jobName = $depend[0];
455   - print QSUBARRAY
456   -"$jobName=`qsub -e $shellscripts -o $shellscripts $qsub_q -t 0-$end -W depend=$afterok:\$$last_job $job`
457   -echo \$$jobName\n";
458   - }
459   - print QSUBARRAY2 "sh $shellscripts/step_2/\$PBS_ARRAYID.fq2fa.sh";
460   - }
461   - elsif ($qsub_array) {
462   - unlink "$shellscripts/step_2.fq2fa.sh";
463   - }
464   -} ##end if $fq_dir ne 'SKIP'
465   -close QSUBARRAY2;
466   -
467   -##split the TE fasta of many seqs into individual files
468   -my @te_fastas;
469   -my %TSD;
470   -open( INFASTA, "$te_fasta" ) || die "$!\n";
471   -my $i = 0;
472   -while ( my $line = <INFASTA> ) {
473   - if ( $line =~ /^>(\S+)\s+TSD=(\S+)/ ) {
474   - my $id = $1;
475   - $TSD{$id} = $2;
476   - if ( $i > 0 ) {
477   - close(OUTFASTA);
478   - $i = 0;
479   - }
480   - my $te_file = "$id.fa";
481   - $te_file =~ s/\|/_/g;
482   - ##create new dir for files: workingDir/outdir/TE/
483   - my $te_dir = "$current_dir/$top_dir/$id";
484   - push @te_fastas, "$te_dir/$te_file";
485   -
486   - mkdir $te_dir;
487   - open( OUTFASTA, ">$te_dir/$te_file" ) or die "$!\n";
488   - print OUTFASTA $line;
489   - $i++;
490   - }
491   - elsif ( $line =~ /^>/ and $line !~ /TSD=/ ) {
492   - die
493   -"The TE_fasta:$te_fasta does not have the proper format:\n>TE_NAME TSD=TSD\nSEQUENCE\n";
494   - }
495   - else { ##should be sequence
496   - print OUTFASTA $line;
497   - }
498   -}
499   -close(INFASTA);
500   -close(OUTFASTA);
501   -
502   -#foreach TE fasta blat against target chromosome and parse and find insertion sites
503   -my $depend = 1 if @depend;
504   -foreach my $te_path (@te_fastas) {
505   - if ($depend) {
506   - @depend = ( "STEP2", "afterokarray" );
507   - }
508   - else {
509   - @depend = ();
510   - }
511   - my @path = split '/', $te_path;
512   - my $te_fasta = pop @path;
513   - my $path = join '/', @path;
514   - my $TE = $te_fasta;
515   - $TE =~ s/\.fa//;
516   - mkdir "$path/blat_output";
517   - mkdir "$path/flanking_seq";
518   - mkdir "$path/te_containing_fq";
519   - mkdir "$path/te_only_read_portions_fa";
520   -
521   - #blat fa files against te.fa
522   - my @flanking_fq;
523   - my $fq_file_count = scalar @fq;
524   -
525   - open QSUBARRAY3, ">$shellscripts/step_3.$TE.blat.sh"
526   - if $qsub_array;
527   - open QSUBARRAY4, ">$shellscripts/step_5.$TE.finder.sh"
528   - if $qsub_array;
529   - for ( my $i = 0 ; $i < $fq_file_count ; $i++ ) {
530   - my $fa = $fa[$i];
531   - my $fq = $fq[$i];
532   -
533   - #remove and save filename part of path
534   - my @fa_path = split '/', $fa;
535   - my $fa_name = pop @fa_path;
536   - $fa_name =~ s/\.fa$//;
537   - my $shell_dir = "$shellscripts/step_3/$TE";
538   - if ($parallel) {
539   - make_path( $shell_dir, { mode => 0755 } );
540   - open OUTSH, ">$shell_dir/$i.$TE.blat.sh"
541   - or die "Can't open $shell_dir/$i.$TE.blat.sh $!\n";
542   - print PARALLEL "sh $shell_dir/$i.$TE.blat.sh\n" if !$qsub_array;
543   - }
544   -
545   - #use pre-existing blatout files
546   - if ( !-e "$path/blat_output/$fa_name.te_$TE.blatout" ) {
547   - my $cmd =
548   -"blat -minScore=$blat_minScore -tileSize=$blat_tileSize $te_path $fa $path/blat_output/$fa_name.te_$TE.blatout 1>> $path/blat_output/blat.out";
549   - print OUTSH "$cmd\n" if $parallel;
550   - print "Finding reads in $fa_name that contain sequence of $TE\n"
551   - if !$parallel;
552   - system($cmd) if !$parallel;
553   - }
554   -
555   - #use pre-existing te_containing_fq files
556   - my $te_Containing_fq =
557   - "$path/te_containing_fq/$fa_name.te_$TE.ContainingReads.fq";
558   - if ( -e $te_Containing_fq ) {
559   - $fq = $te_Containing_fq;
560   - }
561   - my $cmd =
562   -"perl $scripts/relocaTE_trim.pl $path/blat_output/$fa_name.te_$TE.blatout $fq $len_cutoff $mismatch_allowance > $path/flanking_seq/$fa_name.te_$TE.flankingReads.fq";
563   - if ($parallel) {
564   - print OUTSH "$cmd\n";
565   - close OUTSH;
566   - chmod 0755, "$shell_dir/*blat.sh";
567   - }
568   - else {
569   - ##run it now
570   - print "Trimming $fq reads of $TE sequence\n" if !$parallel;
571   - system($cmd) if !$parallel;
572   - }
573   - }
574   - if ($qsub_array) {
575   - my $end = $fq_file_count - 1;
576   - my $job = "$shellscripts/step_3.$TE.blat.sh";
577   - my $desc = $TE;
578   - $desc =~ s/\W/_/;
579   - if ( !@depend ) {
580   - print QSUBARRAY
581   -"STEP_3_$desc=\`qsub -e $shellscripts -o $shellscripts $qsub_q -t 0-$end $job\`
582   -echo \$STEP_3_$desc\n";
583   - @depend = ( "STEP_3_$desc", "afterokarray" );
584   - }
585   - else {
586   - my ( $last_job, $afterok ) = @depend;
587   - @depend = ( "STEP_3_$desc", "afterokarray" );
588   - my $jobName = $depend[0];
589   - print QSUBARRAY
590   -"$jobName=`qsub -e $shellscripts -o $shellscripts $qsub_q -t 0-$end -W depend=$afterok:\$$last_job $job`
591   -echo \$$jobName\n";
592   -
593   - }
594   - print QSUBARRAY3 "sh $shellscripts/step_3/$TE/\$PBS_ARRAYID.$TE.blat.sh";
595   - }
596   - ##if a genome file was provided, align seqs to genome
597   - ##if no genome file was provided, will only blat and trim reads of te seq
598   - if ($mapping) {
599   - my $param_path = "$current_dir/$top_dir/$TE";
600   - my $outregex = "$param_path/regex.txt";
601   - open OUTREGEX, ">$outregex" or die $!;
602   - print OUTREGEX "$mate_file_1\t$mate_file_2\t$mate_file_unpaired\t$TSD{$TE}";
603   - my $cmd =
604   -"$scripts/relocaTE_align.pl $scripts $param_path $genome_path $outregex $TE $exper $bowtie2";
605   - if ( !$parallel ) {
606   - ## run now
607   - print "Aligning $TE trimmed reads to the reference ($genome_path)\n";
608   - system($cmd);
609   - }
610   - else {
611   - my $shell_dir = "$shellscripts/step_4/$TE";
612   - $genome_path =~ /.+\/(.+)\.(fa|fasta)$/;
613   - my $ref = $1;
614   -
615   - #`mkdir -p $shell_dir`;
616   - make_path( $shell_dir, { mode => 0755 } );
617   -
618   - #mkdir $shell_dir;
619   - open OUTSH, ">$shell_dir/step_4.$ref.$TE.align.sh";
620   - print OUTSH "$cmd\n";
621   - print PARALLEL "sh $shell_dir/step_4.$ref.$TE.align.sh\n" if !$qsub_array;
622   - close OUTSH;
623   -
624   - #`chmod +x $shell_dir/step_4.$ref.$TE.align.sh`;
625   - chmod 0755, "$shell_dir/step_4.$ref.$TE.align.sh";
626   - if ($qsub_array) {
627   - my $existing_depend = '';
628   - if ( $qsub_formatGenome_cmd ) {
629   - $existing_depend = "-W depend=afterok:\$STEP1" if !@depend;
630   - $existing_depend = ",depend=afterok:\$STEP1" if @depend;
631   - }
632   - my $job = "$shell_dir/step_4.$ref.$TE.align.sh";
633   - my $desc = $TE;
634   - $desc =~ s/\W/_/;
635   - if ( !@depend ) {
636   - print QSUBARRAY
637   -"STEP_4_$desc=\`qsub -e $shellscripts -o $shellscripts $qsub_q $existing_depend $job\`
638   -echo \$STEP_4_$desc\n";
639   - @depend = ( "STEP_4_$desc", "afterok" );
640   - }
641   - else {
642   - my ( $last_job, $afterok ) = @depend;
643   - @depend = ( "STEP_4_$desc", "afterok" );
644   - my $jobName = $depend[0];
645   - print QSUBARRAY
646   -"$jobName=`qsub -e $shellscripts -o $shellscripts $qsub_q -W depend=$afterok:\$$last_job","$existing_depend $job`
647   -echo \$$jobName\n";
648   - }
649   - }
650   - }
651   -
652   - my $genome_count = 0;
653   - foreach my $seq_id (@genome_seqs) {
654   - $genome_path =~ /.+\/(.+)\.(fa|fasta)$/;
655   - my $ref = $1;
656   - my $merged_bowtie = "$path/bowtie_aln/$ref.$TE.bowtie.out";
657   - my $cmd =
658   -"$scripts/relocaTE_insertionFinder.pl $merged_bowtie $seq_id $genome_path $TE $outregex $exper $flanking_seq_len $existing_TE_path $mismatch_allowance $bowtie2";
659   - if ( !$parallel ) {
660   - ##run it now
661   - print "Finding $TE insertions in $seq_id\n";
662   - system($cmd);
663   - }
664   - else {
665   - my $shell_dir = "$shellscripts/step_5/$TE";
666   - make_path( $shell_dir, { mode => 0755 } );
667   - open OUTSH, ">$shell_dir/$genome_count.$TE.findSites.sh";
668   - print OUTSH "$cmd\n";
669   - close OUTSH;
670   - print PARALLEL "sh $shell_dir/$genome_count.$TE.findSites.sh\n"
671   - if !$qsub_array;
672   - chmod 0755, "$shell_dir/$genome_count.$TE.findSites.sh";
673   - }
674   - $genome_count++;
675   - }
676   - if ($qsub_array) {
677   - my $end = $genome_count - 1;
678   - my $job = "$shellscripts/step_5.$TE.finder.sh";
679   - my $existing_depend = '';
680   - if ($qsub_existingTE_cmd) {
681   - $existing_depend = "-W depend=afterok:\$EXISTINGTE" if !@depend;
682   - $existing_depend = ":\$EXISTINGTE" if @depend;
683   - }
684   - #if ( $qsub_formatGenome_cmd and $existing_depend eq '' ) {
685   - # $existing_depend = "-W depend=afterok:\$STEP1" if !@depend;
686   - #}
687   - #elsif ( $qsub_formatGenome_cmd and ( $existing_depend ne '' or @depend ) )
688   - #{
689   - # $existing_depend .= ":\$STEP1";
690   - #}
691   - my $desc = $TE;
692   - $desc =~ s/\W/_/;
693   - if ( !@depend ) {
694   - print QSUBARRAY
695   -"STEP_5_$desc=\`qsub -e $shellscripts -o $shellscripts $qsub_q -t 0-$end $existing_depend $job\`
696   -echo \$STEP_5_$desc\n";
697   - @depend = ( "STEP_5_$desc", "afterokarray" );
698   - }
699   - else {
700   - my ( $last_job, $afterok ) = @depend;
701   - @depend = ( "STEP_5_$desc", "afterokarray" );
702   - my $jobName = $depend[0];
703   - print QSUBARRAY
704   -"$jobName=`qsub -e $shellscripts -o $shellscripts $qsub_q -t 0-$end -W depend=$afterok:\$$last_job",
705   - "$existing_depend $job`
706   -echo \$$jobName\n";
707   - }
708   - print QSUBARRAY4
709   - "sh $shellscripts/step_5/$TE/\$PBS_ARRAYID.$TE.findSites.sh";
710   - }
711   - }
712   - if ($qsub_array) {
713   - close QSUBARRAY3;
714   - close QSUBARRAY4;
715   - }
716   -}
717   -## Finished, clean up, cat files
718   -##cat all '.te_insertion_sites.table.txt' results into one file
719   -foreach my $te_path (@te_fastas) {
720   - my @path = split '/', $te_path;
721   - my $te_fasta = pop @path;
722   - my $path = join '/', @path;
723   - my $TE = $te_fasta;
724   - $TE =~ s/\.fa//;
725   - if ($parallel) {
726   - my $shell_dir = "$shellscripts/step_6/$TE";
727   - make_path( $shell_dir, { mode => 0755 } );
728   - open FINISH, ">$shellscripts/step_6/$TE/step_6.$TE.finishing.sh";
729   - print PARALLEL "sh $shellscripts/step_6/$TE/step_6.$TE.finishing.sh\n"
730   - if !$qsub_array;
731   -## toDo: Need to put all of this in a perl script then have the
732   -## finishing shell script execute that perl script
733   - print FINISH "
734   -`mkdir -p $path/results/all_files`
735   -
736   -#combine confident insertions to one file
737   -echo \"TE\tTSD\tExper\tchromosome\tinsertion_site\tstrand\tleft_flanking_read_count\tright_flanking_read_count\tleft_flanking_seq\tright_flanking_seq\tTE_orientation\" > $path/results/temp
738   -for i in \`ls $path/results/*.$TE.confident_nonref_insert.txt\` ; do grep -v flanking_read_count \$i >> $path/results/temp ; done
739   -mv $path/results/temp $path/results/$exper.$TE.confident_nonref.txt
740   -mv $path/results/*.$TE.confident_nonref_insert.txt $path/results/all_files
741   -
742   -#combine all insertions to one file
743   -echo \"TE\tTSD\tExper\tchromosome\tinsertion_site\tstrand\tcombined_read_count\tright_flanking_read_count\tleft_flanking_read_count\" > $path/results/temp2
744   -for i in \`ls $path/results/*.$TE.all_nonref_insert.txt\` ; do grep -v total \$i | grep -v Note >> $path/results/temp2 ; done
745   -mv $path/results/temp2 $path/results/$exper.$TE.all_nonref.txt
746   -mv $path/results/*.$TE.all_nonref_insert.txt $path/results/all_files
747   -
748   -#combine confident insertions ref seqs to one file
749   -for i in \`ls $path/results/*.$TE.confident_nonref_genomeflank.fa\` ; do cat \$i >> $path/results/temp3 ; done
750   -mv $path/results/temp3 $path/results/$exper.$TE.confident_nonref_genomeflanks.fa
751   -mv $path/results/*.$TE.confident_nonref_genomeflank.fa $path/results/all_files
752   -
753   -#combine confident insertions gff to one file
754   -echo \"##gff-version 3\" > $path/results/temp4
755   -for i in \`ls $path/results/*.$TE.all_insert.gff\` ; do grep -v gff \$i >> $path/results/temp4 ; done
756   -mv $path/results/temp4 $path/results/$exper.$TE.all_inserts.gff
757   -mv $path/results/*.$TE.all_insert.gff $path/results/all_files
758   -
759   -#combine confident insertions reads to one file
760   -for i in \`ls $path/results/*.$TE.confident_nonref_insert_reads_list.txt\` ; do cat \$i >> $path/results/temp5 ; done
761   -mv $path/results/temp5 $path/results/$exper.$TE.confident_nonref_reads_list.txt
762   -mv $path/results/*.$TE.confident_nonref_insert_reads_list.txt $path/results/all_files
763   -
764   -";
765   - `chmod +x $shellscripts/step_6/$TE/step_6.$TE.finishing.sh`;
766   - }
767   - if ($qsub_array) {
768   - my $job = "$shellscripts/step_6/$TE/step_6.$TE.finishing.sh";
769   - my $desc = $TE;
770   - $desc =~ s/\W/_/;
771   - if ( !@depend ) {
772   - my $jobName = "STEP_6_$desc";
773   - print QSUBARRAY
774   - "$jobName=\`qsub -e $shellscripts -o $shellscripts $qsub_q $job\`
775   -echo \$$jobName\n";
776   - @depend = ( "STEP_6_$desc", "afterok" );
777   - }
778   - else {
779   - my ( $last_job, $afterok ) = ( "STEP_5_$desc", "afterokarray" );
780   - @depend = ( "STEP_6_$desc", "afterok" );
781   - my $jobName = $depend[0];
782   - print QSUBARRAY
783   -"$jobName=`qsub -e $shellscripts -o $shellscripts $qsub_q -W depend=$afterok:\$$last_job $job`
784   -echo \$$jobName\n";
785   - }
786   - }
787   - if ( !$parallel and !$qsub_array ) {
788   - ##do it now
789   - ##combine and delete individual chr files for confident sites
790   - print "Finishing and cleaning up\n";
791   -`echo \"TE\tTSD\tEper\tchromosome\tinsertion_site\tstrand\tleft_flanking_read_count\tright_flanking_read_count\tleft_flanking_seq\tright_flanking_seq\tTE_orientation\" > $path/results/temp`;
792   - my @files = `ls $path/results/*.$TE.confident_nonref_insert.txt`;
793   - foreach my $file (@files) {
794   - chomp $file;
795   - `grep -v flanking_read_count $file >> $path/results/temp`;
796   - unlink $file;
797   - }
798   - `mv $path/results/temp $path/results/$exper.$TE.confident_nonref.txt`;
799   -
800   - ##combine and delete individual chr files for all sites
801   -`echo \"TE\tTSD\tExper\tchromosome\tinsertion_site\tstrand\tcombined_read_count\tright_flanking_read_count\tleft_flanking_read_count\" > $path/results/temp2`;
802   - @files = `ls $path/results/*.$TE.all_nonref_insert.txt`;
803   - foreach my $file (@files) {
804   - chomp $file;