diff --git a/scripts/construcTEr.pl b/scripts/construcTEr.pl index 868b5d2..8865895 100755 --- a/scripts/construcTEr.pl +++ b/scripts/construcTEr.pl @@ -8,6 +8,7 @@ my $working_dir = shift; my $regex_file = shift; my $insert_pos_file = shift; +my $blat_dir = shift; my %seqs; ##get the regelar expression patterns for mates and for the TE @@ -40,9 +41,12 @@ } } my %seq_storage; -my @blat_files = <$working_dir/blat_output/*blatout>; +if (!defined $blat_dir){ + $blat_dir = "$working_dir/blat_output"; +} +my @blat_files = <$blat_dir/*blatout>; if (scalar @blat_files == 0 ){ - die "Can't find blatout files in $working_dir/blat_output/*blatout\n"; + die "Can't find blatout files in $blat_dir\n"; } foreach my $blat_file (@blat_files) { next if $blat_file =~ /unpaired/i; @@ -88,19 +92,13 @@ my $id = $qName; my $aln_bp = $matches + $qBaseInsert + $mismatches; - ## throw out if gap is too big - if ($qBaseInsert > 5 or $tBaseInsert > 5){ + ### want to find hits to the TE + ## throw out if gap is too big + if ($qBaseInsert > 5 or $tBaseInsert > 5){ next; } - - #### if the hit does not overlap the edge of the TE and not most of the read is not aligning - #### throw it out - next if (( $tStart > 1 and $tEnd < $tLen ) and ($aln_bp < ( $qLen * .98 ) )); - - #### if the hit does overlap the edge of the TE and the query match count doesnot just about equal the target match length - ### throw if out - next if ( (( $tStart == 1 or $tEnd == $tLen ) and ( ($aln_bp) < ( $tEnd - $tStart + 1 + $tBaseInsert - 5)) or ($aln_bp) > ( $tEnd - $tStart + 1 + $tBaseInsert + 5)) ); - + ## if 50% or more of the read matches to the TE, keep it + next unless ($matches + $mismatches) >= $qLen*.50 ; my $add = 0; if ( exists $seqs{$te}{$id}{$te_mate}{blat_hit}{matches} ) { my $stored_matches = $seqs{$te}{$id}{$te_mate}{blat_hit}{matches}; @@ -120,7 +118,10 @@ ##doesnt exist so add it $add = 1; } - if ($add) { + + + + if ($add){ $seqs{$te}{$id}{$te_mate}{blat_hit}{qlen} = $qLen; $seqs{$te}{$id}{$te_mate}{blat_hit}{qBaseInsert} = $qBaseInsert; $seqs{$te}{$id}{$te_mate}{blat_hit}{tBaseInsert} = $tBaseInsert; @@ -138,13 +139,12 @@ } } } + +## retrieve seq and pair of any read that matches to the TE foreach my $te (keys %seq_storage){ foreach my $mate_fa ( keys %{$seq_storage{$te}} ) { my @mate_fa_path = split '/' , $mate_fa; my $file_name = pop @mate_fa_path; - ##my $fa_out_dir = "$working_dir/construcTEr_collected_fa"; - ##`mkdir -p $fa_out_dir`; - ##open OUTFA , ">$fa_out_dir/$file_name"; my $mate = $file_name; $mate =~ s/\.fa//; my $fastacmd_str = join ',' , sort keys %{$seq_storage{$te}{$mate_fa}}; @@ -160,8 +160,6 @@ $seq_recs .= `fastacmd -d $mate_fa -s $fastacmd_str`; } - #my $seq_recs = - # `fastacmd -d $mate_fa -s $fastacmd_str`; if ( defined $seq_recs ) { my @seq_recs = split />/, $seq_recs; ## get rid of first empty record @@ -178,7 +176,7 @@ } } } -##get reads to align to genome +##get reads to align to genome and print to a file for bowtie foreach my $te ( keys %seqs ) { my $bowtie_2_aln = "$working_dir/$te.construcTEr.bowtie2aln.fa"; open BOWTIEFA, ">$bowtie_2_aln" or die "Can't open $bowtie_2_aln, $!\n"; @@ -374,6 +372,7 @@ ## if record is a TE hit - get range info my $range_str = ''; + ## only TEs are in %ranges if ( exists $ranges{$name} ) { my $read_range = range_create( $start, $end ); foreach my $range_name ( sort keys %{ $ranges{$name} } ) { @@ -389,7 +388,8 @@ my ($r_s, $r_e) = split /\.\./ , $range; foreach my $insert_str(keys %{$inserts{$target}}){ my $pos = $inserts{$target}{$insert_str}{pos}; - if ( ((abs ($r_s -$pos)) < 500) or ((abs ($r_e -$pos)) < 500) ){ + if ( (((abs ($r_s -$pos)) < 1000) or ((abs ($r_e -$pos)) < 1000)) + and $mismatch == 0){ push @{$inserts{$target}{$insert_str}{rec}} , ">$id $name:$start..$end ($strand) mismatches=$mismatch $range_str\n$seq\n"; } } diff --git a/scripts/run_construcTEr.pl b/scripts/run_construcTEr.pl index b133dba..0ab5fd8 100755 --- a/scripts/run_construcTEr.pl +++ b/scripts/run_construcTEr.pl @@ -23,12 +23,14 @@ my $arrayJob = 1; my $mate_file_1 = '_1\D*?fq'; my $mate_file_2 = '_2\D*?fq'; +my $blat_dir; #my $unpaired = 'unPaired\D*?fq'; my $insert_file = 0; GetOptions( 'p|parallel:i' => \$parallel, 'a|arrayJob:i' => \$arrayJob, 'i|insert_file:s' => \$insert_file, + 'b|blat_dir:s' => \$blat_dir, 'w|workingdir:s' => \$workingdir, 'o|outdir:s' => \$outdir, 'd|fq_dir:s' => \$fq_dir, @@ -90,6 +92,12 @@ print "$insert_file does not exist. Check file name.\n"; &getHelp(); } +if ( defined $blat_dir and -d $blat_dir ) { + $blat_dir = File::Spec->rel2abs($blat_dir); + $blat_dir =~ s/\/$//; +}else { + print "$blat_dir does not exist, need to run blats\n"; +} if ( !defined $fq_dir ) { print "\n\nPlease provide a directory of complete set of paired fastq files\n"; &getHelp(); @@ -145,8 +153,8 @@ sub getHelp { ## get fq files $fq_dir = File::Spec->rel2abs($fq_dir); -my @fq_files = <$fq_dir/*fq>; -push @fq_files , <$fq_dir/*fastq>; +my @fq_files = <$fq_dir/*.fq>; +push @fq_files , <$fq_dir/*.fastq>; ## create bowtie index my $genome_file = File::Spec->rel2abs($genomeFasta); if ( !-e "$genome_file.bowtie_build_index.1.ebwt" ) { @@ -256,7 +264,7 @@ sub getHelp { #foreach TE fasta blat against target chromosome and parse and find insertion sites - +if (!defined $blat_dir){ if ($parallel){ $shell_script_count++; } @@ -301,6 +309,7 @@ sub getHelp { close SH; } } +} if ($parallel){ $shell_script_count++; @@ -312,7 +321,7 @@ sub getHelp { my $outregex = "$current_dir/$top_dir/$TE/$TE.regex.txt"; open OUTREGEX, ">$outregex" or die $!; print OUTREGEX "$mate_file_1\t$mate_file_2"; - my $cmd = "$scripts/construcTEr.pl $fq_dir $genome_file $te_path $current_dir/$top_dir/$TE $outregex $insert_file"; + my $cmd = "$scripts/construcTEr.pl $fq_dir $genome_file $te_path $current_dir/$top_dir/$TE $outregex $insert_file $blat_dir"; if ($parallel) { $shell_dir = "$current_dir/$top_dir/shellscripts/step_$shell_script_count/$TE"; `mkdir -p $shell_dir`;