Skip to content

Commit

Permalink
will now use RelocaTE blat, no need to run again
Browse files Browse the repository at this point in the history
  • Loading branch information
srobb1 committed Apr 29, 2013
1 parent f0f8378 commit e67c2de
Show file tree
Hide file tree
Showing 2 changed files with 34 additions and 25 deletions.
42 changes: 21 additions & 21 deletions scripts/construcTEr.pl
Expand Up @@ -8,6 +8,7 @@
my $working_dir = shift;
my $regex_file = shift;
my $insert_pos_file = shift;
my $blat_dir = shift;
my %seqs;

##get the regelar expression patterns for mates and for the TE
Expand Down Expand Up @@ -40,9 +41,12 @@
}
}
my %seq_storage;
my @blat_files = <$working_dir/blat_output/*blatout>;
if (!defined $blat_dir){
$blat_dir = "$working_dir/blat_output";
}
my @blat_files = <$blat_dir/*blatout>;
if (scalar @blat_files == 0 ){
die "Can't find blatout files in $working_dir/blat_output/*blatout\n";
die "Can't find blatout files in $blat_dir\n";
}
foreach my $blat_file (@blat_files) {
next if $blat_file =~ /unpaired/i;
Expand Down Expand Up @@ -88,19 +92,13 @@
my $id = $qName;
my $aln_bp = $matches + $qBaseInsert + $mismatches;

## throw out if gap is too big
if ($qBaseInsert > 5 or $tBaseInsert > 5){
### want to find hits to the TE
## throw out if gap is too big
if ($qBaseInsert > 5 or $tBaseInsert > 5){
next;
}

#### if the hit does not overlap the edge of the TE and not most of the read is not aligning
#### throw it out
next if (( $tStart > 1 and $tEnd < $tLen ) and ($aln_bp < ( $qLen * .98 ) ));

#### if the hit does overlap the edge of the TE and the query match count doesnot just about equal the target match length
### throw if out
next if ( (( $tStart == 1 or $tEnd == $tLen ) and ( ($aln_bp) < ( $tEnd - $tStart + 1 + $tBaseInsert - 5)) or ($aln_bp) > ( $tEnd - $tStart + 1 + $tBaseInsert + 5)) );

## if 50% or more of the read matches to the TE, keep it
next unless ($matches + $mismatches) >= $qLen*.50 ;
my $add = 0;
if ( exists $seqs{$te}{$id}{$te_mate}{blat_hit}{matches} ) {
my $stored_matches = $seqs{$te}{$id}{$te_mate}{blat_hit}{matches};
Expand All @@ -120,7 +118,10 @@
##doesnt exist so add it
$add = 1;
}
if ($add) {



if ($add){
$seqs{$te}{$id}{$te_mate}{blat_hit}{qlen} = $qLen;
$seqs{$te}{$id}{$te_mate}{blat_hit}{qBaseInsert} = $qBaseInsert;
$seqs{$te}{$id}{$te_mate}{blat_hit}{tBaseInsert} = $tBaseInsert;
Expand All @@ -138,13 +139,12 @@
}
}
}

## retrieve seq and pair of any read that matches to the TE
foreach my $te (keys %seq_storage){
foreach my $mate_fa ( keys %{$seq_storage{$te}} ) {
my @mate_fa_path = split '/' , $mate_fa;
my $file_name = pop @mate_fa_path;
##my $fa_out_dir = "$working_dir/construcTEr_collected_fa";
##`mkdir -p $fa_out_dir`;
##open OUTFA , ">$fa_out_dir/$file_name";
my $mate = $file_name;
$mate =~ s/\.fa//;
my $fastacmd_str = join ',' , sort keys %{$seq_storage{$te}{$mate_fa}};
Expand All @@ -160,8 +160,6 @@
$seq_recs .= `fastacmd -d $mate_fa -s $fastacmd_str`;
}

#my $seq_recs =
# `fastacmd -d $mate_fa -s $fastacmd_str`;
if ( defined $seq_recs ) {
my @seq_recs = split />/, $seq_recs;
## get rid of first empty record
Expand All @@ -178,7 +176,7 @@
}
}
}
##get reads to align to genome
##get reads to align to genome and print to a file for bowtie
foreach my $te ( keys %seqs ) {
my $bowtie_2_aln = "$working_dir/$te.construcTEr.bowtie2aln.fa";
open BOWTIEFA, ">$bowtie_2_aln" or die "Can't open $bowtie_2_aln, $!\n";
Expand Down Expand Up @@ -374,6 +372,7 @@
## if record is a TE hit - get range info

my $range_str = '';
## only TEs are in %ranges
if ( exists $ranges{$name} ) {
my $read_range = range_create( $start, $end );
foreach my $range_name ( sort keys %{ $ranges{$name} } ) {
Expand All @@ -389,7 +388,8 @@
my ($r_s, $r_e) = split /\.\./ , $range;
foreach my $insert_str(keys %{$inserts{$target}}){
my $pos = $inserts{$target}{$insert_str}{pos};
if ( ((abs ($r_s -$pos)) < 500) or ((abs ($r_e -$pos)) < 500) ){
if ( (((abs ($r_s -$pos)) < 1000) or ((abs ($r_e -$pos)) < 1000))
and $mismatch == 0){
push @{$inserts{$target}{$insert_str}{rec}} , ">$id $name:$start..$end ($strand) mismatches=$mismatch $range_str\n$seq\n";
}
}
Expand Down
17 changes: 13 additions & 4 deletions scripts/run_construcTEr.pl
Expand Up @@ -23,12 +23,14 @@
my $arrayJob = 1;
my $mate_file_1 = '_1\D*?fq';
my $mate_file_2 = '_2\D*?fq';
my $blat_dir;
#my $unpaired = 'unPaired\D*?fq';
my $insert_file = 0;
GetOptions(
'p|parallel:i' => \$parallel,
'a|arrayJob:i' => \$arrayJob,
'i|insert_file:s' => \$insert_file,
'b|blat_dir:s' => \$blat_dir,
'w|workingdir:s' => \$workingdir,
'o|outdir:s' => \$outdir,
'd|fq_dir:s' => \$fq_dir,
Expand Down Expand Up @@ -90,6 +92,12 @@
print "$insert_file does not exist. Check file name.\n";
&getHelp();
}
if ( defined $blat_dir and -d $blat_dir ) {
$blat_dir = File::Spec->rel2abs($blat_dir);
$blat_dir =~ s/\/$//;
}else {
print "$blat_dir does not exist, need to run blats\n";
}
if ( !defined $fq_dir ) {
print "\n\nPlease provide a directory of complete set of paired fastq files\n";
&getHelp();
Expand Down Expand Up @@ -145,8 +153,8 @@ sub getHelp {

## get fq files
$fq_dir = File::Spec->rel2abs($fq_dir);
my @fq_files = <$fq_dir/*fq>;
push @fq_files , <$fq_dir/*fastq>;
my @fq_files = <$fq_dir/*.fq>;
push @fq_files , <$fq_dir/*.fastq>;
## create bowtie index
my $genome_file = File::Spec->rel2abs($genomeFasta);
if ( !-e "$genome_file.bowtie_build_index.1.ebwt" ) {
Expand Down Expand Up @@ -256,7 +264,7 @@ sub getHelp {


#foreach TE fasta blat against target chromosome and parse and find insertion sites

if (!defined $blat_dir){
if ($parallel){
$shell_script_count++;
}
Expand Down Expand Up @@ -301,6 +309,7 @@ sub getHelp {
close SH;
}
}
}

if ($parallel){
$shell_script_count++;
Expand All @@ -312,7 +321,7 @@ sub getHelp {
my $outregex = "$current_dir/$top_dir/$TE/$TE.regex.txt";
open OUTREGEX, ">$outregex" or die $!;
print OUTREGEX "$mate_file_1\t$mate_file_2";
my $cmd = "$scripts/construcTEr.pl $fq_dir $genome_file $te_path $current_dir/$top_dir/$TE $outregex $insert_file";
my $cmd = "$scripts/construcTEr.pl $fq_dir $genome_file $te_path $current_dir/$top_dir/$TE $outregex $insert_file $blat_dir";
if ($parallel) {
$shell_dir = "$current_dir/$top_dir/shellscripts/step_$shell_script_count/$TE";
`mkdir -p $shell_dir`;
Expand Down

0 comments on commit e67c2de

Please sign in to comment.