diff --git a/README.md b/README.md index 211f6c6..35f3f4c 100644 --- a/README.md +++ b/README.md @@ -452,3 +452,4 @@ For any of the listed reasons, or anything else, please leave us a Leave a message here + diff --git a/sample_relocaTE_run/sample_readme.txt b/sample_relocaTE_run/sample_readme.txt index c2ac506..cba62e4 100644 --- a/sample_relocaTE_run/sample_readme.txt +++ b/sample_relocaTE_run/sample_readme.txt @@ -9,6 +9,7 @@ These scripts expect that the following programs are installed and included in y 3. BWA 4. BioPerl 5. Blat + 6. Blast: formatdb and fastacmd STEPS A-C A. RelocaTE @@ -32,32 +33,32 @@ A. To find TE insertions in the included fastq reads run RelocaTE: or sh run_relocaTE.sh - These scritps will run relocaTE directly or through a series of shell scripts. - If you are running + These scripts will run relocaTE directly or through a series of shell scripts. + If you are running - run_relocaTE_qsub.sh: - 1) follow the instructions that are printed to the screen (run run_these_jobs.sh) - 2) once complete, view the results in 02052012_sample/mping/results + 1) follow the instructions that are printed to the screen (run run_these_jobs.sh) + 2) once complete, view the results in 02052012_sample/mping/results or - run_relocaTE_shell.sh - 1) follow the instructions that are printed to the screen (run run_these_jobs.sh) - 2) once complete, view the results in 02052012_sample/mping/results + 1) follow the instructions that are printed to the screen (run run_these_jobs.sh) + 2) once complete, view the results in 02052012_sample/mping/results or - run_relocaTE.sh - 1) once complete, view the results in 02052012_sample/mping/results + 1) once complete, view the results in 02052012_sample/mping/results -B. Find Spanners to help classify the insertions (homozygous, heterozygous, etc) by generating a BAM file of the reads not trimmed of TE to the referenc -e. +B. Find Spanners to help classify the insertions (homozygous, heterozygous, etc) by generating a BAM file of the reads not trimmed of TE to the reference. A BAM file is included in the sample data set, but one can be genreted by running the included script: - create_bam.sh by: - - changing directory into the directory of this script - - and typing "sh create_bam.sh" + create_bam.sh by: + - changing directory into the directory of this script + - and typing "sh create_bam.sh" C. To classify the insertions (homozygous, heterozygous, etc) run_characTErizer.sh - - change directory into the directory of this script - - type "sh run_characTErizer.sh" + - change directory into the directory of this script + - type "sh run_characTErizer.sh" - once complete, view the resulting files in the directory you ran characTErizer.pl - 1) sample.inserts_characTErized.gff: GFF file of the classified insertions including excisions - 2) sample.inserts_characTErized.txt: Text file of the classified insertions including excisions - 3) excisions_with_footprint.vcfinfo: additional information on the insertions that have been classified as exicision events + 1) sample.inserts_characTErized.gff: GFF file of the classified insertions including excisions + 2) sample.inserts_characTErized.txt: Text file of the classified insertions including excisions + 3) excisions_with_footprint.vcfinfo: additional information on the insertions that have been classified as exicision events + diff --git a/scripts/characterizer.pl b/scripts/characterizer.pl index 028cc7b..dd79d20 100755 --- a/scripts/characterizer.pl +++ b/scripts/characterizer.pl @@ -310,3 +310,4 @@ sub getHelp { } } } + diff --git a/scripts/characterizer.pl~ b/scripts/characterizer.pl~ deleted file mode 100755 index a16273c..0000000 --- a/scripts/characterizer.pl~ +++ /dev/null @@ -1,312 +0,0 @@ -#!/usr/bin/perl -w -use strict; -use Data::Dumper; -use Cwd; -use Getopt::Long; - -if ( !defined @ARGV ) { - &getHelp; -} - -my $cwd = getcwd(); -my $sites_file; -my $bam_dir; -my $genome_fasta; -## can be a single .bam file or a direcotory containing .bam files -my @bam_files; -my $excision = 0; - -GetOptions( - 's|sites_file:s' => \$sites_file, - 'b|bam_dir:s' => \$bam_dir, - 'g|genome_fasta:s' => \$genome_fasta, - 'x|excision:i' => \$excision, -); - -sub getHelp { - print ' -usage: -./characterizer.pl [-s relocaTE table output file][-b bam file or dir of bam files][-g reference genome fasta file][-h] - -options: --s file relocaTE output file: YOURSAMPLENAME.mping.all_nonref.txt [no default] --b dir/file bam file of the orginal reads aligned to reference (before TE was trimmed) or directory of bam files [no default] --g file reference genome fasta file [no default] --x int find excision events that leave a footprint, yes or no(1|0) [0] --h this help message - -For more information see documentation: http://srobb1.github.com/RelocaTE/ - -'; - exit 1; -} - -if ( -d $bam_dir ) { - - #remove trailing '/' - $bam_dir =~ s/\/$//; - @bam_files = <$bam_dir/*bam>; -} -elsif ( -f $bam_dir or -l $bam_dir ) { - push @bam_files, $bam_dir; -} -open INSITES, "$sites_file" or die "cannot open $sites_file $!\n"; -my @dir_path = split '/', $sites_file; -my $filename = pop @dir_path; -my $file = $filename; -$file =~ s/\..+?$//; -$cwd =~ s/\/$//; #remove trailing / -open OUTGFF, ">$cwd/$file.inserts_characTErized.gff"; -open OUT, ">$cwd/$file.inserts_characTErized.txt"; -print OUT - "strain\tTE\tTSD\tchromosome.pos\tstrand\tavg_flankers\tspanners\tstatus\n"; -print OUTGFF "##gff-version 3\n"; -my %matches; -my %TSDs; -my %toPrint; - -while ( my $line = ) { - next if $line =~ /TE.TSD.Exper.chromosome.insertion_site/; - next if $line =~ /^\s*$/; - chomp $line; - - # mping TTA A119 Chr1 1446..1448 + T:1 R:0 L:1 - my ( - $te, $TSD, $exp, $chromosome, $coor, - $TE_orient, $total_string, $right_string, $left_string - ) = split /\t/, $line; - my ($pos) = $coor =~ /\d+\.\.(\d+)/; - $TSDs{$chromosome}{$pos} = $TSD; - my ($total_count) = $total_string =~ /T:(\d+)/; - my ($left_count) = $left_string =~ /L:(\d+)/; - my ($right_count) = $right_string =~ /R:(\d+)/; - - my $Mmatch = 0; - my $cigar_all; - if ( $left_count >= 1 and $right_count >= 1 and $total_count >= 2 ) { - my @sam_all; - foreach my $bam_file (@bam_files) { - ## get any alignments that overlap the insertion site - my @sam_out = `samtools view $bam_file \'$chromosome:$pos-$pos\'`; - push @sam_all, @sam_out; - } - - #remove redundant lines in sam file - my %sorted_sam; - my $order; - foreach my $line (@sam_all) { - $order++; - if ( !exists $sorted_sam{$line} ) { - $sorted_sam{$line} = $order; - } - } - - #make new sorted sam array by sorting on the value of the sort hash - my @sorted_sam = - sort { $sorted_sam{$a} <=> $sorted_sam{$b} } keys %sorted_sam; - - foreach my $sam_line (@sorted_sam) { - chomp $sam_line; - my @sam_line = split /\t/, $sam_line; - my $cigar = $sam_line[5]; - my $flag = $sam_line[1]; - my $seqLen = length $sam_line[9]; - my $start = $sam_line[3]; - my $end = $start + $seqLen - 1; - next unless $end >= $pos + 5; - next unless $start <= $pos - 5; - ## must be a all M match no soft clipping - if ( $cigar =~ /^\d+M$/ ) { - my ($NM) = $sam_line =~ /NM:i:(\d+)/; ## edit distance used - ## bwa specific: mismatch count, often the same as NM, have not seen a XM>0 and NM==0 - my ($XM) = $sam_line =~ /XM:i:(\d+)/; - if ( defined $XM and $XM == 0 ) { - $Mmatch++; - } - elsif ( defined $NM and $NM == 0 ) { - $Mmatch++; - } - elsif ( !defined $NM and !defined $XM ) { - $Mmatch++; - } - } - elsif ( $cigar =~ /[IND]/ ) { - $matches{"$chromosome.$pos"}{sam}{$sam_line} = 1; - } - } - my $spanners = $Mmatch; - my $average_flankers = $total_count / 2; - my $status = 0; - - if ( $spanners == 0 ) { - $status = 'homozygous'; - } - elsif ( $average_flankers >= 5 and $spanners < 5 ) { - $status = 'homozygous/excision_no_footprint'; - } - elsif ( $spanners < ( $average_flankers * .2 ) and $spanners <= 10 ) { - $status = 'homozygous/excision_no_footprint'; - } - elsif ( $average_flankers <= 2 and $spanners > 10 ) { - $status = 'new_insertion'; - } - elsif ( abs( $average_flankers - $spanners ) <= 5 ) { - $status = 'heterozygous'; - } - elsif ( - abs( $average_flankers - $spanners ) - - ( ( $average_flankers + $spanners ) / 2 ) <= 10 ) - { - $status = 'heterozygous?'; - } - elsif ( $average_flankers > 10 and $spanners > 10 ) { - $status = 'heterozygous'; - } - elsif ( - ( - ( $spanners - $average_flankers ) > - ( $spanners + $average_flankers ) / 2 - ) - and ( $average_flankers <= 10 ) - ) - { - $status = 'new_insertion'; - } - else { - $status = 'other'; - } - - $matches{"$chromosome.$pos"}{status} = $status; - $toPrint{$chromosome}{$pos}{$TSD}{TE} = $te; - $toPrint{$chromosome}{$pos}{$TSD}{flank} = $average_flankers; - $toPrint{$chromosome}{$pos}{$TSD}{span} = $spanners; - $toPrint{$chromosome}{$pos}{$TSD}{status} = $status; - $toPrint{$chromosome}{$pos}{$TSD}{strain} = $exp; - $toPrint{$chromosome}{$pos}{$TSD}{coor} = $coor; - $toPrint{$chromosome}{$pos}{$TSD}{TE_orient} = $TE_orient; - } -} - -if ($excision) { - -##generate vcf of spanners looking for excision events - my @unlink_files; - my @vcfs; - foreach my $pos ( keys %matches ) { - my ( $target, $loc ) = split /\./, $pos; - next unless exists $toPrint{$target}{$loc}; - my $range = "$target:$pos"; - my $sam = "$cwd/$pos.sam"; - my $bam = "$cwd/$pos.bam"; - my $sorted_bam = "$cwd/$pos.sorted"; - my @sam_lines = keys %{ $matches{$pos}{sam} }; - if ( @sam_lines > 1 ) { - my $pos_sam = join "\n", @sam_lines; - open POSSAM, ">$sam"; - print POSSAM $pos_sam; - `samtools view -bT $genome_fasta $sam > $bam`; - `samtools sort $bam $sorted_bam`; - `samtools index $sorted_bam.bam`; - -`samtools mpileup -C50 -ugf $genome_fasta -r $range $sorted_bam.bam | bcftools view -bvcg - > $cwd/$pos.var.raw.bcf`; -`bcftools view $cwd/$pos.var.raw.bcf | vcfutils.pl varFilter -D 100 > $cwd/$pos.var.flt.vcf`; - push @vcfs, "$cwd/$pos.var.flt.vcf"; - push @unlink_files, $sam, $bam, "$sorted_bam.bam.bai", "$sorted_bam.bam", - "$cwd/$pos.var.raw.bcf", "$cwd/$pos.var.flt.vcf"; - close POSSAM; - } - } -##Chr1 16327633 . GAGTACTACAATTAGTA GAGTA 1930.0% . INDEL;DP=2;AF1=1;CI95=0.5,1;DP4=0,0,1,1;MQ=29;FQ=-40.5 GT:PL:GQ 1/1:58,6,0:10 - open EXCISION, ">>$cwd/excisions_with_footprint.vcfinfo" - or die "can't open $cwd/excisions_with_footprint.vcfinfo for writing $!"; - foreach my $vcf (@vcfs) { - ##Chr2.30902247.var.flt.vcf - my @path = split /\//, $vcf; - my $file = pop @path; - my ( $insert_ref, $insert_pos ) = $file =~ /(.+)\.(\d+)\.var\.flt\.vcf/; - my $TSD = $TSDs{$insert_ref}{$insert_pos}; - my $TSD_len = length $TSD; - open VCF, $vcf; - while ( my $line = ) { - next unless $line !~ /^#/; - chomp $line; - my ( $ref, $first_base, $col_3, $ref_seq, $strain_seq ) = split /\t/, - $line; - my $aln_start = $first_base - $insert_pos - 1; - my $aln_end_ref = $first_base - length($ref_seq) - $insert_pos - 1; - my $aln_end_strain = $first_base - length($strain_seq) - $insert_pos - 1; - my $aln_end_ref_near_insert_pos = 0; - my $aln_end_strain_near_insert_pos = 0; - my $aln_start_near_insert_pos = 0; - my $insert_bwt_ends = 0; - my $all_values_after_insertion = 0; - - if ( ( $aln_start <= $TSD_len + 1 ) - and ( ( $aln_start * -1 <= $TSD_len + 1 ) ) ) - { - $aln_start_near_insert_pos = 1; - } - if ( ( $aln_end_ref <= $TSD_len + 1 ) - and ( ( $aln_end_ref * -1 <= $TSD_len + 1 ) ) ) - { - $aln_end_ref_near_insert_pos = 1; - } - if ( ( $aln_end_strain <= $TSD_len + 1 ) - and ( ( $aln_end_strain * -1 <= $TSD_len + 1 ) ) ) - { - $aln_end_strain_near_insert_pos = 1; - } - my ( $end_ref, $end_strain ) = ( - ( $first_base + length($ref_seq) - 1 ), - ( $first_base + length($strain_seq) + 1 ) - ); - if ( ( $end_ref < $insert_pos and $end_strain > $insert_pos ) - or ( $end_strain < $insert_pos and $end_ref > $insert_pos ) ) - { - $insert_bwt_ends = 1; - } - if ( ( ( $first_base - $TSD_len + 1 ) > $insert_pos ) - and ( $end_ref > $insert_pos ) - and ( $end_strain > $insert_pos ) ) - { - $all_values_after_insertion = 1; - } - ## if the alignment end in ref or in strain is close to the insertion postion, - ## or if one is before and one is after the insertion postion, - ## it is a potential excision with footprint - if ( ( $aln_end_ref_near_insert_pos or $aln_end_strain_near_insert_pos ) - or ($insert_bwt_ends) ) - { - ##make sure all values are not after the insertion - if ( !$all_values_after_insertion ) { - print EXCISION "$insert_ref.$insert_pos\t$line\n"; - my $status = $toPrint{$insert_ref}{$insert_pos}{$TSD}{status}; - ##only append if it already isnt there - $toPrint{$insert_ref}{$insert_pos}{$TSD}{status} = - $status . "/excision_with_footprint" - if $status !~ /\/excision_with_footprint/; - } - } - } - } - unlink @unlink_files; -} - -foreach my $chr ( sort keys %toPrint ) { - foreach my $pos ( sort { $a <=> $b } keys %{ $toPrint{$chr} } ) { - foreach my $tsd ( sort keys %{ $toPrint{$chr}{$pos} } ) { - my $TE = $toPrint{$chr}{$pos}{$tsd}{TE}; - my $flankers = $toPrint{$chr}{$pos}{$tsd}{flank}; - my $spanners = $toPrint{$chr}{$pos}{$tsd}{span}; - my $status = $toPrint{$chr}{$pos}{$tsd}{status}; - my $strain = $toPrint{$chr}{$pos}{$tsd}{strain}; - my $coor = $toPrint{$chr}{$pos}{$tsd}{coor}; - my $TE_orient = $toPrint{$chr}{$pos}{$tsd}{TE_orient}; - my ($start) = $coor =~ /(\d+)\.\.\d+/; - print OUT -"$strain\t$TE\t$tsd\t$chr:$coor\t$TE_orient\t$flankers\t$spanners\t$status\n"; - print OUTGFF -"$chr\t$strain\ttransposable_element_attribute\t$start\t$pos\t$TE_orient\t.\t.\tID=$chr.$pos.spanners;avg_flankers=$flankers;spanners=$spanners;type=$status;TE=$TE;TSD=$tsd\n"; - } - } -} diff --git a/scripts/relocaTE.pl b/scripts/relocaTE.pl index 283164a..04c479a 100755 --- a/scripts/relocaTE.pl +++ b/scripts/relocaTE.pl @@ -147,8 +147,8 @@ } else { my $fq_path = File::Spec->rel2abs($fq_dir); - @fq_files = <$fq_path/*fq>; - my @fastq_files = <$fq_path/*fastq>; + @fq_files = <$fq_path/*.fq>; + my @fastq_files = <$fq_path/*.fastq>; push @fq_files, @fastq_files; if ( scalar @fq_files == 0 ) { print "Must provide at least 1 short read file\n"; diff --git a/scripts/relocaTE.pl~ b/scripts/relocaTE.pl~ deleted file mode 100755 index 133cd8a..0000000 --- a/scripts/relocaTE.pl~ +++ /dev/null @@ -1,862 +0,0 @@ -#!/usr/bin/perl -w -use File::Spec; -use Getopt::Long; -use Cwd; -use FindBin qw($RealBin); -use File::Path qw(make_path); -use strict; - -my $scripts = $RealBin; - -if ( !defined @ARGV ) { - &getHelp(); -} -my $genome_fasta = 'NONE'; -my $te_fasta; -my $target = 'NONE'; -my $len_cutoff = 10; -my $mismatch_allowance = 0; -my $fq_dir; -my $exper = 'not.given'; -my $mate_file_1 = '_p1'; -my $mate_file_2 = '_p2'; -my $mate_file_unpaired = '.unPaired'; -my $workingdir; -my $outdir = 'outdir_teSearch'; -my $parallel = 0; -my $qsub_array = 0; -my $qsub_q; -my ( $blat_minScore, $blat_tileSize ) = ( 10, 7 ); -my $flanking_seq_len = 100; -my $existing_TE = 'NONE'; -my $bowtie2 = 0; -GetOptions( - 'p|parallel:i' => \$parallel, - 'a|qsub_array:i' => \$qsub_array, - 'q|qsub_q:s' => \$qsub_q, - 'e|exper:s' => \$exper, - 'w|workingdir:s' => \$workingdir, - 'o|outdir:s' => \$outdir, - 'd|fq_dir:s' => \$fq_dir, - 'g|genome_fasta:s' => \$genome_fasta, - 't|te_fasta:s' => \$te_fasta, - 'l|len_cutoff:i' => \$len_cutoff, - 'm|mismatch:f' => \$mismatch_allowance, - '1|mate_1_id:s' => \$mate_file_1, - '2|mate_2_id:s' => \$mate_file_2, - 'u|unpaired_id:s' => \$mate_file_unpaired, - 'bm|blat_minScore:i' => \$blat_minScore, - 'bt|blat_tileSize:i' => \$blat_tileSize, - 'f|flanking_seq_len:i' => \$flanking_seq_len, - '-r|reference_ins:s' => \$existing_TE, -## '-b2|bowtie2:i' => \$bowtie2, - 'h|help' => \&getHelp, -); -my $current_dir; - -$parallel = 1 if $qsub_array == 1; -if ( defined $qsub_q ) { - $qsub_q = "-q $qsub_q"; -} -else { - $qsub_q = ''; -} - -if ( defined $workingdir and -d $workingdir ) { - $current_dir = File::Spec->rel2abs($workingdir); - $current_dir =~ s/\/$//; -} -else { - $current_dir = cwd(); -} -my $mapping = 1; - -if ( !defined $genome_fasta ) { - print "\n\nPlease provide reference genome by using -g Genome fasta path\n"; - die "\nuse -h option to get help\n"; -} -elsif ( $genome_fasta eq 'NONE' ) { - print -"A reference genome fasta was NOT provided. Proceeding without a reference will result in only the reads containing the TE being identified, no mapping of insertions will be performed\n"; - print "Proceed without mapping? (y|n) \n"; - my $answer = ; - if ( $answer =~ /n/i ) { - &getHelp(); - } - elsif ( $answer =~ /y/i ) { - $mapping = 0; - } - print "Great, proceeding without aligning to a reference genome.\n"; -} -elsif ( !-e $genome_fasta ) { - print "$genome_fasta does not exist. Check file name.\n"; - die "\nuse -h option to get help\n"; -} -my $genome_path; -if ( -e $genome_fasta ) { - $genome_path = File::Spec->rel2abs($genome_fasta); -} -if ( !defined $te_fasta ) { - print -"\nPlease provide fasta file containing transposable elements by using -t TE fasta path - -SAMPLE TE FASTA: ->mping TSD=TTA -GGCCAGTCACAATGGGGGTTTCACTGGTGTGTCATGCACATTTAATAGGGGTAAGACTGAATAAAAAATG -ATTATTTGCATGAAATGGGGATGAGAGAGAAGGAAAGAGTTTCATCCTGGTGAAACTCGTCAGCGTCGTT -TCCAAGTCCTCGGTAACAGAGTGAAACCCCCGTTGAGGCCGATTCGTTTCATTCACCGGATCTCTTGCGT -CCGCCTCCGCCGTGCGACCTCCGCATTCTCCCGCGCCGCGCCGGATTTTGGGTACAAATGATCCCAGCAA -CTTGTATCAATTAAATGCTTTGCTTAGTCTTGGAAACGTCAAAGTGAAACCCCTCCACTGTGGGGATTGT -TTCATAAAAGATTTCATTTGAGAGAAGATGGTATAATATTTTGGGTAGCCGTGCAATGACACTAGCCATT -GTGACTGGCC - -FASTA header must contain \"TSD=\", can be a Perl regular expression. - Example: these exact characters TTA: TSD=TTA - Example: any 4 characters: TSD=.... - Example: A or T followed by GCC: TSD=(A|T)GCC - Example: CGA followed by any character then an A then CT or G: TSD=CGA.A(CT|G) -\n"; - die "\nuse -h option to get help\n"; -} -elsif ( !-e $te_fasta ) { - print "$te_fasta does not exist. Check file name.\n"; - die "\nuse -h option to get help\n"; -} -else { - open INFILE, $te_fasta or die "Can't open $te_fasta\n"; - my $first_line = ; - close INFILE; - if ( $first_line !~ /^>\S+\s+TSD=\S+/ ) { - die -"The TE_fasta:$te_fasta does not have the proper format:\n>TE_NAME TSD=TSD\nSEQUENCE\n"; - } -} -my @fq_files; -my %fq_files; -if ( !defined $fq_dir ) { - print "\n\nPlease provide a directory of paired fastq files\n"; - die "\nuse -h option to get help\n"; -} -elsif ( $fq_dir eq 'SKIP' ) { - ##skip all other steps for processing the raw fq files -} -elsif ( !-d $fq_dir ) { - print -"\n\nCheck the spelling or location of $fq_dir, Please provide a directory of paired fastq files\n"; - die "\nuse -h option to get help\n"; -} -else { - my $fq_path = File::Spec->rel2abs($fq_dir); - @fq_files = <$fq_path/*fq>; - my @fastq_files = <$fq_path/*fastq>; - push @fq_files, @fastq_files; - if ( scalar @fq_files == 0 ) { - print "Must provide at least 1 short read file\n"; - die "\nuse -h option to get help\n"; - } -} -my $existing_TE_path = 'NONE'; -my $existing_blat = 0; -if ( $existing_TE ne 'NONE' ) { - if ( $existing_TE eq '1' ) { - ##run blat - $existing_blat = 1; - } - elsif ( !-e $existing_TE ) { - print "$existing_TE does not exist\n"; - print "Please use -r 1 or provide a file that exists\n"; - die "\nuse -h option to get help\n"; - } - else { - $existing_TE_path = File::Spec->rel2abs($existing_TE); - open INFILE, $existing_TE or die "Can't open $existing_TE\n"; - my $first_line = ; - close INFILE; - if ( $first_line !~ /\S+\t\S+:\d+\.\.\d+/ ) { - print "The existing_TE file is not in the appropriate format: - -SAMPLE Reference TEs (the two columns are tab-delimited): -mping Chr12:839604..840033 -mping Chr11:23200534..23200105 - -TE_nameref_seqname:first_Base_Of_TIR1..Last_base_of_TIR2 - -or (recommended) use \'-r 1\' for RelocaTE to find your TE in the reference - "; - die "\nuse -h option to get help\n"; - } - } -} - -sub getHelp { - print ' -usage: -./relocaTE.pl [-t TE_fasta_file][-g chromosome_genome_fasta][-d dir_of_fq][-e short_sample_name][-h] - -options: - -**required: --t |--te_fasta file fasta containing nucleotide sequences of transposable elements with - TSD=xxx in the desc. [no default] --d |--fq_dir dir directory of paired and unpaired fastq files (paired _p1.fq & _p2.fq) - (.fq or .fastq is acceptable) [no default] - -**recommended: --g |--genome_fasta file genome (reference) fasta file path. If not provided will only align - reads to TE and remove TE seq from short reads. [no default] --e |--exper STR Short sample name, will be used in the output files to create IDs for - the insert (ex. A123) [not.given] --o |--outdir STR name for directory to contain output directories and files, will be - created for the run (ex. 04222012_A123) [outdir_teSearch] - -**optional: --p |--parallel INT Break down the single big job of relocaTE into as many smaller jobs as - possible. The alternative (0) would be to run one after the other - (int, 0=false or 1=true) [0] --q |--qsub_q STR same as qsub -q option, not required [no default] --a |--qsus_array INT if \'-a 1\' , create qsub PBS array jobs to run the many shell scripts - created in the \'-a 1\' option. (see: man qsub option -t).( - 0=false or 1=true) [0] --w |--workingdir dir base working directory, needs to exist, will not be creates, full path - required [cwd] --l |--len INT len cutoff for the TE trimmed reads to be aligned [10] --m |--mismatch FRACTION mismatch allowance for alignment to TE (ex 0.1) [0] --1 |--mate_1_id STR string to uniquely identify mate 1 paired files ex: file_p1.fq [_p1] --2 |--mate_2_id STR pattern to uniquely identify mate 2 paired files ex: file_p2.fq [_p2] --u |--unpaired_id STR pattern to uniquely identify unpaired files ex: file.unPaired.fq [.unPaired] --bm|--blat_minScore INT blat minScore value, used by blat in the comparison of reads to TE sequence [10] --bt|--blat_tileSize INT blat tileSize value, used by blat in the comparison of reads to TE sequence [7] --f |--flanking_seq INT length of the sequence flanking the found insertion to be returned. This - sequence is taken from the reference genome [100] --r |--reference_ins STR To identify reference and shared insertions (reference and reads) - choose option-1 or option-2. - option-1) (recommended) use \'-r 1\' to have RelocaTE find the location of your TE in the - reference. - option-2) input the file name of a tab-delimited file containing the coordinates - of TE insertions pre-existing in the reference sequence. [no default] --h |--help this message - - -See documentation for more information. http://srobb1.github.com/RelocaTE/ - -'; -## use in V2 -## -b2 |--bowtie2 INT to use bowtie2 use \'-b2 1\' else for bowtie use \'-b2 0\' [0] - exit 1; -} -if ( $outdir eq '' or $outdir =~ /^\s+$/ or !defined $outdir ) { - die "your -o option has an incorrect value, it needs to be something -\nuse -h option to get help\n"; -} -else { - $outdir =~ s/\/$//; -} -my $te_path = File::Spec->rel2abs($te_fasta); -my @outdir = split /\//, $outdir; -$outdir = pop @outdir; -my $top_dir = $outdir; -my @depend; -my $shellscripts = "$current_dir/$top_dir/shellscripts"; -if ($qsub_array) { - mkdir "$current_dir/$top_dir"; - mkdir "$shellscripts"; - open QSUBARRAY, ">$current_dir/$top_dir/run_these_jobs.sh" - or die "Can't open $current_dir/$top_dir/run_these_jobs.sh\n"; -} -elsif ($parallel) { - mkdir "$current_dir/$top_dir"; - mkdir "$shellscripts"; - open PARALLEL, ">$current_dir/$top_dir/run_these_jobs.sh" - or die "Can't open $current_dir/$top_dir/run_these_jobs.sh\n"; -} -else { - mkdir "$current_dir/$top_dir"; -} -my $qsub_formatGenome_cmd = 0; -## get names of each ref sequecne -my @genome_seqs; -if ( $mapping > 0 ) { - open( INFASTA, "$genome_path" ) || die "$!\n"; - while ( my $line = ) { - next unless $line =~ /^>(\S+)/; - if ( $line =~ /^>(\S+)/ ) { - my $id = $1; - if ( $id =~ /\|/ ) { - $id =~ s/\|/_/g; - } - push @genome_seqs, $id; - } - else { - die "Your genome FASTA file is in a unexpected format. ->SEQNAME -SEQUENCE ->SEQNAME2 -SEQUENCE2\n"; - } - } - close(INFASTA); - - #create bowtie index - my $cmd; - if ( !$bowtie2 and !-e "$genome_path.bowtie_build_index.1.ebwt" ) { - $cmd = -"bowtie-build -f $genome_path $genome_path.bowtie_build_index 12> $current_dir/$top_dir/bowtie-build.out"; - $qsub_formatGenome_cmd = 1; - } - elsif ( $bowtie2 and !-e "$genome_path.bowtie2_build_index.1.bt2" ) { - $cmd = -"bowtie2-build -f $genome_path $genome_path.bowtie2_build_index 12> $current_dir/$top_dir/bowtie-build2.out"; - $qsub_formatGenome_cmd = 1; - } - my $ref = 'ref'; - if ( $genome_path =~ /(?:.+\/)?(.+)\.(fa|fasta)$/ ) { - $ref = $1; - } - if ( $parallel and defined $cmd ) { - my $shell_dir = "$shellscripts/step_1"; - mkdir $shell_dir; - open OUTSH, ">$shell_dir/step_1.$ref.formatGenome.sh"; - print OUTSH "$cmd\n"; - close OUTSH; - chmod 0755, "$shell_dir/step_1.$ref.formatGenome.sh"; - print PARALLEL "sh $shell_dir/step_1.$ref.formatGenome.sh\n" - if !$qsub_array; - } - elsif ( $parallel and !defined $cmd ) { - my $step1_file = - "$shellscripts/step_1_not_needed_genome_fasta_already_formatted"; - my $shell_dir = "$shellscripts"; - mkdir $shell_dir; - if ($parallel) { - open STEP1, ">$step1_file" or die "Can't Open $step1_file\n"; - print STEP1 ''; - close STEP1; - } - } - elsif ( defined $cmd ) { - ##run it now - print "Formatting the reference genome: $genome_path\n"; - system($cmd); - } - if ($qsub_array) { - if ( !-e "$shellscripts/step_1_not_needed_genome_fasta_already_formatted" ) - { - my $job = "$shellscripts/step_1/step_1.$ref.formatGenome.sh"; - print QSUBARRAY - "STEP1=\`qsub -e $shellscripts -o $shellscripts $qsub_q $job\` -echo \$STEP1\n"; - - } - } -} ##end if($mapping) - -##run existing TE blat against ref if the file does not exsit -my $qsub_existingTE_cmd = 0; -my $existing_blat_cmd = -"blat $genome_path $te_path $current_dir/$top_dir/existingTE.blatout 1> $current_dir/$top_dir/existingTE.blat.stdout"; -if ($existing_blat) { - ##if running blat set existing_TE_path to blatout - $existing_TE_path = "$current_dir/$top_dir/existingTE.blatout"; - if ( $parallel - and !-e "$current_dir/$top_dir/existingTE.blatout" ) - { - my $shell_dir = "$shellscripts"; - if ( !-d $shell_dir ) { - mkdir $shell_dir; - } - open OUTSH, ">$shell_dir/step_0.existingTE_blat.sh" - or die "Can't open $shell_dir/step_0.existingTE_blat.sh for writing $!\n"; - print PARALLEL "sh $shell_dir/step_0.existingTE_blat.sh\n" if !$qsub_array; - print OUTSH "$existing_blat_cmd\n"; - if ($qsub_array) { - $qsub_existingTE_cmd = 1; - print QSUBARRAY -"EXISTINGTE=`qsub -e $shellscripts -o $shellscripts $qsub_q $shellscripts/step_0.existingTE_blat.sh` -echo \$EXISTINGTE\n"; - } - close OUTSH; - } - elsif ( !-e "$current_dir/$top_dir/existingTE.blatout" ) { - ## do it now - print "finding TEs ($te_path) in the reference genome ($genome_path)\n"; - system($existing_blat_cmd); - } -} - -my @fq; -my @fa; - -#convert fq files to fa for blat -open QSUBARRAY2, ">$shellscripts/step_2.fq2fa.sh" - if $qsub_array; -my $fq_count = 0; -if ( $fq_dir ne 'SKIP' ) { - foreach my $fq (@fq_files) { - my $fq_path = File::Spec->rel2abs($fq); - push @fq, $fq_path; - my $fa = $fq; - if ( $fa =~ s/\.(fq|fastq)$/.fa/ ) { - push @fa, $fa; - if ( !-e $fa ) { - my $cmd = "$scripts/relocaTE_fq2fa.pl $fq_path $fa"; - if ($parallel) { - my @fq_path = split '/', $fq_path; - my $fq_name = pop @fq_path; - my $shell_dir = "$shellscripts/step_2"; - - mkdir $shell_dir; - my $outsh = "$shell_dir/$fq_count." . "fq2fa.sh"; - open OUTSH, ">$outsh"; - print PARALLEL "sh $outsh\n" if !$qsub_array; - print OUTSH "$cmd\n"; - } - else { - ##run it now - print "Converting $fq_path to fasta for blat\n"; - system($cmd); - } - } - else { - my $shell_dir = "$shellscripts"; - - mkdir $shell_dir; - my $step2_file = - "$shellscripts/step_2_not_needed_fq_already_converted_2_fa"; - - if ($parallel) { - open STEP2, ">$step2_file" or die "Can't Open $step2_file\n"; - print STEP2 ''; - close STEP2; - } - } - } - else { - print -"$fq does not seem to be a fastq based on the file extension. It should be fq or fastq\n"; - &getHelp(); - } - $fq_count++; - } - if ( !-e "$shellscripts/step_2_not_needed_fq_already_converted_2_fa" - and $qsub_array ) - { - my $end = $fq_count - 1; - my $job = "$shellscripts/step_2.fq2fa.sh"; - if ( !@depend ) { - print QSUBARRAY - "STEP2=\`qsub -e $shellscripts -o $shellscripts $qsub_q -t 0-$end $job\` -echo \$STEP2\n"; - @depend = ( "STEP2", "afterokarray" ); - } - else { - my ( $last_job, $afterok ) = @depend; - @depend = ( "STEP2", "afterokarray" ); - my $jobName = $depend[0]; - print QSUBARRAY -"$jobName=`qsub -e $shellscripts -o $shellscripts $qsub_q -t 0-$end -W depend=$afterok:\$$last_job $job` -echo \$$jobName\n"; - } - print QSUBARRAY2 "sh $shellscripts/step_2/\$PBS_ARRAYID.fq2fa.sh"; - } - elsif ($qsub_array) { - unlink "$shellscripts/step_2.fq2fa.sh"; - } -} ##end if $fq_dir ne 'SKIP' -close QSUBARRAY2; - -##split the TE fasta of many seqs into individual files -my @te_fastas; -my %TSD; -open( INFASTA, "$te_fasta" ) || die "$!\n"; -my $i = 0; -while ( my $line = ) { - if ( $line =~ /^>(\S+)\s+TSD=(\S+)/ ) { - my $id = $1; - $TSD{$id} = $2; - if ( $i > 0 ) { - close(OUTFASTA); - $i = 0; - } - my $te_file = "$id.fa"; - $te_file =~ s/\|/_/g; - ##create new dir for files: workingDir/outdir/TE/ - my $te_dir = "$current_dir/$top_dir/$id"; - push @te_fastas, "$te_dir/$te_file"; - - mkdir $te_dir; - open( OUTFASTA, ">$te_dir/$te_file" ) or die "$!\n"; - print OUTFASTA $line; - $i++; - } - elsif ( $line =~ /^>/ and $line !~ /TSD=/ ) { - die -"The TE_fasta:$te_fasta does not have the proper format:\n>TE_NAME TSD=TSD\nSEQUENCE\n"; - } - else { ##should be sequence - print OUTFASTA $line; - } -} -close(INFASTA); -close(OUTFASTA); - -#foreach TE fasta blat against target chromosome and parse and find insertion sites -my $depend = 1 if @depend; -foreach my $te_path (@te_fastas) { - if ($depend) { - @depend = ( "STEP2", "afterokarray" ); - } - else { - @depend = (); - } - my @path = split '/', $te_path; - my $te_fasta = pop @path; - my $path = join '/', @path; - my $TE = $te_fasta; - $TE =~ s/\.fa//; - mkdir "$path/blat_output"; - mkdir "$path/flanking_seq"; - mkdir "$path/te_containing_fq"; - mkdir "$path/te_only_read_portions_fa"; - - #blat fa files against te.fa - my @flanking_fq; - my $fq_file_count = scalar @fq; - - open QSUBARRAY3, ">$shellscripts/step_3.$TE.blat.sh" - if $qsub_array; - open QSUBARRAY4, ">$shellscripts/step_5.$TE.finder.sh" - if $qsub_array; - for ( my $i = 0 ; $i < $fq_file_count ; $i++ ) { - my $fa = $fa[$i]; - my $fq = $fq[$i]; - - #remove and save filename part of path - my @fa_path = split '/', $fa; - my $fa_name = pop @fa_path; - $fa_name =~ s/\.fa$//; - my $shell_dir = "$shellscripts/step_3/$TE"; - if ($parallel) { - make_path( $shell_dir, { mode => 0755 } ); - open OUTSH, ">$shell_dir/$i.$TE.blat.sh" - or die "Can't open $shell_dir/$i.$TE.blat.sh $!\n"; - print PARALLEL "sh $shell_dir/$i.$TE.blat.sh\n" if !$qsub_array; - } - - #use pre-existing blatout files - if ( !-e "$path/blat_output/$fa_name.te_$TE.blatout" ) { - my $cmd = -"blat -minScore=$blat_minScore -tileSize=$blat_tileSize $te_path $fa $path/blat_output/$fa_name.te_$TE.blatout 1>> $path/blat_output/blat.out"; - print OUTSH "$cmd\n" if $parallel; - print "Finding reads in $fa_name that contain sequence of $TE\n" - if !$parallel; - system($cmd) if !$parallel; - } - - #use pre-existing te_containing_fq files - my $te_Containing_fq = - "$path/te_containing_fq/$fa_name.te_$TE.ContainingReads.fq"; - if ( -e $te_Containing_fq ) { - $fq = $te_Containing_fq; - } - my $cmd = -"perl $scripts/relocaTE_trim.pl $path/blat_output/$fa_name.te_$TE.blatout $fq $len_cutoff $mismatch_allowance > $path/flanking_seq/$fa_name.te_$TE.flankingReads.fq"; - if ($parallel) { - print OUTSH "$cmd\n"; - close OUTSH; - chmod 0755, "$shell_dir/*blat.sh"; - } - else { - ##run it now - print "Trimming $fq reads of $TE sequence\n" if !$parallel; - system($cmd) if !$parallel; - } - } - if ($qsub_array) { - my $end = $fq_file_count - 1; - my $job = "$shellscripts/step_3.$TE.blat.sh"; - my $desc = $TE; - $desc =~ s/\W/_/; - if ( !@depend ) { - print QSUBARRAY -"STEP_3_$desc=\`qsub -e $shellscripts -o $shellscripts $qsub_q -t 0-$end $job\` -echo \$STEP_3_$desc\n"; - @depend = ( "STEP_3_$desc", "afterokarray" ); - } - else { - my ( $last_job, $afterok ) = @depend; - @depend = ( "STEP_3_$desc", "afterokarray" ); - my $jobName = $depend[0]; - print QSUBARRAY -"$jobName=`qsub -e $shellscripts -o $shellscripts $qsub_q -t 0-$end -W depend=$afterok:\$$last_job $job` -echo \$$jobName\n"; - - } - print QSUBARRAY3 "sh $shellscripts/step_3/$TE/\$PBS_ARRAYID.$TE.blat.sh"; - } - ##if a genome file was provided, align seqs to genome - ##if no genome file was provided, will only blat and trim reads of te seq - if ($mapping) { - my $param_path = "$current_dir/$top_dir/$TE"; - my $outregex = "$param_path/regex.txt"; - open OUTREGEX, ">$outregex" or die $!; - print OUTREGEX "$mate_file_1\t$mate_file_2\t$mate_file_unpaired\t$TSD{$TE}"; - my $cmd = -"$scripts/relocaTE_align.pl $scripts $param_path $genome_path $outregex $TE $exper $bowtie2"; - if ( !$parallel ) { - ## run now - print "Aligning $TE trimmed reads to the reference ($genome_path)\n"; - system($cmd); - } - else { - my $shell_dir = "$shellscripts/step_4/$TE"; - $genome_path =~ /.+\/(.+)\.(fa|fasta)$/; - my $ref = $1; - - #`mkdir -p $shell_dir`; - make_path( $shell_dir, { mode => 0755 } ); - - #mkdir $shell_dir; - open OUTSH, ">$shell_dir/step_4.$ref.$TE.align.sh"; - print OUTSH "$cmd\n"; - print PARALLEL "sh $shell_dir/step_4.$ref.$TE.align.sh\n" if !$qsub_array; - close OUTSH; - - #`chmod +x $shell_dir/step_4.$ref.$TE.align.sh`; - chmod 0755, "$shell_dir/step_4.$ref.$TE.align.sh"; - if ($qsub_array) { - my $existing_depend = ''; - if ( $qsub_formatGenome_cmd ) { - $existing_depend = "-W depend=afterok:\$STEP1" if !@depend; - $existing_depend = ",depend=afterok:\$STEP1" if @depend; - } - my $job = "$shell_dir/step_4.$ref.$TE.align.sh"; - my $desc = $TE; - $desc =~ s/\W/_/; - if ( !@depend ) { - print QSUBARRAY -"STEP_4_$desc=\`qsub -e $shellscripts -o $shellscripts $qsub_q $existing_depend $job\` -echo \$STEP_4_$desc\n"; - @depend = ( "STEP_4_$desc", "afterok" ); - } - else { - my ( $last_job, $afterok ) = @depend; - @depend = ( "STEP_4_$desc", "afterok" ); - my $jobName = $depend[0]; - print QSUBARRAY -"$jobName=`qsub -e $shellscripts -o $shellscripts $qsub_q -W depend=$afterok:\$$last_job","$existing_depend $job` -echo \$$jobName\n"; - } - } - } - - my $genome_count = 0; - foreach my $seq_id (@genome_seqs) { - $genome_path =~ /.+\/(.+)\.(fa|fasta)$/; - my $ref = $1; - my $merged_bowtie = "$path/bowtie_aln/$ref.$TE.bowtie.out"; - my $cmd = -"$scripts/relocaTE_insertionFinder.pl $merged_bowtie $seq_id $genome_path $TE $outregex $exper $flanking_seq_len $existing_TE_path $mismatch_allowance $bowtie2"; - if ( !$parallel ) { - ##run it now - print "Finding $TE insertions in $seq_id\n"; - system($cmd); - } - else { - my $shell_dir = "$shellscripts/step_5/$TE"; - make_path( $shell_dir, { mode => 0755 } ); - open OUTSH, ">$shell_dir/$genome_count.$TE.findSites.sh"; - print OUTSH "$cmd\n"; - close OUTSH; - print PARALLEL "sh $shell_dir/$genome_count.$TE.findSites.sh\n" - if !$qsub_array; - chmod 0755, "$shell_dir/$genome_count.$TE.findSites.sh"; - } - $genome_count++; - } - if ($qsub_array) { - my $end = $genome_count - 1; - my $job = "$shellscripts/step_5.$TE.finder.sh"; - my $existing_depend = ''; - if ($qsub_existingTE_cmd) { - $existing_depend = "-W depend=afterok:\$EXISTINGTE" if !@depend; - $existing_depend = ":\$EXISTINGTE" if @depend; - } - #if ( $qsub_formatGenome_cmd and $existing_depend eq '' ) { - # $existing_depend = "-W depend=afterok:\$STEP1" if !@depend; - #} - #elsif ( $qsub_formatGenome_cmd and ( $existing_depend ne '' or @depend ) ) - #{ - # $existing_depend .= ":\$STEP1"; - #} - my $desc = $TE; - $desc =~ s/\W/_/; - if ( !@depend ) { - print QSUBARRAY -"STEP_5_$desc=\`qsub -e $shellscripts -o $shellscripts $qsub_q -t 0-$end $existing_depend $job\` -echo \$STEP_5_$desc\n"; - @depend = ( "STEP_5_$desc", "afterokarray" ); - } - else { - my ( $last_job, $afterok ) = @depend; - @depend = ( "STEP_5_$desc", "afterokarray" ); - my $jobName = $depend[0]; - print QSUBARRAY -"$jobName=`qsub -e $shellscripts -o $shellscripts $qsub_q -t 0-$end -W depend=$afterok:\$$last_job", - "$existing_depend $job` -echo \$$jobName\n"; - } - print QSUBARRAY4 - "sh $shellscripts/step_5/$TE/\$PBS_ARRAYID.$TE.findSites.sh"; - } - } - if ($qsub_array) { - close QSUBARRAY3; - close QSUBARRAY4; - } -} -## Finished, clean up, cat files -##cat all '.te_insertion_sites.table.txt' results into one file -foreach my $te_path (@te_fastas) { - my @path = split '/', $te_path; - my $te_fasta = pop @path; - my $path = join '/', @path; - my $TE = $te_fasta; - $TE =~ s/\.fa//; - if ($parallel) { - my $shell_dir = "$shellscripts/step_6/$TE"; - make_path( $shell_dir, { mode => 0755 } ); - open FINISH, ">$shellscripts/step_6/$TE/step_6.$TE.finishing.sh"; - print PARALLEL "sh $shellscripts/step_6/$TE/step_6.$TE.finishing.sh\n" - if !$qsub_array; -## toDo: Need to put all of this in a perl script then have the -## finishing shell script execute that perl script - print FINISH " -`mkdir -p $path/results/all_files` - -#combine confident insertions to one file -echo \"TE\tTSD\tExper\tchromosome\tinsertion_site\tstrand\tleft_flanking_read_count\tright_flanking_read_count\tleft_flanking_seq\tright_flanking_seq\tTE_orientation\" > $path/results/temp -for i in \`ls $path/results/*.$TE.confident_nonref_insert.txt\` ; do grep -v flanking_read_count \$i >> $path/results/temp ; done -mv $path/results/temp $path/results/$exper.$TE.confident_nonref.txt -mv $path/results/*.$TE.confident_nonref_insert.txt $path/results/all_files - -#combine all insertions to one file -echo \"TE\tTSD\tExper\tchromosome\tinsertion_site\tstrand\tcombined_read_count\tright_flanking_read_count\tleft_flanking_read_count\" > $path/results/temp2 -for i in \`ls $path/results/*.$TE.all_nonref_insert.txt\` ; do grep -v total \$i | grep -v Note >> $path/results/temp2 ; done -mv $path/results/temp2 $path/results/$exper.$TE.all_nonref.txt -mv $path/results/*.$TE.all_nonref_insert.txt $path/results/all_files - -#combine confident insertions ref seqs to one file -for i in \`ls $path/results/*.$TE.confident_nonref_genomeflank.fa\` ; do cat \$i >> $path/results/temp3 ; done -mv $path/results/temp3 $path/results/$exper.$TE.confident_nonref_genomeflanks.fa -mv $path/results/*.$TE.confident_nonref_genomeflank.fa $path/results/all_files - -#combine confident insertions gff to one file -echo \"##gff-version 3\" > $path/results/temp4 -for i in \`ls $path/results/*.$TE.all_insert.gff\` ; do grep -v gff \$i >> $path/results/temp4 ; done -mv $path/results/temp4 $path/results/$exper.$TE.all_inserts.gff -mv $path/results/*.$TE.all_insert.gff $path/results/all_files - -#combine confident insertions reads to one file -for i in \`ls $path/results/*.$TE.confident_nonref_insert_reads_list.txt\` ; do cat \$i >> $path/results/temp5 ; done -mv $path/results/temp5 $path/results/$exper.$TE.confident_nonref_reads_list.txt -mv $path/results/*.$TE.confident_nonref_insert_reads_list.txt $path/results/all_files - -"; - `chmod +x $shellscripts/step_6/$TE/step_6.$TE.finishing.sh`; - } - if ($qsub_array) { - my $job = "$shellscripts/step_6/$TE/step_6.$TE.finishing.sh"; - my $desc = $TE; - $desc =~ s/\W/_/; - if ( !@depend ) { - my $jobName = "STEP_6_$desc"; - print QSUBARRAY - "$jobName=\`qsub -e $shellscripts -o $shellscripts $qsub_q $job\` -echo \$$jobName\n"; - @depend = ( "STEP_6_$desc", "afterok" ); - } - else { - my ( $last_job, $afterok ) = ( "STEP_5_$desc", "afterokarray" ); - @depend = ( "STEP_6_$desc", "afterok" ); - my $jobName = $depend[0]; - print QSUBARRAY -"$jobName=`qsub -e $shellscripts -o $shellscripts $qsub_q -W depend=$afterok:\$$last_job $job` -echo \$$jobName\n"; - } - } - if ( !$parallel and !$qsub_array ) { - ##do it now - ##combine and delete individual chr files for confident sites - print "Finishing and cleaning up\n"; -`echo \"TE\tTSD\tEper\tchromosome\tinsertion_site\tstrand\tleft_flanking_read_count\tright_flanking_read_count\tleft_flanking_seq\tright_flanking_seq\tTE_orientation\" > $path/results/temp`; - my @files = `ls $path/results/*.$TE.confident_nonref_insert.txt`; - foreach my $file (@files) { - chomp $file; - `grep -v flanking_read_count $file >> $path/results/temp`; - unlink $file; - } - `mv $path/results/temp $path/results/$exper.$TE.confident_nonref.txt`; - - ##combine and delete individual chr files for all sites -`echo \"TE\tTSD\tExper\tchromosome\tinsertion_site\tstrand\tcombined_read_count\tright_flanking_read_count\tleft_flanking_read_count\" > $path/results/temp2`; - @files = `ls $path/results/*.$TE.all_nonref_insert.txt`; - foreach my $file (@files) { - chomp $file; - `grep -v total $file | grep -v Note >> $path/results/temp2`; - unlink $file; - } - `mv $path/results/temp2 $path/results/$exper.$TE.all_nonref.txt`; - - ##combine and delete individual chr fasta files - @files = `ls $path/results/*.$TE.confident_nonref_genomeflank.fa`; - foreach my $file (@files) { - chomp $file; - `cat $file >> $path/results/temp3`; - unlink $file; - } -`mv $path/results/temp3 $path/results/$exper.$TE.confident_nonref_genomeflanks.fa`; - - ##combine and delete individual chr gff files - `echo \"##gff-version 3\" > $path/results/temp4`; - @files = `ls $path/results/*.$TE.all_insert.gff`; - foreach my $file (@files) { - chomp $file; - `grep -v gff $file >> $path/results/temp4`; - unlink $file; - } - `mv $path/results/temp4 $path/results/$exper.$TE.all_inserts.gff`; - - ##combine and delete individual chr reads list - @files = `ls $path/results/*.$TE.confident_nonref_insert_reads_list.txt`; - foreach my $file (@files) { - chomp $file; - `cat $file >> $path/results/temp5`; - unlink $file; - } -`mv $path/results/temp5 $path/results/$exper.$TE.confident_nonref_reads_list.txt`; - print "$TE results are found in $path/results\n"; - } - close FINISH; -} - -if ($qsub_array) { - close QSUBARRAY; -## this would happen before IO was finished on the file - # system ("qsub $qsub_q $current_dir/$top_dir/run_these_jobs.sh"); - print "$current_dir/$top_dir/run_these_jobs.sh was created --- Run this script, \'sh $current_dir/$top_dir/run_these_jobs.sh\' --- This script will submit all jobs to the queue in the appropriate order. --- Be sure to check the error files in $current_dir/$top_dir/shellscripts. They should all be file size 0. -\n"; -} -elsif ($parallel) { - - #system (sort "$current_dir/$top_dir/run_these_jobs.sh"); - print - "Run each command line statement in $current_dir/$top_dir/run_these_jobs.sh. ---Run these in order (step_1,step_2,step_3, so on) for each TE. ---For example, all the step_3 scripts for a specific TE should be successfully completed (finished without errors) -before running a step_4 script of the same TE. ---All scripts of the same step can be run in parallel (at the same time). -\n"; -} diff --git a/scripts/relocaTE_insertionFinder.pl b/scripts/relocaTE_insertionFinder.pl index ce3b46e..8d6381d 100755 --- a/scripts/relocaTE_insertionFinder.pl +++ b/scripts/relocaTE_insertionFinder.pl @@ -444,3 +444,4 @@ sub TSD_check { } } } +