add util/plot_distribution.py, and some changes

shenwei356 · Dec 3, 2014 · e5b0fec · e5b0fec
1 parent 4172c3e
commit e5b0fec
Show file tree

Hide file tree

Showing 17 changed files with 264 additions and 93 deletions.
diff --git a/README.md b/README.md
@@ -7,7 +7,7 @@ See more utilitis in BioUtil module from [CPAN](http://search.cpan.org/search?qu
 
 -------
 
-Copyright (c) 2013, Wei Shen (shenwei356@gmail.com)
+Copyright (c) 2014, Wei Shen (shenwei356@gmail.com)
 
 
 [MIT License](https://github.com/shenwei356/bio_scripts/blob/master/LICENSE)
diff --git a/blast/.directory b/blast/.directory
diff --git a/enzyme/.directory b/enzyme/.directory
diff --git a/for_education/.directory b/for_education/.directory
diff --git a/sequence/fasta_seq_gc_content_plot.py → not_used/fasta_seq_gc_content_plot.py b/sequence/fasta_seq_gc_content_plot.py → not_used/fasta_seq_gc_content_plot.py
diff --git a/sequence/fasta_seq_length_plot.py → not_used/fasta_seq_length_plot.py b/sequence/fasta_seq_length_plot.py → not_used/fasta_seq_length_plot.py
diff --git a/protein/.directory b/protein/.directory
diff --git a/protein/protein_batch_compute_pI.pl b/protein/protein_batch_compute_pI.pl
@@ -7,15 +7,7 @@
 # Update  : 2014-07-29
 
 use strict;
-
-# try to use BioUtil::Seq
-if ( eval { require BioUtil::Seq; 1; } ne 1 ) {
-    die "\nPlease install BioUtil::Seq by CPAN:\n"
-        . "  cpan install BioUtil\n\n";
-}
-else {
-    BioUtil::Seq->import();
-}
+use BioUtil::Seq;
 
 my $usage = <<"USAGE";
 

diff --git a/sequence/.directory b/sequence/.directory
diff --git a/sequence/fasta2tab b/sequence/fasta2tab
@@ -4,6 +4,7 @@
 use strict;
 use Getopt::Long;
 use BioUtil::Seq;
+use BioUtil::Util;
 
 my $usage = q(
 fasta2tab - transfrom the fasta fromat to two-column table
@@ -19,8 +20,10 @@ Options:
     -lc, --lowercase           Lowercase
     -uc, --uppercase           Uppercase
 
-    -l,  --length              Ouput sequence length in another column
-    -gc, --gc                  Ouput GC content in another column
+    -l,  --length              Ouput sequence length at another column
+    -l2, --length2             Ouput number of latin-letter in sequence 
+                               at another column
+    -gc, --gc                  Ouput GC content at another column
 
     -h,  --help                Show this help information
 
@@ -33,7 +36,10 @@ Examples:
     2. extract sequence longer than 1000 bp
        cat seq.fa | fasta2tab -t -l | awk -F'\t' '$3 >= 1000' | tab2fasta -l 70
 
-    3. reverse complement sequence, uppercase, and trim gaps
+    3. extract aligned sequence of which the original sequence is longer than 1000 bp
+       cat seq.fa | fasta2tab   -l2 | awk -F'\t' '$3 >= 1000' | tab2fasta -l 70
+
+    4. reverse complement sequence, uppercase, and trim gaps
        zcat seq.fa.gz | fasta2tab -uc -rc -t | tab2fasta
 
 This script is usually used in pair with tab2fasta.
@@ -53,23 +59,15 @@ GetOptions(
     'lowercase|lc' => \$$para{lc},
     'uppercase|uc' => \$$para{uc},
 
-    'length|l' => \$$para{len},
-    'gc'       => \$$para{gc},
+    'length|l'   => \$$para{len},
+    'length2|l2' => \$$para{len2},
+    'gc'         => \$$para{gc},
 
 ) or die $usage;
 
 die $usage if $$para{help};
 
-# get the file list
-my @files = ();
-for my $file (@ARGV) {
-    for my $f ( glob $file ) {
-        push @files, $f;
-    }
-}
-if ( @files == 0 ) {
-    push @files, 'STDIN';
-}
+my @files = file_list_from_argv(@ARGV);
 
 for my $file (@files) {
     my $next_seq = FastaReader($file);
@@ -101,6 +99,15 @@ for my $file (@files) {
 
         print "$header\t$seq";
         print "\t", length $seq if $$para{len};
+        if ($$para{len2}){
+            if ($$para{trim}){
+                print "\t", length $seq;
+            } else {
+                my $seq2 = $seq;
+                $seq2 =~ s/[^a-zA-Z]+//g;
+                print "\t", length $seq2;
+            }
+        }
         print "\t", base_content( 'gc', $seq ) if $$para{gc};
         print "\n";
     }

diff --git a/sequence/fasta_extract_by_pattern.pl b/sequence/fasta_extract_by_pattern.pl
@@ -59,15 +59,7 @@
 die "no patterns given. Type \"$0 -h\" for help.\n" if @patterns == 0;
 
 # get the file list
-my @files = ();
-for my $file (@ARGV) {
-    for my $f ( glob $file ) {
-        push @files, $f;
-    }
-}
-if ( @files == 0 ) {
-    push @files, 'STDIN';
-}
+my @files = file_list_from_argv(@ARGV);
 
 # patterns_map for rapid matching with full pattern
 my %patterns_map = ();

diff --git a/sequence/fasta_extract_randomly.pl b/sequence/fasta_extract_randomly.pl
@@ -5,6 +5,7 @@
 
 use File::Basename;
 use BioUtil::Seq;
+use BioUtil::Util;
 
 $0 = basename($0);
 my $usage = <<USAGE;
@@ -31,16 +32,7 @@
 
 srand();
 
-my @files = ();
-
-for my $file (@ARGV) {
-    for my $f ( glob $file ) {
-        push @files, $f;
-    }
-}
-if ( @files == 0 ) {
-    push @files, 'STDIN';
-}
+my @files = file_list_from_argv(@ARGV);
 
 my $n = 0;
 for my $file (@files) {

diff --git a/sequence/fasta_format.pl b/sequence/fasta_format.pl
diff --git a/sequence/tab2fasta b/sequence/tab2fasta
@@ -3,7 +3,7 @@
 
 use strict;
 use Getopt::Long;
-use BioUtil::Seq;
+use BioUtil::Util;
 
 my $usage = q(
 tab2fasta - transfrom column table to fasta fromat
@@ -28,16 +28,7 @@ GetOptions(
 
 die $usage if $$para{help};
 
-# get the file list
-my @files = ();
-for my $file (@ARGV) {
-    for my $f ( glob $file ) {
-        push @files, $f;
-    }
-}
-if ( @files == 0 ) {
-    push @files, 'STDIN';
-}
+my @files = file_list_from_argv(@ARGV);
 
 for my $file (@files) {
     my $fh = undef;