/
make_sliding_fasta_tiles
executable file
·110 lines (85 loc) · 2.31 KB
/
make_sliding_fasta_tiles
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
#!/usr/bin/perl
use strict;
use Getopt::Long;
my $window_size = 400;
my $offset = 50;
my $print_usage = 0;
my $print_window_start = 0;
my $usage = <<USAGE;
This script takes sequences in a fasta file and converts them into sub-sequences
of specified length and offset (offset of starting positions).
The subsequences will be named like the original sequences with _N appended,
where _N is the tile number.
Prints to stdout.
Mark Stenglein Dec 18, 2013
usage: make_sliding_fasta_tiles [-h] [-w tile_size] [-o offset] fasta_file
[-h] print this message
[-w tile_size] specify size of subsequences (default = $window_size)
[-o offset] specify offset between subsequence start points (default = $offset)
[-s] print tile start position as fasta header instead of usual header (original_name_tile_#)
USAGE
# no args, no input
if ( -t STDIN and not @ARGV) { print $usage and exit; }
GetOptions ( "h" => \$print_usage,
"w=i" => \$window_size,
"s" => \$print_window_start,
"o=i" => \$offset);
if ($window_size <= 0) { print "error: invalid tile size: $window_size\n" and exit; }
if ($offset <= 0) { print "error: invalid offset: $offset\n" and exit; }
if ($print_usage) { print $usage and exit; }
my $header = undef;
my $seq = undef;
while (<>)
{
chomp;
if (/>(.*)/)
{
if ($seq)
{
output_seq_tiles($header, $seq);
}
$header = $1;
$seq = undef;
next;
}
$seq .= $_;
}
# last record
if ($seq)
{
output_seq_tiles($header, $seq);
}
sub output_seq_tiles
{
my ($header, $seq) = @_;
my $window = $window_size;
my $start = 0;
my $seq_length = length $seq;
my $i = 1;
my $last_one = 0;
for ($start = 0 ; $start < $seq_length; $start += $offset)
{
my $end = $start + $window;
if ($end > $seq_length)
{
$window = $seq_length - $start;
$last_one = 1;
}
my $sub_seq = substr($seq, $start, $window);
if ($print_window_start)
{
my $start_nt = $start + 1;
print ">$header";
print "_$start_nt\n";
# print ">$start_nt\n";
}
else
{
print ">$header";
print "_$i\n";
}
print "$sub_seq\n";
if ($last_one) { last; }
$i++;
}
}