-
Notifications
You must be signed in to change notification settings - Fork 5
/
align2profile_qc_full.pl
executable file
·320 lines (305 loc) · 11.5 KB
/
align2profile_qc_full.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
#!/usr/bin/perl -w
#align2profile_qc_full.pl - alignment cleanup (columns and rows - SLOW)
#Copyright (C) 2011 Thomas J. Sharpton
#author contact: thomas.sharpton@gladstone.ucsf.edu
#
#This program is free software: you can redistribute it and/or modify
#it under the terms of the GNU General Public License as published by
#the Free Software Foundation, either version 3 of the License, or
#(at your option) any later version.
#
#This program is distributed in the hope that it will be useful,
#but WITHOUT ANY WARRANTY; without even the implied warranty of
#MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
#GNU General Public License for more details.
#
#You should have received a copy of the GNU General Public License
#along with this program (see LICENSE.txt). If not, see
#<http://www.gnu.org/licenses/>.
use strict;
use Bio::AlignIO;
use Getopt::Long;
use Bio::Align::Utilities qw(:all);
#CONSIDERATIONS/RESTRICTIONS
#Currently, all thresholds are inclusive (if you equal threshold, you stay)
#For all gap based cutoffs, only gaps relative to the profile are considered (-). "." are ignored for cutoff purposes.
#Currently, score/p-vals can only be parsed from INFERNAL output. Will update as needed.
#Alignment file can only contain a single alignment
#Currently, alignment masking is the excition of columns, so not ideal for protein space (should replace with N|?)
#Var initializatoin
my ($input, $output, $seq_scores_tab, $lookup_list, $flat);
my ($seq_len_cut, $aln_seq_score_cut, $aln_seq_p_cut, $aln_seq_score_rat, $gap_ratio_cut, $mm_ratio_cut, $pcnt_cov_cut, $ngaps_internal_cut, $gappy_col_cut);
my ($gap_window, $mm_window, $aln_format);
#Set a few default settings
$seq_len_cut = 0;
$aln_seq_score_cut = 0.0;
$aln_seq_p_cut = 0.0;
$aln_seq_score_rat = 1.0;
$pcnt_cov_cut = 15;
$ngaps_internal_cut = 50;
$gap_ratio_cut = 50; #Currently not configured
$mm_ratio_cut = 50; #Currently not configured
$mm_window = 50; #IN BP/AA POSITIONS
$gappy_col_cut = 0.75;
$gap_window = $mm_window;
$aln_format = "fasta";
#Command line parameter space settings
GetOptions(
"i=s" => \$input, #Path to alignment object(s)
"o=s" => \$output, #Output file path
"s:s" => \$seq_scores_tab, #Formatted for INFERNAL verbose output
"l:s" => \$lookup_list, #Path to list of sequence ids to prune from alignment
"lc:i" => \$seq_len_cut, #Minimum sequence length cutoff
"sc:i" => \$aln_seq_score_cut, #Minimum alignment score for each sequence cutoff
"sr:i" => \$aln_seq_score_rat, #Minimum bit score per base in sequence cutoff
"pc:i" => \$aln_seq_p_cut, #Minimum p-val/e-val for each sequence cutoff
"grc:i" => \$gap_ratio_cut, #Minimum number of gaps/window for each sequence cutoff "mrc=i" => \$mm_ratio_cut, #Minimum number of mismatches/window per sequence cutoff
"cov:i" => \$pcnt_cov_cut, #Percentage of sequence positions that must be covered by model/concensus (minimum)
"ngc:i" => \$ngaps_internal_cut, #Maximum number of sequence bounded gaps tolerated per sequence (TOTAL GAPS)
"gcc:i" => \$gappy_col_cut, #Maximum number of gaps in a column divided by sequences in an alignment
"mmw:i" => \$mm_window, #Size of mismatch window (Default = 50)
"gpw:i" => \$gap_window, #Size of gap window (Default = mm_window)
"f:s" => \$aln_format, #Alignment format (Default = FASTA)
"flat" => \$flat, #use flat to keep coords out of headers
);
print "processing $input with format $aln_format and referencing $seq_scores_tab, will print output to $output\n";
#You're not a bonehead, right?
if(!$input || !$output){
die "You must specific an input and output file!\n"
}
if(!$seq_scores_tab && ( $aln_seq_score_cut || $aln_seq_p_cut) ) {
warn "You didn't specify a place to obtain sequence score or pvalue, so the 'sc' and 'pc' cutoffs you set will be ignored!\n"
}
#Initialize the framework
my $in_aln = Bio::AlignIO->new(-file => "$input", -format => "$aln_format");
my $out_aln = Bio::AlignIO->new(-file => ">$output", -format => "$aln_format");
my %seq_scores = ();
if (defined($seq_scores_tab)){
%seq_scores = %{ process_seq_score_tab($seq_scores_tab) };
}
my %init_prunes = ();
if(defined($lookup_list)){
%init_prunes = %{ process_lookup_list($lookup_list) };
}
#Let's loop through the alignment and look at each sequence, one at a time. Prune out the bad ones!
while( my $aln = $in_aln->next_aln() ){
#make sure all sequences are the same length
if( !$aln->is_flush() ){
warn "Alignment isn't flush, passing!\n";
next;
}
my $nseqs = $aln->num_sequences();
my $alnlen = $aln->length();
#build a sequence, gap, insert map of the alignment
#my %pre_alignment_map = %{ build_alignment_map( $aln ) };
foreach my $seq ($aln->each_seq) {
my $to_prune = 0;
my @reasons = ();
my $id = $seq->display_id();
my $sequence = $seq->seq();
#sequence length shouldn't count gaps/inserts, so copy $sequence and count seq length
my $seq_seq = $sequence;
$seq_seq =~ s/\-//g;
$seq_seq =~ s/\.//g;
my $seqlen = length( $seq_seq );
my $residues = $sequence;
$residues =~ s/(\-|\.)|//g;
my $n_residues = length($residues);
# my @seqarray = split("", $sequence);
# for(my $i=0; $i < $alnlen; $i++){
# $pre_alignment_map{ $i+1 }{ $seqarray[$i] }++;
# }
#SEQUENCE IDS A LOOKUP FILE SAYS TO PASS ON
foreach my $prune_id ( keys( %init_prunes ) ){
if($id =~ m/$prune_id/){
$to_prune = 1;
last;
}
}
#SEQUENCE LENGTH
if($seqlen < $seq_len_cut){
$to_prune = 1;
}
#INTERNAL GAPS
if(defined($ngaps_internal_cut)){
my $cpseq = $sequence;
#only want internal gaps (bounded by sequence) so let's remove leading and lagging
$cpseq =~ s/^(\-|\.)+//;
$cpseq =~ s/(\-|\.)+$//;
#now count the total number of gaps
my $ngaps = $cpseq =~ tr/\-/\-/;
if ($ngaps > $ngaps_internal_cut){
$to_prune = 1;
}
}
#SCORE AND P-VAL BASED CUTOFF PROCESING
if(defined($seq_scores_tab)){
if(defined($seq_scores{$id})){ #IF NOT, ASSUME IT'S REFERENCE SEQUENCE USED TO BUILD MODEL INCLUDED IN ALIGNMENT (MEANS THAT IT'S A GOOD READ)
if(defined($aln_seq_score_cut) && $seq_scores{$id}{"score"} < $aln_seq_score_cut){
$to_prune = 1;
}
if(defined($aln_seq_p_cut) && $seq_scores{$id}{"pval"} < $aln_seq_p_cut){
$to_prune = 1;
}
if(defined($aln_seq_score_rat) ){
my $score_per_residue = $seq_scores{$id}{"score"} / $n_residues;
if( $score_per_residue < $aln_seq_score_rat ){
$to_prune = 1;
}
}
}
}
#DO THE PRUNING OF THE SEQUENCES HERE
if($to_prune){
#print "Pruning $id\n";
$aln->remove_seq($seq);
}
}
#%pre_alignment_map = ();
#POST PRUNING ALIGNMENT CLEANUP/MASKING
#Now iterate through the sequences that passed and remove bad columns from the alignment.
#make sure all sequences are the same length
if( !$aln->is_flush() ){
warn "Alignment is no longer flush, passing!\n";
next;
}
my $post_nseqs = $aln->num_sequences();
my $postalnlen = $aln->length();
#Build a processed alignment map
my %post_alignment_map = %{ build_alignment_map( $aln ) };
#purge gappy columns using %post_alignment_map
my @gappy_cols = ();
foreach my $pos( sort { $a <=> $b } ( keys( %post_alignment_map ) ) ){
if(defined($post_alignment_map{$pos}{"."}) && $post_alignment_map{$pos}{"."}/$post_nseqs > $gappy_col_cut){
#print "$pos\t$post_alignment_map{$pos}{'.'}\n";
push(@gappy_cols, $pos);
}
}
my $n_gappy_cols = @gappy_cols;
my $gap_start = $gappy_cols[0]; #Keep all start and stop vars in aln coordinate space (1 based)
my $gap_stop = 1;
my $aln_start = 1;
my $aln_stop = $postalnlen;
my @aln_slices = ();
$aln->map_chars('-', 'N'); #Must mask gaps for splicing to work later
for ( my $n = 0; $n < $n_gappy_cols; $n++ ){
my $slice;
if ( exists( $gappy_cols[ $n + 1] ) && $gappy_cols[ $n ] + 1 == $gappy_cols[ $n + 1 ] ){
#then monotonic increase, move on
next;
}
else{
#check to see if gaps start at beginning of aln, if so, reset gap_start and move on
if( $aln_start == $gap_start ){
$gap_start = $gappy_cols[ $n + 1 ];
$gap_stop = $gappy_cols[ $n ];
next;
}
#deal with the lefthand-most aln block
elsif( !@aln_slices ){
$slice = $aln->slice( $gap_stop, $gap_start - 1 );
push( @aln_slices, $slice );
$gap_start = $gappy_cols[ $n + 1 ];
$gap_stop = $gappy_cols[ $n ];
}
#deal with the righthand-most aln block
elsif( $n+1 == $n_gappy_cols && $gappy_cols[$n] < $aln_stop ){
#need to process the aln blocks on both sides of the gappy column block
#get the left side:
$slice = $aln->slice( $gap_stop + 1, $gap_start - 1 );
push( @aln_slices, $slice );
$gap_stop = $gappy_cols[$n];
#get the right side:
$slice = $aln->slice( $gap_stop + 1, $aln_stop );
push( @aln_slices, $slice );
}
else{
$slice = $aln->slice( $gap_stop + 1, $gap_start - 1);
push( @aln_slices, $slice );
$gap_start = $gappy_cols[ $n + 1];
$gap_stop = $gappy_cols[$n];
}
}
}
%post_alignment_map=();
#now stitch alignment together
my $clean_aln = cat(@aln_slices);
#map back to gap chars
$aln->map_chars('N','-');
$clean_aln->map_chars('N','-');
#Build final alignment map
my %final_alignment_map = %{ build_alignment_map( $clean_aln ) };
produce_aln_profile_data( \%final_alignment_map );
if( $flat ){
$clean_aln->set_displayname_flat();
}
$out_aln->write_aln($clean_aln);
}
####################################
#SUBROUTINES
####################################
sub produce_aln_profile_data{
my %alignment_map = %{ $_[0] };
foreach my $pos ( sort { $a <=> $b } ( keys ( %alignment_map ) ) ){
my $coverage = 0;
foreach my $char ( keys( %{ $alignment_map{ $pos } } ) ){
if ($char eq "-" || $char eq "."){
next;
}
$coverage = $coverage + $alignment_map{ $pos }{ $char };
}
print "$pos\t$coverage\n";
}
}
sub build_alignment_map{
my $aln = shift; #An AlignI object;
my %alignment_map = ();
my $alnlen = $aln->length();
foreach my $seq ($aln->each_seq) {
my $sequence = $seq->seq();
my @seqarray = split("", $sequence);
for(my $i=0; $i < $alnlen; $i++){
$alignment_map{ $i+1 }{ $seqarray[$i] }++;
}
}
return \%alignment_map
}
sub process_lookup_list{
my $file = shift;
my %init_prunes = ();
open(IN, $file) || die "can't open $file for score parsing: $!\n";
while(<IN>){
chomp $_;
my $id = $_;
$init_prunes{$id}{"count"}++;
if($init_prunes{$id}{"count"} > 1){
warn "Found lookup list id $_ inside lookup list file more than once!\n"
}
}
return \%init_prunes;
}
sub process_seq_score_tab{
my $file = shift;
open(SCORES, $file) || die "can't open $file for score parsing: $!\n";
my %seq_scores = ();
while(<SCORES>){
chomp $_;
if($_ =~ m/^\#/ || $_ =~ m/^$/ ){
next;
}
my @data = split(" ", $_);
my $read = $data[1];
my $score = $data[3];
my $pval = $data[5];
$seq_scores{$read}{"count"}++;
if($seq_scores{$read}{"count"} > 1){
warn "Sequence $read has more than one set of results. Overwriting previous results which are\n\tscore\t" .
$seq_scores{$read}{"score"} . "\n\tpval\t" .
$seq_scores{$read}{"pval"} . "\n";
}
$seq_scores{$read}{"score"} = $score;
$seq_scores{$read}{"pval"} = $pval;
}
return \%seq_scores;
close SCORES;
}