-
Notifications
You must be signed in to change notification settings - Fork 28
/
pairwise.py
108 lines (91 loc) · 4.16 KB
/
pairwise.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
"""
Copyright 2020 Ryan Wick (rrwick@gmail.com)
https://github.com/rrwick/Trycycler
This file is part of Trycycler. Trycycler is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by the Free Software Foundation,
either version 3 of the License, or (at your option) any later version. Trycycler is distributed
in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
details. You should have received a copy of the GNU General Public License along with Trycycler.
If not, see <http://www.gnu.org/licenses/>.
"""
import edlib
import re
from .log import log, section_header, explanation
def get_pairwise_alignments(seqs):
section_header('Pairwise global alignments')
explanation('Trycycler uses the edlib aligner to get global alignments between all pairs of '
'sequences. This can help you to spot any problematic sequences that should be '
'excluded before continuing. If you see any sequences with notably worse '
'identities, you can remove them (delete the contig\'s FASTA) and run this '
'command again.')
seq_names = list(seqs.keys())
max_seq_name_len = max(len(x) for x in seq_names)
pairwise_cigars, percent_identities, worst_1kbp_identities = {}, {}, {}
for i, a in enumerate(seq_names):
seq_a = seqs[a]
for j in range(i+1, len(seq_names)):
b = seq_names[j]
seq_b = seqs[b]
log(' ' * (max_seq_name_len - len(a)) + a, end='')
log(' vs ', end='')
log(b + '...' + ' ' * (max_seq_name_len - len(b)), end=' ')
result = edlib.align(seq_a, seq_b, mode='NW', task='path')
cigar = result['cigar']
percent_identity, worst_1kbp = identity_and_worst_1kbp_from_cigar(cigar)
log(f'{percent_identity:.3f}% overall identity, '
f'{worst_1kbp:.1f}% worst-1kbp identity')
pairwise_cigars[(a, b)] = cigar
percent_identities[(a, b)] = percent_identity
percent_identities[(b, a)] = percent_identity
worst_1kbp_identities[(a, b)] = worst_1kbp
worst_1kbp_identities[(b, a)] = worst_1kbp
log()
return pairwise_cigars, percent_identities, worst_1kbp_identities
def identity_and_max_indel_from_cigar(cigar):
cigar_parts = re.findall(r'\d+[IDX=]', cigar)
total, matches, max_indel = 0, 0, 0
for p in cigar_parts:
size = int(p[:-1])
letter = p[-1]
total += size
if letter == '=':
matches += size
if letter == 'I' or letter == 'D':
max_indel = max(max_indel, size)
percent_identity = 100.0 * matches / total
return percent_identity, max_indel
def identity_and_worst_1kbp_from_cigar(cigar):
percent_identity, _ = identity_and_max_indel_from_cigar(cigar)
expanded_cigar = get_expanded_cigar(cigar)
worst_1kbp = worst_window_identity(expanded_cigar, 1000)
return percent_identity, worst_1kbp
def get_expanded_cigar(cigar):
expanded_cigar = []
cigar_parts = re.findall(r'\d+[IDX=]', cigar)
for p in cigar_parts:
size = int(p[:-1])
letter = p[-1]
expanded_cigar.append(letter * size)
return ''.join(expanded_cigar)
def worst_window_identity(expanded_cigar, window_size):
"""
This function returns the worst percent identity in a sliding window across an expanded CIGAR
string.
"""
cigar_len = len(expanded_cigar)
if cigar_len <= window_size:
return 100.0 * expanded_cigar.count('=') / cigar_len
start, end = 0, window_size
window_match_count = expanded_cigar[start:end].count('=')
min_match_count = window_match_count
while end < cigar_len:
if expanded_cigar[start] == '=':
window_match_count -= 1
if expanded_cigar[end] == '=':
window_match_count += 1
if window_match_count < min_match_count:
min_match_count = window_match_count
start += 1
end += 1
return 100.0 * min_match_count / window_size