From 1b99dd81b25f3661f54a736e152b185bb3d00f7f Mon Sep 17 00:00:00 2001 From: Daniel Esteban Palma Igor Date: Tue, 1 Aug 2023 21:22:03 -0400 Subject: [PATCH] Parallelization of get_pairwise_alignments --- trycycler/pairwise.py | 30 ++++++++++++++++++++---------- trycycler/reconcile.py | 2 +- 2 files changed, 21 insertions(+), 11 deletions(-) diff --git a/trycycler/pairwise.py b/trycycler/pairwise.py index 896c416..fbc5fbc 100644 --- a/trycycler/pairwise.py +++ b/trycycler/pairwise.py @@ -11,13 +11,20 @@ If not, see . """ +from concurrent.futures import as_completed, ProcessPoolExecutor import edlib import re from .log import log, section_header, explanation -def get_pairwise_alignments(seqs): +def align_sequences(seq_a, seq_b): + result = edlib.align(seq_a, seq_b, mode='NW', task='path') + cigar = result['cigar'] + percent_identity, worst_1kbp = identity_and_worst_1kbp_from_cigar(cigar) + return cigar, percent_identity, worst_1kbp + +def get_pairwise_alignments(seqs, threads=1): section_header('Pairwise global alignments') explanation('Trycycler uses the edlib aligner to get global alignments between all pairs of ' 'sequences. This can help you to spot any problematic sequences that should be ' @@ -28,18 +35,21 @@ def get_pairwise_alignments(seqs): max_seq_name_len = max(len(x) for x in seq_names) pairwise_cigars, percent_identities, worst_1kbp_identities = {}, {}, {} - for i, a in enumerate(seq_names): - seq_a = seqs[a] - for j in range(i+1, len(seq_names)): - b = seq_names[j] - seq_b = seqs[b] + with ProcessPoolExecutor(max_workers=threads) as executor: + futures = {} + for i, a in enumerate(seq_names): + seq_a = seqs[a] + for j in range(i+1, len(seq_names)): + b = seq_names[j] + seq_b = seqs[b] + future = executor.submit(align_sequences, seq_a, seq_b) + futures[future] = (a, b) + for future in as_completed(futures): + a, b = futures[future] + cigar, percent_identity, worst_1kbp = future.result() log(' ' * (max_seq_name_len - len(a)) + a, end='') log(' vs ', end='') log(b + '...' + ' ' * (max_seq_name_len - len(b)), end=' ') - - result = edlib.align(seq_a, seq_b, mode='NW', task='path') - cigar = result['cigar'] - percent_identity, worst_1kbp = identity_and_worst_1kbp_from_cigar(cigar) log(f'{percent_identity:.3f}% overall identity, ' f'{worst_1kbp:.1f}% worst-1kbp identity') diff --git a/trycycler/reconcile.py b/trycycler/reconcile.py index 207be78..a7ebdc3 100644 --- a/trycycler/reconcile.py +++ b/trycycler/reconcile.py @@ -36,7 +36,7 @@ def reconcile(args): seqs = circularise(seqs, args) seqs, starting_seq = get_starting_seq(seqs, args.threads) seqs = rotate_to_starting_seq(seqs, starting_seq) - pairwise_cigars, percent_identities, worst_1kbp_identities = get_pairwise_alignments(seqs) + pairwise_cigars, percent_identities, worst_1kbp_identities = get_pairwise_alignments(seqs, args.threads) print_identity_matrix(seqs, percent_identities, args.min_identity) print_worst_1kbp_matrix(seqs, worst_1kbp_identities, args.min_1kbp_identity) finished_message()