-
Notifications
You must be signed in to change notification settings - Fork 2
/
alignment.smk
194 lines (156 loc) · 6.32 KB
/
alignment.smk
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
def get_orthologs_path(wildcards):
orthologs_checkpoint = checkpoints.min_seq_filter_orthofisher if config.get('use_orthofisher', USE_ORTHOFISHER_DEFAULT) else checkpoints.orthofinder_all
orthologs_path = orthologs_checkpoint.get(**wildcards).output[0]
return orthologs_path
def get_alignment_inputs(wildcards):
"""
Gets the results of one of the orthologs modules to pass to the alignment.
"""
return f"{get_orthologs_path(wildcards)}/{{og}}.fa"
rule mafft:
"""
Aligns the protein (amino acid) ortholog with MAFFT.
"""
input:
get_alignment_inputs
output:
temp("results/alignment/aligned_proteins/{og}.protein.alignment.fa")
# bibs:
# "../bibs/mafft7.bib"
log:
LOG_DIR / "alignment/mafft/mafft-{og}.log"
threads:
calculate_filesize_threads
conda:
ENV_DIR / "mafft.yaml"
shell:
"""
{{ mafft --thread {threads} --auto {input} > {output} ; }} 2>&1 | tee {log}
"""
rule get_cds_seq:
"""
This rule creates an unaligned mfasta file of the corresponding nucleotide sequences.
Locates the original CDSs so that the aligned (amino acid) sequences can be translated back.
"""
input:
cds_dir=Path(rules.rename_sequences.output[0]).parent,
alignment=rules.mafft.output
output:
temp("results/alignment/seqs_cds/{og}.cds.seqs.fa")
# bibs:
# "../bibs/biopython.bib"
conda:
ENV_DIR / "biopython.yaml"
shell:
"""
python {SCRIPT_DIR}/get_cds_seq.py --cds-dir {input.cds_dir} --alignment {input.alignment} --output-file {output}
"""
# "python {SCRIPT_DIR}/get_cds_seq.py --cds-dir {input.cds_dir} --alignment {input.alignment} --output-file {output} 2>&1 | tee {log}"
rule taxon_only:
"""
Trim sequence IDs to taxon.
At the end, the sequence IDs need to be trimmed down to contain just the taxon identifier
and produce clean output for the next stages.
"""
input:
rules.mafft.output
output:
"results/alignment/taxon_only/{og}.taxon_only.protein.alignment.fa"
conda:
ENV_DIR / "typer.yaml"
shell:
"python {SCRIPT_DIR}/taxon_only.py {input} {output}"
rule thread_dna:
"""
Back-translates the alignment to codons based on the CDS sequences, yielding a correspond alignment of nucleotide sequences.
https://jlsteenwyk.com/PhyKIT/usage/index.html#protein-to-nucleotide-alignment
The --stop argument keeps in stop codons which are otherwise removed.
"""
input:
alignment=rules.taxon_only.output,
cds=rules.get_cds_seq.output
output:
"results/alignment/threaded_cds/{og}.cds.alignment.fa"
# bibs:
# "../bibs/phykit.bib"
conda:
ENV_DIR / "phykit.yaml"
log:
LOG_DIR / "alignment/thread_dna/{og}.log"
shell:
"""
{{ phykit thread_dna --protein {input.alignment} --nucleotide {input.cds} --stop > {output} ; }} 2>&1 | tee {log}
"""
def get_alignments_to_trim(wildcards):
if wildcards.alignment_type == "cds":
return rules.thread_dna.output
return rules.taxon_only.output
rule trim_alignments:
"""
Trim multiple-sequence alignments using ClipKIT.
https://jlsteenwyk.com/ClipKIT
"""
input:
get_alignments_to_trim
output:
# These cannot be marked as 'temp' because they are used in later rules which refer to them in the alignments list file.
"results/alignment/trimmed_{alignment_type}/{og}.trimmed.{alignment_type}.alignment.fa"
# bibs:
# "../bibs/clipkit.bib"
conda:
ENV_DIR / "clipkit.yaml"
log:
LOG_DIR / "alignment/trim_alignments/{og}.{alignment_type}.log"
shell:
"""
clipkit {input} -m smart-gap -o {output} 2>&1 | tee {log}
"""
def get_trimmed_alignments(wildcards):
orthologs_path = get_orthologs_path(wildcards)
all_ogs = glob_wildcards(os.path.join(orthologs_path, "{og}.fa")).og
return expand(rules.trim_alignments.output, og=all_ogs, alignment_type=wildcards.alignment_type)
def get_untrimmed_alignments(wildcards):
orthologs_path = get_orthologs_path(wildcards)
all_ogs = glob_wildcards(os.path.join(orthologs_path, "{og}.fa")).og
if wildcards.alignment_type == "cds":
return expand(rules.thread_dna.output, og=all_ogs)
return expand(rules.taxon_only.output, og=all_ogs)
def get_min_length(wildcards):
if wildcards.alignment_type == "cds":
return config.get("minimum_trimmed_alignment_length_cds", MINIMUM_TRIMMED_ALIGNMENT_LENGTH_CDS_DEFAULT)
return config.get("minimum_trimmed_alignment_length_proteins", MINIMUM_TRIMMED_ALIGNMENT_LENGTH_PROTEINS_DEFAULT)
checkpoint list_alignments:
"""
List path to alignment files into a single text file for use in PhyKIT.
"""
input:
trimmed=get_trimmed_alignments,
untrimmed=get_untrimmed_alignments,
output:
"results/alignment/alignments_list.{alignment_type}.txt",
params:
min_length=get_min_length,
max_trimmed_proportion=config.get("max_trimmed_proportion", MAX_TRIMMED_PROPORTION_DEFAULT),
threads: workflow.cores
log:
LOG_DIR / "alignment/list_alignments.{alignment_type}.log"
script:
f"{SCRIPT_DIR}/filter_alignments.py"
def get_alignment_files(wildcards):
files = []
if config.get('infer_tree_with_protein_seqs', INFER_TREE_WITH_PROTEIN_SEQS_DEFAULT):
files.append("results/alignment/alignments_list.protein.txt")
if config.get('infer_tree_with_cds_seqs', INFER_TREE_WITH_CDS_SEQS_DEFAULT):
files.append("results/alignment/alignments_list.cds.txt")
return files
rule missing_taxa:
input:
input_sources="results/intake/input_sources.csv",
alignments_list_protein="results/alignment/alignments_list.protein.txt" if config.get('infer_tree_with_protein_seqs', INFER_TREE_WITH_PROTEIN_SEQS_DEFAULT) else ".",
alignments_list_cds="results/alignment/alignments_list.cds.txt" if config.get('infer_tree_with_cds_seqs', INFER_TREE_WITH_CDS_SEQS_DEFAULT) else ".",
output:
"logs/warnings/missing_taxa.txt"
conda:
ENV_DIR / "typer.yaml"
shell:
"python {SCRIPT_DIR}/missing_taxa.py {input.input_sources} results/alignment/taxon_only/ {input.alignments_list_protein} {input.alignments_list_cds} > {output}"