-
Notifications
You must be signed in to change notification settings - Fork 2
/
orthofisher.smk
87 lines (76 loc) · 2.82 KB
/
orthofisher.smk
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import pandas as pd
rule orthofisher_input_generation:
"""
Runs `orthofisher <https://github.com/JLSteenwyk/orthofisher>`_ on input files of FASTA file and pHMM paths the intake rule.
:config orthofisher_hmmer_files: list of hmmer files for orthofisher
"""
input:
protein_files
output:
tsv="results/orthofisher/input_protein_files.tsv",
hmm="results/orthofisher/hmms.txt",
params:
hmm_list=config["orthofisher_hmmer_files"],
log:
LOG_DIR / "orthofisher/orthofisher_input_generation.log",
shell:
"""
echo "Running orthofisher_input_generation" 2>&1 | tee {log}
for FILE in {params.hmm_list}; do
if [ -s "$FILE" ]; then
echo "$FILE" >> {output.hmm} 2>&1 | tee -a {log}
fi
done
{{ echo {input} | tr " " "\n" > {output.tsv} ; }} 2>&1 | tee -a {log}
"""
checkpoint orthofisher:
"""
Runs `orthofisher <https://github.com/JLSteenwyk/orthofisher>`_ on input files of FASTA file and pHMM paths the intake rule.
"""
input:
tsv=rules.orthofisher_input_generation.output.tsv,
hmm=rules.orthofisher_input_generation.output.hmm,
output:
directory("results/orthofisher/output"),
conda:
ENV_DIR / "orthofisher.yaml"
log:
LOG_DIR / "orthofisher/orthofisher.log",
# bibs:
# "../bibs/orthofisher.nbib",
shell:
"""
orthofisher -m {input.hmm} -f {input.tsv} -o {output} 2>&1 | tee {log}
"""
def list_orthofisher_scogs(wildcards):
checkpoint_output = checkpoints.orthofisher.get(**wildcards).output[0]
scog = Path(checkpoint_output)/"scog"
return list(scog.glob("*.hmm.orthofisher"))
checkpoint min_seq_filter_orthofisher:
"""
List all the ortholog ids and puts them in a file.
Only keeps the orthologs with a minimum number of sequences.
It also removes suffixes added to the IDs by orthofisher.
:config ortholog_min_seqs: Minimum number of sequences that needs to be in an alignment for it to proceed to phylogenetic analysis
"""
input:
list_orthofisher_scogs
output:
directory("results/orthofisher/min-seq-filtered"),
params:
min_seqs=max(3,config.get("ortholog_min_seqs", ORTHOLOG_MIN_SEQS_DEFAULT)),
log:
LOG_DIR / "orthofisher/min_seq_filter_orthofisher.log"
shell:
"""
mkdir -p {output} 2>&1 | tee {log}
for i in {input}; do
nseq=$(grep ">" $i | wc -l)
if [[ $nseq -ge {params.min_seqs} ]]; then
og=$(basename $i | sed 's/\..*//g')
path={output}/$og.fa
echo "Copying $i to $path and editing IDs" 2>&1 | tee -a {log}
{{ cat $i | cut -f1,2,3,4 -d'|' > $path ; }} 2>&1 | tee -a {log}
fi
done
"""