-
Notifications
You must be signed in to change notification settings - Fork 2
/
MiMut.sh
283 lines (243 loc) · 9.11 KB
/
MiMut.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
#!/bin/bash
#!/usr/bin/env bash
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
##### Functions
usage()
{
echo "Program: MiMut"
echo "Version: 1.0"
echo "Usage:"
echo " sh /path/to/script/MiMut.sh [options] -r <ref.fa> -g <ref.gtf> -c <cds.fa> -p <protein.fa> -l <lib.txt> -f <reads.txt> "
echo "Mandatory parameters:"
echo " -r | --reference To provide the genome reference. "
echo " -g | --gtf To provide the annotation file in GTF format for annotation of SNPs. "
echo " -c | --cds To provide the CDS sequences in FASTA format for annotation of SNPs. "
echo " -p | --protein To provide the protein sequences in FASTA format for annotation of SNPs. "
echo " -l | --library To set the library for mutant SNPs comparing. Please reserve library files in the MiMut folder. A library file should contain SNP files in VCF format with absolute paths. "
echo " -f | --files To provide raw reads. Save all paired reads files with their full paths to one txt file. One pair per line. Forward and reverse seperated by a space. "
echo "Optional parameters:"
echo " -b | --bsa To use the secondary workflow of MiMut, such as using SL9 and LF10 inbred lines. Provide the genome of the second line, e.g. SL9 genome. "
echo " -s | --step To restart with a certain step(trim/align/call-SNPs/filter/annotate). The results from previous run will be overwritten. Example: -s align"
echo " -t | --thread To set the number of threads. Default:8 "
echo " -h | --help To see the usage. "
}
trim()
{
for n in "${!array[@]}"
do
trimmomatic PE \
-phred33 \
-threads $thread \
${array[$n]} \
$n\_FP.fq.gz $n\_FU.fq.gz \
$n\_RP.fq.gz $n\_RU.fq.gz \
ILLUMINACLIP:$SCRIPT_DIR/adapters.fa:2:30:10:2:True \
LEADING:20 \
TRAILING:20 \
SLIDINGWINDOW:4:20 \
MINLEN:50
done
echo Finish reads trimming
}
align()
{
bamarray=()
for n in "${!array[@]}"
do
bowtie2-build --quiet $ref ref
bowtie2 --threads $thread --phred33 --no-discordant --no-unal -x ref -1 $n\_FP.fq.gz -2 $n\_RP.fq.gz -U $n\_FU.fq.gz,$n\_RU.fq.gz -S $n\_reads.sam
samtools view -bST $ref -@ $thread $n\_reads.sam > $n\_reads.bam
bamarray=("${bamarray[@]}" $n"_reads.bam")
done
samtools merge reads.bam ${bamarray[@]}
samtools sort -@ $thread -o reads.sort.bam -T samtmp -O bam reads.bam
rm *_reads.sam *_reads.bam reads.bam
bamtools filter -in reads.sort.bam -out reads.sort.filter1.bam -tag "XM:<3"
samtools depth -aa -q 20 -Q 20 reads.sort.filter1.bam > depth.txt
echo Finish reads alignment and filtering
}
call()
{
samtools stats -c 1,1000,1 reads.sort.filter1.bam > stats.txt
abn=`grep '^SN' stats.txt |cut -f2-|grep '^bases mapped (cigar)'|cut -f2`
avg=`grep '^COV' stats.txt |cut -f3-|sort -nrk2,2|head -1|cut -f1`
ulim=`echo "$(($avg*25/10))"`
blim=`echo "$(($avg*10/25))"`
bcftools mpileup -Ou --threads $thread -Q 20 -q 20 -f $ref reads.sort.filter1.bam | bcftools call -Ou --threads $thread -mv | bcftools filter -e '%QUAL<20' > snp0.vcf
snpcount0=`grep -c -v '^#' snp0.vcf`
echo Round 0: $snpcount0 SNPs
covflt="(DP4[0]+DP4[1]+DP4[2]+DP4[3])<$blim||(DP4[0]+DP4[1]+DP4[2]+DP4[3])>$ulim"
bcftools filter -e $covflt snp0.vcf > snp1.vcf
echo Filter SNPs by coverage: $covflt
snpcount1=`grep -c -v '^#' snp1.vcf`
echo Round 1: $snpcount1 SNPs
frqflt="(DP4[2]+DP4[3])/(DP4[0]+DP4[1]+DP4[2]+DP4[3])<0.25"
bcftools filter -e $frqflt snp1.vcf > snp2.vcf
bcftools norm -f $ref snp2.vcf -Ov -o snp2.norm.vcf
mv snp2.norm.vcf snp2.vcf
echo Filter SNPs by alt frequency
snpcount2=`grep -c -v '^#' snp2.vcf`
echo Round 2: $snpcount2 SNPs
frqflt2="(DP4[2]+DP4[3])/(DP4[0]+DP4[1]+DP4[2]+DP4[3])<0.9"
bcftools filter -e $frqflt2 snp2.vcf | bcftools filter -e '%QUAL<200' | bcftools filter -e 'MQ<40' | bcftools filter -e 'MQSB<0.5' | awk '{split($10, a, ":");split(a[2], b, ",");if (b[1]>40&&b[2]>20&&b[3]==0) print}' > snp3.vcf
grep '^#' snp2.vcf |cat - snp3.vcf > tmp
mv tmp snp3.vcf
echo Filter SNPs by homozygousity and mapping quality
snpcount3=`grep -c -v '^#' snp3.vcf`
echo Round 3: $snpcount3 SNPs
cp snp2.vcf ../
cp snp3.vcf ../
echo Finish SNP calling and filtering
}
filter()
{
perl $SCRIPT_DIR/MiMut.pl -f=snp3.vcf -v=$lib -o=SNP -m=off
echo Finish SNPs filtering
}
filterb()
{
perl $SCRIPT_DIR/MiMut.pl -f=snp3.vcf -m=on -v=off -o=SNP
cp SNP.eps ../../SNP.eps
echo Finish SNPs filtering
}
annotate()
{
tmpref=$RANDOM
mkdir $(which snpEff | perl -ae '$_=~s/bin\/snpEff/share\/snpeff-5.1-2\/data/;print $_')
mkdir $(which snpEff | perl -sae '$_=~s/bin\/snpEff/share\/snpeff-5.1-2\/data\/$ref/;print $_' -- -ref=$tmpref)
cp $ref $(which snpEff | perl -sae '$_=~s/bin\/snpEff/share\/snpeff-5.1-2\/data\/$ref\/sequences.fa/;print $_' -- -ref=$tmpref)
cp $gtf $(which snpEff | perl -sae '$_=~s/bin\/snpEff/share\/snpeff-5.1-2\/data\/$ref\/genes.gtf/;print $_' -- -ref=$tmpref)
cp $cds $(which snpEff | perl -sae '$_=~s/bin\/snpEff/share\/snpeff-5.1-2\/data\/$ref\/cds.fa/;print $_' -- -ref=$tmpref)
cp $prot $(which snpEff | perl -sae '$_=~s/bin\/snpEff/share\/snpeff-5.1-2\/data\/$ref\/protein.fa/;print $_' -- -ref=$tmpref)
configpath=$(which snpEff | perl -sae '$_=~s/bin\/snpEff/share\/snpeff-5.1-2\/$ref.config/;print $_' -- -ref=$tmpref)
echo $tmpref.genome : $tmpref > $configpath
snpEff build -gtf22 -c $configpath -v $tmpref
snpEff $tmpref SNP.vcf -c $configpath > SNP.ann.vcf
rm $configpath
rm -r $(which snpEff | perl -sae '$_=~s/bin\/snpEff/share\/snpeff-5.1-2\/data\/$ref/;print $_' -- -ref=$tmpref)
SnpSift filter "ANN[0].IMPACT has 'HIGH'" SNP.ann.vcf > SNP.ann.1.vcf
SnpSift filter "ANN[0].IMPACT has 'MODERATE'" SNP.ann.vcf > SNP.ann.2.vcf
cp SNP.ann*vcf ../
echo Finish annotation
}
annotateb()
{
awk '$3>0{print $1"\t"$2"\t"$2+20000}' SNP.txt |tail -n +2 > tmp.bed
bedtools merge -i tmp.bed |awk '$3-$2>80000{print}'|bedtools merge -i - -d 200000 |awk '{print "LF10_"$1"\t"$2"\t"$3}'> ../high_count_regions.bed
rm tmp.bed
cd ..
cat SNP.ann.vcf | SnpSift intervals high_count_regions.bed > SNP.ann.high_count_regions.vcf
SnpSift filter "ANN[0].IMPACT has 'HIGH'" SNP.ann.high_count_regions.vcf > SNP.ann.1.vcf
SnpSift filter "ANN[0].IMPACT has 'MODERATE'" SNP.ann.high_count_regions.vcf > SNP.ann.2.vcf
cp SNP.ann*vcf ../
echo Finish annotation
}
##### Main
thread=8
if [ $# -lt 12 ]; then
usage
exit 1
else
while [ "$1" != "" ]; do
case $1 in
-r | --reference ) shift
if [[ "$1" == "" || "$1" =~ ^- ]]; then
echo "missing value for -r"
exit 1
fi
ref=$1
;;
-g | --gtf ) shift
if [[ "$1" == "" || "$1" =~ ^- ]]; then
echo "missing value for -g"
exit 1
fi
gtf=$1
;;
-c | --cds ) shift
if [[ "$1" == "" || "$1" =~ ^- ]]; then
echo "missing value for -c"
exit 1
fi
cds=$1
;;
-p | --protein ) shift
if [[ "$1" == "" || "$1" =~ ^- ]]; then
echo "missing value for -p"
exit 1
fi
prot=$1
;;
-l | --library ) shift
if [[ "$1" == "" || "$1" =~ ^- ]]; then
echo "missing value for -l"
exit 1
fi
lib=$1
;;
-f | --files) shift
if [[ "$1" == "" || "$1" =~ ^- ]]; then
echo "missing value for -f"
exit 1
fi
file=$1
;;
-b | --bsa ) shift
if [[ "$1" == "" || "$1" =~ ^- ]]; then
echo "missing value for -b"
exit 1
fi
bsaref=$1
;;
-s | --step ) shift
if [[ "$1" == "" || "$1" =~ ^- ]]; then
echo "missing value for -s"
exit 1
fi
step=$1
;;
-t | --thread ) shift
if [[ "$1" == "" || "$1" =~ ^- ]]; then
echo "missing value for -t"
exit 1
fi
thread=$1
;;
-h | --help ) usage
exit
;;
* ) usage
exit 1
esac
shift
done
fi
sed -i '/^$/d' $file
array=()
while read line; do array+=("$line"); done < $file
if [[ `ls -l | grep tmp` ]]; then echo overwrite previous results;else mkdir tmp;fi
cd tmp
case $step in
"" ) ;&
"trim" ) trim ;&
"align" ) align ;&
"call-SNPs" ) call ;&
"filter" ) filter ;&
"annotate" ) annotate ;;
*) echo "Error: no such a step"; exit 1 ;;
esac
if [ "$bsaref" != "" ]; then
mkdir tmpb
cd tmpb
ref=$bsaref
echo Start the analysis on the second genome...
case $step in
"" ) ;&
"trim" ) mv ../*fq.gz . ;&
"align" ) align ;&
"call-SNPs" ) call ;&
"filter" ) filterb ;&
"annotate" ) annotateb ;;
*) echo "Error: no such a step"; exit 1 ;;
esac
fi