-
Notifications
You must be signed in to change notification settings - Fork 0
/
run_preprocessing_pipeline_one_sample_btv
executable file
·103 lines (79 loc) · 3.93 KB
/
run_preprocessing_pipeline_one_sample_btv
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
#!/bin/bash
#
# This script runs a "pre-processing" analysis pipeline on a paired-end sequencing dataset.
# What it does is:
#
# (1) uses cutadapt to trim low quality sequences, low quality bases, and adapter sequences
# You may need to modify the adapter sequence file (see below)
#
# outputs <file_base>_R1_f.fastq <file_base>_R2_f.fastq
#
# (2) uses cd-hit-est to collapse duplicate read pairs. This is done by concatenating the first 30nt of
# the read1 and read2 sequences and collapsing read pairs that have at least 58/60 of these nt identical
# Do it this way cause faster.
#
# outputs <file_base>_R1_fu.fastq <file_base>_R2_fu.fastq
#
#
# Input:
#
# This script takes as an argument a file base name.
# It expects files of the form: <base_name>_R1.fastq and <base_name>_R2.fastq to exist
# These should correspond to the read1 and read2 paired-end data.
#
#
# Dependencies:
#
# This script calls in turn a bunch of other scripts I've written. These should be in your PATH.
#
# concat_fasta_records
# fastq_to_fasta
# reconcile_read2_file
# reconcile_fastq_to_fasta
# fasta_from_sam
#
# Mark Stenglein
#
# 10/6/2015
#
file_base=$1
log=${file_base}.pipeline.log
#
# Hardcoded paths --> may need to change
#
# bracket to redirect all stdout output to log file
{
echo "***********************************"
echo "begin processing sample: $file_base"
date
f1=${file_base}_R1.fastq
f2=${file_base}_R2.fastq
# use cutadapt to trim adapter sequences and do quality trimming and throw out too-short reads
cutadapt -g file:BTV_primers.fasta -G file:BTV_primers.fasta -q 30,30 --minimum-length 100 -u 1 -o ${file_base}_R1_f.fastq -p ${file_base}_R2_f.fastq $f1 $f2
# parts of this command:
# cutadapt \
# -a AGATCGGAAGAGC -A AGATCGGAAGAGC -g GCTCTTCCGATCT -G GCTCTTCCGATCT \ # TruSeq style adapters
# -a AGATGTGTATAAGAGACAG -A AGATGTGTATAAGAGACAG -g CTGTCTCTTATACACATCT -G CTGTCTCTTATACACATCT \ # Nextera adapters
# -q 30,30 \ # filter low qual seqs -> see cutadapt documentation
# --minimum-length 80 \ # ditch seqs shorter than this and their pairs
# -u 1 \ # trim the last (1 3') base
# -o ${file_base}_R1_f.fastq \ # trimmed (R1) output
# -p ${file_base}_R2_f.fastq \ # paired read (R2) trimmed output
# $f1 $f2 # the name of the input files
# name of cutadapt output
#f1=${file_base}_R1_f.fastq
#f2=${file_base}_R2_f.fastq
# fastq_to_fasta
#fastq_to_fasta < $f1 > ${file_base}_R1_f.fa
#fastq_to_fasta < $f2 > ${file_base}_R2_f.fa
# collapse to "unique" reads based on 1st 30 bases of R1 and R2
# allow 2 mismatches in these 60 bases (58/60 = 96.66% -> so cluster seqs w/ >= 96% pairwise id)
# this collapses reads w/ identical start and end points, likely PCR duplicates
# doing it based on 60 bases only speeds up process and is enough info to identify dups
#concat_fasta_records -5 30 ${file_base}_R1_f.fa ${file_base}_R2_f.fa > ${file_base}_R12_30_f.fa
#cd-hit-est -c 1.0 -i ${file_base}_R12_30_f.fa -o ${file_base}_R12_30_f.fa.cdhit -T 12 -M 0
#reconcile_fastq_to_fasta ${file_base}_R1_f.fastq ${file_base}_R12_30_f.fa.cdhit > ${file_base}_R1_fu.fastq
#reconcile_fastq_to_fasta ${file_base}_R2_f.fastq ${file_base}_R12_30_f.fa.cdhit > ${file_base}_R2_fu.fastq
#f1=${file_base}_R1_fu.fastq
#f2=${file_base}_R2_fu.fastq
} 2>&1 | tee -a $log # output tee'd to a logfile