### get qc for simulated reads (simulated using hiseq rapid run scores)

In [1]:
!ln -fs ../../../raw_data/simulated_reads/chr1_1M_hiseq_rr_score_samples/simulated_R1.fastq ./
!ln -fs ../../../raw_data/simulated_reads/chr1_1M_hiseq_rr_score_samples/simulated_R2.fastq ./
!ln -fs ../../../resources/barcodes/miseq_barcodes/mwanga_barcodes_combined.txt

!ls -lh

total 16
-rw-r--r--  1 ryankuster  staff   7.5K Dec 28 10:24 motif_quality_simulated_hiseq_rr_scores.ipynb
lrwxr-xr-x  1 ryankuster  staff    71B Dec 28 10:25 [35mmwanga_barcodes_combined.txt[m[m -> ../../../resources/barcodes/miseq_barcodes/mwanga_barcodes_combined.txt
lrwxr-xr-x  1 ryankuster  staff    83B Dec 28 10:25 [35msimulated_R1.fastq[m[m -> ../../../raw_data/simulated_reads/chr1_1M_hiseq_rr_score_samples/simulated_R1.fastq
lrwxr-xr-x  1 ryankuster  staff    83B Dec 28 10:25 [35msimulated_R2.fastq[m[m -> ../../../raw_data/simulated_reads/chr1_1M_hiseq_rr_score_samples/simulated_R2.fastq


In [2]:
!mkdir qc
!python3 /Users/ryankuster/github/ngscomposer/tools/crinoid.py -r1 simulated_R1.fastq -o qc
!python3 /Users/ryankuster/github/ngscomposer/tools/crinoid.py -r1 simulated_R2.fastq -o qc

### demultiplex to create flush-ended reads (beginning with motifs)

In [3]:
!mkdir demultiplexed

!python3 /Users/ryankuster/github/ngscomposer/tools/anemone.py -r1 simulated_R1.fastq -r2 simulated_R2.fastq -f 6 -m 1 -c mwanga_barcodes_combined.txt -o demultiplexed

redundant R1 barcodes detected
redundant R2 barcodes detected


In [4]:
!mkdir ./demultiplexed/qc
!python3 /Users/ryankuster/github/ngscomposer/tools/crinoid.py -r1 ./demultiplexed/combined.R1.fastq -o ./demultiplexed/qc
!python3 /Users/ryankuster/github/ngscomposer/tools/crinoid.py -r1 ./demultiplexed/combined.R2.fastq -o ./demultiplexed/qc

### run rotifer (modified to keep reads without RE motif)

In [5]:
!mkdir parsed
!python3 ../scripts/rotifer_fails.py -r1 ./demultiplexed/combined.R1.fastq -m1 TGCAT -o parsed
!python3 ../scripts/rotifer_fails.py -r1 ./demultiplexed/combined.R2.fastq -m1 CATG -o parsed

### check qc of failing vs. passing reads

In [6]:
!mkdir ./parsed/qc
!for file in ./parsed/*fastq; do python3 /Users/ryankuster/github/ngscomposer/tools/crinoid.py -r1 $file -o ./parsed/qc ; done

R1 - original read q scores after demultiplexing:

![title](demultiplexed/qc/qscores.combined.R1.fastq.csv.png)

R1 - q scores of reads containing intact RE motif 'TGCAT':

 ![title](parsed/qc/qscores.se.combined.R1.fastq.csv.png)

R1 - q scores of reads without intact RE motif 'TGCAT':

 ![title](parsed/qc/qscores.failed.combined.R1.fastq.csv.png)

R2 - original read q scores after demultiplexing:

![title](demultiplexed/qc/qscores.combined.R2.fastq.csv.png)

R2 - q scores of reads containing intact RE motif 'TGCAT':

 ![title](parsed/qc/qscores.se.combined.R2.fastq.csv.png)

R2 - q scores of reads without intact RE motif 'TGCAT':

 ![title](parsed/qc/qscores.failed.combined.R2.fastq.csv.png)

### test read compression of passing vs. failing reads (using 'gsed' on macOS Big Sur, use 'sed' on linux)  
first, the reads must be the same length for compression, so the final 3 bases are trimmed because variable length barcodes were used

In [7]:
!gsed -n '2~4p' ./parsed/failed.combined.R1.fastq | awk '{print length}' | sort | uniq -c

1465 234
1702 235
1558 236
1583 237


In [8]:
!python3 /Users/ryankuster/github/ngscomposer/tools/scallop.py -r1 ./parsed/failed.combined.R1.fastq -b 234
!python3 /Users/ryankuster/github/ngscomposer/tools/scallop.py -r1 ./parsed/failed.combined.R2.fastq -b 234
!python3 /Users/ryankuster/github/ngscomposer/tools/scallop.py -r1 ./parsed/se.combined.R1.fastq -b 234
!python3 /Users/ryankuster/github/ngscomposer/tools/scallop.py -r1 ./parsed/se.combined.R2.fastq -b 234

grab only the fastq reads containing the DNA sequences

In [9]:
!mkdir unique_reads

!gsed -n '2~4p' ./parsed/trimmed_se.failed.combined.R1.fastq > ./unique_reads/ALL.failed.combined.R1.fastq
!gsed -n '2~4p' ./parsed/trimmed_se.failed.combined.R2.fastq > ./unique_reads/ALL.failed.combined.R2.fastq
!gsed -n '2~4p' ./parsed/trimmed_se.se.combined.R1.fastq > ./unique_reads/ALL.se.combined.R1.fastq
!gsed -n '2~4p' ./parsed/trimmed_se.se.combined.R2.fastq > ./unique_reads/ALL.se.combined.R2.fastq

In [10]:
!sort ./unique_reads/ALL.failed.combined.R1.fastq | uniq > ./unique_reads/UNIQ.failed.combined.R1.fastq
!sort ./unique_reads/ALL.failed.combined.R2.fastq | uniq > ./unique_reads/UNIQ.failed.combined.R2.fastq
!sort ./unique_reads/ALL.se.combined.R1.fastq | uniq > ./unique_reads/UNIQ.se.combined.R1.fastq
!sort ./unique_reads/ALL.se.combined.R2.fastq | uniq > ./unique_reads/UNIQ.se.combined.R2.fastq

In [11]:
!wc -l ./unique_reads/*R1.fastq

    6308 ./unique_reads/ALL.failed.combined.R1.fastq
  945794 ./unique_reads/ALL.se.combined.R1.fastq
    6300 ./unique_reads/UNIQ.failed.combined.R1.fastq
  297482 ./unique_reads/UNIQ.se.combined.R1.fastq
 1255884 total


In [12]:
!wc -l ./unique_reads/*R2.fastq

   17119 ./unique_reads/ALL.failed.combined.R2.fastq
  934983 ./unique_reads/ALL.se.combined.R2.fastq
   17058 ./unique_reads/UNIQ.failed.combined.R2.fastq
  502888 ./unique_reads/UNIQ.se.combined.R2.fastq
 1472048 total
