# Process and barcode sequencing reads
This script takes paired read (read 1: barcode + UMI (27bp), read 2: staggers + UMI + partial barcode (49bp)) fastq files and does the following:
1. trims read 2 adapter sequences to recover barcode+UMI sequence
2. merges read 1 and 2 with FLASH
3. identifies and counts barcodes

In [1]:
import glob
import os
import subprocess
import regex
import gzip
from Bio import SeqIO
import pandas as pd
import multiprocessing as mp
import numpy as np

# check number of available cores
len(os.sched_getaffinity(0))

20

In [2]:
# adapt ladder fastq files by trimming to new seq format (R1: 27bp, R2: 49bp)
r1_len = 27
r2_len = 49

# working directory with fastq files
working_dir="/home/users/rang/scratch/yeast/tmp/ladder_pilot_adapted/fastq/"
os.chdir(working_dir)

# trim R1
fastq_list = sorted([os.path.abspath(x) for x in glob.glob("*R1_001.fastq.gz")])
for fastq_path in fastq_list:
    output_path = fastq_path.replace("_R1_001", "_R1_adapted")
    cutadapt_cmd = ["cutadapt", "-l", str(r1_len),
                    "-o", output_path, fastq_path]
    subprocess.run(cutadapt_cmd)
    
# trim R2
fastq_list = sorted([os.path.abspath(x) for x in glob.glob("*R2_001.fastq.gz")])
for fastq_path in fastq_list:
    output_path = fastq_path.replace("_R2_001", "_R2_adapted")
    cutadapt_cmd = ["cutadapt", "-l", str(r2_len),
                    "-o", output_path, fastq_path]
    subprocess.run(cutadapt_cmd)

## Inspect fastq files for quality control using FastQC

In [3]:
!mkdir ~/scratch/yeast/tmp/ladder_pilot_adapted/fastq/fastqc/
!fastqc -o ~/scratch/yeast/tmp/ladder_pilot_adapted/fastq/fastqc/ ~/scratch/yeast/tmp/ladder_pilot_adapted/fastq/*fastq.gz 

mkdir: cannot create directory ‘/home/users/rang/scratch/yeast/tmp/ladder_pilot_adapted/fastq/fastqc/’: File exists
Started analysis of 18146FL-30-01-01_S1_L001_R1_001.fastq.gz
Approx 5% complete for 18146FL-30-01-01_S1_L001_R1_001.fastq.gz
Approx 10% complete for 18146FL-30-01-01_S1_L001_R1_001.fastq.gz
Approx 15% complete for 18146FL-30-01-01_S1_L001_R1_001.fastq.gz
Approx 20% complete for 18146FL-30-01-01_S1_L001_R1_001.fastq.gz
Approx 25% complete for 18146FL-30-01-01_S1_L001_R1_001.fastq.gz
Approx 30% complete for 18146FL-30-01-01_S1_L001_R1_001.fastq.gz
Approx 35% complete for 18146FL-30-01-01_S1_L001_R1_001.fastq.gz
Approx 40% complete for 18146FL-30-01-01_S1_L001_R1_001.fastq.gz
Approx 45% complete for 18146FL-30-01-01_S1_L001_R1_001.fastq.gz
Approx 50% complete for 18146FL-30-01-01_S1_L001_R1_001.fastq.gz
Approx 55% complete for 18146FL-30-01-01_S1_L001_R1_001.fastq.gz
Approx 60% complete for 18146FL-30-01-01_S1_L001_R1_001.fastq.gz
Approx 65% complete for 18146FL-30-01-01_S1_

Approx 85% complete for 18146FL-30-01-02_S2_L001_R1_adapted.fastq.gz
Approx 90% complete for 18146FL-30-01-02_S2_L001_R1_adapted.fastq.gz
Approx 95% complete for 18146FL-30-01-02_S2_L001_R1_adapted.fastq.gz
Analysis complete for 18146FL-30-01-02_S2_L001_R1_adapted.fastq.gz
Started analysis of 18146FL-30-01-02_S2_L001_R2_001.fastq.gz
Approx 5% complete for 18146FL-30-01-02_S2_L001_R2_001.fastq.gz
Approx 10% complete for 18146FL-30-01-02_S2_L001_R2_001.fastq.gz
Approx 15% complete for 18146FL-30-01-02_S2_L001_R2_001.fastq.gz
Approx 20% complete for 18146FL-30-01-02_S2_L001_R2_001.fastq.gz
Approx 25% complete for 18146FL-30-01-02_S2_L001_R2_001.fastq.gz
Approx 30% complete for 18146FL-30-01-02_S2_L001_R2_001.fastq.gz
Approx 35% complete for 18146FL-30-01-02_S2_L001_R2_001.fastq.gz
Approx 40% complete for 18146FL-30-01-02_S2_L001_R2_001.fastq.gz
Approx 45% complete for 18146FL-30-01-02_S2_L001_R2_001.fastq.gz
Approx 50% complete for 18146FL-30-01-02_S2_L001_R2_001.fastq.gz
Approx 55% compl

Approx 70% complete for 18146FL-30-01-03_S3_L001_R2_adapted.fastq.gz
Approx 75% complete for 18146FL-30-01-03_S3_L001_R2_adapted.fastq.gz
Approx 80% complete for 18146FL-30-01-03_S3_L001_R2_adapted.fastq.gz
Approx 85% complete for 18146FL-30-01-03_S3_L001_R2_adapted.fastq.gz
Approx 90% complete for 18146FL-30-01-03_S3_L001_R2_adapted.fastq.gz
Approx 95% complete for 18146FL-30-01-03_S3_L001_R2_adapted.fastq.gz
Analysis complete for 18146FL-30-01-03_S3_L001_R2_adapted.fastq.gz
Started analysis of 18146FL-30-01-04_S4_L001_R1_001.fastq.gz
Approx 5% complete for 18146FL-30-01-04_S4_L001_R1_001.fastq.gz
Approx 10% complete for 18146FL-30-01-04_S4_L001_R1_001.fastq.gz
Approx 15% complete for 18146FL-30-01-04_S4_L001_R1_001.fastq.gz
Approx 20% complete for 18146FL-30-01-04_S4_L001_R1_001.fastq.gz
Approx 25% complete for 18146FL-30-01-04_S4_L001_R1_001.fastq.gz
Approx 30% complete for 18146FL-30-01-04_S4_L001_R1_001.fastq.gz
Approx 35% complete for 18146FL-30-01-04_S4_L001_R1_001.fastq.gz
Appr

Approx 55% complete for 18146FL-30-01-05_S5_L001_R1_adapted.fastq.gz
Approx 60% complete for 18146FL-30-01-05_S5_L001_R1_adapted.fastq.gz
Approx 65% complete for 18146FL-30-01-05_S5_L001_R1_adapted.fastq.gz
Approx 70% complete for 18146FL-30-01-05_S5_L001_R1_adapted.fastq.gz
Approx 75% complete for 18146FL-30-01-05_S5_L001_R1_adapted.fastq.gz
Approx 80% complete for 18146FL-30-01-05_S5_L001_R1_adapted.fastq.gz
Approx 85% complete for 18146FL-30-01-05_S5_L001_R1_adapted.fastq.gz
Approx 90% complete for 18146FL-30-01-05_S5_L001_R1_adapted.fastq.gz
Approx 95% complete for 18146FL-30-01-05_S5_L001_R1_adapted.fastq.gz
Analysis complete for 18146FL-30-01-05_S5_L001_R1_adapted.fastq.gz
Started analysis of 18146FL-30-01-05_S5_L001_R2_001.fastq.gz
Approx 5% complete for 18146FL-30-01-05_S5_L001_R2_001.fastq.gz
Approx 10% complete for 18146FL-30-01-05_S5_L001_R2_001.fastq.gz
Approx 15% complete for 18146FL-30-01-05_S5_L001_R2_001.fastq.gz
Approx 20% complete for 18146FL-30-01-05_S5_L001_R2_001.f

Approx 45% complete for 18146FL-30-01-06_S6_L001_R2_adapted.fastq.gz
Approx 50% complete for 18146FL-30-01-06_S6_L001_R2_adapted.fastq.gz
Approx 55% complete for 18146FL-30-01-06_S6_L001_R2_adapted.fastq.gz
Approx 60% complete for 18146FL-30-01-06_S6_L001_R2_adapted.fastq.gz
Approx 65% complete for 18146FL-30-01-06_S6_L001_R2_adapted.fastq.gz
Approx 70% complete for 18146FL-30-01-06_S6_L001_R2_adapted.fastq.gz
Approx 75% complete for 18146FL-30-01-06_S6_L001_R2_adapted.fastq.gz
Approx 80% complete for 18146FL-30-01-06_S6_L001_R2_adapted.fastq.gz
Approx 85% complete for 18146FL-30-01-06_S6_L001_R2_adapted.fastq.gz
Approx 90% complete for 18146FL-30-01-06_S6_L001_R2_adapted.fastq.gz
Approx 95% complete for 18146FL-30-01-06_S6_L001_R2_adapted.fastq.gz
Analysis complete for 18146FL-30-01-06_S6_L001_R2_adapted.fastq.gz
Started analysis of 18146FL-30-01-07_S7_L001_R1_001.fastq.gz
Approx 5% complete for 18146FL-30-01-07_S7_L001_R1_001.fastq.gz
Approx 10% complete for 18146FL-30-01-07_S7_L001_

Approx 35% complete for 18146FL-30-01-08_S8_L001_R1_adapted.fastq.gz
Approx 40% complete for 18146FL-30-01-08_S8_L001_R1_adapted.fastq.gz
Approx 45% complete for 18146FL-30-01-08_S8_L001_R1_adapted.fastq.gz
Approx 50% complete for 18146FL-30-01-08_S8_L001_R1_adapted.fastq.gz
Approx 55% complete for 18146FL-30-01-08_S8_L001_R1_adapted.fastq.gz
Approx 60% complete for 18146FL-30-01-08_S8_L001_R1_adapted.fastq.gz
Approx 65% complete for 18146FL-30-01-08_S8_L001_R1_adapted.fastq.gz
Approx 70% complete for 18146FL-30-01-08_S8_L001_R1_adapted.fastq.gz
Approx 75% complete for 18146FL-30-01-08_S8_L001_R1_adapted.fastq.gz
Approx 80% complete for 18146FL-30-01-08_S8_L001_R1_adapted.fastq.gz
Approx 85% complete for 18146FL-30-01-08_S8_L001_R1_adapted.fastq.gz
Approx 90% complete for 18146FL-30-01-08_S8_L001_R1_adapted.fastq.gz
Approx 95% complete for 18146FL-30-01-08_S8_L001_R1_adapted.fastq.gz
Analysis complete for 18146FL-30-01-08_S8_L001_R1_adapted.fastq.gz
Started analysis of 18146FL-30-01-08

Approx 20% complete for 18146FL-30-01-09_S9_L001_R2_adapted.fastq.gz
Approx 25% complete for 18146FL-30-01-09_S9_L001_R2_adapted.fastq.gz
Approx 30% complete for 18146FL-30-01-09_S9_L001_R2_adapted.fastq.gz
Approx 35% complete for 18146FL-30-01-09_S9_L001_R2_adapted.fastq.gz
Approx 40% complete for 18146FL-30-01-09_S9_L001_R2_adapted.fastq.gz
Approx 45% complete for 18146FL-30-01-09_S9_L001_R2_adapted.fastq.gz
Approx 50% complete for 18146FL-30-01-09_S9_L001_R2_adapted.fastq.gz
Approx 55% complete for 18146FL-30-01-09_S9_L001_R2_adapted.fastq.gz
Approx 60% complete for 18146FL-30-01-09_S9_L001_R2_adapted.fastq.gz
Approx 65% complete for 18146FL-30-01-09_S9_L001_R2_adapted.fastq.gz
Approx 70% complete for 18146FL-30-01-09_S9_L001_R2_adapted.fastq.gz
Approx 75% complete for 18146FL-30-01-09_S9_L001_R2_adapted.fastq.gz
Approx 80% complete for 18146FL-30-01-09_S9_L001_R2_adapted.fastq.gz
Approx 85% complete for 18146FL-30-01-09_S9_L001_R2_adapted.fastq.gz
Approx 90% complete for 18146FL-30

Started analysis of 18146FL-30-01-11_S11_L001_R1_adapted.fastq.gz
Approx 5% complete for 18146FL-30-01-11_S11_L001_R1_adapted.fastq.gz
Approx 10% complete for 18146FL-30-01-11_S11_L001_R1_adapted.fastq.gz
Approx 15% complete for 18146FL-30-01-11_S11_L001_R1_adapted.fastq.gz
Approx 20% complete for 18146FL-30-01-11_S11_L001_R1_adapted.fastq.gz
Approx 25% complete for 18146FL-30-01-11_S11_L001_R1_adapted.fastq.gz
Approx 30% complete for 18146FL-30-01-11_S11_L001_R1_adapted.fastq.gz
Approx 35% complete for 18146FL-30-01-11_S11_L001_R1_adapted.fastq.gz
Approx 40% complete for 18146FL-30-01-11_S11_L001_R1_adapted.fastq.gz
Approx 45% complete for 18146FL-30-01-11_S11_L001_R1_adapted.fastq.gz
Approx 50% complete for 18146FL-30-01-11_S11_L001_R1_adapted.fastq.gz
Approx 55% complete for 18146FL-30-01-11_S11_L001_R1_adapted.fastq.gz
Approx 60% complete for 18146FL-30-01-11_S11_L001_R1_adapted.fastq.gz
Approx 65% complete for 18146FL-30-01-11_S11_L001_R1_adapted.fastq.gz
Approx 70% complete for 1

Approx 90% complete for 18146FL-30-01-12_S12_L001_R2_001.fastq.gz
Approx 95% complete for 18146FL-30-01-12_S12_L001_R2_001.fastq.gz
Analysis complete for 18146FL-30-01-12_S12_L001_R2_001.fastq.gz
Started analysis of 18146FL-30-01-12_S12_L001_R2_adapted.fastq.gz
Approx 5% complete for 18146FL-30-01-12_S12_L001_R2_adapted.fastq.gz
Approx 10% complete for 18146FL-30-01-12_S12_L001_R2_adapted.fastq.gz
Approx 15% complete for 18146FL-30-01-12_S12_L001_R2_adapted.fastq.gz
Approx 20% complete for 18146FL-30-01-12_S12_L001_R2_adapted.fastq.gz
Approx 25% complete for 18146FL-30-01-12_S12_L001_R2_adapted.fastq.gz
Approx 30% complete for 18146FL-30-01-12_S12_L001_R2_adapted.fastq.gz
Approx 35% complete for 18146FL-30-01-12_S12_L001_R2_adapted.fastq.gz
Approx 40% complete for 18146FL-30-01-12_S12_L001_R2_adapted.fastq.gz
Approx 45% complete for 18146FL-30-01-12_S12_L001_R2_adapted.fastq.gz
Approx 50% complete for 18146FL-30-01-12_S12_L001_R2_adapted.fastq.gz
Approx 55% complete for 18146FL-30-01-1

Approx 75% complete for 18146FL-30-01-14_S14_L001_R1_001.fastq.gz
Approx 80% complete for 18146FL-30-01-14_S14_L001_R1_001.fastq.gz
Approx 85% complete for 18146FL-30-01-14_S14_L001_R1_001.fastq.gz
Approx 90% complete for 18146FL-30-01-14_S14_L001_R1_001.fastq.gz
Approx 95% complete for 18146FL-30-01-14_S14_L001_R1_001.fastq.gz
Analysis complete for 18146FL-30-01-14_S14_L001_R1_001.fastq.gz
Started analysis of 18146FL-30-01-14_S14_L001_R1_adapted.fastq.gz
Approx 5% complete for 18146FL-30-01-14_S14_L001_R1_adapted.fastq.gz
Approx 10% complete for 18146FL-30-01-14_S14_L001_R1_adapted.fastq.gz
Approx 15% complete for 18146FL-30-01-14_S14_L001_R1_adapted.fastq.gz
Approx 20% complete for 18146FL-30-01-14_S14_L001_R1_adapted.fastq.gz
Approx 25% complete for 18146FL-30-01-14_S14_L001_R1_adapted.fastq.gz
Approx 30% complete for 18146FL-30-01-14_S14_L001_R1_adapted.fastq.gz
Approx 35% complete for 18146FL-30-01-14_S14_L001_R1_adapted.fastq.gz
Approx 40% complete for 18146FL-30-01-14_S14_L001_R

Approx 50% complete for 18146FL-30-01-15_S15_L001_R2_001.fastq.gz
Approx 55% complete for 18146FL-30-01-15_S15_L001_R2_001.fastq.gz
Approx 60% complete for 18146FL-30-01-15_S15_L001_R2_001.fastq.gz
Approx 65% complete for 18146FL-30-01-15_S15_L001_R2_001.fastq.gz
Approx 70% complete for 18146FL-30-01-15_S15_L001_R2_001.fastq.gz
Approx 75% complete for 18146FL-30-01-15_S15_L001_R2_001.fastq.gz
Approx 80% complete for 18146FL-30-01-15_S15_L001_R2_001.fastq.gz
Approx 85% complete for 18146FL-30-01-15_S15_L001_R2_001.fastq.gz
Approx 90% complete for 18146FL-30-01-15_S15_L001_R2_001.fastq.gz
Approx 95% complete for 18146FL-30-01-15_S15_L001_R2_001.fastq.gz
Analysis complete for 18146FL-30-01-15_S15_L001_R2_001.fastq.gz
Started analysis of 18146FL-30-01-15_S15_L001_R2_adapted.fastq.gz
Approx 5% complete for 18146FL-30-01-15_S15_L001_R2_adapted.fastq.gz
Approx 10% complete for 18146FL-30-01-15_S15_L001_R2_adapted.fastq.gz
Approx 15% complete for 18146FL-30-01-15_S15_L001_R2_adapted.fastq.gz
A

Approx 35% complete for 18146FL-30-01-17_S17_L001_R1_001.fastq.gz
Approx 40% complete for 18146FL-30-01-17_S17_L001_R1_001.fastq.gz
Approx 45% complete for 18146FL-30-01-17_S17_L001_R1_001.fastq.gz
Approx 50% complete for 18146FL-30-01-17_S17_L001_R1_001.fastq.gz
Approx 55% complete for 18146FL-30-01-17_S17_L001_R1_001.fastq.gz
Approx 60% complete for 18146FL-30-01-17_S17_L001_R1_001.fastq.gz
Approx 65% complete for 18146FL-30-01-17_S17_L001_R1_001.fastq.gz
Approx 70% complete for 18146FL-30-01-17_S17_L001_R1_001.fastq.gz
Approx 75% complete for 18146FL-30-01-17_S17_L001_R1_001.fastq.gz
Approx 80% complete for 18146FL-30-01-17_S17_L001_R1_001.fastq.gz
Approx 85% complete for 18146FL-30-01-17_S17_L001_R1_001.fastq.gz
Approx 90% complete for 18146FL-30-01-17_S17_L001_R1_001.fastq.gz
Approx 95% complete for 18146FL-30-01-17_S17_L001_R1_001.fastq.gz
Analysis complete for 18146FL-30-01-17_S17_L001_R1_001.fastq.gz
Started analysis of 18146FL-30-01-17_S17_L001_R1_adapted.fastq.gz
Approx 5% co

Started analysis of 18146FL-30-01-18_S18_L001_R2_001.fastq.gz
Approx 5% complete for 18146FL-30-01-18_S18_L001_R2_001.fastq.gz
Approx 10% complete for 18146FL-30-01-18_S18_L001_R2_001.fastq.gz
Approx 15% complete for 18146FL-30-01-18_S18_L001_R2_001.fastq.gz
Approx 20% complete for 18146FL-30-01-18_S18_L001_R2_001.fastq.gz
Approx 25% complete for 18146FL-30-01-18_S18_L001_R2_001.fastq.gz
Approx 30% complete for 18146FL-30-01-18_S18_L001_R2_001.fastq.gz
Approx 35% complete for 18146FL-30-01-18_S18_L001_R2_001.fastq.gz
Approx 40% complete for 18146FL-30-01-18_S18_L001_R2_001.fastq.gz
Approx 45% complete for 18146FL-30-01-18_S18_L001_R2_001.fastq.gz
Approx 50% complete for 18146FL-30-01-18_S18_L001_R2_001.fastq.gz
Approx 55% complete for 18146FL-30-01-18_S18_L001_R2_001.fastq.gz
Approx 60% complete for 18146FL-30-01-18_S18_L001_R2_001.fastq.gz
Approx 65% complete for 18146FL-30-01-18_S18_L001_R2_001.fastq.gz
Approx 70% complete for 18146FL-30-01-18_S18_L001_R2_001.fastq.gz
Approx 75% comp

Approx 85% complete for 18146FL-30-01-19_S19_L001_R2_adapted.fastq.gz
Approx 90% complete for 18146FL-30-01-19_S19_L001_R2_adapted.fastq.gz
Approx 95% complete for 18146FL-30-01-19_S19_L001_R2_adapted.fastq.gz
Analysis complete for 18146FL-30-01-19_S19_L001_R2_adapted.fastq.gz
Started analysis of 18146FL-30-01-20_S20_L001_R1_001.fastq.gz
Approx 5% complete for 18146FL-30-01-20_S20_L001_R1_001.fastq.gz
Approx 10% complete for 18146FL-30-01-20_S20_L001_R1_001.fastq.gz
Approx 15% complete for 18146FL-30-01-20_S20_L001_R1_001.fastq.gz
Approx 20% complete for 18146FL-30-01-20_S20_L001_R1_001.fastq.gz
Approx 25% complete for 18146FL-30-01-20_S20_L001_R1_001.fastq.gz
Approx 30% complete for 18146FL-30-01-20_S20_L001_R1_001.fastq.gz
Approx 35% complete for 18146FL-30-01-20_S20_L001_R1_001.fastq.gz
Approx 40% complete for 18146FL-30-01-20_S20_L001_R1_001.fastq.gz
Approx 45% complete for 18146FL-30-01-20_S20_L001_R1_001.fastq.gz
Approx 50% complete for 18146FL-30-01-20_S20_L001_R1_001.fastq.gz
A

Approx 70% complete for 18146FL-30-01-21_S21_L001_R1_adapted.fastq.gz
Approx 75% complete for 18146FL-30-01-21_S21_L001_R1_adapted.fastq.gz
Approx 80% complete for 18146FL-30-01-21_S21_L001_R1_adapted.fastq.gz
Approx 85% complete for 18146FL-30-01-21_S21_L001_R1_adapted.fastq.gz
Approx 90% complete for 18146FL-30-01-21_S21_L001_R1_adapted.fastq.gz
Approx 95% complete for 18146FL-30-01-21_S21_L001_R1_adapted.fastq.gz
Analysis complete for 18146FL-30-01-21_S21_L001_R1_adapted.fastq.gz
Started analysis of 18146FL-30-01-21_S21_L001_R2_001.fastq.gz
Approx 5% complete for 18146FL-30-01-21_S21_L001_R2_001.fastq.gz
Approx 10% complete for 18146FL-30-01-21_S21_L001_R2_001.fastq.gz
Approx 15% complete for 18146FL-30-01-21_S21_L001_R2_001.fastq.gz
Approx 20% complete for 18146FL-30-01-21_S21_L001_R2_001.fastq.gz
Approx 25% complete for 18146FL-30-01-21_S21_L001_R2_001.fastq.gz
Approx 30% complete for 18146FL-30-01-21_S21_L001_R2_001.fastq.gz
Approx 35% complete for 18146FL-30-01-21_S21_L001_R2_00

Approx 55% complete for 18146FL-30-01-22_S22_L001_R2_adapted.fastq.gz
Approx 60% complete for 18146FL-30-01-22_S22_L001_R2_adapted.fastq.gz
Approx 65% complete for 18146FL-30-01-22_S22_L001_R2_adapted.fastq.gz
Approx 70% complete for 18146FL-30-01-22_S22_L001_R2_adapted.fastq.gz
Approx 75% complete for 18146FL-30-01-22_S22_L001_R2_adapted.fastq.gz
Approx 80% complete for 18146FL-30-01-22_S22_L001_R2_adapted.fastq.gz
Approx 85% complete for 18146FL-30-01-22_S22_L001_R2_adapted.fastq.gz
Approx 90% complete for 18146FL-30-01-22_S22_L001_R2_adapted.fastq.gz
Approx 95% complete for 18146FL-30-01-22_S22_L001_R2_adapted.fastq.gz
Analysis complete for 18146FL-30-01-22_S22_L001_R2_adapted.fastq.gz
Started analysis of 18146FL-30-01-23_S23_L001_R1_001.fastq.gz
Approx 5% complete for 18146FL-30-01-23_S23_L001_R1_001.fastq.gz
Approx 10% complete for 18146FL-30-01-23_S23_L001_R1_001.fastq.gz
Approx 15% complete for 18146FL-30-01-23_S23_L001_R1_001.fastq.gz
Approx 20% complete for 18146FL-30-01-23_S2

Approx 35% complete for 18146FL-30-01-24_S24_L001_R1_adapted.fastq.gz
Approx 40% complete for 18146FL-30-01-24_S24_L001_R1_adapted.fastq.gz
Approx 45% complete for 18146FL-30-01-24_S24_L001_R1_adapted.fastq.gz
Approx 50% complete for 18146FL-30-01-24_S24_L001_R1_adapted.fastq.gz
Approx 55% complete for 18146FL-30-01-24_S24_L001_R1_adapted.fastq.gz
Approx 60% complete for 18146FL-30-01-24_S24_L001_R1_adapted.fastq.gz
Approx 65% complete for 18146FL-30-01-24_S24_L001_R1_adapted.fastq.gz
Approx 70% complete for 18146FL-30-01-24_S24_L001_R1_adapted.fastq.gz
Approx 75% complete for 18146FL-30-01-24_S24_L001_R1_adapted.fastq.gz
Approx 80% complete for 18146FL-30-01-24_S24_L001_R1_adapted.fastq.gz
Approx 85% complete for 18146FL-30-01-24_S24_L001_R1_adapted.fastq.gz
Approx 90% complete for 18146FL-30-01-24_S24_L001_R1_adapted.fastq.gz
Approx 95% complete for 18146FL-30-01-24_S24_L001_R1_adapted.fastq.gz
Analysis complete for 18146FL-30-01-24_S24_L001_R1_adapted.fastq.gz
Started analysis of 18

Approx 20% complete for 18146FL-30-01-25_S25_L001_R2_adapted.fastq.gz
Approx 25% complete for 18146FL-30-01-25_S25_L001_R2_adapted.fastq.gz
Approx 30% complete for 18146FL-30-01-25_S25_L001_R2_adapted.fastq.gz
Approx 35% complete for 18146FL-30-01-25_S25_L001_R2_adapted.fastq.gz
Approx 40% complete for 18146FL-30-01-25_S25_L001_R2_adapted.fastq.gz
Approx 45% complete for 18146FL-30-01-25_S25_L001_R2_adapted.fastq.gz
Approx 50% complete for 18146FL-30-01-25_S25_L001_R2_adapted.fastq.gz
Approx 55% complete for 18146FL-30-01-25_S25_L001_R2_adapted.fastq.gz
Approx 60% complete for 18146FL-30-01-25_S25_L001_R2_adapted.fastq.gz
Approx 65% complete for 18146FL-30-01-25_S25_L001_R2_adapted.fastq.gz
Approx 70% complete for 18146FL-30-01-25_S25_L001_R2_adapted.fastq.gz
Approx 75% complete for 18146FL-30-01-25_S25_L001_R2_adapted.fastq.gz
Approx 80% complete for 18146FL-30-01-25_S25_L001_R2_adapted.fastq.gz
Approx 85% complete for 18146FL-30-01-25_S25_L001_R2_adapted.fastq.gz
Approx 90% complete 

Started analysis of 18146FL-30-01-27_S27_L001_R1_adapted.fastq.gz
Approx 5% complete for 18146FL-30-01-27_S27_L001_R1_adapted.fastq.gz
Approx 10% complete for 18146FL-30-01-27_S27_L001_R1_adapted.fastq.gz
Approx 15% complete for 18146FL-30-01-27_S27_L001_R1_adapted.fastq.gz
Approx 20% complete for 18146FL-30-01-27_S27_L001_R1_adapted.fastq.gz
Approx 25% complete for 18146FL-30-01-27_S27_L001_R1_adapted.fastq.gz
Approx 30% complete for 18146FL-30-01-27_S27_L001_R1_adapted.fastq.gz
Approx 35% complete for 18146FL-30-01-27_S27_L001_R1_adapted.fastq.gz
Approx 40% complete for 18146FL-30-01-27_S27_L001_R1_adapted.fastq.gz
Approx 45% complete for 18146FL-30-01-27_S27_L001_R1_adapted.fastq.gz
Approx 50% complete for 18146FL-30-01-27_S27_L001_R1_adapted.fastq.gz
Approx 55% complete for 18146FL-30-01-27_S27_L001_R1_adapted.fastq.gz
Approx 60% complete for 18146FL-30-01-27_S27_L001_R1_adapted.fastq.gz
Approx 65% complete for 18146FL-30-01-27_S27_L001_R1_adapted.fastq.gz
Approx 70% complete for 1

Approx 85% complete for 18146FL-30-01-28_S28_L001_R2_001.fastq.gz
Approx 90% complete for 18146FL-30-01-28_S28_L001_R2_001.fastq.gz
Approx 95% complete for 18146FL-30-01-28_S28_L001_R2_001.fastq.gz
Analysis complete for 18146FL-30-01-28_S28_L001_R2_001.fastq.gz
Started analysis of 18146FL-30-01-28_S28_L001_R2_adapted.fastq.gz
Approx 5% complete for 18146FL-30-01-28_S28_L001_R2_adapted.fastq.gz
Approx 10% complete for 18146FL-30-01-28_S28_L001_R2_adapted.fastq.gz
Approx 15% complete for 18146FL-30-01-28_S28_L001_R2_adapted.fastq.gz
Approx 20% complete for 18146FL-30-01-28_S28_L001_R2_adapted.fastq.gz
Approx 25% complete for 18146FL-30-01-28_S28_L001_R2_adapted.fastq.gz
Approx 30% complete for 18146FL-30-01-28_S28_L001_R2_adapted.fastq.gz
Approx 35% complete for 18146FL-30-01-28_S28_L001_R2_adapted.fastq.gz
Approx 40% complete for 18146FL-30-01-28_S28_L001_R2_adapted.fastq.gz
Approx 45% complete for 18146FL-30-01-28_S28_L001_R2_adapted.fastq.gz
Approx 50% complete for 18146FL-30-01-28_S2

Approx 60% complete for 18146FL-30-01-30_S30_L001_R1_001.fastq.gz
Approx 65% complete for 18146FL-30-01-30_S30_L001_R1_001.fastq.gz
Approx 70% complete for 18146FL-30-01-30_S30_L001_R1_001.fastq.gz
Approx 75% complete for 18146FL-30-01-30_S30_L001_R1_001.fastq.gz
Approx 80% complete for 18146FL-30-01-30_S30_L001_R1_001.fastq.gz
Approx 85% complete for 18146FL-30-01-30_S30_L001_R1_001.fastq.gz
Approx 90% complete for 18146FL-30-01-30_S30_L001_R1_001.fastq.gz
Approx 95% complete for 18146FL-30-01-30_S30_L001_R1_001.fastq.gz
Analysis complete for 18146FL-30-01-30_S30_L001_R1_001.fastq.gz
Started analysis of 18146FL-30-01-30_S30_L001_R1_adapted.fastq.gz
Approx 5% complete for 18146FL-30-01-30_S30_L001_R1_adapted.fastq.gz
Approx 10% complete for 18146FL-30-01-30_S30_L001_R1_adapted.fastq.gz
Approx 15% complete for 18146FL-30-01-30_S30_L001_R1_adapted.fastq.gz
Approx 20% complete for 18146FL-30-01-30_S30_L001_R1_adapted.fastq.gz
Approx 25% complete for 18146FL-30-01-30_S30_L001_R1_adapted.fa

Approx 40% complete for 18146FL-30-01-31_S31_L001_R2_001.fastq.gz
Approx 45% complete for 18146FL-30-01-31_S31_L001_R2_001.fastq.gz
Approx 50% complete for 18146FL-30-01-31_S31_L001_R2_001.fastq.gz
Approx 55% complete for 18146FL-30-01-31_S31_L001_R2_001.fastq.gz
Approx 60% complete for 18146FL-30-01-31_S31_L001_R2_001.fastq.gz
Approx 65% complete for 18146FL-30-01-31_S31_L001_R2_001.fastq.gz
Approx 70% complete for 18146FL-30-01-31_S31_L001_R2_001.fastq.gz
Approx 75% complete for 18146FL-30-01-31_S31_L001_R2_001.fastq.gz
Approx 80% complete for 18146FL-30-01-31_S31_L001_R2_001.fastq.gz
Approx 85% complete for 18146FL-30-01-31_S31_L001_R2_001.fastq.gz
Approx 90% complete for 18146FL-30-01-31_S31_L001_R2_001.fastq.gz
Approx 95% complete for 18146FL-30-01-31_S31_L001_R2_001.fastq.gz
Analysis complete for 18146FL-30-01-31_S31_L001_R2_001.fastq.gz
Started analysis of 18146FL-30-01-31_S31_L001_R2_adapted.fastq.gz
Approx 5% complete for 18146FL-30-01-31_S31_L001_R2_adapted.fastq.gz
Approx 10

Approx 20% complete for 18146FL-30-01-33_S33_L001_R1_001.fastq.gz
Approx 25% complete for 18146FL-30-01-33_S33_L001_R1_001.fastq.gz
Approx 30% complete for 18146FL-30-01-33_S33_L001_R1_001.fastq.gz
Approx 35% complete for 18146FL-30-01-33_S33_L001_R1_001.fastq.gz
Approx 40% complete for 18146FL-30-01-33_S33_L001_R1_001.fastq.gz
Approx 45% complete for 18146FL-30-01-33_S33_L001_R1_001.fastq.gz
Approx 50% complete for 18146FL-30-01-33_S33_L001_R1_001.fastq.gz
Approx 55% complete for 18146FL-30-01-33_S33_L001_R1_001.fastq.gz
Approx 60% complete for 18146FL-30-01-33_S33_L001_R1_001.fastq.gz
Approx 65% complete for 18146FL-30-01-33_S33_L001_R1_001.fastq.gz
Approx 70% complete for 18146FL-30-01-33_S33_L001_R1_001.fastq.gz
Approx 75% complete for 18146FL-30-01-33_S33_L001_R1_001.fastq.gz
Approx 80% complete for 18146FL-30-01-33_S33_L001_R1_001.fastq.gz
Approx 85% complete for 18146FL-30-01-33_S33_L001_R1_001.fastq.gz
Approx 90% complete for 18146FL-30-01-33_S33_L001_R1_001.fastq.gz
Approx 95%

Approx 95% complete for 18146FL-30-01-34_S34_L001_R1_adapted.fastq.gz
Analysis complete for 18146FL-30-01-34_S34_L001_R1_adapted.fastq.gz
Started analysis of 18146FL-30-01-34_S34_L001_R2_001.fastq.gz
Approx 5% complete for 18146FL-30-01-34_S34_L001_R2_001.fastq.gz
Approx 10% complete for 18146FL-30-01-34_S34_L001_R2_001.fastq.gz
Approx 15% complete for 18146FL-30-01-34_S34_L001_R2_001.fastq.gz
Approx 20% complete for 18146FL-30-01-34_S34_L001_R2_001.fastq.gz
Approx 25% complete for 18146FL-30-01-34_S34_L001_R2_001.fastq.gz
Approx 30% complete for 18146FL-30-01-34_S34_L001_R2_001.fastq.gz
Approx 35% complete for 18146FL-30-01-34_S34_L001_R2_001.fastq.gz
Approx 40% complete for 18146FL-30-01-34_S34_L001_R2_001.fastq.gz
Approx 45% complete for 18146FL-30-01-34_S34_L001_R2_001.fastq.gz
Approx 50% complete for 18146FL-30-01-34_S34_L001_R2_001.fastq.gz
Approx 55% complete for 18146FL-30-01-34_S34_L001_R2_001.fastq.gz
Approx 60% complete for 18146FL-30-01-34_S34_L001_R2_001.fastq.gz
Approx 65

Approx 75% complete for 18146FL-30-01-35_S35_L001_R2_adapted.fastq.gz
Approx 80% complete for 18146FL-30-01-35_S35_L001_R2_adapted.fastq.gz
Approx 85% complete for 18146FL-30-01-35_S35_L001_R2_adapted.fastq.gz
Approx 90% complete for 18146FL-30-01-35_S35_L001_R2_adapted.fastq.gz
Approx 95% complete for 18146FL-30-01-35_S35_L001_R2_adapted.fastq.gz
Analysis complete for 18146FL-30-01-35_S35_L001_R2_adapted.fastq.gz
Started analysis of 18146FL-30-01-36_S36_L001_R1_001.fastq.gz
Approx 5% complete for 18146FL-30-01-36_S36_L001_R1_001.fastq.gz
Approx 10% complete for 18146FL-30-01-36_S36_L001_R1_001.fastq.gz
Approx 15% complete for 18146FL-30-01-36_S36_L001_R1_001.fastq.gz
Approx 20% complete for 18146FL-30-01-36_S36_L001_R1_001.fastq.gz
Approx 25% complete for 18146FL-30-01-36_S36_L001_R1_001.fastq.gz
Approx 30% complete for 18146FL-30-01-36_S36_L001_R1_001.fastq.gz
Approx 35% complete for 18146FL-30-01-36_S36_L001_R1_001.fastq.gz
Approx 40% complete for 18146FL-30-01-36_S36_L001_R1_001.fa

Approx 60% complete for 18146FL-30-01-37_S37_L001_R1_adapted.fastq.gz
Approx 65% complete for 18146FL-30-01-37_S37_L001_R1_adapted.fastq.gz
Approx 70% complete for 18146FL-30-01-37_S37_L001_R1_adapted.fastq.gz
Approx 75% complete for 18146FL-30-01-37_S37_L001_R1_adapted.fastq.gz
Approx 80% complete for 18146FL-30-01-37_S37_L001_R1_adapted.fastq.gz
Approx 85% complete for 18146FL-30-01-37_S37_L001_R1_adapted.fastq.gz
Approx 90% complete for 18146FL-30-01-37_S37_L001_R1_adapted.fastq.gz
Approx 95% complete for 18146FL-30-01-37_S37_L001_R1_adapted.fastq.gz
Analysis complete for 18146FL-30-01-37_S37_L001_R1_adapted.fastq.gz
Started analysis of 18146FL-30-01-37_S37_L001_R2_001.fastq.gz
Approx 5% complete for 18146FL-30-01-37_S37_L001_R2_001.fastq.gz
Approx 10% complete for 18146FL-30-01-37_S37_L001_R2_001.fastq.gz
Approx 15% complete for 18146FL-30-01-37_S37_L001_R2_001.fastq.gz
Approx 20% complete for 18146FL-30-01-37_S37_L001_R2_001.fastq.gz
Approx 25% complete for 18146FL-30-01-37_S37_L0

Approx 40% complete for 18146FL-30-01-38_S38_L001_R2_adapted.fastq.gz
Approx 45% complete for 18146FL-30-01-38_S38_L001_R2_adapted.fastq.gz
Approx 50% complete for 18146FL-30-01-38_S38_L001_R2_adapted.fastq.gz
Approx 55% complete for 18146FL-30-01-38_S38_L001_R2_adapted.fastq.gz
Approx 60% complete for 18146FL-30-01-38_S38_L001_R2_adapted.fastq.gz
Approx 65% complete for 18146FL-30-01-38_S38_L001_R2_adapted.fastq.gz
Approx 70% complete for 18146FL-30-01-38_S38_L001_R2_adapted.fastq.gz
Approx 75% complete for 18146FL-30-01-38_S38_L001_R2_adapted.fastq.gz
Approx 80% complete for 18146FL-30-01-38_S38_L001_R2_adapted.fastq.gz
Approx 85% complete for 18146FL-30-01-38_S38_L001_R2_adapted.fastq.gz
Approx 90% complete for 18146FL-30-01-38_S38_L001_R2_adapted.fastq.gz
Approx 95% complete for 18146FL-30-01-38_S38_L001_R2_adapted.fastq.gz
Analysis complete for 18146FL-30-01-38_S38_L001_R2_adapted.fastq.gz
Started analysis of 18146FL-30-01-39_S39_L001_R1_001.fastq.gz
Approx 5% complete for 18146FL

Approx 25% complete for 18146FL-30-01-40_S40_L001_R1_adapted.fastq.gz
Approx 30% complete for 18146FL-30-01-40_S40_L001_R1_adapted.fastq.gz
Approx 35% complete for 18146FL-30-01-40_S40_L001_R1_adapted.fastq.gz
Approx 40% complete for 18146FL-30-01-40_S40_L001_R1_adapted.fastq.gz
Approx 45% complete for 18146FL-30-01-40_S40_L001_R1_adapted.fastq.gz
Approx 50% complete for 18146FL-30-01-40_S40_L001_R1_adapted.fastq.gz
Approx 55% complete for 18146FL-30-01-40_S40_L001_R1_adapted.fastq.gz
Approx 60% complete for 18146FL-30-01-40_S40_L001_R1_adapted.fastq.gz
Approx 65% complete for 18146FL-30-01-40_S40_L001_R1_adapted.fastq.gz
Approx 70% complete for 18146FL-30-01-40_S40_L001_R1_adapted.fastq.gz
Approx 75% complete for 18146FL-30-01-40_S40_L001_R1_adapted.fastq.gz
Approx 80% complete for 18146FL-30-01-40_S40_L001_R1_adapted.fastq.gz
Approx 85% complete for 18146FL-30-01-40_S40_L001_R1_adapted.fastq.gz
Approx 90% complete for 18146FL-30-01-40_S40_L001_R1_adapted.fastq.gz
Approx 95% complete 

Started analysis of 18146FL-30-01-41_S41_L001_R2_adapted.fastq.gz
Approx 5% complete for 18146FL-30-01-41_S41_L001_R2_adapted.fastq.gz
Approx 10% complete for 18146FL-30-01-41_S41_L001_R2_adapted.fastq.gz
Approx 15% complete for 18146FL-30-01-41_S41_L001_R2_adapted.fastq.gz
Approx 20% complete for 18146FL-30-01-41_S41_L001_R2_adapted.fastq.gz
Approx 25% complete for 18146FL-30-01-41_S41_L001_R2_adapted.fastq.gz
Approx 30% complete for 18146FL-30-01-41_S41_L001_R2_adapted.fastq.gz
Approx 35% complete for 18146FL-30-01-41_S41_L001_R2_adapted.fastq.gz
Approx 40% complete for 18146FL-30-01-41_S41_L001_R2_adapted.fastq.gz
Approx 45% complete for 18146FL-30-01-41_S41_L001_R2_adapted.fastq.gz
Approx 50% complete for 18146FL-30-01-41_S41_L001_R2_adapted.fastq.gz
Approx 55% complete for 18146FL-30-01-41_S41_L001_R2_adapted.fastq.gz
Approx 60% complete for 18146FL-30-01-41_S41_L001_R2_adapted.fastq.gz
Approx 65% complete for 18146FL-30-01-41_S41_L001_R2_adapted.fastq.gz
Approx 70% complete for 1

## Summarize FastQC output with MultiQC

In [4]:
!multiqc -o ~/scratch/yeast/tmp/ladder_pilot_adapted/fastq/fastqc/ ~/scratch/yeast/tmp/ladder_pilot_adapted/fastq/fastqc/

[1;30m[INFO   ][0m         multiqc : This is MultiQC v1.9
[1;30m[INFO   ][0m         multiqc : Template    : default
[1;30m[INFO   ][0m         multiqc : Searching   : /home/users/rang/scratch/yeast/tmp/ladder_pilot_adapted/fastq/fastqc
[?25lSearching 336 files..  [####################################]  100%          [?25h
[1;30m[INFO   ][0m          fastqc : Found 168 reports
[1;30m[ERROR  ][0m        bargraph : [31m############### Error making MatPlotLib figure! Falling back to HighCharts.[0m
[1;30m[ERROR  ][0m       linegraph : [31m############### Error making MatPlotLib figure! Falling back to HighCharts.[0m
[1;30m[ERROR  ][0m       linegraph : [31m############### Error making MatPlotLib figure! Falling back to HighCharts.[0m
[1;30m[ERROR  ][0m       linegraph : [31m############### Error making MatPlotLib figure! Falling back to HighCharts.[0m
[1;30m[ERROR  ][0m       linegraph : [31m############### Error making MatPlotLib figure! Falling back to HighCh

## Map fastq file names to sample names

In [5]:
# key to map fastq names to output names
seqID_to_sampleName = {}
sample_key_file = "/home/users/rang/crispey3/ladder_pilot_feb2021/SampleKey-18146-30.txt"
with open(sample_key_file, 'r') as sample_key:
    sample_key.readline() # skip header
    for line in sample_key:
        seqID, sampleName = line.rstrip().split("\t")
        sampleName = sampleName.replace("_","-")
        seqID_to_sampleName[seqID] = sampleName


## Trim read 2 adapters with cutadapt
Remove staggers, leaving the UMI+partial barcode sequence (19-26bp, may be shorter depending on quality trimming)

In [6]:
# working directory with fastq files
working_dir="/home/users/rang/scratch/yeast/tmp/ladder_pilot_adapted/fastq/"
os.chdir(working_dir)

# get read 2 files for trimming
fastq_list = sorted([os.path.abspath(x) for x in glob.glob("*R2_adapted.fastq.gz")])

In [7]:
# cutadapt parameters to trim read 2 to get barcode+UMI (27bp)
adapter_5prime = 'GGCCAGTTTAAACTT'

num_of_cores = len(os.sched_getaffinity(0))
err = 0.2 # fraction tolerated for adapter matching
min_r2_length = 12 # R2 must contain at least UMI sequence and some of SphI linker
output_dir_name = 'trimmed'

In [8]:
# store sample key in regex pattern
pattern = regex.compile('|'.join(seqID_to_sampleName.keys()))

# trim read 2, filter untrimmed read pairs
for fastq_path in fastq_list:
    fastq_dir = os.path.dirname(fastq_path)
    output_dir = fastq_dir + "/"+output_dir_name+"/"
    os.makedirs(output_dir, exist_ok=True)
    
    # rename output files by sample key stored in seqID_to_sampleName 
    fastq_file = os.path.basename(fastq_path)
    output_file_r2 = pattern.sub(lambda x: seqID_to_sampleName[x.group()], fastq_file).replace("_adapted.fastq.gz", "_adapted_trimmed.fastq.gz")
    output_file_r1 = output_file_r2.replace("_R2_", "_R1_")

    print('Trimming: ' + fastq_path)
    
    cutadapt_cmd = ["cutadapt", "-g", adapter_5prime, 
                    "-j", str(num_of_cores), 
                    "-e", str(err), 
                    "-q", "20", # use -q for miseq/hiseq quality trimming
                    #"--nextseq-trim", "20", # use this option for nextseq quality trimming
                    "--discard-untrimmed", "-m", str(min_r2_length), 
                    "--pair-filter=first", 
                    "-o", output_dir+output_file_r2, "-p", output_dir+output_file_r1,
                    fastq_path, fastq_path.replace("L001_R2_", "L001_R1_")]
    
    subprocess.run(cutadapt_cmd)

    print("Output files:")
    print(output_file_r2)
    print(output_file_r1)
    print()
    
print('Done trimming!')

Trimming: /scratch/users/rang/yeast/tmp/ladder_pilot_adapted/fastq/18146FL-30-01-01_S1_L001_R2_adapted.fastq.gz
Output files:
t3-1_S1_L001_R2_adapted_trimmed.fastq.gz
t3-1_S1_L001_R1_adapted_trimmed.fastq.gz

Trimming: /scratch/users/rang/yeast/tmp/ladder_pilot_adapted/fastq/18146FL-30-01-02_S2_L001_R2_adapted.fastq.gz
Output files:
t1-3_S2_L001_R2_adapted_trimmed.fastq.gz
t1-3_S2_L001_R1_adapted_trimmed.fastq.gz

Trimming: /scratch/users/rang/yeast/tmp/ladder_pilot_adapted/fastq/18146FL-30-01-03_S3_L001_R2_adapted.fastq.gz
Output files:
t2-2_S3_L001_R2_adapted_trimmed.fastq.gz
t2-2_S3_L001_R1_adapted_trimmed.fastq.gz

Trimming: /scratch/users/rang/yeast/tmp/ladder_pilot_adapted/fastq/18146FL-30-01-04_S4_L001_R2_adapted.fastq.gz
Output files:
t1-2_S4_L001_R2_adapted_trimmed.fastq.gz
t1-2_S4_L001_R1_adapted_trimmed.fastq.gz

Trimming: /scratch/users/rang/yeast/tmp/ladder_pilot_adapted/fastq/18146FL-30-01-05_S5_L001_R2_adapted.fastq.gz
Output files:
t3-4_S5_L001_R2_adapted_trimmed.fastq.

Output files:
t2-1_S40_L001_R2_adapted_trimmed.fastq.gz
t2-1_S40_L001_R1_adapted_trimmed.fastq.gz

Trimming: /scratch/users/rang/yeast/tmp/ladder_pilot_adapted/fastq/18146FL-30-01-41_S41_L001_R2_adapted.fastq.gz
Output files:
t6-5_S41_L001_R2_adapted_trimmed.fastq.gz
t6-5_S41_L001_R1_adapted_trimmed.fastq.gz

Trimming: /scratch/users/rang/yeast/tmp/ladder_pilot_adapted/fastq/18146FL-30-01-42_S42_L001_R2_adapted.fastq.gz
Output files:
t5-5_S42_L001_R2_adapted_trimmed.fastq.gz
t5-5_S42_L001_R1_adapted_trimmed.fastq.gz

Done trimming!


## Merge read 1 and read 2 with FLASH to produce final barcode+UMI sequence

In [9]:
working_dir="/home/users/rang/scratch/yeast/tmp/ladder_pilot_adapted/fastq/trimmed/"
os.chdir(working_dir)

fastq_list = sorted([os.path.abspath(x) for x in glob.glob("*R1_adapted_trimmed.fastq.gz")])

In [10]:
# FLASH parameters
min_overlap = 12 # min overlap cannot be longer than the shorter read.
max_mismatch = 0.25
output_dir_name = 'merged'

In [11]:
# use FLASH to merge trimmed-filtered read 2 and read 1 data to produce final 27bp sequence containing barcode and UMI data
for fastq_path in fastq_list:
    fastq_dir = os.path.dirname(fastq_path)
    output_dir = fastq_dir + "/"+output_dir_name+"/"
    os.makedirs(output_dir, exist_ok=True)
    
    output_prefix = os.path.basename(fastq_path).split("_")[0]+"_barcode" # check output file naming 
    print('Merging', fastq_path, 'and', fastq_path.replace("_R1_", "_R2_"))
    
    flash_cmd = ["flash", "-m", str(min_overlap), 
                 "-x", str(max_mismatch), #"-O", # use -O if innie-only merging does not work
                 "-o", output_prefix, "-d", output_dir, 
                 "--compress", 
                 fastq_path, fastq_path.replace("_R1_", "_R2_")]
    subprocess.run(flash_cmd)
    print(output_prefix, "merged")

print('Done merging!')

Merging /scratch/users/rang/yeast/tmp/ladder_pilot_adapted/fastq/trimmed/t0-1_S27_L001_R1_adapted_trimmed.fastq.gz and /scratch/users/rang/yeast/tmp/ladder_pilot_adapted/fastq/trimmed/t0-1_S27_L001_R2_adapted_trimmed.fastq.gz
t0-1_barcode merged
Merging /scratch/users/rang/yeast/tmp/ladder_pilot_adapted/fastq/trimmed/t0-2_S18_L001_R1_adapted_trimmed.fastq.gz and /scratch/users/rang/yeast/tmp/ladder_pilot_adapted/fastq/trimmed/t0-2_S18_L001_R2_adapted_trimmed.fastq.gz
t0-2_barcode merged
Merging /scratch/users/rang/yeast/tmp/ladder_pilot_adapted/fastq/trimmed/t0-3_S6_L001_R1_adapted_trimmed.fastq.gz and /scratch/users/rang/yeast/tmp/ladder_pilot_adapted/fastq/trimmed/t0-3_S6_L001_R2_adapted_trimmed.fastq.gz
t0-3_barcode merged
Merging /scratch/users/rang/yeast/tmp/ladder_pilot_adapted/fastq/trimmed/t0-4_S10_L001_R1_adapted_trimmed.fastq.gz and /scratch/users/rang/yeast/tmp/ladder_pilot_adapted/fastq/trimmed/t0-4_S10_L001_R2_adapted_trimmed.fastq.gz
t0-4_barcode merged
Merging /scratch/u

t5-4_barcode merged
Merging /scratch/users/rang/yeast/tmp/ladder_pilot_adapted/fastq/trimmed/t5-5_S42_L001_R1_adapted_trimmed.fastq.gz and /scratch/users/rang/yeast/tmp/ladder_pilot_adapted/fastq/trimmed/t5-5_S42_L001_R2_adapted_trimmed.fastq.gz
t5-5_barcode merged
Merging /scratch/users/rang/yeast/tmp/ladder_pilot_adapted/fastq/trimmed/t5-6_S21_L001_R1_adapted_trimmed.fastq.gz and /scratch/users/rang/yeast/tmp/ladder_pilot_adapted/fastq/trimmed/t5-6_S21_L001_R2_adapted_trimmed.fastq.gz
t5-6_barcode merged
Merging /scratch/users/rang/yeast/tmp/ladder_pilot_adapted/fastq/trimmed/t6-1_S23_L001_R1_adapted_trimmed.fastq.gz and /scratch/users/rang/yeast/tmp/ladder_pilot_adapted/fastq/trimmed/t6-1_S23_L001_R2_adapted_trimmed.fastq.gz
t6-1_barcode merged
Merging /scratch/users/rang/yeast/tmp/ladder_pilot_adapted/fastq/trimmed/t6-2_S28_L001_R1_adapted_trimmed.fastq.gz and /scratch/users/rang/yeast/tmp/ladder_pilot_adapted/fastq/trimmed/t6-2_S28_L001_R2_adapted_trimmed.fastq.gz
t6-2_barcode mer

## (optional) Downsample reads for analysis
Use seqtk in command line to downsample fastq files prior to assembling counts matrix.<br>
e.g. seqtk sample -s100 read1.fq 10000 > sub1.fq

## Count barcodes
Counting barcodes consists of several steps. First, parse each fastq file and count all sequences. After assembling into an initial sequences counts matrix, extract the barcode and UMI sequences and map them to a reference table of barcodes and UMIs. Counts for ID-able sequences are onsolidated into a final counts matrix for input to DESeq2

In [12]:
def count_seqs(fastq_file, min_seq_length, max_seq_length):
    '''
    Parses a fastq file and counts sequences. Returns dict of counts
    '''
    seq_counts_dict = {}
    # parse fastq
    with gzip.open(fastq_file, 'rt') as fastq:
        for read in SeqIO.parse(fastq, "fastq"):
            # filter for sequences within min/max length and contains no N's
            if min_seq_length <= len(read.seq) <= max_seq_length and read.seq.count("N")==0:
                sequence = str(read.seq)
                # count sequence
                try:
                    seq_counts_dict[sequence] += 1
                except KeyError:
                    seq_counts_dict[sequence] = 1
    
    return seq_counts_dict

    
def map_seq_to_barcode_umi(seq, barcode_table, umi_list, barcode_length, umi_length, linker_seq):
    '''
    splits a sequence into barcode and UMI, maps to barcode table and UMI list to assign ID
    does NOT do UMI mapping if umi_list is empty.
    '''
    barcode, umi = split_barcode_umi_from_seq(seq, barcode_length, umi_length, linker_seq)

    if len(barcode)<barcode_length/2 or len(umi)<umi_length/2:
        # Barcode/UMI too short
        return None

    # assign barcode ID
    barcode_id = assign_barcode(barcode=barcode, barcode_table=barcode_table, error=len(barcode)//5)
    if barcode_id:
        # add barcode ID to final ID
        final_id = barcode_id
    else:
        # Barcode cannot be identified
        return None

    # assign UMI ID (if applicable)
    if len(umi_list)>0:
        umi_id = assign_umi(umi=umi, umi_list=umi_list, error=len(umi)//4)
        if umi_id:
            # add UMI ID to final ID
            final_id = '-'.join([final_id, str(umi_id)])
        else:
            # UMI cannot be identified
            return None
    
    return final_id
    

def split_barcode_umi_from_seq(seq, barcode_length, umi_length, linker_seq):
    '''
    Splits seq by linker_seq and returns barcode and UMI sequence
    If linker seq cannot be found (e.g. sequencing error) or yields multiple splits,
    fall back to splitting by base position.
    '''
    try:
        # split by linker
        barcode, umi = seq.split(linker_seq) # can try error tolerant regex?
    except ValueError:
        # split by base position
        if umi_length == 0:
            umi = ''
            barcode = seq[:barcode_length]
        else:
            umi = seq[-umi_length:]
            barcode = seq[:-(umi_length+len(linker_seq))] # may return partial barcodes for short sequences

    return (barcode, umi)


def assign_barcode(barcode, barcode_table, error):
    '''
    Searches barcode table for barcode sequence and returns unique barcode ID
    Tries perfect match first, then error-tolerant regex
    '''
    try:
        # search for perfect match
        barcode_id = barcode_table.loc[barcode, 'Unique_ID']
    except KeyError:
        # search by error-tolerant regex
        pattern = "("+barcode+"){e<="+str(error)+"}"
        search = [bool(regex.search(pattern, x)) for x in barcode_table.index]
        if sum(search)==1:
            barcode_id = barcode_table.loc[search, 'Unique_ID']
        else:
            # barcode cannot be identified
            barcode_id = None
    
    return barcode_id


def assign_umi(umi, umi_list, error):
    '''
    Searches umi list for umi sequence and returns 1-index position as umi ID
    Tries perfect match first, then error-tolerant regex
    '''
    if umi == '':
        umi_id = None
    else:
        try:
            # search for perfect match
            umi_id = umi_list.index(umi)+1
        except ValueError:
            # search by error-tolerant regex
            pattern = "("+umi+"){e<="+str(error)+"}"
            search = [bool(regex.search(pattern, x)) for x in umi_list]
            if sum(search) == 1:
                umi_id = search.index(True)+1
            else:
                # UMI cannot be identified
                umi_id = None
    
    return umi_id


In [13]:
working_dir="/home/users/rang/scratch/yeast/tmp/ladder_pilot_adapted/fastq/trimmed/merged/"
os.chdir(working_dir)

# merged reads to count barcodes from
fastq_list = sorted(glob.glob("*extendedFrags.fastq.gz")) # check for file name
# sample names for each fastq
sample_name_list = [fastq_file.split("_")[0] for fastq_file in fastq_list] # adjust accordingly to generate sample name for counts matrix

#output directory
output_dir = "/home/users/rang/scratch/yeast/tmp/ladder_pilot_adapted/counts/"

# sequence counts file (before combining)
seq_counts_filename = "seq_counts.txt"

# mapped barcode-UMI counts file
barcode_counts_filename = "barcode_counts.txt"


# open barcode reference file
barcode_reference_file = '/home/users/rang/crispey3/library_design/Input/12BP_PBCs_well_grouped.csv'
barcode_table = pd.read_csv(barcode_reference_file, index_col=1)

# approved list of UMIs used in cloning CRISPEY3 plasmid
umi_list = ['ACGCGTGAA',
            'ATGTGGCTC',
            'CAGAGGATC',
            'CTGTGGCAA',
            'GTGTGATTC',
            'TAGAGGACT', #] # only first 6 UMIs were included in cloning CRISPEY3 libaries
            'AAGAGCCTC',
            'AAGAGGAGG',
            'ATGTGCGAA',
            'ATGTGTAGG',
            'CAGAGCCAA',
            'CTGTGATGG',
            'CTGTGTATC',
            'GAGAGGAAA',
            'TCGCGGTAA',
            'TTGTGCGTC']
umi_list = sorted(umi_list)


In [14]:
# count sequences in each fastq file
fastq_dict = dict(zip(sample_name_list, fastq_list))
with mp.Pool(min(len(os.sched_getaffinity(0)), len(fastq_list))) as pool:
    seq_counts_df = {sample_name : pool.apply_async(count_seqs, (fastq_file, 20, 27)) for sample_name, fastq_file in fastq_dict.items()}
    seq_counts_df = {sample_name : res.get() for sample_name, res in seq_counts_df.items()}
    
# merge to dataframe
seq_counts_df = pd.DataFrame.from_dict(seq_counts_df, orient="columns")
seq_counts_df.index.name = 'sequence'

# write to file to inspect
seq_counts_df.to_csv(output_dir+seq_counts_filename, sep="\t")


In [15]:
# map each sequence in seq_counts_df to barcode-UMI ID
mapped_counts_df = seq_counts_df.copy()
mapped_counts_df = mapped_counts_df.reset_index()

# filter out singletons and doubletons (computationally expensive to map these rare barcodes, minimal impact to total count)
mapped_counts_df = mapped_counts_df.loc[mapped_counts_df.sum(axis=1)>=3]

# split sequences up for multiprocessing
def map_sequences(seq_list, barcode_table, umi_list, barcode_length, umi_length, linker_seq):
    '''
    helper function for multiprocessing of barcode-umi mapping
    '''
    return seq_list.apply(map_seq_to_barcode_umi, args=(barcode_table, umi_list, barcode_length, umi_length, linker_seq))

num_of_cores = len(os.sched_getaffinity(0))
with mp.Pool(num_of_cores) as pool:
    sequences_chunks = np.array_split(mapped_counts_df['sequence'], num_of_cores)
    starmap_args = [[seq_list, barcode_table, umi_list, 12, 9, 'GCATGC'] for seq_list in sequences_chunks]
    barcode_umi_id_lists = pool.starmap(map_sequences, starmap_args)

mapped_counts_df['barcode_umi_id'] = pd.concat(barcode_umi_id_lists)
display(mapped_counts_df)

# consolidate counts
mapped_counts_df = mapped_counts_df.groupby('barcode_umi_id').sum().fillna(0).astype(int)
mapped_counts_df.index.name = 'barcode'
display(mapped_counts_df)

# write all counts to output file
os.makedirs(output_dir, exist_ok=True)
mapped_counts_df.to_csv(output_dir+barcode_counts_filename, sep="\t")



# # one-liner to map sequences to barcode-umi IDs
# # warning: single-threaded, expected to be slow
# mapped_counts_df['barcode_umi_id'] = mapped_counts_df['sequence'].apply(map_seq_to_barcode_umi, args=(barcode_table, umi_list, 12, 9, 'GCATGC'))


Unnamed: 0,sequence,t0-1,t0-2,t0-3,t0-4,t0-5,t0-6,t1-1,t1-2,t1-3,...,t5-4,t5-5,t5-6,t6-1,t6-2,t6-3,t6-4,t6-5,t6-6,barcode_umi_id
0,ACTAATGCCCGCGCATGCATGTGTAGG,116.0,122.0,109.0,88.0,131.0,115.0,87.0,114.0,82.0,...,104.0,121.0,121.0,126.0,119.0,116.0,134.0,120.0,122.0,Ladder_023-6
1,CTAACAAGGCACGCATGCCAGAGGATC,164.0,215.0,235.0,178.0,245.0,191.0,217.0,262.0,203.0,...,237.0,254.0,261.0,324.0,298.0,263.0,334.0,289.0,285.0,Ladder_024-8
2,GAGCAACCAGTAGCATGCGTGTGATTC,151.0,166.0,153.0,129.0,186.0,174.0,153.0,167.0,127.0,...,147.0,185.0,208.0,195.0,202.0,194.0,253.0,185.0,183.0,Ladder_025-13
3,GTGTATTCTTGCGCATGCCAGAGGATC,107.0,96.0,103.0,76.0,114.0,101.0,88.0,101.0,66.0,...,55.0,66.0,86.0,62.0,60.0,63.0,71.0,66.0,52.0,Ladder_039-8
4,CCAGGCAGCGACGCATGCCTGTGGCAA,176.0,238.0,239.0,203.0,279.0,256.0,206.0,266.0,174.0,...,233.0,283.0,302.0,264.0,269.0,276.0,315.0,309.0,267.0,Ladder_021-10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9800,CTAACAAGGCACGCATGCCAGAGGACC,,,,,,,,,,...,,1.0,,,,1.0,,1.0,,
9811,TGAAGTCCTTTAGCATGCTCGTGCGTC,,,,,,,,,,...,,1.0,,,,2.0,,,,Ladder_040-16
10115,CAATAAGTATTCGCATGCGAGAGGAAA,,,,,,,,,,...,,,,1.0,2.0,,,,,
10435,ACTAATGCCCGCGCATGCCTGTGGCGA,,,,,,,,,,...,,,,,,2.0,,1.0,,


Unnamed: 0_level_0,t0-1,t0-2,t0-3,t0-4,t0-5,t0-6,t1-1,t1-2,t1-3,t1-4,...,t5-3,t5-4,t5-5,t5-6,t6-1,t6-2,t6-3,t6-4,t6-5,t6-6
barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Ladder_001-1,20,28,21,21,31,31,19,26,24,20,...,13,14,14,17,14,17,20,16,23,21
Ladder_001-10,103,114,132,95,129,133,108,128,103,120,...,86,90,90,107,116,99,89,119,108,97
Ladder_001-11,81,73,85,56,95,79,59,82,59,60,...,56,43,48,61,42,54,40,63,60,58
Ladder_001-12,66,88,72,70,100,70,62,72,64,54,...,42,58,76,47,56,67,63,64,79,55
Ladder_001-13,83,100,80,72,100,79,76,88,74,78,...,94,65,76,94,118,88,83,98,78,94
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Ladder_043-5,24,24,29,23,46,27,28,20,24,27,...,22,12,20,20,16,17,17,17,19,15
Ladder_043-6,11,18,19,17,16,16,13,9,19,12,...,15,9,10,18,4,7,14,16,15,9
Ladder_043-7,16,22,16,26,28,27,16,18,20,17,...,20,12,12,16,13,14,15,13,23,14
Ladder_043-8,17,21,26,14,32,27,26,20,12,27,...,20,19,24,28,19,21,12,15,23,16


## (optional) Combine counts across UMIs per barcode
The counts of different UMIs of the same barcode can be added together to produce a stacked counts matrix

In [16]:
# combine counts from different UMIs of the same barcode
stacked_counts_filename = "stacked_barcode_counts.txt"

stacked_counts_df = mapped_counts_df.groupby(by=lambda x: x.split('-')[0]).sum()
stacked_counts_df.to_csv(output_dir+stacked_counts_filename, sep="\t")

In [None]:
# # one-step count_barcodes function
# # warning: less efficient since barcode-UMI mapping is done per sample, rather than a single time after all sequences are counted

# def count_barcodes(fastq_file, barcode_table, umi_list, 
#                    min_seq_length=20, barcode_length=12, umi_length=9, linker_seq='GCATGC'):
#     '''
#     parses fastq file to count sequences, then extracts barcode-UMI info from sequences and assigns ID based
#     on provided reference barcode_table and umi_list. Finally, consolidates counts by assigned ID and returns
#     a dict of barcode counts.
#     does NOT do UMI mapping if umi_list is empty.
#     '''
#     # set max_seq_length
#     max_seq_length = barcode_length+umi_length+len(linker_seq) # this could be adjusted to allow insertions
#     # alphabetical sort umi_list
#     umi_list = sorted(umi_list)
    
#     # count raw sequences
#     seq_counts = count_seqs(fastq_file, min_seq_length, max_seq_length)
    
#     # consolidate barcode counts
#     barcode_counts_dict = {}
#     for seq, count in seq_counts.items():
#         # assign barcode-UMI ID
#         assigned_id = map_seq_to_barcode_umi(seq, barcode_table, umi_list, barcode_length, umi_length, linker_seq)
#         if assigned_id:
#             try:
#                 barcode_counts_dict[assigned_id] += count
#             except KeyError:
#                 barcode_counts_dict[assigned_id] = count
    
#     return barcode_counts_dict


# fastq_dict = dict(zip(sample_name_list, fastq_list))
# with mp.Pool(min(len(os.sched_getaffinity(0)), len(fastq_list))) as pool:
#     all_counts_df = {sample_name : pool.apply_async(count_barcodes, (fastq_file, barcode_table, umi_list)) for sample_name, fastq_file in fastq_dict.items()}
#     all_counts_df = {sample_name : res.get() for sample_name, res in all_counts_df.items()}
    
# # write all counts to output file
# os.makedirs(output_dir, exist_ok=True)
# all_counts_df = pd.DataFrame.from_dict(all_counts_df, orient="columns")
# all_counts_df.index.name = 'barcode'
# all_counts_df.to_csv(output_dir+counts_filename, sep="\t")