# Process and barcode sequencing reads
This script takes paired read (read 1: barcode only (12bp) or barcode+UMI (27bp). read 2: barcode+UMI+staggers) fastq files and does the following:
1. trims read 2 adapter sequences to recover barcode+UMI sequence
2. merges read 1 and 2 with FLASH
3. counts barcodes

In [1]:
import glob
import os
import subprocess
import re
import gzip
from Bio import SeqIO
import pandas as pd

# check number of available cores
len(os.sched_getaffinity(0))

20

## Trim read 2 adapters with cutadapt

In [2]:
# key to map fastq names to output names
seqID_to_sampleName = {}
sample_key_file = "/home/users/rang/crispey3/ladder_pilot_feb2021/SampleKey-18146-30.txt"
with open(sample_key_file, 'r') as sample_key:
    sample_key.readline() # skip header
    for line in sample_key:
        seqID, sampleName = line.rstrip().split("\t")
        sampleName = sampleName.replace("_","-")
        seqID_to_sampleName[seqID] = sampleName


In [3]:
# working directory with fastq files
working_dir="/home/users/rang/crispey3/ladder_pilot_feb2021/fastq/"
os.chdir(working_dir)

# get read 2 files for trimming
fastq_list = sorted([os.path.abspath(x) for x in glob.glob("*R2_001.fastq.gz")])

In [4]:
# cutadapt parameters to trim read 2 to get barcode+UMI (27bp)
adapter_5prime = 'GGCCAGTTTAAACTT'
adapter_3prime = 'GCATGGC'
num_of_cores = 4 #len(os.sched_getaffinity(0))
err = 0.2 # fraction tolerated for adapter matching
barcode_len = 27 # barcode (12bp) + SphI site (6bp) + UMI (9bp)
output_dir_name = 'trimmed'

In [5]:
# store sample key in regex pattern
pattern = re.compile('|'.join(seqID_to_sampleName.keys()))

# trim read 2, filter untrimmed read pairs
for fastq_path in fastq_list:
    fastq_dir = os.path.dirname(fastq_path)
    output_dir = fastq_dir + "/"+output_dir_name+"/"
    os.makedirs(output_dir, exist_ok=True)
    
    # rename output files by sample key stored in seqID_to_sampleName 
    fastq_file = os.path.basename(fastq_path)
    output_file_r2 = pattern.sub(lambda x: seqID_to_sampleName[x.group()], fastq_file).replace("_001.fastq.gz", "_001_trimmed.fastq.gz") #fastq_file.replace("_001.fastq.gz", "_001_trimmed.fastq.gz")
    output_file_r1 = output_file_r2.replace("_R2_", "_R1_")

    print('Trimming: ' + fastq_path)
    
    cutadapt_cmd = ["cutadapt", "-g", adapter_5prime+"..."+adapter_3prime, 
                    "-j", str(num_of_cores), 
                    "-e", str(err), 
                    "-q", "20", #"--nextseq-trim", "20", # for miseq/hiseq use -q option for quality trimming
                    "--discard-untrimmed", "-m", str(barcode_len), 
                    "--pair-filter=first", 
                    "-o", output_dir+output_file_r2, "-p", output_dir+output_file_r1,
                    fastq_path, fastq_path.replace("L001_R2_", "L001_R1_")]
    
    subprocess.run(cutadapt_cmd)

    print("Output files:")
    print(output_file_r2)
    print(output_file_r1)
    print()
    
print('Done trimming!')

Trimming: /home/users/rang/crispey3/ladder_pilot_feb2021/fastq/18146FL-30-01-01_S1_L001_R2_001.fastq.gz
Output files:
t3-1_S1_L001_R2_001_trimmed.fastq.gz
t3-1_S1_L001_R1_001_trimmed.fastq.gz

Trimming: /home/users/rang/crispey3/ladder_pilot_feb2021/fastq/18146FL-30-01-02_S2_L001_R2_001.fastq.gz
Output files:
t1-3_S2_L001_R2_001_trimmed.fastq.gz
t1-3_S2_L001_R1_001_trimmed.fastq.gz

Trimming: /home/users/rang/crispey3/ladder_pilot_feb2021/fastq/18146FL-30-01-03_S3_L001_R2_001.fastq.gz
Output files:
t2-2_S3_L001_R2_001_trimmed.fastq.gz
t2-2_S3_L001_R1_001_trimmed.fastq.gz

Trimming: /home/users/rang/crispey3/ladder_pilot_feb2021/fastq/18146FL-30-01-04_S4_L001_R2_001.fastq.gz
Output files:
t1-2_S4_L001_R2_001_trimmed.fastq.gz
t1-2_S4_L001_R1_001_trimmed.fastq.gz

Trimming: /home/users/rang/crispey3/ladder_pilot_feb2021/fastq/18146FL-30-01-05_S5_L001_R2_001.fastq.gz
Output files:
t3-4_S5_L001_R2_001_trimmed.fastq.gz
t3-4_S5_L001_R1_001_trimmed.fastq.gz

Trimming: /home/users/rang/crispey3

## Merge read 1 and read 2 with FLASH to produce final barcode+UMI sequence

In [6]:
working_dir="/home/users/rang/crispey3/ladder_pilot_feb2021/fastq/trimmed/"
os.chdir(working_dir)

fastq_list = sorted([os.path.abspath(x) for x in glob.glob("*R1_001_trimmed.fastq.gz")])

In [7]:
# FLASH parameters -- please tune for CRISPEY3!
min_overlap = 14
max_mismatch = 0.25
output_dir_name = 'merged'

In [8]:
# use FLASH to merge trimmed-filtered read 2 and read 1 data to produce final 27bp sequence containing barcode and UMI data
for fastq_path in fastq_list:
    fastq_dir = os.path.dirname(fastq_path)
    output_dir = fastq_dir + "/"+output_dir_name+"/"
    os.makedirs(output_dir, exist_ok=True)
    
    output_prefix = os.path.basename(fastq_path).split("_")[0]+"_barcode" # check output file naming 
    print('Merging', fastq_path, 'and', fastq_path.replace("_R1_", "_R2_"))
    
    flash_cmd = ["flash", "-m", str(min_overlap), 
                 "-x", str(max_mismatch), "-O", # use -O if innie-only merging does not work
                 "-o", output_prefix, "-d", output_dir, 
                 "--compress", 
                 fastq_path, fastq_path.replace("_R1_", "_R2_")]
    subprocess.run(flash_cmd)
    print(output_prefix, "merged")

print('Done merging!')

Merging /home/users/rang/crispey3/ladder_pilot_feb2021/fastq/trimmed/t0-1_S27_L001_R1_001_trimmed.fastq.gz and /home/users/rang/crispey3/ladder_pilot_feb2021/fastq/trimmed/t0-1_S27_L001_R2_001_trimmed.fastq.gz
t0-1_barcode merged
Merging /home/users/rang/crispey3/ladder_pilot_feb2021/fastq/trimmed/t0-2_S18_L001_R1_001_trimmed.fastq.gz and /home/users/rang/crispey3/ladder_pilot_feb2021/fastq/trimmed/t0-2_S18_L001_R2_001_trimmed.fastq.gz
t0-2_barcode merged
Merging /home/users/rang/crispey3/ladder_pilot_feb2021/fastq/trimmed/t0-3_S6_L001_R1_001_trimmed.fastq.gz and /home/users/rang/crispey3/ladder_pilot_feb2021/fastq/trimmed/t0-3_S6_L001_R2_001_trimmed.fastq.gz
t0-3_barcode merged
Merging /home/users/rang/crispey3/ladder_pilot_feb2021/fastq/trimmed/t0-4_S10_L001_R1_001_trimmed.fastq.gz and /home/users/rang/crispey3/ladder_pilot_feb2021/fastq/trimmed/t0-4_S10_L001_R2_001_trimmed.fastq.gz
t0-4_barcode merged
Merging /home/users/rang/crispey3/ladder_pilot_feb2021/fastq/trimmed/t0-5_S31_L001

t5-6_barcode merged
Merging /home/users/rang/crispey3/ladder_pilot_feb2021/fastq/trimmed/t6-1_S23_L001_R1_001_trimmed.fastq.gz and /home/users/rang/crispey3/ladder_pilot_feb2021/fastq/trimmed/t6-1_S23_L001_R2_001_trimmed.fastq.gz
t6-1_barcode merged
Merging /home/users/rang/crispey3/ladder_pilot_feb2021/fastq/trimmed/t6-2_S28_L001_R1_001_trimmed.fastq.gz and /home/users/rang/crispey3/ladder_pilot_feb2021/fastq/trimmed/t6-2_S28_L001_R2_001_trimmed.fastq.gz
t6-2_barcode merged
Merging /home/users/rang/crispey3/ladder_pilot_feb2021/fastq/trimmed/t6-3_S29_L001_R1_001_trimmed.fastq.gz and /home/users/rang/crispey3/ladder_pilot_feb2021/fastq/trimmed/t6-3_S29_L001_R2_001_trimmed.fastq.gz
t6-3_barcode merged
Merging /home/users/rang/crispey3/ladder_pilot_feb2021/fastq/trimmed/t6-4_S16_L001_R1_001_trimmed.fastq.gz and /home/users/rang/crispey3/ladder_pilot_feb2021/fastq/trimmed/t6-4_S16_L001_R2_001_trimmed.fastq.gz
t6-4_barcode merged
Merging /home/users/rang/crispey3/ladder_pilot_feb2021/fastq

In [9]:
# usually the (correctly) merged barcode should be at the correct length, but if read 1 was sequenced longer, 
# the merged fastq will need to be trimmed back to the correct length
barcode_length = 27

working_dir="/home/users/rang/crispey3/ladder_pilot_feb2021/fastq/trimmed/merged"
os.chdir(working_dir)

fastq_list = sorted([os.path.abspath(x) for x in glob.glob("*barcode.extendedFrags.fastq.gz")])

for fastq_path in fastq_list:
    output_path = fastq_path.replace(".extendedFrags", "_final")
    cutadapt_cmd = ["cutadapt", "-l", str(barcode_length),
                    "-o", output_path, fastq_path]
    
    subprocess.run(cutadapt_cmd)

## Count barcodes

In [10]:
def count_barcodes(fastq_file, barcode_length=27):
    '''
    takes a fastq file and counts all valid barcodes that meet barcode length and contains no N's
    IMPROVE FUNCTION TO ASSIGN BARCODES CONTAINING Ns?
    '''
    barcode_dict = {}
    barcode_withNs = []
    read_too_long = 0
    read_too_short = 0
    with gzip.open(fastq_file, 'rt') as fastq:
        for read in SeqIO.parse(fastq, "fastq"):
            read_length = len(read.seq)
            n_count = read.seq.count("N")
            if read_length == barcode_length:
                if n_count == 0:
                    if str(read.seq) not in barcode_dict:
                        barcode_dict[str(read.seq)] = 1
                    else:
                        barcode_dict[str(read.seq)] += 1
                else:
                    barcode_withNs.append(str(read.seq))
            elif read_length > barcode_length:
                read_too_long +=1
            elif read_length < barcode_length:
                read_too_short +=1
                
    print("Reads longer than {}bp: {}".format(barcode_length, read_too_long))
    print("Reads shorter than {}bp: {}".format(barcode_length, read_too_short))
    print("{}bp reads with 1 or more Ns:".format(barcode_length, len(barcode_withNs)))
    
    return barcode_dict


In [11]:
working_dir="/home/users/rang/crispey3/ladder_pilot_feb2021/fastq/trimmed/merged/"
os.chdir(working_dir)

# merged reads to count barcodes from
fastq_list = sorted(glob.glob("*barcode_final*")) # check for file name

#output directory
output_dir = "/home/users/rang/crispey3/ladder_pilot_feb2021/counts/"

# counts file
counts_filename = "all_barcode_counts.txt"


In [12]:
all_counts_df = {}
for fastq_file in fastq_list:
    print(fastq_file)
    sample_name = fastq_file.split("_")[0] # adjust to get sample name from fastq_file
    all_counts_df[sample_name] = count_barcodes(fastq_file, barcode_length=27)

# write all counts to output file
os.makedirs(output_dir, exist_ok=True)
all_counts_df = pd.DataFrame.from_dict(all_counts_df, orient="columns")
all_counts_df.index.name = 'barcode'
all_counts_df.to_csv(output_dir+counts_filename, sep="\t")


t0-1_barcode_final.fastq.gz
Reads longer than 27bp: 0
Reads shorter than 27bp: 0
27bp reads with 1 or more Ns:
t0-2_barcode_final.fastq.gz
Reads longer than 27bp: 0
Reads shorter than 27bp: 0
27bp reads with 1 or more Ns:
t0-3_barcode_final.fastq.gz
Reads longer than 27bp: 0
Reads shorter than 27bp: 0
27bp reads with 1 or more Ns:
t0-4_barcode_final.fastq.gz
Reads longer than 27bp: 0
Reads shorter than 27bp: 0
27bp reads with 1 or more Ns:
t0-5_barcode_final.fastq.gz
Reads longer than 27bp: 0
Reads shorter than 27bp: 0
27bp reads with 1 or more Ns:
t0-6_barcode_final.fastq.gz
Reads longer than 27bp: 0
Reads shorter than 27bp: 0
27bp reads with 1 or more Ns:
t1-1_barcode_final.fastq.gz
Reads longer than 27bp: 0
Reads shorter than 27bp: 0
27bp reads with 1 or more Ns:
t1-2_barcode_final.fastq.gz
Reads longer than 27bp: 0
Reads shorter than 27bp: 0
27bp reads with 1 or more Ns:
t1-3_barcode_final.fastq.gz
Reads longer than 27bp: 0
Reads shorter than 27bp: 0
27bp reads with 1 or more Ns:
t

## (optional) Downsample reads for analysis
Use seqtk in command line.<br>
Fastq files stored in ./sampleXXX/ directories

## Count barcodes for downsampled reads

In [21]:
downsample_name = "sample050"

working_dir="/home/users/rang/crispey3/ladder_pilot_feb2021/fastq/trimmed/merged/{}/".format(downsample_name)
os.chdir(working_dir)

# merged reads to count barcodes from
fastq_list = sorted(glob.glob("*barcode_{}*".format(downsample_name))) # check for file name

#output directory
output_dir = "/home/users/rang/crispey3/ladder_pilot_feb2021/counts/"

# counts file
counts_filename = "all_barcode_counts_{}.txt".format(downsample_name)


In [22]:
all_counts_df = {}
for fastq_file in fastq_list:
    print(fastq_file)
    sample_name = fastq_file.split("_")[0] # adjust to get sample name from fastq_file
    all_counts_df[sample_name] = count_barcodes(fastq_file, barcode_length=27)

# write all counts to output file
os.makedirs(output_dir, exist_ok=True)
all_counts_df = pd.DataFrame.from_dict(all_counts_df, orient="columns")
all_counts_df.index.name = 'barcode'
all_counts_df.to_csv(output_dir+counts_filename, sep="\t")


t0-1_barcode_sample050.fastq.gz
Reads longer than 27bp: 0
Reads shorter than 27bp: 0
27bp reads with 1 or more Ns:
t0-2_barcode_sample050.fastq.gz
Reads longer than 27bp: 0
Reads shorter than 27bp: 0
27bp reads with 1 or more Ns:
t0-3_barcode_sample050.fastq.gz
Reads longer than 27bp: 0
Reads shorter than 27bp: 0
27bp reads with 1 or more Ns:
t0-4_barcode_sample050.fastq.gz
Reads longer than 27bp: 0
Reads shorter than 27bp: 0
27bp reads with 1 or more Ns:
t0-5_barcode_sample050.fastq.gz
Reads longer than 27bp: 0
Reads shorter than 27bp: 0
27bp reads with 1 or more Ns:
t0-6_barcode_sample050.fastq.gz
Reads longer than 27bp: 0
Reads shorter than 27bp: 0
27bp reads with 1 or more Ns:
t1-1_barcode_sample050.fastq.gz
Reads longer than 27bp: 0
Reads shorter than 27bp: 0
27bp reads with 1 or more Ns:
t1-2_barcode_sample050.fastq.gz
Reads longer than 27bp: 0
Reads shorter than 27bp: 0
27bp reads with 1 or more Ns:
t1-3_barcode_sample050.fastq.gz
Reads longer than 27bp: 0
Reads shorter than 27b