# Process and barcode sequencing reads
This script takes paired read (read 1: barcode only (12bp) or barcode+UMI (27bp). read 2: barcode+UMI+staggers) fastq files and does the following:
1. trims read 2 adapter sequences to recover barcode+UMI sequence
2. merges read 1 and 2 with FLASH
3. identifies and counts barcodes

In [None]:
import glob
import os
import subprocess
import re
import gzip
from Bio import SeqIO
import pandas as pd

# check number of available cores
len(os.sched_getaffinity(0))

## Inspect fastq files for quality control using FastQC

In [None]:
!mkdir ~/crispey3/ladder_pilot_feb2021/fastq/fastqc/
!fastqc -o ~/crispey3/ladder_pilot_feb2021/fastq/fastqc/ ~/crispey3/ladder_pilot_feb2021/fastq/*fastq.gz 

## Summarize FastQC output with MultiQC

In [None]:
!multiqc -o ~/crispey3/ladder_pilot_feb2021/fastq/fastqc/ ~/crispey3/ladder_pilot_feb2021/fastq/fastqc/

## Proceed to trim read 2 adapters with cutadapt

In [None]:
# key to map fastq names to output names
seqID_to_sampleName = {}
sample_key_file = "/home/users/rang/crispey3/ladder_pilot_feb2021/SampleKey-18146-30.txt"
with open(sample_key_file, 'r') as sample_key:
    sample_key.readline() # skip header
    for line in sample_key:
        seqID, sampleName = line.rstrip().split("\t")
        sampleName = sampleName.replace("_","-")
        seqID_to_sampleName[seqID] = sampleName


In [None]:
# working directory with fastq files
working_dir="/home/users/rang/crispey3/ladder_pilot_feb2021/fastq/"
os.chdir(working_dir)

# get read 2 files for trimming
fastq_list = sorted([os.path.abspath(x) for x in glob.glob("*R2_001.fastq.gz")])

In [None]:
# cutadapt parameters to trim read 2 to get barcode+UMI (27bp)
adapter_5prime = 'GGCCAGTTTAAACTT'
adapter_3prime = 'GCATGGC'
num_of_cores = 4 #len(os.sched_getaffinity(0))
err = 0.2 # fraction tolerated for adapter matching
barcode_len = 27 # barcode (12bp) + SphI site (6bp) + UMI (9bp)
output_dir_name = 'trimmed'

In [None]:
# store sample key in regex pattern
pattern = re.compile('|'.join(seqID_to_sampleName.keys()))

# trim read 2, filter untrimmed read pairs
for fastq_path in fastq_list:
    fastq_dir = os.path.dirname(fastq_path)
    output_dir = fastq_dir + "/"+output_dir_name+"/"
    os.makedirs(output_dir, exist_ok=True)
    
    # rename output files by sample key stored in seqID_to_sampleName 
    fastq_file = os.path.basename(fastq_path)
    output_file_r2 = pattern.sub(lambda x: seqID_to_sampleName[x.group()], fastq_file).replace("_001.fastq.gz", "_001_trimmed.fastq.gz") #fastq_file.replace("_001.fastq.gz", "_001_trimmed.fastq.gz")
    output_file_r1 = output_file_r2.replace("_R2_", "_R1_")

    print('Trimming: ' + fastq_path)
    
    cutadapt_cmd = ["cutadapt", "-g", adapter_5prime+"..."+adapter_3prime, 
                    "-j", str(num_of_cores), 
                    "-e", str(err), 
                    "-q", "20", # use -q for miseq/hiseq quality trimming
                    #"--nextseq-trim", "20", # use this option for nextseq quality trimming
                    "--discard-untrimmed", "-m", str(barcode_len), 
                    "--pair-filter=first", 
                    "-o", output_dir+output_file_r2, "-p", output_dir+output_file_r1,
                    fastq_path, fastq_path.replace("L001_R2_", "L001_R1_")]
    
    subprocess.run(cutadapt_cmd)

    print("Output files:")
    print(output_file_r2)
    print(output_file_r1)
    print()
    
print('Done trimming!')

## Merge read 1 and read 2 with FLASH to produce final barcode+UMI sequence

In [None]:
working_dir="/home/users/rang/crispey3/ladder_pilot_feb2021/fastq/trimmed/"
os.chdir(working_dir)

fastq_list = sorted([os.path.abspath(x) for x in glob.glob("*R1_001_trimmed.fastq.gz")])

In [None]:
# FLASH parameters
min_overlap = 14 # this number should be no greater than the length of the shorter read
max_mismatch = 0.25
output_dir_name = 'merged'

In [None]:
# use FLASH to merge trimmed-filtered read 2 and read 1 data to produce final 27bp sequence containing barcode and UMI data
for fastq_path in fastq_list:
    fastq_dir = os.path.dirname(fastq_path)
    output_dir = fastq_dir + "/"+output_dir_name+"/"
    os.makedirs(output_dir, exist_ok=True)
    
    output_prefix = os.path.basename(fastq_path).split("_")[0]+"_barcode" # check output file naming 
    print('Merging', fastq_path, 'and', fastq_path.replace("_R1_", "_R2_"))
    
    flash_cmd = ["flash", "-m", str(min_overlap), 
                 "-x", str(max_mismatch), "-O", # use -O if innie-only merging does not work
                 "-o", output_prefix, "-d", output_dir, 
                 "--compress", 
                 fastq_path, fastq_path.replace("_R1_", "_R2_")]
    subprocess.run(flash_cmd)
    print(output_prefix, "merged")

print('Done merging!')

## (optional) Trim back merged read to desired barcode length
If read 1 was sequenced longer than the barcode+UMI, trim the merged read back to the correct length. This step is not required if read 1 is shorter than barcode length.

In [None]:
# trim merged read to desired barcode length
barcode_length = 27

working_dir="/home/users/rang/crispey3/ladder_pilot_feb2021/fastq/trimmed/merged"
os.chdir(working_dir)

fastq_list = sorted([os.path.abspath(x) for x in glob.glob("*barcode.extendedFrags.fastq.gz")])

for fastq_path in fastq_list:
    output_path = fastq_path.replace(".extendedFrags", "_final")
    cutadapt_cmd = ["cutadapt", "-l", str(barcode_length),
                    "-o", output_path, fastq_path]
    
    subprocess.run(cutadapt_cmd)

## (optional) Downsample reads for analysis
Use seqtk in command line to downsample fastq files prior to assembling counts matrix.<br>
e.g. seqtk sample -s100 read1.fq 10000 > sub1.fq

## Count barcodes
Counting barcodes consists of several steps. For each read:
1. Filter reads that are too long/short, or contain N's
2. Extract barcode and UMI sequences
3. Assign barcode-UMI ID according to reference lists of barcodes and UMIs
4. Count barcode by ID

In [None]:
def count_barcodes(fastq_file, sample_name, barcode_table, umi_list,
                   min_seq_length=20, barcode_length=12, umi_length=9, linker_seq='GCATGC'):
    '''
    parses a fastq file, extracts barcode and UMI sequences and identifies ID according to reference table
    in each sequence, barcodes and UMIs can be extracted by removing the middle linker sequence
    barcodes and UMIs are ID-ed using error tolerant regex. Error tolerance is set according to minimum Hamming
    distance between designed barcode and UMI sequences. Adjust accordingly.
    ID'ed barcodes are counted and its counts returned as a dict.
    
    length parameters should be adjusted based on what is included in the fastq reads. For example, if the reads contain
    only barcode sequence without UMIs or linker, set umi_length=0 and linker_seq=''.
    
    to exclude umi info and counting, set umi_list = []
    '''
    barcode_counts_dict = {}
    
    # set max_seq_length
    max_seq_length = barcode_length+umi_length+len(linker_seq) # this could be adjusted to allow insertions
    # alphabetical sort umi_list
    umi_list = sorted(umi_list)
    
    # parse fastq
    with gzip.open(fastq_file, 'rt') as fastq:
        for read in SeqIO.parse(fastq, "fastq"):
            barcode, umi, barcode_id, umi_id, final_id = [''] * 5
            
            # extract barcode and UMI sequence from read
            # filter for barcodes within min/max barcode length and contains no N's
            if min_seq_length <= len(read.seq) <= max_seq_length and read.seq.count("N")==0:
                sequence = str(read.seq)
                
                # split sequence to get barcode and UMI sequences
                try:
                    # assumes no error in linker sequence
                    barcode, umi = sequence.split(linker_seq) # error tolerant regex is possible, but risks conflicting with barcode/UMI. is there a better way?
                except ValueError:
                    # if linker seq cannot be found (possibly sequencing error) or yields multiple splits,
                    # fall back to splitting by base position. Assumes UMI and linker have no indels
                    if umi_length == 0:
                        umi = ''
                        barcode = sequence[:barcode_length]
                    else:
                        umi = sequence[-umi_length:]
                        barcode = sequence[:-(umi_length+len(linker_seq))] # for shorter sequences, partial barcodes are retrieved and can still be potentially ID-ed
                        
                # skip read if barcode or UMI is too short
                if len(barcode)<barcode_length/2 or len(umi)<umi_length/2:
                    print("Skipping {}: Barcode/UMI too short".format(sequence))
                    continue
                    
            else:
#                 print("Skipping {}: Read does not pass filter".format(read.seq))
                continue
            
            
            # assign barcode ID to final ID
            # first search for barcode with exact match. If not possible, use error-tolerant regex
            try:
                barcode_id = barcode_table.loc[barcode, 'Unique_ID']
            except KeyError:
                error=len(barcode)//5 # max error allowed: 0.2
                search = [bool(regex.search(barcode+'{e<='+str(error)+'}', x)) for x in barcode_table.index]
                if sum(search)==1:
                    barcode_id = barcode_table.loc[search, 'Unique_ID']
                else:
                    # skip read if barcode cannot be identified
                    print("Skipping {}: Unable to assign barcode ID".format(sequence))
                    continue
            # add barcode ID to final ID
            final_id = barcode_id
            
            
            # assign UMI ID and append to final ID (if applicable)
            # first search for UMI with exact match in UMI list. assign number based on order in sorted UMI list
            # If not possible, use error-tolerant regex to assign
            if len(umi)>0 and len(umi_list)>0:
                try:
                    umi_id = umi_list.index(umi)+1
                except ValueError:
                    error=len(umi)//4 # max error allowed: 0.25
                    search = [bool(regex.search(umi+'{e<='+str(error)+'}', x)) for x in umi_list]
                    if sum(search) == 1:
                        umi_id = search.index(True)+1
                    else:
                        # skip read if UMI cannot be identified
                        print("Skipping {}: Unable to assign UMI ID".format(sequence))
                        continue
                            
                # add UMI ID to final ID
                final_id = '-'.join([final_id, str(umi_id)])

            
            # add newly assigned barcode-UMI to counts
            try:
                barcode_counts_dict[final_id] += 1
            except KeyError:
                barcode_counts_dict[final_id] = 1
    
    
    return barcode_counts_dict


In [None]:
working_dir="/home/users/rang/crispey3/ladder_pilot_feb2021/fastq/trimmed/merged/"
os.chdir(working_dir)

# merged reads to count barcodes from
fastq_list = sorted(glob.glob("*barcode_final*")) # check for file name
# sample names for each fastq
sample_name_list = [fastq_file.split("_")[0] for fastq_file in fastq_list] # adjust accordingly to generate sample name for counts matrix

#output directory
output_dir = "/home/users/rang/crispey3/ladder_pilot_feb2021/counts/"

# counts file
counts_filename = "all_barcode_counts.txt"


# open barcode reference file
barcode_reference_file = '/home/users/rang/crispey3/library_design/Input/12BP_PBCs_well_grouped.csv'
barcode_table = pd.read_csv(barcode_reference_file, index_col=1)

# approved list of UMIs used in cloning CRISPEY3 plasmid
umi_list = ['ACGCGTGAA',
            'ATGTGGCTC',
            'CAGAGGATC',
            'CTGTGGCAA',
            'GTGTGATTC',
            'TAGAGGACT']
umi_list = sorted(umi_list)


In [None]:
# identify and count barcodes for each fastq file
fastq_dict = dict(zip(sample_name_list, fastq_list))
with mp.Pool(min(len(os.sched_getaffinity(0)), len(fastq_list))) as pool:
    all_counts_df = {sample_name : pool.apply_async(count_barcodes, (fastq_file, sample_name, barcode_table, umi_list)) for sample_name, fastq_file in fastq_dict.items()}
    all_counts_df = {sample_name : res.get() for sample_name, res in all_counts_df.items()}
    
# write all counts to output file
os.makedirs(output_dir, exist_ok=True)
all_counts_df = pd.DataFrame.from_dict(all_counts_df, orient="columns")
all_counts_df.index.name = 'barcode'
all_counts_df.to_csv(output_dir+counts_filename, sep="\t")

## (optional) Combine counts across UMIs per barcode
The counts of different UMIs of the same barcode can be added together to produce a stacked counts matrix

In [None]:
# combine counts from different UMIs of the same barcode
stacked_counts_filename = "stacked_barcode_counts.txt"

stacked_counts_df = all_counts_df.groupby(by=lambda x: x.split('-')[0]).sum()
stacked_counts_df.to_csv(output_dir+stacked_counts_filename, sep="\t")