# Process and barcode sequencing reads
This script takes paired read (read 1: barcode + UMI (27bp), read 2: staggers + UMI + partial barcode (49bp)) fastq files and does the following:
1. trims read 2 adapter sequences to recover barcode+UMI sequence
2. merges read 1 and 2 with FLASH
3. identifies and counts barcodes

In [1]:
import glob
import os
import subprocess
import regex
import gzip
from Bio import SeqIO
import pandas as pd
import multiprocessing as mp
import numpy as np

# check number of available cores
len(os.sched_getaffinity(0))

20

## Inspect fastq files for quality control using FastQC

In [2]:
# !mkdir ~/scratch/yeast/crispey3/pool1_scm_aug2021/fastq/fastqc/
# !fastqc -q -o ~/scratch/yeast/crispey3/pool1_scm_aug2021/fastq/fastqc/ ~/scratch/yeast/crispey3/pool1_scm_aug2021/fastq/*fastq.gz 
!mkdir ~/scratch/yeast/crispey3/gxg_bya_aug2021/fastq/fastqc/
!fastqc -q -o ~/scratch/yeast/crispey3/gxg_bya_aug2021/fastq/fastqc/ ~/scratch/yeast/crispey3/gxg_bya_aug2021/fastq/*fastq.gz

mkdir: cannot create directory ‘/home/users/rang/scratch/yeast/crispey3/pool1_scm_aug2021/fastq/fastqc/’: File exists


## Summarize FastQC output with MultiQC

In [3]:
# !multiqc -o ~/scratch/yeast/crispey3/pool1_scm_aug2021/fastq/fastqc/ ~/scratch/yeast/crispey3/pool1_scm_aug2021/fastq/fastqc/
!multiqc -o ~/scratch/yeast/crispey3/gxg_bya_aug2021/fastq/fastqc/ ~/scratch/yeast/crispey3/gxg_bya_aug2021/fastq/fastqc/

[1;30m[INFO   ][0m         multiqc : This is MultiQC v1.9
[1;30m[INFO   ][0m         multiqc : Template    : default
[1;30m[INFO   ][0m         multiqc : Searching   : /home/users/rang/scratch/yeast/crispey3/pool1_scm_aug2021/fastq/fastqc
[?25lSearching 145 files..  [####################################]  100%          [?25h
[1;30m[INFO   ][0m          fastqc : Found 72 reports
[1;30m[INFO   ][0m         multiqc : Compressing plot data
[1;30m[INFO   ][0m         multiqc : Report      : ../../../scratch/yeast/crispey3/pool1_scm_aug2021/fastq/fastqc/multiqc_report_1.html
[1;30m[INFO   ][0m         multiqc : Data        : ../../../scratch/yeast/crispey3/pool1_scm_aug2021/fastq/fastqc/multiqc_data_1
[1;30m[INFO   ][0m         multiqc : MultiQC complete


## Map fastq file names to sample names

In [4]:
# key to map fastq names to output names
seqID_to_sampleName = {}
# sample_key_file = "/home/users/rang/scratch/yeast/crispey3/pool1_scm_aug2021/SampleKey-18146-36.txt"
sample_key_file = "/home/users/rang/scratch/yeast/crispey3/gxg_bya_aug2021/SampleKey-18146-35.txt"
with open(sample_key_file, 'r') as sample_key:
    sample_key.readline() # skip header
    for line in sample_key:
        seqID, sampleName = line.rstrip().split("\t")
        sampleName = sampleName.replace("_","-")
        seqID_to_sampleName[seqID] = sampleName


## Trim read 2 adapters with cutadapt
Remove staggers, leaving the UMI+partial barcode sequence (19-26bp, may be shorter depending on quality trimming)

In [47]:
# working directory with fastq files
# working_dir="/home/users/rang/scratch/yeast/crispey3/pool1_scm_aug2021/fastq/"
working_dir="/home/users/rang/scratch/yeast/crispey3/gxg_bya_aug2021/fastq/"
os.chdir(working_dir)

# get read 2 files for trimming
fastq_list = sorted([os.path.abspath(x) for x in glob.glob("*R2_001.fastq.gz")])

In [49]:
# cutadapt parameters to trim read 2 to get barcode+UMI (27bp)
adapter_5prime = 'GGCCAGTTTAAACTT'
adapter_3prime = 'GCATGGC'

num_of_cores = len(os.sched_getaffinity(0))
err = 0.2 # fraction tolerated for adapter matching
min_r2_length = 12 # R2 must contain at least UMI sequence and some of SphI linker
output_dir_name = 'trimmed'

In [50]:
# store sample key in regex pattern
pattern = regex.compile('|'.join(seqID_to_sampleName.keys()))

# trim read 2, filter untrimmed read pairs
for fastq_path in fastq_list:
    fastq_dir = os.path.dirname(fastq_path)
    output_dir = fastq_dir + "/"+output_dir_name+"/"
    os.makedirs(output_dir, exist_ok=True)
    
    # rename output files by sample key stored in seqID_to_sampleName 
    fastq_file = os.path.basename(fastq_path)
    output_file_r2 = pattern.sub(lambda x: seqID_to_sampleName[x.group()], fastq_file).replace("_001.fastq.gz", "_001_trimmed.fastq.gz")
    output_file_r1 = output_file_r2.replace("_R2_", "_R1_")

    print('Trimming: ' + fastq_path)
    
    cutadapt_cmd = ["cutadapt", "-g", adapter_5prime+"..."+adapter_3prime, #adapter_5prime, 
                    "-j", str(num_of_cores), 
                    "-e", str(err), 
                    #"-q", "20", # use -q for miseq/hiseq quality trimming
                    "--nextseq-trim", "20", # use this option for nextseq quality trimming
                    "--discard-untrimmed", "-m", str(min_r2_length), 
                    "--pair-filter=first", 
                    "-o", output_dir+output_file_r2, "-p", output_dir+output_file_r1,
                    fastq_path, fastq_path.replace("L001_R2_", "L001_R1_")]
    
    subprocess.run(cutadapt_cmd)

    print("Output files:")
    print(output_file_r2)
    print(output_file_r1)
    print()
    
print('Done trimming!')

Trimming: /scratch/users/rang/yeast/crispey3/gxg_bya_aug2021/fastq/18146FL-35-01V1-01_S1_L001_R2_001.fastq.gz
Output files:
BYA-t1-1_S1_L001_R2_001_trimmed.fastq.gz
BYA-t1-1_S1_L001_R1_001_trimmed.fastq.gz

Trimming: /scratch/users/rang/yeast/crispey3/gxg_bya_aug2021/fastq/18146FL-35-01V1-02_S2_L001_R2_001.fastq.gz
Output files:
BYA-t1-2_S2_L001_R2_001_trimmed.fastq.gz
BYA-t1-2_S2_L001_R1_001_trimmed.fastq.gz

Trimming: /scratch/users/rang/yeast/crispey3/gxg_bya_aug2021/fastq/18146FL-35-01V1-03_S3_L001_R2_001.fastq.gz
Output files:
BYA-t1-3_S3_L001_R2_001_trimmed.fastq.gz
BYA-t1-3_S3_L001_R1_001_trimmed.fastq.gz

Trimming: /scratch/users/rang/yeast/crispey3/gxg_bya_aug2021/fastq/18146FL-35-01V1-04_S4_L001_R2_001.fastq.gz
Output files:
BYA-t4-1_S4_L001_R2_001_trimmed.fastq.gz
BYA-t4-1_S4_L001_R1_001_trimmed.fastq.gz

Trimming: /scratch/users/rang/yeast/crispey3/gxg_bya_aug2021/fastq/18146FL-35-01V1-05_S5_L001_R2_001.fastq.gz
Output files:
BYA-t4-2_S5_L001_R2_001_trimmed.fastq.gz
BYA-t4-

## Merge read 1 and read 2 with FLASH to produce final barcode+UMI sequence

In [8]:
# working_dir="/home/users/rang/scratch/yeast/crispey3/pool1_scm_aug2021/fastq/trimmed/"
working_dir="/home/users/rang/scratch/yeast/crispey3/gxg_bya_aug2021/fastq/trimmed/"
os.chdir(working_dir)

fastq_list = sorted([os.path.abspath(x) for x in glob.glob("*R1_001_trimmed.fastq.gz")])

In [9]:
# FLASH parameters
min_overlap = 8 # cannot be longer than the shorter read.
max_overlap = 12 # cannot be longer than the shorter read.
max_mismatch = 0.25 #default is 0.25, set 0.4 to tolerate lower seq quality
num_of_cores = len(os.sched_getaffinity(0))
output_dir_name = 'merged'

In [10]:
# use FLASH to merge trimmed-filtered read 2 and read 1 data to produce final 27bp sequence containing barcode and UMI data
for fastq_path in fastq_list:
    fastq_dir = os.path.dirname(fastq_path)
    output_dir = fastq_dir + "/"+output_dir_name+"/"
    os.makedirs(output_dir, exist_ok=True)
    
    # check output file naming 
    output_prefix = os.path.basename(fastq_path).split("_")[0]+"_barcode"
    print('Merging', fastq_path, 'and', fastq_path.replace("_R1_", "_R2_"))
    
    flash_cmd = ["flash", "-m", str(min_overlap), "-M", str(max_overlap),
                 "-x", str(max_mismatch), #"-O", # use -O if innie-only merging does not work
                 "-t", str(num_of_cores),
                 "-o", output_prefix, "-d", output_dir, 
                 "--compress", 
                 fastq_path, fastq_path.replace("_R1_", "_R2_")]
    subprocess.run(flash_cmd)
    print(output_prefix, "merged")

print('Done merging!')

Merging /scratch/users/rang/yeast/crispey3/gxg_bya_aug2021/fastq/trimmed/BYA-t1-1_S1_L001_R1_001_trimmed.fastq.gz and /scratch/users/rang/yeast/crispey3/gxg_bya_aug2021/fastq/trimmed/BYA-t1-1_S1_L001_R2_001_trimmed.fastq.gz
BYA-t1-1_barcode merged
Merging /scratch/users/rang/yeast/crispey3/gxg_bya_aug2021/fastq/trimmed/BYA-t1-2_S2_L001_R1_001_trimmed.fastq.gz and /scratch/users/rang/yeast/crispey3/gxg_bya_aug2021/fastq/trimmed/BYA-t1-2_S2_L001_R2_001_trimmed.fastq.gz
BYA-t1-2_barcode merged
Merging /scratch/users/rang/yeast/crispey3/gxg_bya_aug2021/fastq/trimmed/BYA-t1-3_S3_L001_R1_001_trimmed.fastq.gz and /scratch/users/rang/yeast/crispey3/gxg_bya_aug2021/fastq/trimmed/BYA-t1-3_S3_L001_R2_001_trimmed.fastq.gz
BYA-t1-3_barcode merged
Merging /scratch/users/rang/yeast/crispey3/gxg_bya_aug2021/fastq/trimmed/BYA-t10-1_S10_L001_R1_001_trimmed.fastq.gz and /scratch/users/rang/yeast/crispey3/gxg_bya_aug2021/fastq/trimmed/BYA-t10-1_S10_L001_R2_001_trimmed.fastq.gz
BYA-t10-1_barcode merged
Mer

## (optional) Downsample reads for analysis
Use seqtk in command line to downsample fastq files prior to assembling counts matrix.<br>
e.g. seqtk sample -s100 read1.fq 10000 > sub1.fq

## Count barcodes
Counting barcodes consists of several steps. First, parse each fastq file and count all sequences. After assembling into an initial sequences counts matrix, extract the barcode and UMI sequences and map them to a reference table of barcodes and UMIs. Counts for ID-able sequences are onsolidated into a final counts matrix for input to DESeq2

In [11]:
def count_seqs(fastq_file, min_seq_length, max_seq_length):
    '''
    Parses a fastq file and counts sequences. Returns dict of counts
    '''
    seq_counts_dict = {}
    # parse fastq
    with gzip.open(fastq_file, 'rt') as fastq:
        for read in SeqIO.parse(fastq, "fastq"):
            # filter for sequences within min/max length and contains no N's
            if min_seq_length <= len(read.seq) <= max_seq_length and read.seq.count("N")==0:
                sequence = str(read.seq)
                # count sequence
                try:
                    seq_counts_dict[sequence] += 1
                except KeyError:
                    seq_counts_dict[sequence] = 1
    
    return seq_counts_dict

    
def map_seq_to_barcode_umi(seq, barcode_table, umi_list, barcode_length, umi_length, linker_seq):
    '''
    splits a sequence into barcode and UMI, maps to barcode table and UMI list to assign ID
    does NOT do UMI mapping if umi_list is empty.
    '''
    final_id = 'UNKNOWN'
    
    barcode, umi = split_barcode_umi_from_seq(seq, barcode_length, umi_length, linker_seq)
    
    if len(barcode)<barcode_length/2 or len(umi)<umi_length/2:
        # Barcode/UMI too short
        return final_id

    # assign barcode ID
    barcode_id = assign_barcode(barcode=barcode, barcode_table=barcode_table, error=1)
    if barcode_id == 'UNKNOWN':
        # Barcode cannot be identified
        return barcode_id
    else:
        # add barcode ID to final ID
        final_id = barcode_id

    # assign UMI ID (if applicable)
    if len(umi_list)>0:
        umi_id = assign_umi(umi=umi, umi_list=umi_list, error=1)
        if umi_id == 'UNKNOWN':
            # UMI cannot be identified
            return umi_id
        else:
            # add UMI ID to final ID
            final_id = '-'.join([final_id, str(umi_id)])
    
    return final_id
    

def split_barcode_umi_from_seq(seq, barcode_length, umi_length, linker_seq):
    '''
    Splits seq by linker_seq and returns barcode and UMI sequence
    If linker seq cannot be found (e.g. sequencing error) or yields multiple splits,
    fall back to splitting by base position.
    '''
    barcode = ''
    umi = ''
    
    try:
        # split by linker
        barcode, umi = seq.split(linker_seq) # can try error tolerant regex?
    except ValueError:
        # split by base position
        if barcode_length>0 and umi_length==0:
            barcode = seq[:barcode_length]
        else:
            umi = seq[-umi_length:]
            barcode = seq[:-(umi_length+len(linker_seq))] # may return partial barcodes for short sequences

    return (barcode, umi)


def assign_barcode(barcode, barcode_table, error):
    '''
    Searches barcode table for barcode sequence and returns unique barcode ID
    Tries perfect match first, then error-tolerant regex
    '''
    if barcode == '':
        barcode_id = 'UNKNOWN'
    else:
        try:
            # search for perfect match
            barcode_id = barcode_table.loc[barcode, 'Unique_ID']
        except KeyError:
            # search by error-tolerant regex
            pattern = "(?:"+barcode+"){s<="+str(error)+"}"
            search = [bool(regex.search(pattern, x)) for x in barcode_table.index]
            if sum(search)==1:
                barcode_id = barcode_table.loc[search, 'Unique_ID'][0]
            else:
                # barcode cannot be identified
                barcode_id = 'UNKNOWN'
    
    return barcode_id


def assign_umi(umi, umi_list, error):
    '''
    Searches umi list for umi sequence and returns 1-index position as umi ID
    Tries perfect match first, then error-tolerant regex
    '''
    if umi == '':
        umi_id = 'UNKNOWN'
    else:
        try:
            # search for perfect match
            umi_id = umi_list.index(umi)+1
        except ValueError:
            # search by error-tolerant regex
            pattern = "(?:"+umi+"){s<="+str(error)+"}"
            search = [bool(regex.search(pattern, x)) for x in umi_list]
            if sum(search) == 1:
                umi_id = search.index(True)+1
            else:
                # UMI cannot be identified
                umi_id = 'UNKNOWN'
    
    return umi_id


In [12]:
# working_dir="/home/users/rang/scratch/yeast/crispey3/pool1_scm_aug2021/fastq/trimmed/merged/"
working_dir="/home/users/rang/scratch/yeast/crispey3/gxg_bya_aug2021/fastq/trimmed/merged/"
os.chdir(working_dir)

#output directory
# output_dir = "/home/users/rang/scratch/yeast/crispey3/pool1_scm_aug2021/counts/"
output_dir = "/home/users/rang/scratch/yeast/crispey3/gxg_bya_aug2021/counts/"

# merged reads to count barcodes from
fastq_list = sorted(glob.glob("*extendedFrags.fastq.gz")) # check for file name
# sample names for each fastq
sample_name_list = [fastq_file.split("_")[0] for fastq_file in fastq_list] # adjust accordingly to generate sample name for counts matrix

# sequence counts file (before combining)
seq_counts_filename = "seq_counts.txt"

# mapped barcode-UMI counts file
barcode_counts_filename = "barcode_counts.txt"


# open barcode reference file
barcode_reference_file = '/home/users/rang/crispey3/library_design/Input/12BP_PBCs_well_grouped.csv'
barcode_table = pd.read_csv(barcode_reference_file, index_col=1)

# approved list of UMIs used in cloning CRISPEY3 plasmid
umi_list = ['ACGCGTGAA',
            'ATGTGGCTC',
            'CAGAGGATC',
            'CTGTGGCAA',
            'GTGTGATTC',
            'TAGAGGACT',] # only first 6 UMIs were included in cloning CRISPEY3 libaries
#             'AAGAGCCTC',
#             'AAGAGGAGG',
#             'ATGTGCGAA',
#             'ATGTGTAGG',
#             'CAGAGCCAA',
#             'CTGTGATGG',
#             'CTGTGTATC',
#             'GAGAGGAAA',
#             'TCGCGGTAA',
#             'TTGTGCGTC']
umi_list = sorted(umi_list)


In [13]:
# count sequences in each fastq file
fastq_dict = dict(zip(sample_name_list, fastq_list))
with mp.Pool(min(len(os.sched_getaffinity(0)), len(fastq_list))) as pool:
    seq_counts_df = {sample_name : pool.apply_async(count_seqs, (fastq_file, 20, 27)) for sample_name, fastq_file in fastq_dict.items()}
    seq_counts_df = {sample_name : res.get() for sample_name, res in seq_counts_df.items()}
    
# merge to dataframe
seq_counts_df = pd.DataFrame.from_dict(seq_counts_df, orient="columns")
seq_counts_df.index.name = 'sequence'

# write to file to inspect
os.makedirs(output_dir, exist_ok=True)
seq_counts_df.to_csv(output_dir+seq_counts_filename, sep="\t")


In [13]:
# seq_counts_df = pd.read_csv(output_dir+seq_counts_filename, sep="\t", index_col='sequence')

In [14]:
# map each sequence in seq_counts_df to barcode-UMI ID
mapped_counts_df = seq_counts_df.copy()
mapped_counts_df = mapped_counts_df.reset_index()

# filter out low count barcodes (computationally expensive to map these rare barcodes, minimal impact to total count (~2%))
cutoff = 3
mapped_counts_df = mapped_counts_df.loc[mapped_counts_df.sum(axis=1)>=cutoff]

# do multiprocessing for sequence mapping
num_of_cores = len(os.sched_getaffinity(0))
with mp.Pool(num_of_cores) as pool:
    barcode_umi_id_list = [pool.apply_async(map_seq_to_barcode_umi, (seq, barcode_table, umi_list, 12, 9, 'GCATGC')) for seq in mapped_counts_df['sequence']]
    barcode_umi_id_list = [res.get() for res in barcode_umi_id_list]
    
# mapped_counts_df['barcode_umi_id'] = [x for res in barcode_umi_id_lists for x in res] #pd.concat(barcode_umi_id_lists)
mapped_counts_df['barcode_umi_id'] = barcode_umi_id_list
display(mapped_counts_df)

# remove unknowns
mapped_counts_df = mapped_counts_df.query('barcode_umi_id!="UNKNOWN"')

# consolidate counts
mapped_counts_df = mapped_counts_df.groupby('barcode_umi_id').sum().fillna(0).astype(int)
mapped_counts_df.index.name = 'barcode'
display(mapped_counts_df)

# write all counts to output file
os.makedirs(output_dir, exist_ok=True)
mapped_counts_df.to_csv(output_dir+barcode_counts_filename, sep="\t")


# # one-liner to map sequences to barcode-umi IDs
# # warning: single-threaded, expected to be slow
# mapped_counts_df['barcode_umi_id'] = mapped_counts_df['sequence'].apply(map_seq_to_barcode_umi, args=(barcode_table, umi_list, 12, 9, 'GCATGC'))


Unnamed: 0,sequence,BYA-t1-1,BYA-t1-2,BYA-t1-3,BYA-t10-1,BYA-t10-2,BYA-t10-3,BYA-t13-1,BYA-t13-2,BYA-t13-3,BYA-t4-1,BYA-t4-2,BYA-t4-3,BYA-t7-1,BYA-t7-2,BYA-t7-3,barcode_umi_id
0,TATCGTCTTCATGCATGCACGCGTGAA,4302.0,5068.0,4616.0,4728.0,5121.0,5249.0,5065.0,5002.0,5508.0,4307.0,6222.0,4616.0,4677.0,4721.0,4993.0,054_037-1
1,AGCTGGACTCGTGCATGCCTGTGGCAA,12467.0,14109.0,13828.0,13657.0,14528.0,15180.0,13899.0,13459.0,15149.0,13201.0,17873.0,14381.0,13410.0,13931.0,15180.0,065_058-4
2,TATTAAGAGCGCGCATGCCAGAGGATC,9801.0,10695.0,10322.0,11307.0,11981.0,11757.0,11664.0,11432.0,12113.0,10351.0,14210.0,10831.0,10809.0,11083.0,11452.0,076_066-3
3,CTCCTTGTTCGCGCATGCGTGTGATTC,216.0,284.0,245.0,176.0,143.0,265.0,116.0,133.0,250.0,221.0,310.0,230.0,187.0,214.0,301.0,048_064-5
4,GCCGAAACTAGAGCATGCCAGAGGATC,12874.0,14597.0,14325.0,13189.0,13620.0,12758.0,12596.0,12369.0,12304.0,13450.0,18026.0,14225.0,13128.0,13413.0,14133.0,035_004-3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2424829,TCGCTCACAAACGCACGCATGTGGCTC,,,,,,,,,,,,,,,3.0,040_038-2
2425244,TCCGTGTCGGAAATTACCCC,,,,,,,,,,,,,,,3.0,UNKNOWN
2425708,ATCGGTATCATTGGCTTAGA,,,,,,,,,,,,,,,4.0,UNKNOWN
2426570,GTCGCACGGTGACGGAGGAA,,,,,,,,,,,,,,,3.0,UNKNOWN


Unnamed: 0_level_0,BYA-t1-1,BYA-t1-2,BYA-t1-3,BYA-t10-1,BYA-t10-2,BYA-t10-3,BYA-t13-1,BYA-t13-2,BYA-t13-3,BYA-t4-1,BYA-t4-2,BYA-t4-3,BYA-t7-1,BYA-t7-2,BYA-t7-3
barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
001_022-6,5,4,5,34,20,21,37,18,28,13,4,13,10,17,20
001_045-6,7,6,2,0,0,0,0,0,0,2,2,1,2,6,0
001_119-6,7,0,2,0,0,0,0,0,0,2,2,4,0,0,2
002_017-2,1,10,5,0,1,2,0,0,0,2,11,6,0,6,3
002_019-6,2,3,2,0,0,3,0,0,0,0,0,5,0,0,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Ladder_042-3,0,1,2,0,0,0,0,0,0,0,0,1,0,0,0
Ladder_043-1,19,9,11,3,11,3,0,7,2,20,1,13,5,14,12
Ladder_043-2,8,10,11,0,2,1,0,3,0,4,12,9,5,13,0
Ladder_043-3,9,13,10,3,4,9,8,3,8,4,8,5,17,5,10


## (optional) Combine counts across UMIs per barcode
The counts of different UMIs of the same barcode can be added together to produce a stacked counts matrix

In [15]:
# combine counts from different UMIs of the same barcode
stacked_counts_filename = "stacked_barcode_counts.txt"

stacked_counts_df = mapped_counts_df.groupby(by=lambda x: x.split('-')[0]).sum()
stacked_counts_df.index.name = 'barcode'
stacked_counts_df.to_csv(output_dir+stacked_counts_filename, sep="\t")

In [2]:
# # adapt ladder fastq files by trimming to new seq format (R1: 27bp, R2: 49bp)
# r1_len = 27
# r2_len = 49

# # working directory with fastq files
# working_dir="/home/users/rang/scratch/yeast/tmp/ladder_pilot_adapted/fastq/"
# os.chdir(working_dir)

# # trim R1
# fastq_list = sorted([os.path.abspath(x) for x in glob.glob("*R1_001.fastq.gz")])
# for fastq_path in fastq_list:
#     output_path = fastq_path.replace("_R1_001", "_R1_adapted")
#     cutadapt_cmd = ["cutadapt", "-l", str(r1_len),
#                     "-o", output_path, fastq_path]
#     subprocess.run(cutadapt_cmd)
    
# # trim R2
# fastq_list = sorted([os.path.abspath(x) for x in glob.glob("*R2_001.fastq.gz")])
# for fastq_path in fastq_list:
#     output_path = fastq_path.replace("_R2_001", "_R2_adapted")
#     cutadapt_cmd = ["cutadapt", "-l", str(r2_len),
#                     "-o", output_path, fastq_path]
#     subprocess.run(cutadapt_cmd)

In [None]:
# # one-step count_barcodes function
# # warning: less efficient since barcode-UMI mapping is done per sample, rather than a single time after all sequences are counted

# def count_barcodes(fastq_file, barcode_table, umi_list, 
#                    min_seq_length=20, barcode_length=12, umi_length=9, linker_seq='GCATGC'):
#     '''
#     parses fastq file to count sequences, then extracts barcode-UMI info from sequences and assigns ID based
#     on provided reference barcode_table and umi_list. Finally, consolidates counts by assigned ID and returns
#     a dict of barcode counts.
#     does NOT do UMI mapping if umi_list is empty.
#     '''
#     # set max_seq_length
#     max_seq_length = barcode_length+umi_length+len(linker_seq) # this could be adjusted to allow insertions
#     # alphabetical sort umi_list
#     umi_list = sorted(umi_list)
    
#     # count raw sequences
#     seq_counts = count_seqs(fastq_file, min_seq_length, max_seq_length)
    
#     # consolidate barcode counts
#     barcode_counts_dict = {}
#     for seq, count in seq_counts.items():
#         # assign barcode-UMI ID
#         assigned_id = map_seq_to_barcode_umi(seq, barcode_table, umi_list, barcode_length, umi_length, linker_seq)
#         if assigned_id:
#             try:
#                 barcode_counts_dict[assigned_id] += count
#             except KeyError:
#                 barcode_counts_dict[assigned_id] = count
    
#     return barcode_counts_dict


# fastq_dict = dict(zip(sample_name_list, fastq_list))
# with mp.Pool(min(len(os.sched_getaffinity(0)), len(fastq_list))) as pool:
#     all_counts_df = {sample_name : pool.apply_async(count_barcodes, (fastq_file, barcode_table, umi_list)) for sample_name, fastq_file in fastq_dict.items()}
#     all_counts_df = {sample_name : res.get() for sample_name, res in all_counts_df.items()}
    
# # write all counts to output file
# os.makedirs(output_dir, exist_ok=True)
# all_counts_df = pd.DataFrame.from_dict(all_counts_df, orient="columns")
# all_counts_df.index.name = 'barcode'
# all_counts_df.to_csv(output_dir+counts_filename, sep="\t")