# Launch Association sequencing on Wynton

In [1]:
import argparse
import json
import os
import sys

# MPRAflow user inputs

## global, association, and count parameters

In [11]:
###
# global values
###

# parse arguments
parser=argparse.ArgumentParser()

parser.add_argument('--experiment_name', type=str, help="name of MPRA experiment")
parser.add_argument('--data_path', type=str, help="base path where fastq files will be stored")
parser.add_argument('--design_fasta', type=str, help="name of design.fasta file")
parser.add_argument('--fastq_prefix', type=str, help="prefix for fastq files")  
parser.add_argument('-f', '--min_frac', type=str, default="0.7", 
                    help="min fraction of overlap with library during bwa alignment in association sequencing")
parser.add_argument('-mq', '--mapq', type=str, default="0", 
                    help="min mapq quality during association sequencing")
parser.add_argument('-bc', '--bc_threshold', type=str, default="10", 
                    help="min barcode threshold during count sequencing")
parser.add_argument('-bcl', '--bc_length', type=str, default="15", 
                    help="barcode length")

args = parser.parse_args()
EXP_NAME = args.experiment_name
DATAPATH = args.data_path
DESIGN_FILENAME = args.design_fasta
FASTQ_PREFIX = args.fastq_prefix 

# optional arguments
MIN_FRAC = args.min_frac
MAPQ = args.mapq
MIN_BC_THRESH = args.bc_threshold
BC_LEN = args.bc_length

TEST=False


"""
EXP_NAME = "null_mpra"  # change me
DATAPATH = "/wynton/group/ahituv/fongsl/projects/CRISPR_nullomer_MPRA/data/"  # change me
DESIGN_FILENAME = "15mer.fo.pam.scaffold.ext200.library.TWIST.fa"  # change me
FASTQ_PREFIX = "SF_asso_S1"  # change me

TEST = False # run a test? 

###
# Association parameters
###

MIN_FRAC = "0.7"  # change me
MAPQ = "0" #3  # change me

###
# Count parameters
###

BC_LEN=15
MIN_BC_THRESH=10  # min 10 barcodes 
"""

NameError: name 'argparse' is not defined

## paths

In [None]:
###
# paths
###

LIBRARY_DIR = os.path.join(DATAPATH, "library")
ASSOC_DIR = os.path.join(DATAPATH, "assoc") #"20230815_nullomer_MPRA_assoc" 
COUNT_DIR = os.path.join(DATAPATH, "count") # change me "20240430_MPRA_sequencing"
SRC = "/wynton/group/ahituv/MPRAflow/" # change me from "/wynton/group/ahituv/fongsl/src/MPRAflow/" # DELETE me, replace w/ SRC below

###
# design file
###
DESIGN = os.path.join(LIBRARY_DIR, DESIGN_FILENAME)  # change me

###
# json config
###
JSON = os.path.join(DATAPATH, 'config.json')

In [None]:
###
# Association sequencing fastq files
###

if TEST is True:
    ASSOC_DIR = os.path.join(ASSOC_DIR, "test")
    FASTQ_PREFIX="test"
    
R1 = os.path.join(ASSOC_DIR, f"{FASTQ_PREFIX}_R1_001.fastq.gz")
R2 = os.path.join(ASSOC_DIR, f"{FASTQ_PREFIX}_R4_001.fastq.gz")
# note: SF_asso_S1_R3_001.fastq.gz is common i5 GCCAGCGCCG
BC = os.path.join(ASSOC_DIR, f"{FASTQ_PREFIX}_R2_001.fastq.gz")


###
# Count function
###

if TEST is True:
    COUNT_DIR = os.path.join(COUNT_DIR, "test")  # path to DNA, RNA fastq files
    BARCODE_PATH = ASSOC_DIR.strip("test")

EXP_CSV = os.path.join(COUNT_DIR, "experiment.csv")  # experiment file

BARCODE_PATH = ASSOC_DIR.strip("test")
BARCODE_DICT = os.path.join(BARCODE_PATH,
                            'MPRAflow_results', 
                            f"{EXP_NAME}_filtered_coords_to_barcodes.pickle") # sequence-barcode dictionary

# json config 

In [5]:
# json config
jsondata = {}

# global dictionary
global_dict = {
    'exp_name':EXP_NAME, 
    'path':DATAPATH, 
    'library_path': LIBRARY_DIR,
    'design_fa': DESIGN,
    'src_scripts':SRC,
    'test': TEST
}

# association dictionary
assoc_dict = {
    "path": ASSOC_DIR,
    "fastq_prefix": FASTQ_PREFIX,
    "ins": R1,
    "inspe": R2,
    "design_fa": DESIGN,
    "src": SRC,
    "min_frac": str(MIN_FRAC),
    "mapq": str(MAPQ),

}
# write count dictionary
count_dict = {
    "path": COUNT_DIR,
    "exp_csv": EXP_CSV,
    "barcode_path": BARCODE_PATH,
    "barcode_dict": BARCODE_DICT,
    "barcode_len": str(BC_LEN),
    "design_fa": DESIGN,
    "src": SRC,
    "threshold":MIN_BC_THRESH

}

# add to json dict
jsondata["global"] = global_dict
if TEST is True:
    jsondata["assoc_seq_test"] = assoc_dict
    jsondata["count_seq_test"] = count_dict
else:
    jsondata["assoc_seq"] = assoc_dict
    jsondata["count_seq"] = count_dict


# main

# Command line functions - nextflow 

In [9]:
def assocCmd(src, ASSOC_DIR, name, insert_fq, insert_pe_fq, design_fa, bc_fq, min_frac=0.7, mapq=0, qsub=False):
    """association nextflow cmd
    
    ./nextflow association.nf
            --w $1 \
            --name $2 \
            --fastq-insert $3 \
            --fastq-insertPE $4 \
            --design $5 \
            --fastq-bc $6 \
            --min-frac $7 \
            --mapq $8
    """

    if qsub is True:
        cmd = " ".join([
            "qsub -pe smp 6 -l mem_free=12G /wynton/home/ahituv/fongsl/nullomers/bin-lock/assoc.sh",  # CHANGE ME
            ASSOC_DIR, 
            name, 
            insert_fq,
            insert_pe_fq, 
            design_fa, 
            bc_fq, 
            str(min_frac), 
            str(mapq)
            
            ])
        
    else:
        cmd = " ".join([
                "cd", src,
                "\n",
                "nextflow run association.nf",
                "--w",
                ASSOC_DIR,
                "--name",
                name,
                "--fastq-insert",
                insert_fq,
                "--fastq-insertPE",
                insert_pe_fq,
                "--design",
                design_fa,
                "--fastq-bc",
                bc_fq,
                " --min-frac",
                str(min_frac),
                "--mapq",
                str(mapq)
            ])
    print("REMEMBER TO ACTIVATE YOUR MPRAflow CONDA ENVIRONMENT!\n\n", cmd)
    return cmd    


def countCmd(src, count_dir, exp_csv, design_fa, barcode_dict, bc_len, name, bc_thresh=10, qsub=False):
    """generate cmdline count argument, require bc threshold of 10
    
        ./nextflow run count.nf \
            --dir $1 \
            --e  $2 \
            --design $3 \
            --association $4 \
            --bc-length $5 \
            --thresh $6 \
            --name $7 
    """
    
    if qsub is True:
        cmd = " ".join([
            "qsub -pe smp 6 -l mem_free=10G /wynton/home/ahituv/fongsl/nullomers/bin-lock/count.sh",  # CHANGE ME
            count_dir, 
            exp_csv, 
            design_fa,
            barcode_dict,
            str(bc_len), 
            str(bc_thresh), 
            name
            ])

    
    else:
        cmd = " ".join([
            "cd", src,
            "\n",
            "nextflow run count.nf",
            "--dir",
            count_dir,
            "--e",
            exp_csv,
            "--design",
            design_fa,
            "--association",
            barcode_dict,
            "--bc-length",
            str(bc_len),
            #"--umi-length",
            #umi_length
            "--thresh", 
            str(bc_thresh), 
            "--name", name
        
        ])
    print("REMEMBER TO ACTIVATE YOUR MPRAflow CONDA ENVIRONMENT!\n\n", cmd)
    return cmd

## check that paths exists

In [None]:
for path in [DATAPATH, ASSOC_DIR, COUNT_DIR]:
    try:
        os.mkdir(path)
    except:
        print("path already exists", path)

## association sequencing command

In [10]:
cmds = {}

cmds["assoc_cmd"] = assocCmd(SRC, ASSOC_DIR, EXP_NAME, R1, R2, DESIGN, BC, MIN_FRAC, MAPQ, qsub=True) 

cmds["count_cmd"] = countCmd(SRC, COUNT_DIR, EXP_CSV, DESIGN, BARCODE_DICT, BC_LEN, EXP_NAME, qsub=True)

jsondata["runs"] = cmds

REMEMBER TO ACTIVATE YOUR MPRAflow CONDA ENVIRONMENT!

 qsub /wynton/home/ahituv/fongsl/nullomers/bin-lock/assoc.sh /wynton/group/ahituv/fongsl/projects/CRISPR_nullomer_MPRA/data/20230815_nullomer_MPRA_assoc null_mpra /wynton/group/ahituv/fongsl/projects/CRISPR_nullomer_MPRA/data/20230815_nullomer_MPRA_assoc/SF_asso_S1_R1_001.fastq.gz /wynton/group/ahituv/fongsl/projects/CRISPR_nullomer_MPRA/data/20230815_nullomer_MPRA_assoc/SF_asso_S1_R4_001.fastq.gz /wynton/group/ahituv/fongsl/projects/CRISPR_nullomer_MPRA/data/library/15mer.fo.pam.scaffold.ext200.library.TWIST.fa /wynton/group/ahituv/fongsl/projects/CRISPR_nullomer_MPRA/data/20230815_nullomer_MPRA_assoc/SF_asso_S1_R2_001.fastq.gz 0.7 0
REMEMBER TO ACTIVATE YOUR MPRAflow CONDA ENVIRONMENT!

 qsub -pe smp 6 -l mem_free=10G /wynton/home/ahituv/fongsl/nullomers/bin-lock/count.sh /wynton/group/ahituv/fongsl/projects/CRISPR_nullomer_MPRA/data/20240430_MPRA_sequencing /wynton/group/ahituv/fongsl/projects/CRISPR_nullomer_MPRA/data/20240430_

## write json

In [8]:
with open(JSON, 'w') as f:
    json.dump(jsondata, f)
print("wrote json", JSON)

wrote json /wynton/group/ahituv/fongsl/projects/CRISPR_nullomer_MPRA/data/config.json
