# nf-core ATAC-seq pipeline 
20240711 

Sarah Fong
# How to 
1. Set up environment for running nf-core atac-seq pipeline
2. Run nf-core ATAC-seq pipeline using parallelized jobs on Wynton
   
See nf-core documentation: https://nf-co.re/atacseq/2.1.2/

# Setting up environment

1. Add miniconda into your .bashrc (do once)

        /wynton/group/ahituv/bin/miniconda3/bin/conda init bash

2. source the new bashrc

        source ~/.bashrc 

3. create nf core virtual environment

       conda create -n env_nf -f /wynton/group/ahituv/bin/pipelines/env/env_nf.yml

4. activate the environment

       conda activate env_nf

5. make qsub output dir
    
        mkdir $HOME/qsub_log/

# Run nf-core ATAC-seq pipeline

## Make sample.csv
   
    2. headers: sample,fastq_1,fastq_2,replicate
        3. sample: sample label (e.g. control/treated)
        4. fastq_1: PE file 1
        5. fastq_2: PE file 2

    example:

            sample,fastq_1,fastq_2,replicate
            control,/wynton/group/ahituv/fongsl/projects/EMF/data/ATAC/WTC11NGN2.1/./ctrl1/ctrl1_CKDL240018717-1A_HHC5NDSXC
            _L2_1.fq.gz,/wynton/group/ahituv/fongsl/projects/EMF/data/ATAC/WTC11NGN2.1/./ctrl1/ctrl1_CKDL240018717-1A_HHC5N
            DSXC_L2_2.fq.gz,1
            control,/wynton/group/ahituv/fongsl/projects/EMF/data/ATAC/WTC11NGN2.1/./ctrl3/ctrl3_CKDL240018718-1A_HHC5NDSXC
            _L2_1.fq.gz,/wynton/group/ahituv/fongsl/projects/EMF/data/ATAC/WTC11NGN2.1/./ctrl3/ctrl3_CKDL240018718-1A_HHC5N
            DSXC_L2_2.fq.gz,2
            control,/wynton/group/ahituv/fongsl/projects/EMF/data/ATAC/WTC11NGN2.1/./ctrl4/ctrl4_CKDL240018719-1A_HHC5NDSXC
            _L2_1.fq.gz,/wynton/group/ahituv/fongsl/projects/EMF/data/ATAC/WTC11NGN2.1/./ctrl4/ctrl4_CKDL240018719-1A_HHC5N
            DSXC_L2_2.fq.gz,3
            PEMF,/wynton/group/ahituv/fongsl/projects/EMF/data/ATAC/WTC11NGN2.1/./PEMF1/PEMF1_CKDL240018720-1A_HHC5NDSXC_L2
            _1.fq.gz,/wynton/group/ahituv/fongsl/projects/EMF/data/ATAC/WTC11NGN2.1/./PEMF1/PEMF1_CKDL240018720-1A_HHC5NDSX
            C_L2_2.fq.gz,1
            PEMF,/wynton/group/ahituv/fongsl/projects/EMF/data/ATAC/WTC11NGN2.1/./PEMF2/PEMF2_CKDL240018721-1A_HHC5NDSXC_L2
            _1.fq.gz,/wynton/group/ahituv/fongsl/projects/EMF/data/ATAC/WTC11NGN2.1/./PEMF2/PEMF2_CKDL240018721-1A_HHC5NDSX
            C_L2_2.fq.gz,2
            PEMF,/wynton/group/ahituv/fongsl/projects/EMF/data/ATAC/WTC11NGN2.1/./PEMF3/PEMF3_CKDL240018722-1A_HHC5NDSXC_L2
            _1.fq.gz,/wynton/group/ahituv/fongsl/projects/EMF/data/ATAC/WTC11NGN2.1/./PEMF3/PEMF3_CKDL240018722-1A_HHC5NDSX
            C_L2_2.fq.gz,3
            PEMF,/wynton/group/ahituv/fongsl/projects/EMF/data/ATAC/WTC11NGN2.1/./PEMF4/PEMF4_CKDL240018723-1A_HHC5NDSXC_L2
            _1.fq.gz,/wynton/group/ahituv/fongsl/projects/EMF/data/ATAC/WTC11NGN2.1/./PEMF4/PEMF4_CKDL240018723-1A_HHC5NDSX
            C_L2_2.fq.gz,4
       

##### Save this in a directory that you can find again later!!

## Fill in variables — sample_file, outdir
- Run parallelized SGE pipeline with nfcore-atac-wynton.sh 
- NOTE: If your dataset is larger than 8 samples you will need to increase the L_RT time variable 
- For more info on parallel job submission, see https://wynton.ucsf.edu/hpc/scheduler/submit-jobs.html

In [1]:
# Input/output files
SAMPLE_FILE =  # <<FULL_PATH_TO_SAMPLE_FILE>>
# e.g. SAMPLE_FILE = "/wynton/group/ahituv/fongsl/projects/EMF/data/ATAC/WTC11NGN2.1/samples.csv"

OUTDIR =  # <<FULL_PATH_TO_OUTPUT_DIRECTORY>>
# e.g. OUTDIR = "/wynton/group/ahituv/fongsl/projects/EMF/results/ATAC"


# Sequencing params
READ_LEN = 150  # PE is 150bp


# Wynton params
N_SLOTS = 8  # n slots to run in parallel 
MEM_PER_SLOT = 8  # Gb per slot. If 8 slots at 8G/slot = 64G total
SH_QSUB_SCRIPT = "/wynton/group/ahituv/bin/pipelines/nf-core-atacseq-dev/nfcore-atac-wynton.sh"  # custom script to run nf-core ATAC-seq pipeline
H_RT=08:20:00   # amount of time, HH:MM:SS


# put the command together. 
cmd = f"qsub -pe smp {N_SLOTS} -l mem_free={MEM_PER_SLOT}G -l h_rt={H_RT} {SH_QSUB_SCRIPT} {SAMPLE_FILE} {OUTDIR} {READ_LEN}"

# the pipeline command
print(cmd)

qsub -pe smp 4 -l mem_free=8G nfcore-atac-wynton.sh /wynton/group/ahituv/fongsl/projects/EMF/data/ATAC/WTC11NGN2.1/samples.csv  /wynton/group/ahituv/fongsl/projects/EMF/results/ATAC 150


### e.g. 

    qsub -pe smp 4 -l mem_free=8G nfcore-atac-wynton.sh /wynton/group/ahituv/fongsl/projects/EMF/data/ATAC/WTC11NGN2.1/samples.csv  /wynton/group/ahituv/fongsl/projects/EMF/results/ATAC 150

### copy the command above and run in command line. 

if increasing time argument